在Google Cloud Storage中使用Go语言压缩文件夹的最佳方法是什么?

huangapple go评论83阅读模式
英文:

Best Approach to Zipping up a Folder in Google Cloud Storage using Go?

问题

我的Google App Engine Go项目在Google Cloud Storage中创建了一个包含多个文件的“文件夹”的压缩包。在使用现已弃用和移除的Files API实现的BlobStore中,它的速度曾经非常快。最近,我将代码转换为使用Google Cloud Storage,但现在性能非常差,有时会超时。被压缩的文件大小在1K到2M之间。

我正在寻求改进文件内容压缩的任何建议。下面的代码是我编写的将多个文件压缩到云端新zip文件中的代码。执行时间可能很长,并且在将文件写入zip文件之前需要将每个文件的整个内容(请参阅性能问题部分)加载到内存中。肯定有更好的方法。

// 将文件夹打包成zip文件
func (cloud *Cloud) Pack(srcFolder string, fileName string, contentType string, metaData *map[string]string) {

log.Infof(cloud.c, "将存储桶 %v 中的文件夹 %v 打包到文件 %v", cloud.bucket, srcFolder, fileName)

srcFolder = fmt.Sprintf("%v/", srcFolder)
query := &storage.Query{Prefix: srcFolder, Delimiter: "/"}

objs, err := storage.ListObjects(cloud.ctx, cloud.bucket, query)
if err != nil {
	log.Errorf(cloud.c, "无法列出存储桶 %q 中的文件: %v", cloud.bucket, err)
	return
}

totalFiles := len(objs.Results)
if totalFiles == 0 {
	log.Errorf(cloud.c, "在文件夹 %q 中找不到对象: %v", cloud.bucket, srcFolder)
	return
}

// 创建用于写入的存储文件
log.Infof(cloud.c, "将新的zip文件写入 %v/%v,共 %v 个文件", cloud.bucket, fileName, totalFiles)
storageWriter := storage.NewWriter(cloud.ctx, cloud.bucket, fileName)

// 添加可选的内容类型和元数据
if len(contentType) > 0 {
	storageWriter.ContentType = contentType
}
if metaData != nil {
	storageWriter.Metadata = *metaData
}

// 创建一个缓冲区来写入我们的存档。
buf := new(bytes.Buffer)

// 创建一个新的zip存档到内存缓冲区
zipWriter := zip.NewWriter(buf)

// 遍历文件夹中的每个文件
for _, obj := range objs.Results {

	log.Infof(cloud.c, "将大小为 %v 的文件 %v 打包到zip文件中", obj.Name, obj.Size)
	//d.dumpStats(obj)

	// 从存储中读取源文件夹中的文件 - 从存储返回的io.ReadCloser
	storageReader, err := storage.NewReader(cloud.ctx, cloud.bucket, obj.Name)
	if err != nil {
		log.Errorf(cloud.c, "无法从存储桶 %q 的文件 %q 中读取: %v", cloud.bucket, obj.Name, err)
		return
	}
	defer storageReader.Close()

	// 性能问题:必须将整个文件加载到内存中以从云端进行随机访问
	slurp, err := ioutil.ReadAll(storageReader)
	if err != nil {
		log.Errorf(cloud.c, "无法从存储桶 %q 的文件 %q 中读取数据: %v", cloud.bucket, obj.Name, err)
		return
	}

	// 从目录列表中获取文件名(不希望在zip中存储路径)
	_, zipFileName := filepath.Split(obj.Name)

	newFileName := strings.ToLower(zipFileName)

	// 将文件名添加到zip中
	zipFile, err := zipWriter.Create(newFileName)
	if err != nil {
		log.Errorf(cloud.c, "无法从存储桶 %q 的文件 %q 创建zip文件: %v", cloud.bucket, zipFileName, err)
		return
	}

	// 将整个文件写入zip存档
	_, err = zipFile.Write(slurp)
	if err != nil {
		log.Errorf(cloud.c, "无法从存储桶 %q 的文件 %q 写入zip文件: %v", cloud.bucket, zipFileName, err)
		return
	}

	// 刷新缓冲区,以便我们现在可以将其写入
	//err = zipFile.Flush()
	//if err != nil {
	//	d.errorf("pack: 无法刷新存储桶 %q 的zip文件 %q 的写入: %v", cloud.bucket, zipFileName, err)
	//	//return
	//}

	// 现在将所有缓冲的zip数据写入云存储文件
	log.Infof(cloud.c, "将大小为 %v 的zip缓冲区写入云存储文件 %v", buf.Len(), fileName)
	_, err = buf.WriteTo(storageWriter)
	if err != nil {
		log.Errorf(cloud.c, "无法将数据写入存储桶 %q 的文件 %q: %v", cloud.bucket, fileName, err)
		return
	}
}

// 确保在关闭时检查错误。
log.Infof(cloud.c, "关闭zip写入器")
err = zipWriter.Close()
if err != nil {
	log.Errorf(cloud.c, "无法关闭存储桶 %q 的zip文件写入器 %q : %v", cloud.bucket, fileName, err)
}

// 写入任何剩余的数据
if buf.Len() > 0 {
	// 现在将所有缓冲的zip数据写入云存储文件
	// log.Infof(cloud.c, "将大小为 %v 的zip缓冲区写入云存储文件 %v", buf.Len(), fileName)
	_, err := buf.WriteTo(storageWriter)
	if err != nil {
		log.Errorf(cloud.c, "无法将数据写入存储桶 %q 的文件 %q: %v", cloud.bucket, fileName, err)
	}
}

// 关闭最终写入文件
//log.Infof(cloud.c, "关闭云存储文件 %v", fileName)
if err := storageWriter.Close(); err != nil {
	log.Errorf(cloud.c, "无法关闭存储桶 %q 的文件 %q: %v", cloud.bucket, fileName, err)
	return
}

// 成功!
log.Infof(cloud.c, "成功将文件打包到新的云存储文件 %v!", fileName)

}

英文:

My Google App Engine Go project creates a zip of multiple files in a "folder" that's in Google Cloud Storage. It used to be pretty quick when it was implemented in the BlobStore using the now deprecated and removed Files API. I recently converted the code to use Google Cloud Storage and now the performance is really bad and sometimes will timeout. The files that are being zipped are between 1K and 2M in size.

I'm looking for any advice to improve zipping up file contents. The code below is what I wrote for compressing multiple files in the cloud to a new zip file in the cloud. It can take a long time to execute and requires loading the entire contents (See PERFORMANCE ISSUE below) of each file into memory before writing it to the zip. There has to be a better way.

// Pack a folder into zip file
func (cloud *Cloud) Pack(srcFolder string, fileName string, contentType string, metaData *map[string]string) {

	log.Infof(cloud.c, "Packing bucket %v folder %v to file %v", cloud.bucket, srcFolder, fileName)	

	srcFolder = fmt.Sprintf("%v/", srcFolder)
	query := &storage.Query{Prefix: srcFolder, Delimiter: "/"}
	
	objs, err := storage.ListObjects(cloud.ctx, cloud.bucket, query)
	if err != nil {
		log.Errorf(cloud.c, "Packing failed to list bucket %q: %v", cloud.bucket, err)
		return
	}
	
	totalFiles := len(objs.Results)
	if totalFiles == 0 {
		log.Errorf(cloud.c, "Packing failed to find objects found in folder %q: %v", cloud.bucket, srcFolder)
		return
	}

	// create storage file for writing
	log.Infof(cloud.c, "Writing new zip file to %v/%v for %v files", cloud.bucket, fileName, totalFiles)
	storageWriter := storage.NewWriter(cloud.ctx, cloud.bucket, fileName)

	// add optional content type and meta data	
	if len(contentType) > 0 { storageWriter.ContentType = contentType }
	if metaData != nil { storageWriter.Metadata = *metaData }
			
	// Create a buffer to write our archive to.
	buf := new(bytes.Buffer)
	
	// Create a new zip archive to memory buffer
	zipWriter := zip.NewWriter(buf)

	// go through each file in the folder
	for _, obj := range objs.Results {

		log.Infof(cloud.c, "Packing file %v of size %v to zip file", obj.Name, obj.Size)
		//d.dumpStats(obj)
		
		// read file in our source folder from storage - io.ReadCloser returned from storage
		storageReader, err := storage.NewReader(cloud.ctx, cloud.bucket, obj.Name)
		if err != nil {
			log.Errorf(cloud.c, "Packing failed to read from bucket %q file %q: %v", cloud.bucket, obj.Name, err)
			return	
		}
		defer storageReader.Close()

		// PERFORMANCE ISSUE: have to load the entire file into memory to get random access from the cloud
		slurp, err := ioutil.ReadAll(storageReader)
		if err != nil {
			log.Errorf(cloud.c, "Packing failed to read data from bucket %q file %q: %v", cloud.bucket, obj.Name, err)
			return
		}
		
		// grab just the filename from directory listing (don't want to store paths in zip)
		_, zipFileName := filepath.Split(obj.Name)
		
		newFileName := strings.ToLower(zipFileName)
		
		// add filename to zip
		zipFile, err := zipWriter.Create(newFileName)
		if err != nil {
			log.Errorf(cloud.c, "Packing failed to create zip file from bucket %q file %q: %v", cloud.bucket, zipFileName, err)
			return
		}
		
		// write entire file into zip archive
		_, err = zipFile.Write(slurp)
		if err != nil {
			log.Errorf(cloud.c, "Packing failed to write zip file from bucket %q file %q: %v", cloud.bucket, zipFileName, err)
			return
		}
		
		// flush that to buffer so we can write it off now
		//err = zipFile.Flush()
		//if err != nil {
		//	d.errorf("pack: unable to flush write of zip file from bucket %q, file %q: %v", cloud.bucket, zipFileName, err)
		//	//return
		//}
		
		// now drain all that buffered zip data to the cloud storage file	
		log.Infof(cloud.c, "Writing zip buffer of size %v to cloud storage file %v", buf.Len(), fileName)	
		_, err = buf.WriteTo(storageWriter)
		if err != nil {
			log.Errorf(cloud.c, "Packing failed to write data to bucket %q file %q: %v", cloud.bucket, fileName, err)	
			return			
		}
	}
	
	// Make sure to check the error on Close.
	log.Infof(cloud.c, "Closing zip writer")	
	err = zipWriter.Close()
	if err != nil {
		log.Errorf(cloud.c, "Packing failed to close zip file writer from bucket %q file %q : %v", cloud.bucket, fileName, err)
	}

	// write any leftover data
	if buf.Len() > 0 {
		// now drain all that buffered zip data to the cloud storage file	
		// log.Infof(cloud.c, "Packing zip buffer of size %v to cloud storage file %v", buf.Len(), fileName)	
		_, err := buf.WriteTo(storageWriter)
		if err != nil {
			log.Errorf(cloud.c, "Packing failed to write data to bucket %q file %q: %v", cloud.bucket, fileName, err)				
		}
	}

	// close up final write file
	//log.Infof(cloud.c, "Closing cloud storage file %v", fileName)	
	if err := storageWriter.Close(); err != nil {
		log.Errorf(cloud.c, "Packing failed to close bucket %q file %q: %v", cloud.bucket, fileName, err)
		return
	}

	// success!
	log.Infof(cloud.c, "Packed files to new cloud storage file %v successful!", fileName)	
}

答案1

得分: 4

感谢Stephen建议在写入zip文件时不将文件加载到内存缓冲区中。以下是修复后的代码供参考:

// 将文件夹打包成zip文件
func (cloud *Cloud) Pack(srcFolder string, fileName string, contentType string, metaData *map[string]string) bool {

log.Infof(cloud.c, "正在将存储桶 %v 中的文件夹 %v 打包为文件 %v", cloud.bucket, srcFolder, fileName)

srcFolder = fmt.Sprintf("%v/", srcFolder)
query := &storage.Query{Prefix: srcFolder, Delimiter: "/"}

objs, err := storage.ListObjects(cloud.ctx, cloud.bucket, query)
if err != nil {
    log.Errorf(cloud.c, "无法列出存储桶 %q 中的文件:%v", cloud.bucket, err)
    return false
}

totalFiles := len(objs.Results)
if totalFiles == 0 {
    log.Errorf(cloud.c, "在文件夹 %q 中找不到对象:%v", cloud.bucket, srcFolder)
    return false
}

// 创建用于写入的存储文件
log.Infof(cloud.c, "正在将新的zip文件写入到 %v/%v,共 %v 个文件", cloud.bucket, fileName, totalFiles)
storageWriter := storage.NewWriter(cloud.ctx, cloud.bucket, fileName)
defer storageWriter.Close()

// 添加可选的内容类型和元数据
if len(contentType) > 0 {
    storageWriter.ContentType = contentType
}
if metaData != nil {
    storageWriter.Metadata = *metaData
}

// 创建一个新的zip归档到内存缓冲区
zipWriter := zip.NewWriter(storageWriter)

// 遍历文件夹中的每个文件
for _, obj := range objs.Results {

    log.Infof(cloud.c, "正在将大小为 %v 的文件 %v 打包到zip文件中", obj.Name, obj.Size)
    //d.dumpStats(obj)

    // 从存储中读取源文件夹中的文件 - 从存储返回的io.ReadCloser
    storageReader, err := storage.NewReader(cloud.ctx, cloud.bucket, obj.Name)
    if err != nil {
        log.Errorf(cloud.c, "无法从存储桶 %q 的文件 %q 中读取:%v", cloud.bucket, obj.Name, err)
        return false
    }
    defer storageReader.Close()

    // 从目录列表中获取文件名(不希望在zip中存储路径)
    _, zipFileName := filepath.Split(obj.Name)
    newFileName := strings.ToLower(zipFileName)

    // 将文件名添加到zip中
    zipFile, err := zipWriter.Create(newFileName)
    if err != nil {
        log.Errorf(cloud.c, "无法从存储桶 %q 的文件 %q 创建zip文件:%v", cloud.bucket, zipFileName, err)
        return false
    }

    // 从存储读取器复制到zip写入器
    _, err = io.Copy(zipFile, storageReader)
    if err != nil {
        log.Errorf(cloud.c, "无法从存储读取器复制到zip文件:%v", err)
        return false
    }
}

// 确保在关闭时检查错误。
log.Infof(cloud.c, "正在关闭zip写入器")
err = zipWriter.Close()
if err != nil {
    log.Errorf(cloud.c, "无法关闭存储桶 %q 的zip文件写入器 %q:%v", cloud.bucket, fileName, err)
    return false
}

// 成功!
log.Infof(cloud.c, "成功将文件打包到新的云存储文件 %v!", fileName)
return true

}

英文:

Thanks to Stephen who suggested not loading the file into memory buffers when writing to a zip. Here is the fixed code for reference:

// Pack a folder into zip file
func (cloud *Cloud) Pack(srcFolder string, fileName string, contentType string, metaData *map[string]string) bool {

	log.Infof(cloud.c, "Packing bucket %v folder %v to file %v", cloud.bucket, srcFolder, fileName)	

	srcFolder = fmt.Sprintf("%v/", srcFolder)
	query := &storage.Query{Prefix: srcFolder, Delimiter: "/"}
	
	objs, err := storage.ListObjects(cloud.ctx, cloud.bucket, query)
	if err != nil {
		log.Errorf(cloud.c, "Packing failed to list bucket %q: %v", cloud.bucket, err)
		return false
	}
	
	totalFiles := len(objs.Results)
	if totalFiles == 0 {
		log.Errorf(cloud.c, "Packing failed to find objects found in folder %q: %v", cloud.bucket, srcFolder)
		return false
	}

	// create storage file for writing
	log.Infof(cloud.c, "Writing new zip file to %v/%v for %v files", cloud.bucket, fileName, totalFiles)
	storageWriter := storage.NewWriter(cloud.ctx, cloud.bucket, fileName)
	defer storageWriter.Close()

	// add optional content type and meta data	
	if len(contentType) > 0 { storageWriter.ContentType = contentType }
	if metaData != nil { storageWriter.Metadata = *metaData }
			
	// Create a new zip archive to memory buffer
	zipWriter := zip.NewWriter(storageWriter)

	// go through each file in the folder
	for _, obj := range objs.Results {

		log.Infof(cloud.c, "Packing file %v of size %v to zip file", obj.Name, obj.Size)
		//d.dumpStats(obj)
		
		// read file in our source folder from storage - io.ReadCloser returned from storage
		storageReader, err := storage.NewReader(cloud.ctx, cloud.bucket, obj.Name)
		if err != nil {
			log.Errorf(cloud.c, "Packing failed to read from bucket %q file %q: %v", cloud.bucket, obj.Name, err)
			return false
		}
		defer storageReader.Close()
		
		// grab just the filename from directory listing (don't want to store paths in zip)
		_, zipFileName := filepath.Split(obj.Name)
		newFileName := strings.ToLower(zipFileName)
		
		// add filename to zip
		zipFile, err := zipWriter.Create(newFileName)
		if err != nil {
			log.Errorf(cloud.c, "Packing failed to create zip file from bucket %q file %q: %v", cloud.bucket, zipFileName, err)
			return false
		}
		
		// copy from storage reader to zip writer 	
		_, err = io.Copy(zipFile, storageReader)
		if err != nil {
			log.Errorf(cloud.c, "Failed to copy from storage reader to zip file: %v", err)
			return false
		}	
	}
	
	// Make sure to check the error on Close.
	log.Infof(cloud.c, "Closing zip writer")	
	err = zipWriter.Close()
	if err != nil {
		log.Errorf(cloud.c, "Packing failed to close zip file writer from bucket %q file %q : %v", cloud.bucket, fileName, err)
		return false
	}

	// success!
	log.Infof(cloud.c, "Packed files to new cloud storage file %v successful!", fileName)	
	return true
}

huangapple
  • 本文由 发表于 2015年7月24日 21:19:30
  • 转载请务必保留本文链接:https://go.coder-hub.com/31611616.html
匿名

发表评论

匿名网友

:?: :razz: :sad: :evil: :!: :smile: :oops: :grin: :eek: :shock: :???: :cool: :lol: :mad: :twisted: :roll: :wink: :idea: :arrow: :neutral: :cry: :mrgreen:

确定