如何从一个包含超过50个文件的Google Drive文件夹中下载所有文件?

huangapple go评论118阅读模式
英文:

How do I download all files from a Google Drive folder with more than 50 files?

问题

我无法弄清如何编写一个程序来下载一个拥有超过1,000个文件的公开访问的Google Drive文件夹中的所有文件。

到目前为止,我尝试过以下内容:

import gdown
url = 'https://drive.google.com/drive/folders/MY-PUBLICLY-ACCESSIBLE-FOLDER-ID?usp=drive_link'
gdown.download_folder(url, quiet=True, remaining_ok=True, use_cookies=False)

但它只下载了其中50个文件。

英文:

I cannot figure out how to write a program to download all files from a publicly accessible Google Drive folder, which has more than 1,000 of them.

This is what I've tried so far:

import gdown
url = 'https://drive.google.com/drive/folders/MY-PUBLICLY-ACCESSIBLE-FOLDER-ID?usp=drive_link'
gdown.download_folder(url, quiet=True, remaining_ok=True, use_cookies=False)

But it only downloads 50 of the files.

答案1

得分: 5

你可以使用 Google Drive API:

https://developers.google.com/drive/api/quickstart/python

以下是我以前使用过的脚本:

from googleapiclient.discovery import build
from googleapiclient.http import MediaIoBaseDownload
from google_auth_oauthlib.flow import InstalledAppFlow
import io
import os

# 定义范围
SCOPES = ['https://www.googleapis.com/auth/drive.readonly']

# 获取你的Google凭证
def get_credentials():
    flow = InstalledAppFlow.from_client_secrets_file('credentials.json', SCOPES)
    creds = flow.run_local_server(port=0)
    return creds

# 构建下载器
creds = get_credentials()
drive_downloader = build('drive', 'v3', credentials=creds)

# 用你实际的Google Drive文件夹ID替换 'FOLDER_ID'
folder_id = 'FOLDER_ID'
query = f"Folder ID '{folder_id}'"
results = drive_downloader.files().list(q=query, pageSize=1000).execute()
items = results.get('files', [])

# 下载文件
for item in items:
    request = drive_downloader.files().get_media(fileId=item['id'])
    f = io.FileIO(item['name'], 'wb')
    downloader = MediaIoBaseDownload(f, request)
    done = False
    while done is False:
        status, done = downloader.next_chunk()
        print(f"Download {int(status.progress() * 100)}.")

print(f"Downloaded {len(items)} files from the folder.")

希望这对你有所帮助。

英文:

You can use the Google Drive API:

https://developers.google.com/drive/api/quickstart/python

Here is a script I have used in the past:

from googleapiclient.discovery import build
from googleapiclient.http import MediaIoBaseDownload
from google_auth_oauthlib.flow import InstalledAppFlow
import io
import os

# Define the scopes
SCOPES = ['https://www.googleapis.com/auth/drive.readonly']

# Obtain your Google credentials
def get_credentials():
    flow = InstalledAppFlow.from_client_secrets_file('credentials.json', SCOPES)
    creds = flow.run_local_server(port=0)
    return creds

# Build the downloader
creds = get_credentials()
drive_downloader = build('drive', 'v3', credentials=creds)

# Replace 'FOLDER_ID' with your actual Google Drive folder ID
folder_id = 'FOLDER_ID'
query = f"Folder ID '{folder_id}'"
results = drive_downloader.files().list(q=query, pageSize=1000).execute()
items = results.get('files', [])

# Download the files
for item in items:
    request = drive_downloader.files().get_media(fileId=item['id'])
    f = io.FileIO(item['name'], 'wb')
    downloader = MediaIoBaseDownload(f, request)
    done = False
    while done is False:
        status, done = downloader.next_chunk()
        print(f"Download {int(status.progress() * 100)}.")

print(f"Downloaded {len(items)} files from the folder.")

答案2

得分: 1

以下是翻译好的部分:

Work Around:
-----------

由于 `gdown``download_folder` 函数不关心需要下载多少个包含 50 个文件或更少的子文件夹我们可以使用以下函数来组织要保存到新路径中的文件夹中的文件它会创建一个 `temp_folder`,将所有文件移动到包含 50 个文件或更少的子文件夹中然后运行 `gdown``download_folder` 函数

```python
import os

def organize_folder_into_subfolders(path_to_original_folder, max_number_of_files_per_subfolder=50):
    '''将文件夹中的所有文件移动到新创建的子文件夹中,每个子文件夹包含最多 max_number_of_files_per_subfolder 个文件或更少的文件'''
    files_in_folder = os.listdir(path_to_original_folder)
    if not path_to_original_folder.endswith('/'):
        path_to_original_folder += '/'
    temp_path_to_original_folder = path_to_original_folder + 'temp_folder'
    os.makedirs(temp_path_to_original_folder)
    subfolders_dict = {'temp_subfolder_0': []}
    os.makedirs(temp_path_to_original_folder + '/' + 'temp_subfolder_0')
    for _file_name in files_in_folder:
        if len(subfolders_dict['temp_subfolder_' + str(len(subfolders_dict) - 1)]) == max_number_of_files_per_subfolder:
            subfolders_dict['temp_subfolder_' + str(len(subfolders_dict))] = []
            os.makedirs(temp_path_to_original_folder + '/' + 'temp_subfolder_' + str(len(subfolders_dict) - 1))
        subfolders_dict['temp_subfolder_' + str(len(subfolders_dict) - 1)].append(_file_name)
    for _file_subfolder_path, _file_names in subfolders_dict.items():
        for _file_name in _file_names:
            os.rename(path_to_original_folder + _file_name, temp_path_to_original_folder + '/' + _file_subfolder_path + '/' + _file_name)
    return subfolders_dict

然后运行 download_folder 函数:

import gdown
url = 'https://drive.google.com/drive/folders/1OXV4qhFF_qJ8VqyrXpR7CzHDsToaqY_W?usp=drive_link'
gdown.download_folder(url, quiet=True, use_cookies=False, remaining_ok=True)

如果您希望您的原始文件夹和新文件夹不以子文件夹的形式组织,我们可以使用以下函数来“撤销”或将文件放回原始文件夹和新文件夹,并删除临时子文件夹:

import os

def undo_organize_folder_into_subfolders(path_to_original_folder, path_to_new_folder, subfolders_dict):
    '''将以子文件夹形式组织的文件移回原始文件夹和新文件夹,并删除子文件夹'''
    if not path_to_original_folder.endswith('/'):
        path_to_original_folder += '/'
    if not path_to_new_folder.endswith('/'):
        path_to_new_folder += '/'
    temp_path_to_original_folder = path_to_original_folder + 'temp_folder'
    temp_path_to_new_folder = path_to_new_folder + 'temp_folder'
    for _file_subfolder_path, _file_names in subfolders_dict.items():
        for _file_name in _file_names:
            os.rename(temp_path_to_original_folder + '/' + _file_subfolder_path + '/' + _file_name, path_to_original_folder + _file_name)
            os.rename(temp_path_to_new_folder + '/' + _file_subfolder_path + '/' + _file_name, path_to_new_folder + _file_name)
        os.rmdir(temp_path_to_original_folder + '/' + _file_subfolder_path)
        os.rmdir(temp_path_to_new_folder + '/' + _file_subfolder_path)
    os.rmdir(temp_path_to_original_folder)
    os.rmdir(temp_path_to_new_folder)

只需确保您的当前工作目录已设置:

from google.colab import drive
drive.mount('/content/drive', force_remount=True)
%cd '/content/drive/My Drive/Colab Notebooks/'
英文:

Work Around:

Since gdown's download_folder function doesn't care how many subfolders of 50 files or fewer it has to download we can use the following function to organize the files in the folder you want to save in a new path by creating a temp_folder where all the files are moved into subfolders comprising of 50 files or fewer and then running the gdown's download_folder function:

import os

def organize_folder_into_subfolders(path_to_original_folder, max_number_of_files_per_subfolder=50):
    '''Moves all files in a folder into newly created subfolders comprising of the max_number_of_files_per_subfolder or fewer'''
    files_in_folder = os.listdir(path_to_original_folder)
    if not path_to_original_folder.endswith('/'):
        path_to_original_folder += '/'
    temp_path_to_original_folder = path_to_original_folder + 'temp_folder'
    os.makedirs(temp_path_to_original_folder)
    subfolders_dict = {'temp_subfolder_0': []}
    os.makedirs(temp_path_to_original_folder + '/' + 'temp_subfolder_0')
    for _file_name in files_in_folder:
        if len(subfolders_dict['temp_subfolder_' + str(len(subfolders_dict) - 1)]) == max_number_of_files_per_subfolder:
            subfolders_dict['temp_subfolder_' + str(len(subfolders_dict))] = []
            os.makedirs(temp_path_to_original_folder + '/' + 'temp_subfolder_' + str(len(subfolders_dict) - 1))
        subfolders_dict['temp_subfolder_' + str(len(subfolders_dict) - 1)].append(_file_name)
    for _file_subfolder_path, _file_names in subfolders_dict.items():
        for _file_name in _file_names:
            os.rename(path_to_original_folder + _file_name, temp_path_to_original_folder + '/' + _file_subfolder_path + '/' + _file_name)
    return subfolders_dict

And then run the download_folder function:

import gdown
url = 'https://drive.google.com/drive/folders/1OXV4qhFF_qJ8VqyrXpR7CzHDsToaqY_W?usp=drive_link'
gdown.download_folder(url, quiet=True, use_cookies=False, remaining_ok=True)

And then if you want your original and new folders not organized as subfolders, we can use this function to "undo" or put the files back into the original and new folders and delete the temp subfolders:

import os

def undo_organize_folder_into_subfolders(path_to_original_folder, path_to_new_folder, subfolders_dict):
    '''Moves the files organized as subfolders back to the original & new folders and deletes subfolders'''
    if not path_to_original_folder.endswith('/'):
        path_to_original_folder += '/'
    if not path_to_new_folder.endswith('/'):
        path_to_new_folder += '/'
    temp_path_to_original_folder = path_to_original_folder + 'temp_folder'
    temp_path_to_new_folder = path_to_new_folder + 'temp_folder'
    for _file_subfolder_path, _file_names in subfolders_dict.items():
        for _file_name in _file_names:
            os.rename(temp_path_to_original_folder + '/' + _file_subfolder_path + '/' + _file_name, path_to_original_folder + _file_name)
            os.rename(temp_path_to_new_folder + '/' + _file_subfolder_path + '/' + _file_name, path_to_new_folder + _file_name)
        os.rmdir(temp_path_to_original_folder + '/' + _file_subfolder_path)
        os.rmdir(temp_path_to_new_folder + '/' + _file_subfolder_path)
    os.rmdir(temp_path_to_original_folder)
    os.rmdir(temp_path_to_new_folder)

And just make sure you have your current working directory set:

from google.colab import drive
drive.mount('/content/drive', force_remount=True)
%cd '/content/drive/My Drive/Colab Notebooks/'

答案3

得分: 1

你只能下载50个文件的原因是,gdown库默认使用“cookie”方法访问Google Drive文件,而Google Drive对使用cookie进行直接下载有50个文件的限制。
您可以使用Google Drive API和google-api-python-client库下载1000个文件。

您需要安装google-api-python-client库来使用Google Drive API,然后需要在Google开发者控制台中创建一个项目,启用Drive API,并创建凭据(OAuth 2.0客户端ID)。将凭据下载为JSON文件并放置在项目目录中。
以下是相同操作的Python脚本

import os
import json
from googleapiclient.discovery import build
from google.oauth2.credentials import Credentials

def authenticate(credentials_file):
    creds = Credentials.from_authorized_user_file(credentials_file)
    return build('drive', 'v3', credentials=creds)

def download_files(service, folder_id, output_dir):
    page_token = None
    while True:
        response = service.files().list(
            q=f"'{folder_id}' in parents",
            spaces='drive',
            fields='nextPageToken, files(id, name, mimeType)',
            pageToken=page_token
        ).execute()

        for file in response.get('files', []):
            file_id = file['id']
            file_name = file['name']
            mime_type = file['mimeType']
            
            if mime_type == 'application/vnd.google-apps.folder':
                # If the file is a subfolder, recursively call the function to download its contents.
                download_files(service, file_id, os.path.join(output_dir, file_name))
            else:
                # If the file is not a folder, download it.
                request = service.files().get_media(fileId=file_id)
                file_path = os.path.join(output_dir, file_name)
                with open(file_path, 'wb') as f:
                    f.write(request.execute())

        page_token = response.get('nextPageToken', None)
        if page_token is None:
            break

if __name__ == '__main__':
    credentials_file = 'path_to_your_credentials.json' 
    folder_id = 'MY-PUBLICLY-ACCESSIBLE-FOLDER-ID'
    output_directory = 'path_to_output_directory'

    if not os.path.exists(output_directory):
        os.makedirs(output_directory)

    service = authenticate(credentials_file)
    download_files(service, folder_id, output_directory)

请根据需要更改文件路径。

英文:

The reason you are only able to download 50 files is that the gdown library, by default, uses the "cookie" method to access Google Drive files, and Google Drive has a limitation of 50 for direct downloads using cookies.
You can use the Google Drive API along to download 1000 files with the google-api-python-client library.

You'll need to install the google-api-python-client library to work with the Google Drive API and then, you'll need to create a project in the Google Developers Console, enable the Drive API, and create credentials (OAuth 2.0 client ID). Download the credentials as a JSON file and place it in your project directory.
Here's a Python script for the same

import os
import json
from googleapiclient.discovery import build
from google.oauth2.credentials import Credentials

def authenticate(credentials_file):
    creds = Credentials.from_authorized_user_file(credentials_file)
    return build('drive', 'v3', credentials=creds)

def download_files(service, folder_id, output_dir):
    page_token = None
    while True:
        response = service.files().list(
            q=f"'{folder_id}' in parents",
            spaces='drive',
            fields='nextPageToken, files(id, name, mimeType)',
            pageToken=page_token
        ).execute()

        for file in response.get('files', []):
            file_id = file['id']
            file_name = file['name']
            mime_type = file['mimeType']
            
            if mime_type == 'application/vnd.google-apps.folder':
                # If the file is a subfolder, recursively call the function to download its contents.
                download_files(service, file_id, os.path.join(output_dir, file_name))
            else:
                # If the file is not a folder, download it.
                request = service.files().get_media(fileId=file_id)
                file_path = os.path.join(output_dir, file_name)
                with open(file_path, 'wb') as f:
                    f.write(request.execute())

        page_token = response.get('nextPageToken', None)
        if page_token is None:
            break

if __name__ == '__main__':
    credentials_file = 'path_to_your_credentials.json' 
    folder_id = 'MY-PUBLICLY-ACCESSIBLE-FOLDER-ID'
    output_directory = 'path_to_output_directory'

    if not os.path.exists(output_directory):
        os.makedirs(output_directory)

    service = authenticate(credentials_file)
    download_files(service, folder_id, output_directory)

please change the file paths accordingly

答案4

得分: 0

由于gdown库的最大文件数限制为50个,您可能希望查看类似于ZDrive库的东西。

如果您查看它,它相当简单。而且既然您知道您的文件夹ID,使用库中的下载工具应该很容易。

与gdown库支持相同的想法,可以用几行代码来实现(基于库文档):

from zdrive import Downloader
output_directory = "/home/abhinav/Documents"
d = Downloader()
folder_id = 'XXXX-YYYY-ZZZZ'
d.downloadFolder(folder_id, destinationFolder=output_directory)
英文:

Since there is limitation of max 50 files for gdown library, you might want to check out something like ZDrive library.

It is pretty simple if you check it out. And since you know your folder Id, it should be easy job to use a downloader from the library.

Supporting the same idea as the gdown library it can be coded in few code lines (based on library documentation):

from zdrive import Downloader
output_directory = "/home/abhinav/Documents"
d = Downloader()
folder_id = 'XXXX-YYYY-ZZZZ'
d.downloadFolder(folder_id, destinationFolder=output_directory)

答案5

得分: 0

以下是翻译好的部分:

"Start with install oogle-api-python-client" 请从以下内容开始:

"首先安装 oogle-api-python-client:"

"pip install --upgrade google-api-python-client google-auth-httplib2 google-auth-oauthlib google-auth google-auth-oauthlib google-auth-httplib2 google-auth google-auth-oauthlib pickle5" 请执行以下命令:

"pip install --upgrade google-api-python-client google-auth-httplib2 google-auth-oauthlib google-auth google-auth-oauthlib google-auth-httplib2 google-auth google-auth-oauthlib pickle5"

"import io
import os
import pickle
from googleapiclient.http import MediaIoBaseDownload
from google_auth_oauthlib.flow import InstalledAppFlow
from google.auth.transport.requests import Request
from googleapiclient.discovery import build" 请导入以下代码:

"导入 io
导入 os
导入 pickle
从 googleapiclient.http 导入 MediaIoBaseDownload
从 google_auth_oauthlib.flow 导入 InstalledAppFlow
从 google.auth.transport.requests 导入 Request
从 googleapiclient.discovery 导入 build"

"SCOPES = ['https://www.googleapis.com/auth/drive.readonly']" 请将 SCOPES 设置为以下内容:

"SCOPES = ['https://www.googleapis.com/auth/drive.readonly']"

"def main():" 请定义主函数如下:

"定义主函数:"

"creds = None" 请将 creds 初始化为 None:

"将 creds 初始化为 None:"

"If os.path.exists('token.pickle'):" 请检查是否存在 'token.pickle' 文件:

"如果存在 'token.pickle' 文件:"

"With open('token.pickle', 'rb') as token:" 请使用以下代码打开 'token.pickle' 文件:

"使用以下代码打开 'token.pickle' 文件:"

"if not creds or not creds.valid:" 请检查是否存在有效的认证凭证:

"如果没有有效的认证凭证或认证凭证无效:"

"if creds and creds.expired and creds.refresh_token:" 请检查认证凭证是否已过期并可刷新:

"如果认证凭证存在且已过期并且具有刷新令牌:"

"flow = InstalledAppFlow.from_client_secrets_file(
'credentials.json', SCOPES)" 请创建流程并从 'credentials.json' 文件中加载客户端密钥文件:

"创建流程并从 'credentials.json' 文件加载客户端密钥文件:"

"Creds = flow.run_local_server(port=0)" 请运行本地服务器以获取认证凭证:

"运行本地服务器以获取认证凭证:"

"With open('token.pickle', 'wb') as token:" 请使用以下代码保存认证凭证:

"使用以下代码保存认证凭证:"

"drive_service = build('drive', 'v3', credentials=creds)" 请构建 Google Drive 服务:

"构建 Google Drive 服务:"

"Url = 'YOUR_GOOGLE_DRIVE_URL'" 请将 URL 替换为您的 Google Drive 链接:

"将 URL 替换为您的 Google Drive 链接:"

"Folder_id = url.split("/")[-1]" 请获取文件夹 ID:

"获取文件夹 ID:"

"Download_dir = os.path.join(os.getcwd(), "downloaded_files")" 请创建用于存储下载文件的目录:

"创建用于存储下载文件的目录:"

"If not os.path.exists(download_dir):" 请检查下载目录是否存在:

"如果下载目录不存在:"

"Os.mkdir(download_dir)" 请创建下载目录:

"创建下载目录:"

"Page_token = None" 请将页面令牌初始化为 None:

"将页面令牌初始化为 None:"

"While True:" 请执行以下循环:

"执行以下循环:"

"Response = drive_service.files().list(q="'{}' in parents".format(folder_id),
spaces='drive',
fields='nextPageToken, files(id, name)',
pageToken=page_token).execute()" 请获取文件列表:

"获取文件列表:"

"For file in files:" 请对每个文件执行以下操作:

"对每个文件执行以下操作:"

"Request = drive_service.files().get_media(fileId=file['id'])" 请获取文件媒体:

"获取文件媒体:"

"Fh = io.FileIO(os.path.join(download_dir, file['name']), 'wb')" 请打开文件并准备下载:

"打开文件并准备下载:"

"Downloader = MediaIoBaseDownload(fh, request)" 请创建下载器:

"创建下载器:"

"Done = False" 请将 'done' 初始化为 False:

"将 'done' 初始化为 False:"

"While done is False:" 请执行以下循环,直到下载完成:

"执行以下循环,直到下载完成:"

"Status, done = downloader.next_chunk()" 请获取下载进度:

"获取下载进度:"

"Print(f'Download {int(status.progress() * 100)}.')" 请打印下载进度:

"打印下载进度:"

"Page_token = response.get('nextPageToken', None)" 请获取下一页的令牌:

"获取下一页的令牌:"

"If page_token is None:" 请检查下一页的令牌是否为 None:

"检查下一页的令牌是否为 None:"

"Break" 请退出循环:

"退出循环:"

"If name == 'main':" 请检查是否在主程序中运行:

"检查是否在主程序中运行:"

"Main()" 请调用主函数:

"调用主函数:"

英文:

There are good answers above and that may be enhanced for them. The reason why your code only downloads 50 of the files is that the gdown library has a limit of 50 files for direct downloads using cookies. To download more than 50 files.

Start with install oogle-api-python-client

pip install --upgrade google-api-python-client google-auth-httplib2 google-auth-oauthlib google-auth google-auth-oauthlib google-auth-httplib2 google-auth google-auth-oauthlib pickle5

then you can use this code

import io
import os
import pickle
from googleapiclient.http import MediaIoBaseDownload
from google_auth_oauthlib.flow import InstalledAppFlow
from google.auth.transport.requests import Request
from googleapiclient.discovery import build

# If modifying these SCOPES, delete the file token.pickle.
SCOPES = ['https://www.googleapis.com/auth/drive.readonly']

def main():
    creds = None
    # The file token.pickle stores the user's access and refresh tokens, and is

    if os.path.exists('token.pickle'):
        with open('token.pickle', 'rb') as token:
            creds = pickle.load(token)
    # If there are no (valid) credentials available, let the user log in.
    if not creds or not creds.valid:
        if creds and creds.expired and creds.refresh_token:
            creds.refresh(Request())
        else:
            flow = InstalledAppFlow.from_client_secrets_file(
                'credentials.json', SCOPES)
            creds = flow.run_local_server(port=0)
        # Save the credentials for the next run
        with open('token.pickle', 'wb') as token:
            pickle.dump(creds, token)

    # Call the Drive v3 API
    drive_service = build('drive', 'v3', credentials=creds)

    # Get the folder ID from the URL.
    url = 'YOUR_GOOGLE_DRIVE_URL'  # Replace with your Google Drive URL
    folder_id = url.split("/")[-1]

    # Create a directory to store the downloaded files.
    download_dir = os.path.join(os.getcwd(), "downloaded_files")
    if not os.path.exists(download_dir):
        os.mkdir(download_dir)

    # Download the files.
    page_token = None
    while True:
        response = drive_service.files().list(q="'{}' in parents".format(folder_id),
                                              spaces='drive',
                                              fields='nextPageToken, files(id, name)',
                                              pageToken=page_token).execute()
        files = response.get('files', [])
        for file in files:
            request = drive_service.files().get_media(fileId=file['id'])
            fh = io.FileIO(os.path.join(download_dir, file['name']), 'wb')
            downloader = MediaIoBaseDownload(fh, request)
            done = False
            while done is False:
                status, done = downloader.next_chunk()
                print(f'Download {int(status.progress() * 100)}.')

        page_token = response.get('nextPageToken', None)
        if page_token is None:
            break

if __name__ == '__main__':
    main()

huangapple
  • 本文由 发表于 2023年6月16日 03:34:13
  • 转载请务必保留本文链接:https://go.coder-hub.com/76485003.html
匿名

发表评论

匿名网友

:?: :razz: :sad: :evil: :!: :smile: :oops: :grin: :eek: :shock: :???: :cool: :lol: :mad: :twisted: :roll: :wink: :idea: :arrow: :neutral: :cry: :mrgreen:

确定