英文:
FileNotFoundError: [Errno 2] No such file or directory: while exporting a parquet file from pandas dataframe
问题
以下是您要翻译的代码部分:
I am basically trying to export a parquet file inside GCS cloud bucket as shown below in my code which is a GCP cloud function where i am getting error in the line "chunk.to_parquet(parquet_file_path, engine='fastparquet', compression='snappy')" saying - " No such file or directory: 'new_folder_20230206_065500/table1-20230206_065638.parquet". The folder is getting created successfully inside bucket but i am not sure why parquet file is not getting generated inside it.
import mysql.connector
import pandas as pd
from google.cloud import storage
from datetime import datetime, timedelta
import os
def extract_data_to_gcs(request):
connection = mysql.connector.connect(
host=os.getenv('..'),
user=os.getenv('...'),
password=os.getenv('...'),
database='....'
)
cursor = connection.cursor(buffered=True)
tables = ["table1", "table2", "table3"]
client = storage.Client()
bucket = client.bucket('data-lake-archive')
# Create a timestamp-based folder name
now = datetime.now()
folder_name = now.strftime("new_folder_%Y%m%d_%H%M%S")
folder_path = f"{folder_name}/"
# Create the folder in the GCS bucket
blob = bucket.blob(folder_path)
blob.upload_from_string("", content_type="application/octet-stream")
for table in tables:
cursor.execute("SELECT * FROM {}".format(table))
chunks = pd.read_sql_query("SELECT * FROM {}".format(table), connection, chunksize=5000000)
for i, chunk in enumerate(chunks):
chunk.columns = [str(col) for col in chunk.columns]
ingestion_timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
parquet_file_path = folder_path + f"{table}-{i}.parquet"
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
# parquet_file_path = folder_path + f'abc.parquet'
print(f'folder path is {folder_path}')
print(f'parquet file path is {parquet_file_path}')
chunk.to_parquet(parquet_file_path, engine='fastparquet', compression='snappy')
# blob = bucket.blob(folder_path + f'{table}-{i}.parquet')
# blob.upload_from_filename(folder_path + f'{table}-{i}.parquet')
cursor.execute("SELECT table_name, column_name FROM information_schema.key_column_usage WHERE referenced_table_name = '{}'".format(table))
referenced_tables = cursor.fetchall()
for referenced_table in referenced_tables:
chunks = pd.read_sql_query("SELECT * FROM {}".format(referenced_table[0]), connection, chunksize=5000000)
for i, chunk in enumerate(chunks):
chunk.columns = [str(col) for col in chunk.columns]
ingestion_timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
chunk.to_parquet(f"{folder_path}{referenced_table[0]}-{ingestion_timestamp}-{i}.parquet", engine='fastparquet', compression='snappy')
blob = bucket.blob(folder_path + f'{referenced_table[0]}-{ingestion_timestamp}-{i}.parquet')
blob.upload_from_filename(folder_path + f'{referenced_table[0]}-{ingestion_timestamp}-{i}.parquet')
return 'Data extracted and uploaded to GCS'
请注意,我已经将代码中的双引号翻译为单引号,以便在代码中的字符串中使用双引号。如果您有任何其他疑问或需要进一步的帮助,请告诉我。
英文:
I am basically trying to export a parquet file inside GCS cloud bucket as shown below in my code which is a GCP cloud function where i am getting error in the line "chunk.to_parquet(parquet_file_path, engine='fastparquet', compression='snappy')" saying -" No such file or directory: 'new_folder_20230206_065500/table1-20230206_065638.parquet". The folder is getting created successfully inside bucket but i am not sure why parquet file is not getting generated inside it.
import mysql.connector
import pandas as pd
from google.cloud import storage
from datetime import datetime, timedelta
import os
def extract_data_to_gcs(request):
connection = mysql.connector.connect(
host=os.getenv('..'),
user=os.getenv('...'),
password=os.getenv('...'),
database='....'
)
cursor = connection.cursor(buffered=True)
tables = ["table1", "table2", "table3"]
client = storage.Client()
bucket = client.bucket('data-lake-archive')
# Create a timestamp-based folder name
now = datetime.now()
folder_name = now.strftime("new_folder_%Y%m%d_%H%M%S")
folder_path = f"{folder_name}/"
# Create the folder in the GCS bucket
blob = bucket.blob(folder_path)
blob.upload_from_string("", content_type="application/octet-stream")
for table in tables:
cursor.execute("SELECT * FROM {}".format(table))
chunks = pd.read_sql_query("SELECT * FROM {}".format(table), connection, chunksize=5000000)
for i, chunk in enumerate(chunks):
chunk.columns = [str(col) for col in chunk.columns]
ingestion_timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
parquet_file_path = folder_path + f"{table}-{i}.parquet"
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
# parquet_file_path = folder_path + f'abc.parquet'
print(f'folder path is {folder_path}')
print(f'parquet file path is {parquet_file_path}')
chunk.to_parquet(parquet_file_path, engine='fastparquet', compression='snappy')
# blob = bucket.blob(folder_path + f'{table}-{i}.parquet')
# blob.upload_from_filename(folder_path + f'{table}-{i}.parquet')
cursor.execute("SELECT table_name, column_name FROM information_schema.key_column_usage WHERE referenced_table_name = '{}'".format(table))
referenced_tables = cursor.fetchall()
for referenced_table in referenced_tables:
chunks = pd.read_sql_query("SELECT * FROM {}".format(referenced_table[0]), connection, chunksize=5000000)
for i, chunk in enumerate(chunks):
chunk.columns = [str(col) for col in chunk.columns]
ingestion_timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
chunk.to_parquet(f"{folder_path}{referenced_table[0]}-{ingestion_timestamp}-{i}.parquet", engine='fastparquet', compression='snappy')
blob = bucket.blob(folder_path + f'{referenced_table[0]}-{ingestion_timestamp}-{i}.parquet')
blob.upload_from_filename(folder_path + f'{referenced_table[0]}-{ingestion_timestamp}-{i}.parquet')
return 'Data extracted and uploaded to GCS'
答案1
得分: 1
Do you need to create the folder first? I'm not familiar with Google Cloud, but that might be a cause of the issue. folder_path = f"{folder_name}/"
Create this folder before doing, chunk.to_parquet(...)
Where exactly are the errors thrown? There are two lines with chunk.to_parquet(). Can you reduce the error down to a specific line?
英文:
Do you need to create the folder first? I'm not familiar with Google Cloud, but that might be a cause of the issue. folder_path = f"{folder_name}/"
Create this folder before doing, chunk.to_parquet(...)
Where exactly are the errors thrown? There are two lines with chunk.to_parquet(). Can you reduce the error down to a specific line?
通过集体智慧和协作来改善编程学习和解决问题的方式。致力于成为全球开发者共同参与的知识库,让每个人都能够通过互相帮助和分享经验来进步。
评论