使用Google Cloud Storage将文本文件上传至云端后,再通过Big Query进行导入。
代码示例:
from google.cloud import storage
def upload_to_gcs(bucket_name, local_file_path, destination_blob_name):
"""将本地文件上传至Google Cloud Storage"""
storage_client = storage.Client()
bucket = storage_client.bucket(bucket_name)
blob = bucket.blob(destination_blob_name)
blob.upload_from_filename(local_file_path)
# 示例
bucket_name = 'my_bucket'
local_file_path = '/path/to/local/file.csv'
destination_blob_name = 'uploaded_file.csv'
upload_to_gcs(bucket_name, local_file_path, destination_blob_name)
from google.cloud import bigquery
def import_to_bigquery(dataset_id, table_id, source_url):
"""将Google Cloud Storage中的文本文件导入至Big Query表格"""
client = bigquery.Client()
dataset_ref = client.dataset(dataset_id)
job_config = bigquery.LoadJobConfig()
job_config.source_format = bigquery.SourceFormat.CSV
job_config.skip_leading_rows = 1 # 跳过CSV文件中的第一行表头
uri = source_url.split('/')[3:] # 格式为 gs://bucket-name/filepath
uri = '/'.join(uri)
uri = f"gs://{uri}"
load_job = client.load_table_from_uri(
uri, dataset_ref.table(table_id), job_config=job_config
)
# 将上传任务状态打印输出,使用result函数等待任务完成
print(f"Starting job {load_job.job_id}")
load_job.result()
print(f"Job {load_job.job_id} completed.")
# 示例
dataset_id = 'my_dataset'
table_id = 'my_table'
source_url = 'gs://my_bucket/uploaded_file.csv'
import_to_bigquery(dataset_id, table_id, source_url)
上一篇:Bigquery表格数据性能问题