可以通过自定义 Operator 来实现只传输一个文件并指定文件名。首先需要继承 GCSToS3Operator,然后重写 execute 函数,在函数中调用 GCSHook 和 S3Hook 进行文件传输,同时指定目标 S3 上的文件名。
示例代码如下所示:
from airflow.contrib.operators.gcs_to_s3 import GCSToS3Operator
from airflow.providers.amazon.aws.hooks.s3 import S3Hook
from airflow.providers.google.cloud.hooks.gcs import GCSHook
class GCSFileToS3Operator(GCSToS3Operator):
def __init__(
self,
source_bucket: str,
source_object: str,
destination_bucket: str,
destination_object: str,
**kwargs,
):
super().__init__(
source_bucket=source_bucket,
source_object=source_object,
destination_bucket=destination_bucket,
destination_object=destination_object,
**kwargs,
)
self.source_bucket = source_bucket
self.source_object = source_object
self.destination_bucket = destination_bucket
self.destination_object = destination_object
def execute(self, context):
gcs_hook = GCSHook()
s3_hook = S3Hook(aws_conn_id=self.aws_conn_id)
# Download the file from GCS to local
tmp_file = self._build_gcs_uri(self.source_bucket, self.source_object)
self.log.info("Downloading from %s", tmp_file)
with NamedTemporaryFile() as f:
gcs_hook.download(bucket_name=self.source_bucket, object_name=self.source_object, filename=f.name)
f.seek(0)
# Upload the file to S3
s3_hook.load_file(
filename=f.name,
key=self.destination_object,
bucket_name=self.destination_bucket
)
self.log.info("File has been uploaded to %s/%s", self.destination_bucket, self.destination_object)
在 DAG 文件中使用自定义 Operator,只需要指定 GCS 上的文件路径和 S3 上的文件名即可:
from datetime import datetime
from airflow import DAG
from