在Airflow中,可以使用PythonOperator操作符来执行自定义的Python代码。下面是一个示例解决方案:
from airflow import DAG
from airflow.operators.python_operator import PythonOperator
from airflow.utils.dates import days_ago
import boto3
import email
import os
# 定义下载邮件到S3的函数
def download_email_to_s3():
    s3_client = boto3.client('s3')
    s3_bucket = 'your-s3-bucket'
    s3_folder = 'email'
    email_username = 'your-email-username'
    email_password = 'your-email-password'
    email_host = 'your-email-host'
    email_port = 993
    
    # 连接到电子邮件服务器
    mail = imaplib.IMAP4_SSL(email_host, email_port)
    mail.login(email_username, email_password)
    mail.select("inbox")
    
    # 搜索并下载最新的邮件
    result, data = mail.search(None, "ALL")
    latest_email_id = data[0].split()[-1]
    result, data = mail.fetch(latest_email_id, "(RFC822)")
    raw_email = data[0][1]
    
    # 解析邮件内容
    msg = email.message_from_bytes(raw_email)
    subject = msg['subject']
    body = ""
    
    if msg.is_multipart():
        for part in msg.walk():
            if part.get_content_type() == "text/plain":
                body = part.get_payload(decode=True)
    else:
        body = msg.get_payload(decode=True)
    
    # 保存邮件到S3
    s3_file_path = os.path.join(s3_folder, f"{latest_email_id}.txt")
    s3_client.put_object(Body=body, Bucket=s3_bucket, Key=s3_file_path)
    
    print(f"Email downloaded to S3: {s3_file_path}")
# 定义DAG
default_args = {
    'owner': 'airflow',
    'start_date': days_ago(1)
}
with DAG('download_email_to_s3', default_args=default_args, schedule_interval=None) as dag:
    download_email_task = PythonOperator(
        task_id='download_email_task',
        python_callable=download_email_to_s3
    )
请注意,上述代码中的一些值需要替换为您自己的值,例如S3存储桶名称、电子邮件凭据和主机等。还可以根据需要进行自定义,例如添加错误处理或更多的日志记录。