import boto3
import os
import shutil
client = boto3.client('glue')
job_name = os.environ['AWS_SimpleWorkflow_JOB_NAME']
response = client.get_job_run(JobName=job_name, RunId=job_run_id)
# Set the maximum allowed size of the redshift_tmp_dir directory (in bytes)
max_size_bytes = 104857600
# Get the size of the redshift_tmp_dir directory (in bytes)
dir_size_bytes = sum(os.path.getsize(os.path.join(sftp_root, f)) for f in os.listdir(redshift_tmp_dir) if os.path.isfile(os.path.join(redshift_tmp_dir, f)))
# If the directory size is greater than the maximum allowed size, delete the oldest files until the directory size is under the limit
if dir_size_bytes > max_size_bytes:
file_list = os.listdir(redshift_tmp_dir)
file_list.sort(key=lambda x: os.stat(os.path.join(redshift_tmp_dir, x)).st_mtime)
for filename in file_list:
file_path = os.path.join(redshift_tmp_dir, filename)
if os.path.isfile(file_path):
os.remove(file_path)
dir_size_bytes = sum(os.path.getsize(os.path.join(sftp_root, f)) for f in os.listdir(redshift_tmp_dir) if os.path.isfile(os.path.join(redshift_tmp_dir, f)))
if dir_size_bytes <= max_size_bytes:
break
首先,在Amazon S3上创建一个桶,以便作为放置日志文件的目录。
在AWS Glue控制台中为该作业创建一个新的日志器。
再次更新AWS Glue作业脚本,以在目录达到一定大小时手动运行清理命令。 例如,在此示例中,如果redshift_tmp_dir目录中的文件