df = spark.read.format("csv").load("s3://bucket/file.csv") df.write.partitionBy("date").parquet("s3://bucket/data.parquet")
from awsglue.context import GlueContext from pyspark.context import SparkContext
sc = SparkContext() glue_context = GlueContext(sc)
job = glue_context.create_job("ExampleJob", args) job.init()
spark = glue_context.spark_session spark.conf.set("spark.dynamicAllocation.enabled", "false") # 禁用动态分配
glue_context._jsc.sc().getExecutorMemoryStatus().size() # 获取当前executor数量 glue_context._jsc.sc().setExecutorLimit(10) # 设置executor数量为10
df.write.orc("s3://bucket/data.orc")
以上方法都可以有效地缩短AWS Glue Pyspark Parquet写入S3花费的时间。