import sys from awsglue.transforms import * from awsglue.utils import getResolvedOptions from awsglue.context import GlueContext from pyspark.context import SparkContext from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("DuplicateRecordRemover").getOrCreate()
table_name = "my_table_name" output_location = "s3://my-bucket-name/my-path"
glueContext = GlueContext(SparkContext.getOrCreate())
my_dyframe = glueContext.create_dynamic_frame.from_catalog(database="my_database_name", table_name=table_name)
my_df = my_dyframe.toDF()
my_df = my_df.drop_duplicates()
my_df.write.mode('append').parquet(output_location)
my_dyframe = glueContext.create_dynamic_frame.from_options(frame=my_df, connection_type="s3", connection_options={"path": output_location})
glueContext.write_dynamic_frame.from_catalog(frame=my_dyframe, database="my_database_name", table_name=table_name)