AWS Glue Job提取目录表中不存在的列时,需要检查源表和目录表的列表是否匹配。可以使用下面的代码示例来检查两个表中是否存在不匹配的列:
import sys
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.dynamicframe import DynamicFrame
from awsglue.job import Job
args = getResolvedOptions(sys.argv, ['JOB_NAME'])
sc = SparkContext()
glueContext = GlueContext(sc)
spark = glueContext.spark_session
job = Job(glueContext)
job.init(args['JOB_NAME'], args)
source_datasource = glueContext.create_dynamic_frame.from_catalog(database = "database_name", table_name = "source_table")
source_df = source_datasource.toDF()
source_columns = source_df.columns
catalog_datasource = glueContext.create_dynamic_frame.from_catalog(database = "database_name", table_name = "catalog_table")
catalog_df = catalog_datasource.toDF()
catalog_columns = catalog_df.columns
# Find columns in source table that are not in catalog table
missing_columns = list(set(source_columns) - set(catalog_columns))
if len(missing_columns) > 0:
print("Columns missing in catalog table:")
print(missing_columns)
else:
print("All columns present in catalog table.")
job.commit()
通过比较两个表的列,可以查找源表中不存在于目录表中的列。如果存在不匹配的列,则可以手动更正目录表或进行其他必要的更改来使两个表匹配。