是的,AWS Glue可以连接到托管在VPC中的具有专用承租人的数据存储(RDS)。连接需要确保AWS Glue和RDS实例都在同一个VPC中,并且AWS Glue需要在该VPC的子网中拥有资源,以便可以连接到RDS实例。
以下是使用AWS Glue连接到托管在VPC中的RDS实例的Python示例代码:
import sys
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from awsglue.transforms import *
from awsglue import DynamicFrame
from awsglue.context import GlueContext
from pyspark.context import SparkContext
from pyspark.sql.functions import col, from_unixtime, unix_timestamp
args = getResolvedOptions(sys.argv, ['JOB_NAME', 'RDS_ENDPOINT', 'RDS_DATABASE', 'RDS_USER', 'RDS_PASSWORD'])
glue_context = GlueContext(SparkContext.getOrCreate())
job = Job(glue_context)
job.init(args['JOB_NAME'], args)
# set up JDBC connection
rds_endpoint = args['RDS_ENDPOINT']
rds_database = args['RDS_DATABASE']
rds_user = args['RDS_USER']
rds_password = args['RDS_PASSWORD']
jdbc_url = "jdbc:postgresql://{}/{}".format(rds_endpoint,rds_database)
connection_properties = {'user': rds_user,'password': rds_password,'driver': 'org.postgresql.Driver'}
# create a dynamic frame from RDS data
rds_dynamic_frame = glue_context.create_dynamic_frame.from_options(connection_type="jdbc",
connection_options={
"url": jdbc_url,
"dbtable": "my_table",
"user": rds_user,
"password": rds_password,
"redshiftTmpDir": 's3://path/to/temp/dir/'
},
transformation_ctx="rds_dynamic_frame")
# convert dynamic frame to data frame
rds_dataframe = rds_dynamic_frame.toDF()
# perform transformations
transformed_dataframe = rds_dataframe