在AWS Glue作业中,连接是可选的,但是如果您要使用数据目录或使用其他AWS服务,例如在RDS中运行的数据库,则需要创建连接。下面是一个使用AWS Glue连接的示例:
import sys
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from awsglue.dynamicframe import DynamicFrame
from awsglue.context import GlueContext
from pyspark.context import SparkContext
from pyspark.sql.functions import *
sc = SparkContext.getOrCreate()
glueContext = GlueContext(sc)
# 创建连接
connection = glueContext.create_connection(
connection_type="mysql", # 数据库类型
connection_options={
"url": "jdbc:mysql://your-mysql-hostname:3306/",
"user": "your-mysql-username",
"password": "your-mysql-password",
"dbtable": "your-mysql-dbtable"
}
)
# 读取MySQL中的数据
datasource = glueContext.create_dynamic_frame.from_options(
connection_type="mysql",
connection_options={
"url": "jdbc:mysql://your-mysql-hostname:3306/",
"user": "your-mysql-username",
"password": "your-mysql-password",
"dbtable": "your-mysql-dbtable"
}
)
# 转换数据
...
...
...
# 写入S3
glueContext.write_dynamic_frame.from_options(
frame=transformed_data,
connection_type="s3",
connection_options={
"path": "s3://your-bucket/path/"
}
)