AWS Glue中使用Google BigQuery Connector的过程中,确实能够编写自定义查询。下面是一个基本的编写自定义查询的示例:
import com.amazonaws.services.glue.util.AWSGlueArgParser
import com.amazonaws.services.glue.DynamicFrame
import com.google.cloud.hadoop.io.bigquery.BigQueryConfiguration
import com.google.cloud.hadoop.io.bigquery.BigQueryStrings
import com.google.cloud.hadoop.io.bigquery.BigQueryUtils
import org.apache.hadoop.mapreduce.Job
import org.apache.spark.SparkContext
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.DataFrame
import org.apache.spark.sql.functions._
object CustomQuery {
def main(sysArgs: Array[String]) {
// Glue job-specific args.
val glueArgs = Map("JOB_NAME" -> "custom-query")
// Google BigQuery related args.
val projectId = "my-big-query-project"
val bucketName = "my-bucket"
val datasetName = "my-dataset"
val tableName = "my-table"
val customQuery = "SELECT COUNT(*) FROM `my-dataset.my-table`"
// Use AWSGlueArgParser to parse args.
val args = AWSGlueArgParser.getResolvedOptions(sysArgs, Seq("JOB_NAME").toArray)
// Set Spark context.
val sparkContext = new SparkContext()
val sparkSession = SparkSession
.builder()
.appName(args("JOB_NAME"))
.getOrCreate()
// Initialize the BigQuery Configuration.
val bigQueryConf = BigQueryConfiguration.configureBigQueryOutput(projectId)
// Set the Google Cloud storage bucket to export results to.
bigQueryConf.set(BigQueryConfiguration.OUTPUT_FORMAT_OPTION, BigQueryStrings.CSV)
bigQueryConf.set(BigQueryConfiguration.GCS_BUCKET_OPTION, s"gs://$bucketName")
// Set up the Job, OutputFormat and Table to write to.
val job = Job.getInstance(sparkContext.hadoopConfiguration)
BigQueryConfiguration.configureBigQueryOutput(job, projectId, customQuery, datasetName, tableName)
// Load DataFrame from customQuery.
val df = sparkSession.read
.format("