使用Airflow提供的迭代器API,显式地使用迭代器来处理结果。
在BigQueryGetDataOperator的execute方法中,使用BigQueryCursorIterator迭代器迭代结果,返回一个迭代器供后续任务使用。以下是代码示例:
from airflow.contrib.hooks.bigquery_hook import BigQueryHook
from airflow.contrib.utils.bigquery_iterator import BigQueryCursorIterator
from airflow.models import BaseOperator
from airflow.utils.decorators import apply_defaults
class BigQueryGetDataOperator(BaseOperator):
@apply_defaults
def __init__(
self,
sql,
destination_dataset_table=None,
bigquery_conn_id='bigquery_default',
delegate_to=None,
use_legacy_sql=True,
*args,
**kwargs):
super(BigQueryGetDataOperator, self).__init__(*args, **kwargs)
self.sql = sql
self.destination_dataset_table = destination_dataset_table
self.bigquery_conn_id = bigquery_conn_id
self.delegate_to = delegate_to
self.use_legacy_sql = use_legacy_sql
def execute(self, context):
hook = BigQueryHook(
bigquery_conn_id=self.bigquery_conn_id,
use_legacy_sql=self.use_legacy_sql,
delegate_to=self.delegate_to
)
conn = hook.get_conn()
cursor = conn.cursor()
cursor.execute(self.sql)
iterator = BigQueryCursorIterator(cursor)
return iterator
在使用此操作符时,接收到迭代器再使用它,例如:
data = BigQueryGetDataOperator(
task_id='get_data',
sql='SELECT * FROM my_table'
)
for row in data.execute(None):
process(row)