要使用Amazon Textract提取键值对,您可以遵循以下步骤:
import boto3
# 创建Textract客户端
textract_client = boto3.client('textract')
StartDocumentTextDetection方法来开始文档文本检测:response = textract_client.start_document_text_detection(
DocumentLocation={
'S3Object': {
'Bucket': 'your-bucket-name',
'Name': 'your-document-key'
}
}
)
# 获取文档检测作业ID
job_id = response['JobId']
GetDocumentTextDetection方法获取文档检测结果:response = textract_client.get_document_text_detection(JobId=job_id)
# 获取文档中的文本块
blocks = response['Blocks']
key_value_pairs = []
for block in blocks:
if block['BlockType'] == 'KEY_VALUE_SET':
key_value_pair = {}
for relationship in block['Relationships']:
if relationship['Type'] == 'CHILD':
for child_id in relationship['Ids']:
child_block = next(
child_block for child_block in blocks if child_block['Id'] == child_id
)
if child_block['BlockType'] == 'KEY':
key_value_pair['Key'] = child_block['Text']
elif child_block['BlockType'] == 'VALUE':
key_value_pair['Value'] = child_block['Text']
key_value_pairs.append(key_value_pair)
完整代码示例:
import boto3
# 创建Textract客户端
textract_client = boto3.client('textract')
# 开始文档文本检测
response = textract_client.start_document_text_detection(
DocumentLocation={
'S3Object': {
'Bucket': 'your-bucket-name',
'Name': 'your-document-key'
}
}
)
# 获取文档检测作业ID
job_id = response['JobId']
# 获取文档检测结果
response = textract_client.get_document_text_detection(JobId=job_id)
# 获取文档中的文本块
blocks = response['Blocks']
# 提取键值对
key_value_pairs = []
for block in blocks:
if block['BlockType'] == 'KEY_VALUE_SET':
key_value_pair = {}
for relationship in block['Relationships']:
if relationship['Type'] == 'CHILD':
for child_id in relationship['Ids']:
child_block = next(
child_block for child_block in blocks if child_block['Id'] == child_id
)
if child_block['BlockType'] == 'KEY':
key_value_pair['Key'] = child_block['Text']
elif child_block['BlockType'] == 'VALUE':
key_value_pair['Value'] = child_block['Text']
key_value_pairs.append(key_value_pair)
# 打印键值对
for pair in key_value_pairs:
print('Key:', pair['Key'])
print('Value:', pair['Value'])
print('---')
请注意,上述示例假设您已经设置了适当的AWS认证,并且将需要替换的参数(例如桶名和文档键)替换为您自己的值。