import boto3
import logging
import urllib.parse
import uuid
from dateutil import parser
import re
# Arrange logging
logger = logging.getLogger()
logger.setLevel(logging.INFO)
# Initialize AWS purchasers
s3_client = boto3.shopper(‘s3’)
textract_client = boto3.shopper(‘textract’)
comprehend_client = boto3.shopper(‘comprehend’)
def lambda_handler(occasion, context):
attempt:
# Log the occasion obtained from S3
logger.data(f”Acquired occasion: {json.dumps(occasion)}”)
# Get the bucket identify and doc key from the occasion
bucket = occasion[‘Records’][0][‘s3’][‘bucket’][‘name’]
key = occasion[‘Records’][0][‘s3’][‘object’][‘key’]
logger.data(f”Bucket: {bucket}, Key: {key}”)
# URL-encode the thing key to deal with areas and particular characters
encoded_key = urllib.parse.quote(key)
logger.data(f”Encoded Key: {encoded_key}”)
# Skip processing if the file is within the outcome bucket
if bucket == ‘your-result-bucket’:
logger.data(“Skipping processing for output bucket.”)
return
# Name Textract to investigate the doc
response = textract_client.analyze_document(
Doc={‘S3Object’: {‘Bucket’: bucket, ‘Title’: encoded_key}},
FeatureTypes=[‘TABLES’, ‘FORMS’]
)
# Extract the textual content so as
text_blocks = extract_text_blocks(response)
combined_text = “n”.be a part of(text_blocks)
# Name Comprehend to investigate the textual content
comprehend_response = comprehend_client.detect_entities(
Textual content=combined_text,
LanguageCode=’en’
)
# Extract the structured knowledge
extracted_data = extract_data(text_blocks, comprehend_response)
logger.data(f”Extracted Information: {extracted_data}”)
# Save the extracted knowledge to a different S3 bucket
save_to_s3(extracted_data, key)
return {
‘statusCode’: 200,
‘physique’: json.dumps(‘Receipt processed efficiently!’)
}
besides Exception as e:
logger.error(f”An error occurred: {str(e)}”)
return {
‘statusCode’: 500,
‘physique’: json.dumps(‘An inside error occurred.’)
}
def extract_text_blocks(response):
text_blocks = []
for block in response[‘Blocks’]:
if block[‘BlockType’] == ‘LINE’:
text_blocks.append(block[‘Text’])
return text_blocks
def extract_data(text_blocks, comprehend_response):
knowledge = {‘ReceiptId’: str(uuid.uuid4())}
entities = comprehend_response[‘Entities’]
vendor_name = None
total_amount = None
date = None
for entity in entities:
if entity[‘Type’] == ‘ORGANIZATION’ and vendor_name is None:
vendor_name = entity[‘Text’]
elif entity[‘Type’] == ‘DATE’ and date is None:
date = entity[‘Text’]
# Extract complete quantity extra precisely
for line in text_blocks:
if ‘complete’ in line.decrease():
elements = line.cut up()
for half in elements:
if half.change(‘.’, ”, 1).isdigit():
total_amount = half
break
knowledge[‘Vendor’] = vendor_name if vendor_name else ‘N/A’
knowledge[‘Total’] = total_amount if total_amount else ‘N/A’
knowledge[‘Date’] = extract_date(text_blocks)
return knowledge
def extract_date(text_blocks):
date_patterns = [
r’bd{1,2}[/-]d{1,2}[/-]d{2,4}b’, # Matches dates like MM/DD/YY or MM/DD/YYYY
]
date_regex = re.compile(‘|’.be a part of(date_patterns))
for line in text_blocks:
match = date_regex.search(line)
if match:
date_str = match.group(0)
attempt:
date = parser.parse(date_str, fuzzy=False)
logger.data(f”Parsed date: {date.strftime(‘%Y-%m-%d’)} from line: {line}”)
if date.yr > 1900 and date.yr < 2100:
return date.strftime(‘%Y-%m-%d’)
besides ValueError:
logger.data(f”Did not parse date from line: {line}”)
proceed
return ‘N/A’
def save_to_s3(knowledge, original_key):
result_bucket = ‘hijaz-receipt-results’ # Exchange along with your outcome bucket identify
result_key = ‘outcomes/’ + original_key.cut up(‘/’)[-1].change(‘.jpg’, ‘.json’).change(‘.png’, ‘.json’)
s3_client.put_object(
Bucket=result_bucket,
Key=result_key,
Physique=json.dumps(knowledge),
ContentType=’utility/json’
)