Receipt Vision: Build a receipt reader using AWS Textract and Comprehend | by Hassan Ijaz

import json
import boto3
import logging
import urllib.parse
import uuid
from dateutil import parser
import re

# Arrange logging
logger = logging.getLogger()
logger.setLevel(logging.INFO)

# Initialize AWS purchasers
s3_client = boto3.shopper(‘s3’)
textract_client = boto3.shopper(‘textract’)
comprehend_client = boto3.shopper(‘comprehend’)

def lambda_handler(occasion, context):
attempt:
# Log the occasion obtained from S3
logger.data(f”Acquired occasion: {json.dumps(occasion)}”)

# Get the bucket identify and doc key from the occasion
bucket = occasion[‘Records’][0][‘s3’][‘bucket’][‘name’]
key = occasion[‘Records’][0][‘s3’][‘object’][‘key’]
logger.data(f”Bucket: {bucket}, Key: {key}”)

# URL-encode the thing key to deal with areas and particular characters
encoded_key = urllib.parse.quote(key)
logger.data(f”Encoded Key: {encoded_key}”)

# Skip processing if the file is within the outcome bucket
if bucket == ‘your-result-bucket’:
logger.data(“Skipping processing for output bucket.”)
return

# Name Textract to investigate the doc
response = textract_client.analyze_document(
Doc={‘S3Object’: {‘Bucket’: bucket, ‘Title’: encoded_key}},
FeatureTypes=[‘TABLES’, ‘FORMS’]
)

# Extract the textual content so as
text_blocks = extract_text_blocks(response)
combined_text = “n”.be a part of(text_blocks)

# Name Comprehend to investigate the textual content
comprehend_response = comprehend_client.detect_entities(
Textual content=combined_text,
LanguageCode=’en’
)

# Extract the structured knowledge
extracted_data = extract_data(text_blocks, comprehend_response)
logger.data(f”Extracted Information: {extracted_data}”)

# Save the extracted knowledge to a different S3 bucket
save_to_s3(extracted_data, key)

return {
‘statusCode’: 200,
‘physique’: json.dumps(‘Receipt processed efficiently!’)
}

besides Exception as e:
logger.error(f”An error occurred: {str(e)}”)
return {
‘statusCode’: 500,
‘physique’: json.dumps(‘An inside error occurred.’)
}

def extract_text_blocks(response):
text_blocks = []
for block in response[‘Blocks’]:
if block[‘BlockType’] == ‘LINE’:
text_blocks.append(block[‘Text’])
return text_blocks

def extract_data(text_blocks, comprehend_response):
knowledge = {‘ReceiptId’: str(uuid.uuid4())}
entities = comprehend_response[‘Entities’]
vendor_name = None
total_amount = None
date = None

for entity in entities:
if entity[‘Type’] == ‘ORGANIZATION’ and vendor_name is None:
vendor_name = entity[‘Text’]
elif entity[‘Type’] == ‘DATE’ and date is None:
date = entity[‘Text’]

# Extract complete quantity extra precisely
for line in text_blocks:
if ‘complete’ in line.decrease():
elements = line.cut up()
for half in elements:
if half.change(‘.’, ”, 1).isdigit():
total_amount = half
break

knowledge[‘Vendor’] = vendor_name if vendor_name else ‘N/A’
knowledge[‘Total’] = total_amount if total_amount else ‘N/A’
knowledge[‘Date’] = extract_date(text_blocks)

return knowledge

def extract_date(text_blocks):
date_patterns = [
r’bd{1,2}[/-]d{1,2}[/-]d{2,4}b’, # Matches dates like MM/DD/YY or MM/DD/YYYY
]
date_regex = re.compile(‘|’.be a part of(date_patterns))

for line in text_blocks:
match = date_regex.search(line)
if match:
date_str = match.group(0)
attempt:
date = parser.parse(date_str, fuzzy=False)
logger.data(f”Parsed date: {date.strftime(‘%Y-%m-%d’)} from line: {line}”)
if date.yr > 1900 and date.yr < 2100:
return date.strftime(‘%Y-%m-%d’)
besides ValueError:
logger.data(f”Did not parse date from line: {line}”)
proceed
return ‘N/A’

def save_to_s3(knowledge, original_key):
result_bucket = ‘hijaz-receipt-results’ # Exchange along with your outcome bucket identify
result_key = ‘outcomes/’ + original_key.cut up(‘/’)[-1].change(‘.jpg’, ‘.json’).change(‘.png’, ‘.json’)

s3_client.put_object(
Bucket=result_bucket,
Key=result_key,
Physique=json.dumps(knowledge),
ContentType=’utility/json’
)

Source link

The Proliferation of AI Models: Challenges and Considerations | by Sandip Patil | Jul, 2024

Calculating Parkinson’s Volatility in Python | by Sofien Kaabar, CFA | Jul, 2024

Landscape of NLP from 1950–2024. The landscape of Natural Language… | by Raajitha Muthyala | Jul, 2024

Leave A Reply Cancel Reply

One of the most durable power stations I’ve tested is not made by Anker or Jackery

Still Wakes the Deep is a modern horror classic

The CFO’s Quest in an AI-first World

Buy a Microsoft Visual Studio Pro license for $35

The Proliferation of AI Models: Challenges and Considerations | by Sandip Patil | Jul, 2024

Most Popular

The Hamas Threat of Hostage Execution Videos Looms Large Over Social Media

Revolutionizing the Way We Find Love

Federal Investigators Widen Tesla Inquiry, Company Says

Our Picks

One of the most durable power stations I’ve tested is not made by Anker or Jackery

Still Wakes the Deep is a modern horror classic

The CFO’s Quest in an AI-first World

Receipt Vision: Build a receipt reader using AWS Textract and Comprehend | by Hassan Ijaz | Jun, 2024

Related Posts

Leave A Reply Cancel Reply