๐Ÿงพ Smart Receipt Analyzer

A legacy-level real-world Python OCR project built for Africa โ€” Powered by NITDA & NCAIR

๐Ÿ“œ Full Python Code


            
import pytesseract
import cv2
import csv
import re
from PIL import Image

# Path to Tesseract OCR
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'

# Load and preprocess image
img = cv2.imread('receipt.jpg')
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
raw_text = pytesseract.image_to_string(gray)

# Clean lines
lines = raw_text.split('\n')
lines = [line.strip() for line in lines if line.strip() != '']
print("Cleaned Lines:")
for line in lines:
    print(f"-> {line}")

# Categorization setup
expenses = []
category_map = {
    "indomie": "Food",
    "milk": "Beverage",
    "detergent": "Cleaning",
    "tooth": "Personal Care",
    # ... more keywords ...
}

def categorize(item_name):
    item_name = item_name.lower()
    for keyword, category in category_map.items():
        if keyword in item_name:
            return category
    return "Uncategorized"

# Extract item/price from text
for line in lines:
    if any(x in line.lower() for x in ['total', 'thank', 'change']): continue
    match = re.search(r'(.+?)\s+([โ‚ฆN]?\s?[\d.,]+)', line)
    if match:
        item = match.group(1).strip()
        price_str = match.group(2).replace('โ‚ฆ', '').replace(',', '.').strip()
        try:
            price = round(float(price_str))
            expenses.append({'item': item, 'price': price, 'category': categorize(item)})
        except:
            continue

# Print and save to CSV
print("\nStructured Data:")
for e in expenses:
    print(f"{e['item']} - โ‚ฆ{e['price']} - [{e['category']}]")

with open('expenses.csv', 'w', newline='') as f:
    writer = csv.writer(f)
    writer.writerow(['Item', 'Price', 'Category'])
    for e in expenses:
        writer.writerow([e['item'], e['price'], e['category']])