A legacy-level real-world Python OCR project built for Africa โ Powered by NITDA & NCAIR
import pytesseract
import cv2
import csv
import re
from PIL import Image
# Path to Tesseract OCR
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'
# Load and preprocess image
img = cv2.imread('receipt.jpg')
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
raw_text = pytesseract.image_to_string(gray)
# Clean lines
lines = raw_text.split('\n')
lines = [line.strip() for line in lines if line.strip() != '']
print("Cleaned Lines:")
for line in lines:
print(f"-> {line}")
# Categorization setup
expenses = []
category_map = {
"indomie": "Food",
"milk": "Beverage",
"detergent": "Cleaning",
"tooth": "Personal Care",
# ... more keywords ...
}
def categorize(item_name):
item_name = item_name.lower()
for keyword, category in category_map.items():
if keyword in item_name:
return category
return "Uncategorized"
# Extract item/price from text
for line in lines:
if any(x in line.lower() for x in ['total', 'thank', 'change']): continue
match = re.search(r'(.+?)\s+([โฆN]?\s?[\d.,]+)', line)
if match:
item = match.group(1).strip()
price_str = match.group(2).replace('โฆ', '').replace(',', '.').strip()
try:
price = round(float(price_str))
expenses.append({'item': item, 'price': price, 'category': categorize(item)})
except:
continue
# Print and save to CSV
print("\nStructured Data:")
for e in expenses:
print(f"{e['item']} - โฆ{e['price']} - [{e['category']}]")
with open('expenses.csv', 'w', newline='') as f:
writer = csv.writer(f)
writer.writerow(['Item', 'Price', 'Category'])
for e in expenses:
writer.writerow([e['item'], e['price'], e['category']])
๐ Setup: We load all libraries and define the OCR path for Windows using pytesseract.tesseract_cmd.
๐งพ Image to Text: Using OpenCV, we load the image, convert it to grayscale, and use Tesseract to read the content into raw_text.
๐งน Clean Lines: We split the text into lines and remove empty entries.
๐ก Keyword Categorization: A dictionary of keywords is used to detect what category each item belongs to โ if no match is found, we mark it as Uncategorized.
๐ง Regex Matching: We use a smart regular expression to extract item names and price values from the end of each line.
๐พ Save to CSV: Finally, we write our clean data to a CSV file named expenses.csv for reporting and dashboarding.