â Final Script: receipt_reader.py
import pytesseract
import cv2
import csv
import re
from PIL import Image
# Set the path to Tesseract OCR (adjust if needed)
pytesseract.pytesseract.tesseract_cmd = r'C:\\Program Files\\Tesseract-OCR\\tesseract.exe'
# === PHASE 1: Load image and extract raw text ===
img = cv2.imread('receipt.jpg') # Load receipt image
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) # Convert to grayscale
raw_text = pytesseract.image_to_string(gray) # Extract raw text
lines = raw_text.split('\n') # Split into lines
lines = [line.strip() for line in lines if line.strip() != ''] # Clean empty lines
print("Cleaned Lines:")
for line in lines:
print(f"-> {line}")
# === PHASE 2: Extract item names and prices ===
expenses = []
for line in lines:
line = line.strip()
if any(word in line.lower() for word in ['total', 'cash', 'change', 'receipt', 'thank', 'approval', 'code']):
continue
match = re.search(r'(.+?)\s+([âŚN]?\s?[\d.,]+)[\)]?$', line)
if match:
item = match.group(1).strip()
price_str = match.group(2).strip().replace('âŚ', '').replace('N', '').replace(',', '.').strip()
try:
price = round(float(price_str))
expenses.append({'item': item, 'price': price})
except ValueError:
continue
# === Show the results ===
print("\nStructured Data:")
for e in expenses:
print(f"{e['item']} - âŚ{e['price']}")
# === Save to CSV file ===
with open('expenses.csv', 'w', newline='') as file:
writer = csv.writer(file)
writer.writerow(['Item', 'Price'])
for e in expenses:
writer.writerow([e['item'], e['price']])
â Your Folder Should Look Like
receipt_reader.py â your Python file
receipt.jpg â image you want to scan
expenses.csv â auto-created after running script
â How to Run It
python receipt_reader.py
You should see cleaned items printed in your terminal + expenses.csv saved in the folder.