import cv2
import numpy as np

def extract_from_image(image_path):
    img = cv2.imread(image_path)
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)

    # Improve OCR accuracy
    gray = cv2.threshold(gray, 150, 255, cv2.THRESH_BINARY)[1]

    text = pytesseract.image_to_string(gray, lang='eng')

    # Aadhaar number (most reliable)
    aadhaar_match = re.search(r'\b\d{4}\s\d{4}\s\d{4}\b', text)

    # DOB / YOB
    dob_match = re.search(r'\b\d{2}/\d{2}/\d{4}\b', text)
    yob_match = re.search(r'Year\s*of\s*Birth[:\s]*(\d{4})', text, re.IGNORECASE)

    # Gender
    gender = None
    if re.search(r'\bMale\b', text, re.IGNORECASE):
        gender = 'Male'
    elif re.search(r'\bFemale\b', text, re.IGNORECASE):
        gender = 'Female'

    # Name heuristic (first clean alphabetic line)
    name = None
    for line in text.split('\n'):
        line = line.strip()
        if len(line) > 3 and line.replace(' ', '').isalpha():
            name = line
            break

    return {
        'aadhaar': aadhaar_match.group() if aadhaar_match else None,
        'name': name,
        'dob': dob_match.group() if dob_match else yob_match.group(1) if yob_match else None,
        'gender': gender,
        'raw_text': text
    }
