import pickle
import re
from PIL import Image
from transformers import pipeline
import io

def clean_text(text):
    clean_text = re.sub(r'<[^>]+>', '', text)
    clean_text = clean_text.strip()
    clean_text = re.sub(r'\s+', ' ', clean_text)
    return clean_text

pipe = pipeline("image-to-text", model="jinhybr/OCR-Donut-CORD")

def extract_text(binary_image):
    image = Image.open(io.BytesIO(binary_image))
    result = pipe(image)
    text = result[0]['generated_text']
    cleaned_text = clean_text(text)
    return cleaned_text

# print(extract_text(open("pictures/users/2.jpg", "rb").read()))

print("OCR pipeline loaded successfully!")