Spaces:

Sathiyapramod
/

image_processing_ocr

Running

App Files Files Community

Sathiyapramod commited on 21 days ago

Commit

285d260

verified ·

1 Parent(s): 881a1b7

Update app.py

Browse files

Files changed (1) hide show

app.py +58 -106

app.py CHANGED Viewed

@@ -1,125 +1,77 @@
 import gradio as gr
 from PIL import Image
-import numpy as np
-import cv2
 import torch
-from transformers import TrOCRProcessor, VisionEncoderDecoderModel
 # =========================
-# Model Loader (cached)
 # =========================
-processor = None
-model = None
 device = "cuda" if torch.cuda.is_available() else "cpu"
-def load_model():
-    global processor, model
-    if processor is None or model is None:
-        processor = TrOCRProcessor.from_pretrained("microsoft/trocr-base-handwritten")
-        model = VisionEncoderDecoderModel.from_pretrained("microsoft/trocr-base-handwritten")
-        model.to(device)
-# =========================
-# Line Segmentation Logic
-# =========================
-def segment_lines(image: Image.Image):
-    """
-    Splits image into individual text lines using horizontal projection
-    """
-    # Convert to grayscale
-    gray = np.array(image.convert("L"))
-    # Apply thresholding
-    _, thresh = cv2.threshold(gray, 150, 255, cv2.THRESH_BINARY_INV)
-    # Horizontal projection
-    horizontal_sum = np.sum(thresh, axis=1)
-    lines = []
-    start = None
-    for i, val in enumerate(horizontal_sum):
-        if val > 0 and start is None:
-            start = i
-        elif val == 0 and start is not None:
-            end = i
-            lines.append((start, end))
-            start = None
-    # Edge case: last line
-    if start is not None:
-        lines.append((start, len(horizontal_sum)))
-    # Crop line images
-    line_images = []
-    for (s, e) in lines:
-        # Add small padding
-        top = max(0, s - 5)
-        bottom = min(image.height, e + 5)
-        cropped = image.crop((0, top, image.width, bottom))
-        # Skip very small/noise regions
-        if bottom - top > 10:
-            line_images.append(cropped)
-    return line_images
-# =========================
-# OCR Prediction
-# =========================
-def predict(image):
-    load_model()
     if image is None:
         return "⚠️ Please upload an image."
-    try:
-        # Segment into lines
-        lines = segment_lines(image)
-        if not lines:
-            return "⚠️ No text detected. Try a clearer image."
-        results = []
-        for line_img in lines:
-            pixel_values = processor(
-                images=line_img,
-                return_tensors="pt"
-            ).pixel_values.to(device)
-            generated_ids = model.generate(pixel_values)
-            text = processor.batch_decode(
-                generated_ids,
-                skip_special_tokens=True
-            )[0]
-            results.append(text)
-        final_text = "\n".join(results)
-        return final_text if final_text.strip() else "⚠️ Could not extract text."
-    except Exception as e:
-        return f"❌ Error occurred: {str(e)}"
 # =========================
 # Gradio UI
 # =========================
-demo = gr.Interface(
-    fn=predict,
-    inputs=gr.Image(type="pil", label="Upload Handwritten Image"),
-    outputs=gr.Textbox(label="Extracted Text"),
-    title="📝 Handwritten OCR (Multi-line)",
-    description="Upload a handwritten note image. The model will extract text line by line.",
-)
 if __name__ == "__main__":
     demo.launch()

 import gradio as gr
 from PIL import Image
 import torch
+from transformers import AutoProcessor, AutoModelForCausalLM
 # =========================
+# Model Setup
 # =========================
+# Florence-2 is much more robust for full-page handwriting than TrOCR
+model_id = 'microsoft/Florence-2-large'
 device = "cuda" if torch.cuda.is_available() else "cpu"
+# Load model and processor with trust_remote_code=True for Florence architecture
+model = AutoModelForCausalLM.from_pretrained(model_id, trust_remote_code=True).to(device).eval()
+processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True)
+def run_ocr(image):
     if image is None:
         return "⚠️ Please upload an image."
+    # Florence-2 uses specific task prompts.
+    # <OCR_WITH_REGION> is best for messy handwriting and preserving layout.
+    prompt = "<OCR_WITH_REGION>"
+    # Preprocess image
+    inputs = processor(text=prompt, images=image, return_tensors="pt").to(device)
+    # Generate text
+    with torch.no_grad():
+        generated_ids = model.generate(
+            input_ids=inputs["input_ids"],
+            pixel_values=inputs["pixel_values"],
+            max_new_tokens=1024,
+            do_sample=False,
+            num_beams=3
+        )
+    # Decode result
+    generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
+    # Post-process to clean up the Florence-specific tags
+    parsed_answer = processor.post_process_generation(
+        generated_text,
+        task=prompt,
+        image_size=(image.width, image.height)
+    )
+    # Extract the plain text from the parsed dictionary
+    result = parsed_answer.get(prompt, "Could not parse text.")
+    # If the result is a dict (region based), we extract just the labels/text
+    if isinstance(result, dict) and 'labels' in result:
+        return "\n".join(result['labels'])
+    return str(result)
 # =========================
 # Gradio UI
 # =========================
+with gr.Blocks(theme=gr.themes.Soft()) as demo:
+    gr.Markdown("# 🖋️ Advanced Handwritten Note Extractor")
+    gr.Markdown("Using **Florence-2-Large** for contextual OCR. Better for full letters and messy notes.")
+    with gr.Row():
+        input_img = gr.Image(type="pil", label="Upload Handwritten Letter")
+        output_text = gr.Textbox(label="Extracted Text", lines=15)
+    btn = gr.Button("Extract Text", variant="primary")
+    btn.click(fn=run_ocr, inputs=input_img, outputs=output_text)
+    gr.Examples(
+        examples=[], # You can add paths to example images here
+        inputs=input_img
+    )
 if __name__ == "__main__":
     demo.launch()