Spaces:

Sathiyapramod
/

image_processing_ocr

Running

App Files Files Community

Sathiyapramod commited on 18 days ago

Commit

bb94232

verified ·

1 Parent(s): 427e409

Update app.py

Browse files

Files changed (1) hide show

app.py +24 -31

app.py CHANGED Viewed

@@ -1,31 +1,38 @@
 import gradio as gr
 from PIL import Image
 import torch
-from transformers import AutoProcessor, AutoModelForCausalLM
 # =========================
-# Model Setup
 # =========================
-# Florence-2 is much more robust for full-page handwriting than TrOCR
 model_id = 'microsoft/Florence-2-large'
 device = "cuda" if torch.cuda.is_available() else "cpu"
-# Load model and processor with trust_remote_code=True for Florence architecture
-model = AutoModelForCausalLM.from_pretrained(model_id, trust_remote_code=True).to(device).eval()
 processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True)
 def run_ocr(image):
     if image is None:
         return "⚠️ Please upload an image."
-    # Florence-2 uses specific task prompts.
-    # <OCR_WITH_REGION> is best for messy handwriting and preserving layout.
-    prompt = "<OCR_WITH_REGION>"
-    # Preprocess image
     inputs = processor(text=prompt, images=image, return_tensors="pt").to(device)
-    # Generate text
     with torch.no_grad():
         generated_ids = model.generate(
             input_ids=inputs["input_ids"],
@@ -35,43 +42,29 @@ def run_ocr(image):
             num_beams=3
         )
-    # Decode result
     generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
-    # Post-process to clean up the Florence-specific tags
     parsed_answer = processor.post_process_generation(
         generated_text,
         task=prompt,
         image_size=(image.width, image.height)
     )
-    # Extract the plain text from the parsed dictionary
-    result = parsed_answer.get(prompt, "Could not parse text.")
-    # If the result is a dict (region based), we extract just the labels/text
-    if isinstance(result, dict) and 'labels' in result:
-        return "\n".join(result['labels'])
-    return str(result)
 # =========================
 # Gradio UI
 # =========================
-with gr.Blocks(theme=gr.themes.Soft()) as demo:
-    gr.Markdown("# 🖋️ Advanced Handwritten Note Extractor")
-    gr.Markdown("Using **Florence-2-Large** for contextual OCR. Better for full letters and messy notes.")
     with gr.Row():
-        input_img = gr.Image(type="pil", label="Upload Handwritten Letter")
-        output_text = gr.Textbox(label="Extracted Text", lines=15)
-    btn = gr.Button("Extract Text", variant="primary")
     btn.click(fn=run_ocr, inputs=input_img, outputs=output_text)
-    gr.Examples(
-        examples=[], # You can add paths to example images here
-        inputs=input_img
-    )
 if __name__ == "__main__":
     demo.launch()

 import gradio as gr
 from PIL import Image
 import torch
+from transformers import AutoProcessor, AutoModelForCausalLM, AutoConfig
 # =========================
+# Model Setup & Patch
 # =========================
 model_id = 'microsoft/Florence-2-large'
 device = "cuda" if torch.cuda.is_available() else "cpu"
+# PATCH: Explicitly handle the Florence2 configuration bug
+config = AutoConfig.from_pretrained(model_id, trust_remote_code=True)
+if not hasattr(config, 'forced_bos_token_id'):
+    config.forced_bos_token_id = None
+# Load model and processor
+model = AutoModelForCausalLM.from_pretrained(
+    model_id,
+    config=config,
+    trust_remote_code=True
+).to(device).eval()
 processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True)
 def run_ocr(image):
     if image is None:
         return "⚠️ Please upload an image."
+    # Using <DETAILED_CAPTION> or <OCR> task for better text flow
+    # Florence-2 works best with these specific task tags
+    prompt = "<OCR>"
     inputs = processor(text=prompt, images=image, return_tensors="pt").to(device)
     with torch.no_grad():
         generated_ids = model.generate(
             input_ids=inputs["input_ids"],
             num_beams=3
         )
     generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
+    # Clean up the output
     parsed_answer = processor.post_process_generation(
         generated_text,
         task=prompt,
         image_size=(image.width, image.height)
     )
+    return parsed_answer[prompt]
 # =========================
 # Gradio UI
 # =========================
+with gr.Blocks() as demo:
+    gr.Markdown("## 🖋️ Handwritten Note to Text (Florence-2)")
     with gr.Row():
+        input_img = gr.Image(type="pil")
+        output_text = gr.Textbox(label="Extracted Text", lines=10)
+    btn = gr.Button("Convert to Text", variant="primary")
     btn.click(fn=run_ocr, inputs=input_img, outputs=output_text)
 if __name__ == "__main__":
     demo.launch()