Spaces:

IFMedTechdemo
/

Multi-Model-OCR

Running on Zero

App Files Files Community

IFMedTechdemo commited on 25 days ago

Commit

e71abcc

verified ·

1 Parent(s): 161e0b2

Update app.py

Browse files

Files changed (1) hide show

app.py +23 -68

app.py CHANGED Viewed

@@ -19,28 +19,6 @@ import time
 # Device setup
 device = "cuda" if torch.cuda.is_available() else "cpu"
-# Load Chandra-OCR using AutoModel
-MODEL_ID_V = "datalab-to/chandra"
-processor_v = AutoProcessor.from_pretrained(MODEL_ID_V, trust_remote_code=True)
-model_v = AutoModel.from_pretrained(
-    MODEL_ID_V,
-    trust_remote_code=True,
-    torch_dtype=torch.float16,
-    attn_implementation="sdpa",
-    device_map="auto"
-).eval()
-# Load Nanonets-OCR2-3B using AutoModel
-MODEL_ID_X = "nanonets/Nanonets-OCR2-3B"
-processor_x = AutoProcessor.from_pretrained(MODEL_ID_X, trust_remote_code=True)
-model_x = AutoModel.from_pretrained(
-    MODEL_ID_X,
-    trust_remote_code=True,
-    torch_dtype=torch.float16,
-    attn_implementation="sdpa",
-    device_map="auto"
-).eval()
 # Load Dots.OCR
 MODEL_PATH_D = "strangervisionhf/dots.ocr-base-fix"
 processor_d = AutoProcessor.from_pretrained(MODEL_PATH_D, trust_remote_code=True)
@@ -52,9 +30,9 @@ model_d = AutoModelForCausalLM.from_pretrained(
     trust_remote_code=True
 ).eval()
-# Load olmOCR-2-7B-1025-FP8 using AutoModel
-MODEL_ID_M = "allenai/olmOCR-2-7B-1025-FP8"
-processor_m = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct", trust_remote_code=True)
 model_m = AutoModel.from_pretrained(
     MODEL_ID_M,
     trust_remote_code=True,
@@ -74,9 +52,6 @@ model_ds = AutoModel.from_pretrained(
     device_map="auto"
 ).eval().to(torch.bfloat16)
-# Rest of your code remains the same...
 @spaces.GPU
 def generate_image(model_name: str, text: str, image: Image.Image,
                    max_new_tokens: int, temperature: float, top_p: float,
@@ -91,7 +66,6 @@ def generate_image(model_name: str, text: str, image: Image.Image,
     # Handle DeepSeek-OCR separately due to different API
     if model_name == "DeepSeek-OCR":
-        # DeepSeek-OCR resolution configs
         resolution_configs = {
             "Tiny": {"base_size": 512, "image_size": 512, "crop_mode": False},
             "Small": {"base_size": 640, "image_size": 640, "crop_mode": False},
@@ -101,18 +75,14 @@ def generate_image(model_name: str, text: str, image: Image.Image,
         }
         config = resolution_configs[resolution_mode]
-        # Save image temporarily
         temp_image_path = "/tmp/temp_ocr_image.jpg"
         image.save(temp_image_path)
-        # DeepSeek-OCR uses special prompt format
         if not text:
             text = "Free OCR."
         prompt_ds = f"<image>\n{text}"
         try:
-            # DeepSeek-OCR's custom infer method
             result = model_ds.infer(
                 tokenizer_ds,
                 prompt=prompt_ds,
@@ -128,21 +98,14 @@ def generate_image(model_name: str, text: str, image: Image.Image,
         except Exception as e:
             yield f"Error: {str(e)}", f"Error: {str(e)}"
         finally:
-            # Clean up temp file
             if os.path.exists(temp_image_path):
                 os.remove(temp_image_path)
         return
     # Handle other models with standard API
-    if model_name == "olmOCR-2-7B-1025-FP8":
         processor = processor_m
         model = model_m
-    elif model_name == "Nanonets-OCR2-3B":
-        processor = processor_x
-        model = model_x
-    elif model_name == "Chandra-OCR":
-        processor = processor_v
-        model = model_v
     elif model_name == "Dots.OCR":
         processor = processor_d
         model = model_d
@@ -154,9 +117,10 @@ def generate_image(model_name: str, text: str, image: Image.Image,
         "role": "user",
         "content": [
             {"type": "image"},
-            {"type": "text", "text": text},
         ]
     }]
     prompt_full = processor.apply_chat_template(
         messages, tokenize=False, add_generation_prompt=True
     )
@@ -215,26 +179,18 @@ with gr.Blocks(css=css, title="Multi-Model OCR Space") as demo:
         """
         # 🔍 Multi-Model OCR Comparison Space
-        Compare five state-of-the-art OCR models on your images:
-        - **Chandra-OCR**: Specialized OCR model
-        - **Nanonets-OCR2-3B**: High-accuracy OCR
-        - **Dots.OCR**: Lightweight OCR solution
-        - **olmOCR-2-7B-1025-FP8**: Advanced FP8 quantized OCR model
-        - **DeepSeek-OCR**: Context compression OCR with 10× compression ratio (97% accuracy)
         """
     )
     with gr.Row():
         with gr.Column(scale=1):
             model_selector = gr.Dropdown(
-                choices=[
-                    "Chandra-OCR",
-                    "Nanonets-OCR2-3B",
-                    "Dots.OCR",
-                    "olmOCR-2-7B-1025-FP8",
-                    "DeepSeek-OCR"
-                ],
-                value="DeepSeek-OCR",
                 label="Select OCR Model",
                 elem_classes=["model-selector"]
             )
@@ -243,8 +199,8 @@ with gr.Blocks(css=css, title="Multi-Model OCR Space") as demo:
                 choices=["Tiny", "Small", "Base", "Large", "Gundam"],
                 value="Gundam",
                 label="DeepSeek-OCR Resolution Mode",
-                info="Only applies to DeepSeek-OCR. Gundam mode recommended for best results.",
-                visible=True
             )
             image_input = gr.Image(type="pil", label="Upload Image")
@@ -339,20 +295,19 @@ with gr.Blocks(css=css, title="Multi-Model OCR Space") as demo:
     gr.Markdown(
         """
-        ### Model Information:
-        **DeepSeek-OCR Modes:**
-        - **Tiny**: 64 tokens @ 512×512 (fastest, basic documents)
-        - **Small**: 100 tokens @ 640×640 (good for simple pages)
-        - **Base**: 256 tokens @ 1024×1024 (standard documents)
-        - **Large**: 400 tokens @ 1280×1280 (complex layouts)
-        - **Gundam**: Dynamic multi-view (recommended for best accuracy)
         ### Tips:
-        - Upload clear images for best results
-        - DeepSeek-OCR excels at table extraction and markdown conversion
         - Adjust temperature for more creative or conservative outputs
-        - Try different models to compare performance on your specific use case
         """
     )

 # Device setup
 device = "cuda" if torch.cuda.is_available() else "cpu"
 # Load Dots.OCR
 MODEL_PATH_D = "strangervisionhf/dots.ocr-base-fix"
 processor_d = AutoProcessor.from_pretrained(MODEL_PATH_D, trust_remote_code=True)
     trust_remote_code=True
 ).eval()
+# Load olmOCR-2-7B-1025 (non-FP8 version for simplicity)
+MODEL_ID_M = "allenai/olmOCR-2-7B-1025"
+processor_m = AutoProcessor.from_pretrained(MODEL_ID_M, trust_remote_code=True)
 model_m = AutoModel.from_pretrained(
     MODEL_ID_M,
     trust_remote_code=True,
     device_map="auto"
 ).eval().to(torch.bfloat16)
 @spaces.GPU
 def generate_image(model_name: str, text: str, image: Image.Image,
                    max_new_tokens: int, temperature: float, top_p: float,
     # Handle DeepSeek-OCR separately due to different API
     if model_name == "DeepSeek-OCR":
         resolution_configs = {
             "Tiny": {"base_size": 512, "image_size": 512, "crop_mode": False},
             "Small": {"base_size": 640, "image_size": 640, "crop_mode": False},
         }
         config = resolution_configs[resolution_mode]
         temp_image_path = "/tmp/temp_ocr_image.jpg"
         image.save(temp_image_path)
         if not text:
             text = "Free OCR."
         prompt_ds = f"<image>\n{text}"
         try:
             result = model_ds.infer(
                 tokenizer_ds,
                 prompt=prompt_ds,
         except Exception as e:
             yield f"Error: {str(e)}", f"Error: {str(e)}"
         finally:
             if os.path.exists(temp_image_path):
                 os.remove(temp_image_path)
         return
     # Handle other models with standard API
+    if model_name == "olmOCR-2-7B-1025":
         processor = processor_m
         model = model_m
     elif model_name == "Dots.OCR":
         processor = processor_d
         model = model_d
         "role": "user",
         "content": [
             {"type": "image"},
+            {"type": "text", "text": text if text else "Perform OCR on this image."},
         ]
     }]
     prompt_full = processor.apply_chat_template(
         messages, tokenize=False, add_generation_prompt=True
     )
         """
         # 🔍 Multi-Model OCR Comparison Space
+        Compare three state-of-the-art OCR models:
+        - **Dots.OCR**: Lightweight and efficient OCR
+        - **olmOCR-2-7B-1025**: Advanced OCR for math, tables, and complex layouts (82.4% accuracy)
+        - **DeepSeek-OCR**: Context compression OCR with 10× compression (97% accuracy)
         """
     )
     with gr.Row():
         with gr.Column(scale=1):
             model_selector = gr.Dropdown(
+                choices=["Dots.OCR", "olmOCR-2-7B-1025", "DeepSeek-OCR"],
+                value="olmOCR-2-7B-1025",
                 label="Select OCR Model",
                 elem_classes=["model-selector"]
             )
                 choices=["Tiny", "Small", "Base", "Large", "Gundam"],
                 value="Gundam",
                 label="DeepSeek-OCR Resolution Mode",
+                info="Only applies to DeepSeek-OCR. Gundam mode recommended.",
+                visible=False
             )
             image_input = gr.Image(type="pil", label="Upload Image")
     gr.Markdown(
         """
+        ### Model Strengths:
+        **Dots.OCR**: Fast and lightweight, great for simple documents and quick processing
+        **olmOCR-2-7B-1025**: Best for complex documents with tables, LaTeX equations, multi-column layouts, and handwritten text
+        **DeepSeek-OCR**: Excellent for markdown conversion, table extraction, and efficient context compression (10× smaller output)
         ### Tips:
+        - Upload clear, well-lit images for best results
+        - Use olmOCR for academic papers and technical documents
+        - Use DeepSeek for efficient processing of large document batches
         - Adjust temperature for more creative or conservative outputs
         """
     )