Spaces:

IFMedTechdemo
/

Multi-Model-OCR

Running on Zero

App Files Files Community

IFMedTechdemo commited on 25 days ago

Commit

962d22d

verified ·

1 Parent(s): ccbebae

Update app.py

Browse files

Files changed (1) hide show

app.py +198 -112

app.py CHANGED Viewed

@@ -1,6 +1,12 @@
 import os
 import time
 import torch
 from threading import Thread
 from PIL import Image
 from transformers import (
@@ -23,55 +29,89 @@ MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
 device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
 # Load Chandra-OCR
-MODEL_ID_V = "datalab-to/chandra"
-processor_v = AutoProcessor.from_pretrained(MODEL_ID_V, trust_remote_code=True)
-if Qwen3VLForConditionalGeneration:
-    model_v = Qwen3VLForConditionalGeneration.from_pretrained(
-        MODEL_ID_V,
-        trust_remote_code=True,
-        torch_dtype=torch.float16
-    ).to(device).eval()
-else:
     model_v = None
 # Load Nanonets-OCR2-3B
-MODEL_ID_X = "nanonets/Nanonets-OCR2-3B"
-processor_x = AutoProcessor.from_pretrained(MODEL_ID_X, trust_remote_code=True)
-model_x = Qwen2_5_VLForConditionalGeneration.from_pretrained(
-    MODEL_ID_X,
-    trust_remote_code=True,
-    torch_dtype=torch.float16
-).to(device).eval()
-# Load Dots.OCR from the local, patched directory
-MODEL_PATH_D = "strangervisionhf/dots.ocr-base-fix"
-processor_d = AutoProcessor.from_pretrained(MODEL_PATH_D, trust_remote_code=True)
-model_d = AutoModelForCausalLM.from_pretrained(
-    MODEL_PATH_D,
-    attn_implementation="flash_attention_2",
-    torch_dtype=torch.bfloat16,
-    device_map="auto",
-    trust_remote_code=True
-).eval()
 # Load olmOCR-2-7B-1025
-MODEL_ID_M = "allenai/olmOCR-2-7B-1025"
-processor_m = AutoProcessor.from_pretrained(MODEL_ID_M, trust_remote_code=True)
-model_m = Qwen2_5_VLForConditionalGeneration.from_pretrained(
-    MODEL_ID_M,
-    trust_remote_code=True,
-    torch_dtype=torch.float16
-).to(device).eval()
 # Load DeepSeek-OCR
-MODEL_ID_DS = "deepseek-ai/deepseek-ocr"
-processor_ds = AutoProcessor.from_pretrained(MODEL_ID_DS, trust_remote_code=True)
-model_ds = Qwen2_5_VLForConditionalGeneration.from_pretrained(
-    MODEL_ID_DS,
-    trust_remote_code=True,
-    torch_dtype=torch.float16
-).to(device).eval()
 @spaces.GPU
 def generate_image(model_name: str, text: str, image: Image.Image,
@@ -81,6 +121,9 @@ def generate_image(model_name: str, text: str, image: Image.Image,
     Generates responses using the selected model for image input.
     Yields raw text and Markdown-formatted text.
     Args:
         model_name: Name of the OCR model to use
         text: Prompt text for the model
@@ -94,25 +137,40 @@ def generate_image(model_name: str, text: str, image: Image.Image,
     Yields:
         tuple: (raw_text, markdown_text)
     """
     # Select model and processor based on model_name
     if model_name == "olmOCR-2-7B-1025":
         processor = processor_m
-        model = model_m
     elif model_name == "Nanonets-OCR2-3B":
         processor = processor_x
-        model = model_x
     elif model_name == "Chandra-OCR":
         if model_v is None:
-            yield "Chandra-OCR model not available.", "Chandra-OCR model not available."
             return
         processor = processor_v
-        model = model_v
     elif model_name == "Dots.OCR":
         processor = processor_d
-        model = model_d
     elif model_name == "DeepSeek-OCR":
         processor = processor_ds
-        model = model_ds
     else:
         yield "Invalid model selected.", "Invalid model selected."
         return
@@ -121,89 +179,108 @@ def generate_image(model_name: str, text: str, image: Image.Image,
         yield "Please upload an image.", "Please upload an image."
         return
-    # Prepare messages in chat format
-    messages = [{
-        "role": "user",
-        "content": [
-            {"type": "image"},
-            {"type": "text", "text": text},
-        ]
-    }]
-    # Apply chat template
-    prompt_full = processor.apply_chat_template(
-        messages,
-        tokenize=False,
-        add_generation_prompt=True
-    )
-    # Process inputs
-    inputs = processor(
-        text=[prompt_full],
-        images=[image],
-        return_tensors="pt",
-        padding=True
-    ).to(device)
-    # Setup streaming generation
-    streamer = TextIteratorStreamer(
-        processor,
-        skip_prompt=True,
-        skip_special_tokens=True
-    )
-    generation_kwargs = {
-        **inputs,
-        "streamer": streamer,
-        "max_new_tokens": max_new_tokens,
-        "do_sample": True,
-        "temperature": temperature,
-        "top_p": top_p,
-        "top_k": top_k,
-        "repetition_penalty": repetition_penalty,
-    }
-    # Start generation in separate thread
-    thread = Thread(target=model.generate, kwargs=generation_kwargs)
-    thread.start()
-    # Stream the results
-    buffer = ""
-    for new_text in streamer:
-        buffer += new_text
-        buffer = buffer.replace("<|im_end|>", "")
-        time.sleep(0.01)
-        yield buffer, buffer
-    # Ensure thread completes
-    thread.join()
 # Example usage for Gradio interface
 if __name__ == "__main__":
     import gradio as gr
-    with gr.Blocks() as demo:
-        gr.Markdown("# Multi-Model OCR Application")
-        gr.Markdown("Upload an image and select a model to extract text")
         with gr.Row():
             with gr.Column():
                 model_selector = gr.Dropdown(
-                    choices=[
-                        "olmOCR-2-7B-1025",
-                        "Nanonets-OCR2-3B",
-                        "Chandra-OCR",
-                        "Dots.OCR",
-                        "DeepSeek-OCR"
-                    ],
-                    value="DeepSeek-OCR",
                     label="Select OCR Model"
                 )
                 image_input = gr.Image(type="pil", label="Upload Image")
                 text_input = gr.Textbox(
                     value="Extract all text from this image.",
-                    label="Prompt"
                 )
                 with gr.Accordion("Advanced Settings", open=False):
@@ -249,6 +326,15 @@ if __name__ == "__main__":
                 output_text = gr.Textbox(label="Extracted Text", lines=20)
                 output_markdown = gr.Markdown(label="Formatted Output")
         submit_btn.click(
             fn=generate_image,
             inputs=[

+"""
+OCR Application with Multiple Models including DeepSeek OCR
+Fixed version with @spaces.GPU decorator for Hugging Face Spaces
+"""
 import os
 import time
 import torch
+import spaces
 from threading import Thread
 from PIL import Image
 from transformers import (
 device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+print(f"Initial Device: {device}")
+print(f"CUDA Available: {torch.cuda.is_available()}")
 # Load Chandra-OCR
+try:
+    MODEL_ID_V = "datalab-to/chandra"
+    processor_v = AutoProcessor.from_pretrained(MODEL_ID_V, trust_remote_code=True)
+    if Qwen3VLForConditionalGeneration:
+        model_v = Qwen3VLForConditionalGeneration.from_pretrained(
+            MODEL_ID_V,
+            trust_remote_code=True,
+            torch_dtype=torch.float16
+        ).eval()
+        print("✓ Chandra-OCR loaded")
+    else:
+        model_v = None
+        print("✗ Chandra-OCR: Qwen3VL not available")
+except Exception as e:
     model_v = None
+    processor_v = None
+    print(f"✗ Chandra-OCR: Failed to load - {str(e)}")
 # Load Nanonets-OCR2-3B
+try:
+    MODEL_ID_X = "nanonets/Nanonets-OCR2-3B"
+    processor_x = AutoProcessor.from_pretrained(MODEL_ID_X, trust_remote_code=True)
+    model_x = Qwen2_5_VLForConditionalGeneration.from_pretrained(
+        MODEL_ID_X,
+        trust_remote_code=True,
+        torch_dtype=torch.float16
+    ).eval()
+    print("✓ Nanonets-OCR2-3B loaded")
+except Exception as e:
+    model_x = None
+    processor_x = None
+    print(f"✗ Nanonets-OCR2-3B: Failed to load - {str(e)}")
+# Load Dots.OCR - will be moved to GPU when needed
+try:
+    MODEL_PATH_D = "strangervisionhf/dots.ocr-base-fix"
+    processor_d = AutoProcessor.from_pretrained(MODEL_PATH_D, trust_remote_code=True)
+    model_d = AutoModelForCausalLM.from_pretrained(
+        MODEL_PATH_D,
+        attn_implementation="flash_attention_2",
+        torch_dtype=torch.bfloat16,
+        trust_remote_code=True
+    ).eval()
+    print("✓ Dots.OCR loaded")
+except Exception as e:
+    model_d = None
+    processor_d = None
+    print(f"✗ Dots.OCR: Failed to load - {str(e)}")
 # Load olmOCR-2-7B-1025
+try:
+    MODEL_ID_M = "allenai/olmOCR-2-7B-1025"
+    processor_m = AutoProcessor.from_pretrained(MODEL_ID_M, trust_remote_code=True)
+    model_m = Qwen2_5_VLForConditionalGeneration.from_pretrained(
+        MODEL_ID_M,
+        trust_remote_code=True,
+        torch_dtype=torch.float16
+    ).eval()
+    print("✓ olmOCR-2-7B-1025 loaded")
+except Exception as e:
+    model_m = None
+    processor_m = None
+    print(f"✗ olmOCR-2-7B-1025: Failed to load - {str(e)}")
 # Load DeepSeek-OCR
+try:
+    MODEL_ID_DS = "deepseek-ai/deepseek-ocr"
+    processor_ds = AutoProcessor.from_pretrained(MODEL_ID_DS, trust_remote_code=True)
+    model_ds = Qwen2_5_VLForConditionalGeneration.from_pretrained(
+        MODEL_ID_DS,
+        trust_remote_code=True,
+        torch_dtype=torch.float16
+    ).eval()
+    print("✓ DeepSeek-OCR loaded")
+except Exception as e:
+    model_ds = None
+    processor_ds = None
+    print(f"✗ DeepSeek-OCR: Failed to load - {str(e)}")
 @spaces.GPU
 def generate_image(model_name: str, text: str, image: Image.Image,
     Generates responses using the selected model for image input.
     Yields raw text and Markdown-formatted text.
+    This function is decorated with @spaces.GPU to ensure it runs on GPU
+    when available in Hugging Face Spaces.
     Args:
         model_name: Name of the OCR model to use
         text: Prompt text for the model
     Yields:
         tuple: (raw_text, markdown_text)
     """
+    # Device will be cuda when @spaces.GPU decorator activates
+    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
     # Select model and processor based on model_name
     if model_name == "olmOCR-2-7B-1025":
+        if model_m is None:
+            yield "olmOCR-2-7B-1025 is not available.", "olmOCR-2-7B-1025 is not available."
+            return
         processor = processor_m
+        model = model_m.to(device)
     elif model_name == "Nanonets-OCR2-3B":
+        if model_x is None:
+            yield "Nanonets-OCR2-3B is not available.", "Nanonets-OCR2-3B is not available."
+            return
         processor = processor_x
+        model = model_x.to(device)
     elif model_name == "Chandra-OCR":
         if model_v is None:
+            yield "Chandra-OCR is not available.", "Chandra-OCR is not available."
             return
         processor = processor_v
+        model = model_v.to(device)
     elif model_name == "Dots.OCR":
+        if model_d is None:
+            yield "Dots.OCR is not available.", "Dots.OCR is not available."
+            return
         processor = processor_d
+        model = model_d.to(device)
     elif model_name == "DeepSeek-OCR":
+        if model_ds is None:
+            yield "DeepSeek-OCR is not available.", "DeepSeek-OCR is not available."
+            return
         processor = processor_ds
+        model = model_ds.to(device)
     else:
         yield "Invalid model selected.", "Invalid model selected."
         return
         yield "Please upload an image.", "Please upload an image."
         return
+    try:
+        # Prepare messages in chat format
+        messages = [{
+            "role": "user",
+            "content": [
+                {"type": "image"},
+                {"type": "text", "text": text},
+            ]
+        }]
+        # Apply chat template
+        prompt_full = processor.apply_chat_template(
+            messages,
+            tokenize=False,
+            add_generation_prompt=True
+        )
+        # Process inputs
+        inputs = processor(
+            text=[prompt_full],
+            images=[image],
+            return_tensors="pt",
+            padding=True
+        ).to(device)
+        # Setup streaming generation
+        streamer = TextIteratorStreamer(
+            processor,
+            skip_prompt=True,
+            skip_special_tokens=True
+        )
+        generation_kwargs = {
+            **inputs,
+            "streamer": streamer,
+            "max_new_tokens": max_new_tokens,
+            "do_sample": True,
+            "temperature": temperature,
+            "top_p": top_p,
+            "top_k": top_k,
+            "repetition_penalty": repetition_penalty,
+        }
+        # Start generation in separate thread
+        thread = Thread(target=model.generate, kwargs=generation_kwargs)
+        thread.start()
+        # Stream the results
+        buffer = ""
+        for new_text in streamer:
+            buffer += new_text
+            buffer = buffer.replace("<|im_end|>", "")
+            time.sleep(0.01)
+            yield buffer, buffer
+        # Ensure thread completes
+        thread.join()
+    except Exception as e:
+        error_msg = f"Error during generation: {str(e)}"
+        yield error_msg, error_msg
 # Example usage for Gradio interface
 if __name__ == "__main__":
     import gradio as gr
+    # Determine available models
+    available_models = []
+    if model_m is not None:
+        available_models.append("olmOCR-2-7B-1025")
+    if model_x is not None:
+        available_models.append("Nanonets-OCR2-3B")
+    if model_v is not None:
+        available_models.append("Chandra-OCR")
+    if model_d is not None:
+        available_models.append("Dots.OCR")
+    if model_ds is not None:
+        available_models.append("DeepSeek-OCR")
+    if not available_models:
+        print("ERROR: No models were loaded successfully!")
+        exit(1)
+    print(f"\n✓ Available models: {', '.join(available_models)}")
+    with gr.Blocks(title="Multi-Model OCR") as demo:
+        gr.Markdown("# 🔍 Multi-Model OCR Application")
+        gr.Markdown("Upload an image and select a model to extract text. Models run on GPU via Hugging Face Spaces.")
         with gr.Row():
             with gr.Column():
                 model_selector = gr.Dropdown(
+                    choices=available_models,
+                    value=available_models[0] if available_models else None,
                     label="Select OCR Model"
                 )
                 image_input = gr.Image(type="pil", label="Upload Image")
                 text_input = gr.Textbox(
                     value="Extract all text from this image.",
+                    label="Prompt",
+                    lines=2
                 )
                 with gr.Accordion("Advanced Settings", open=False):
                 output_text = gr.Textbox(label="Extracted Text", lines=20)
                 output_markdown = gr.Markdown(label="Formatted Output")
+        gr.Markdown("""
+        ### Available Models:
+        - **olmOCR-2-7B-1025**: Allen AI's OCR model
+        - **Nanonets-OCR2-3B**: Nanonets OCR model
+        - **Chandra-OCR**: Datalab OCR model
+        - **Dots.OCR**: Stranger Vision OCR model
+        - **DeepSeek-OCR**: DeepSeek AI's OCR model
+        """)
         submit_btn.click(
             fn=generate_image,
             inputs=[