Spaces:

IFMedTechdemo
/

Multi-Model-OCR

Running on Zero

App Files Files Community

IFMedTechdemo commited on 28 days ago

Commit

0eb08d6

verified ·

1 Parent(s): 5594430

Update app.py

Browse files

Files changed (1) hide show

app.py +183 -31

app.py CHANGED Viewed

@@ -1,6 +1,6 @@
 """
 OCR Application with Multiple Models including DeepSeek OCR
-Final fixed version with proper tokenizer handling
 """
 import os
@@ -8,11 +8,17 @@ import time
 import torch
 import spaces
 import warnings
 from threading import Thread
 from PIL import Image
 from transformers import (
     AutoProcessor,
     AutoModelForCausalLM,
     Qwen2_5_VLForConditionalGeneration,
     TextIteratorStreamer
 )
@@ -101,41 +107,172 @@ except Exception as e:
     processor_m = None
     print(f"✗ olmOCR-2-7B-1025: Failed to load - {str(e)}")
-# Load DeepSeek-OCR with proper tokenizer handling
 try:
     MODEL_ID_DS = "deepseek-ai/deepseek-ocr"
-    processor_ds = AutoProcessor.from_pretrained(MODEL_ID_DS, trust_remote_code=True)
-    model_ds = Qwen2_5_VLForConditionalGeneration.from_pretrained(
         MODEL_ID_DS,
         trust_remote_code=True,
-        torch_dtype=torch.float16
     ).eval()
-    # Fix tokenizer chat template - access the correct tokenizer attribute
     try:
-        # The tokenizer might be nested under processor_ds.tokenizer
-        tokenizer = processor_ds.tokenizer if hasattr(processor_ds, 'tokenizer') else processor_ds
-        if not hasattr(tokenizer, 'chat_template') or tokenizer.chat_template is None:
-            # Use a standard Qwen-style chat template
-            tokenizer.chat_template = "{% for message in messages %}{% if loop.first and message['role'] != 'system' %}<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n{% endif %}<|im_start|>{{ message['role'] }}\n{% if message['content'] is string %}{{ message['content'] }}{% else %}{% for content in message['content'] %}{% if content['type'] == 'image' %}<|vision_start|><|image_pad|><|vision_end|>{% elif content['type'] == 'text' %}{{ content['text'] }}{% endif %}{% endfor %}{% endif %}<|im_end|>\n{% endfor %}<|im_start|>assistant\n"
-            print("✓ DeepSeek-OCR loaded (with custom chat template)")
-        else:
-            print("✓ DeepSeek-OCR loaded")
-    except Exception as tokenizer_error:
-        print(f"  Warning: Could not set chat template - {tokenizer_error}")
-        print("  Model loaded but may need fallback prompting")
-except Exception as e:
-    model_ds = None
-    processor_ds = None
-    print(f"✗ DeepSeek-OCR: Failed to load - {str(e)}")
 @spaces.GPU
 def generate_image(model_name: str, text: str, image: Image.Image,
                    max_new_tokens: int, temperature: float, top_p: float,
-                   top_k: int, repetition_penalty: float):
     """
     Generates responses using the selected model for image input.
     Yields raw text and Markdown-formatted text.
@@ -152,10 +289,16 @@ def generate_image(model_name: str, text: str, image: Image.Image,
         top_p: Nucleus sampling parameter
         top_k: Top-k sampling parameter
         repetition_penalty: Penalty for repeating tokens
     Yields:
         tuple: (raw_text, markdown_text)
     """
     # Device will be cuda when @spaces.GPU decorator activates
     device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
@@ -184,12 +327,6 @@ def generate_image(model_name: str, text: str, image: Image.Image,
             return
         processor = processor_d
         model = model_d.to(device)
-    elif model_name == "DeepSeek-OCR":
-        if model_ds is None:
-            yield "DeepSeek-OCR is not available.", "DeepSeek-OCR is not available."
-            return
-        processor = processor_ds
-        model = model_ds.to(device)
     else:
         yield "Invalid model selected.", "Invalid model selected."
         return
@@ -218,7 +355,6 @@ def generate_image(model_name: str, text: str, image: Image.Image,
         except Exception as template_error:
             # Fallback: create a simple prompt without chat template
             print(f"Chat template error: {template_error}. Using fallback prompt.")
-            # Simple format that most models understand
             prompt_full = f"{text}"
         # Process inputs
@@ -347,6 +483,14 @@ if __name__ == "__main__":
                         step=0.1,
                         label="Repetition Penalty"
                     )
                 submit_btn = gr.Button("Extract Text", variant="primary")
@@ -360,7 +504,14 @@ if __name__ == "__main__":
         - **Nanonets-OCR2-3B**: Nanonets OCR model
         - **Chandra-OCR**: Datalab OCR model
         - **Dots.OCR**: Stranger Vision OCR model
-        - **DeepSeek-OCR**: DeepSeek AI's OCR model (experimental)
         """)
         submit_btn.click(
@@ -373,7 +524,8 @@ if __name__ == "__main__":
                 temperature,
                 top_p,
                 top_k,
-                repetition_penalty
             ],
             outputs=[output_text, output_markdown]
         )

 """
 OCR Application with Multiple Models including DeepSeek OCR
+Merged version with working DeepSeek implementation
 """
 import os
 import torch
 import spaces
 import warnings
+import tempfile
+import sys
+from io import StringIO
+from contextlib import contextmanager
 from threading import Thread
 from PIL import Image
 from transformers import (
     AutoProcessor,
     AutoModelForCausalLM,
+    AutoModel,
+    AutoTokenizer,
     Qwen2_5_VLForConditionalGeneration,
     TextIteratorStreamer
 )
     processor_m = None
     print(f"✗ olmOCR-2-7B-1025: Failed to load - {str(e)}")
+# Load DeepSeek-OCR using the working implementation
 try:
     MODEL_ID_DS = "deepseek-ai/deepseek-ocr"
+    tokenizer_ds = AutoTokenizer.from_pretrained(MODEL_ID_DS, trust_remote_code=True)
+    model_ds = AutoModel.from_pretrained(
         MODEL_ID_DS,
+        _attn_implementation="flash_attention_2",
         trust_remote_code=True,
+        use_safetensors=True,
     ).eval()
+    print("✓ DeepSeek-OCR loaded")
+except Exception as e:
+    model_ds = None
+    tokenizer_ds = None
+    print(f"✗ DeepSeek-OCR: Failed to load - {str(e)}")
+@contextmanager
+def capture_stdout():
+    """Capture stdout to get printed output from model"""
+    old_stdout = sys.stdout
+    sys.stdout = StringIO()
+    try:
+        yield sys.stdout
+    finally:
+        sys.stdout = old_stdout
+@spaces.GPU
+def generate_image_deepseek(text: str, image: Image.Image,
+                            preset: str = "gundam"):
+    """
+    Special generation function for DeepSeek-OCR using its native infer method.
+    Args:
+        text: Prompt text (used to determine task type)
+        image: PIL Image object to process
+        preset: Model preset configuration
+    Yields:
+        tuple: (raw_text, markdown_text)
+    """
+    if model_ds is None:
+        yield "DeepSeek-OCR is not available.", "DeepSeek-OCR is not available."
+        return
+    if image is None:
+        yield "Please upload an image.", "Please upload an image."
+        return
     try:
+        # Move model to GPU
+        device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+        model_ds.to(device).to(torch.bfloat16)
+        # Create temp directory for this session
+        with tempfile.TemporaryDirectory() as temp_dir:
+            # Save image with proper format
+            temp_image_path = os.path.join(temp_dir, "input_image.jpg")
+            # Convert RGBA to RGB if necessary
+            if image.mode in ('RGBA', 'LA', 'P'):
+                rgb_image = Image.new('RGB', image.size, (255, 255, 255))
+                if image.mode == 'RGBA':
+                    rgb_image.paste(image, mask=image.split()[3])
+                else:
+                    rgb_image.paste(image)
+                rgb_image.save(temp_image_path, 'JPEG', quality=95)
+            else:
+                image.save(temp_image_path, 'JPEG', quality=95)
+            # Set parameters based on preset
+            presets = {
+                "tiny": {"base_size": 512, "image_size": 512, "crop_mode": False},
+                "small": {"base_size": 640, "image_size": 640, "crop_mode": False},
+                "base": {"base_size": 1024, "image_size": 1024, "crop_mode": False},
+                "large": {"base_size": 1280, "image_size": 1280, "crop_mode": False},
+                "gundam": {"base_size": 1024, "image_size": 640, "crop_mode": True},
+            }
+            config = presets[preset]
+            # Determine task type from prompt
+            if "markdown" in text.lower() or "convert" in text.lower():
+                prompt = "<image>\n<|grounding|>Convert the document to markdown. "
+            else:
+                prompt = "<image>\nFree OCR. "
+            # Capture stdout while running inference
+            captured_output = ""
+            with capture_stdout() as output:
+                result = model_ds.infer(
+                    tokenizer_ds,
+                    prompt=prompt,
+                    image_file=temp_image_path,
+                    output_path=temp_dir,
+                    base_size=config["base_size"],
+                    image_size=config["image_size"],
+                    crop_mode=config["crop_mode"],
+                    save_results=True,
+                    test_compress=True,
+                )
+                captured_output = output.getvalue()
+            # Extract the text from captured output
+            extracted_text = ""
+            # Look for the actual OCR result in the captured output
+            lines = captured_output.split('\n')
+            capture_text = False
+            text_lines = []
+            for line in lines:
+                # Start capturing after seeing certain patterns
+                if "# " in line or line.strip().startswith("**"):
+                    capture_text = True
+                if capture_text:
+                    # Stop at the separator lines
+                    if line.startswith("====") or line.startswith("---") and len(line) > 10:
+                        if text_lines:  # Only stop if we've captured something
+                            break
+                    # Add non-empty lines that aren't debug output
+                    elif line.strip() and not line.startswith("image size:") and not line.startswith("valid image") and not line.startswith("output texts") and not line.startswith("compression"):
+                        text_lines.append(line)
+            if text_lines:
+                extracted_text = '\n'.join(text_lines)
+            # If we didn't get text from stdout, check if result contains text
+            if not extracted_text and result is not None:
+                if isinstance(result, str):
+                    extracted_text = result
+                elif isinstance(result, (list, tuple)) and len(result) > 0:
+                    if isinstance(result[0], str):
+                        extracted_text = result[0]
+                    elif hasattr(result[0], 'text'):
+                        extracted_text = result[0].text
+            # Clean up any remaining markers from the text
+            if extracted_text:
+                clean_lines = []
+                for line in extracted_text.split('\n'):
+                    if not any(pattern in line.lower() for pattern in ['image size:', 'valid image', 'compression ratio', 'save results:', 'output texts']):
+                        clean_lines.append(line)
+                extracted_text = '\n'.join(clean_lines).strip()
+        # Move model back to CPU to free GPU memory
+        model_ds.to("cpu")
+        torch.cuda.empty_cache()
+        # Return the extracted text
+        final_text = extracted_text if extracted_text else "No text could be extracted from the image."
+        yield final_text, final_text
+    except Exception as e:
+        error_msg = f"Error during DeepSeek generation: {str(e)}"
+        print(f"Full error: {e}")
+        import traceback
+        traceback.print_exc()
+        yield error_msg, error_msg
 @spaces.GPU
 def generate_image(model_name: str, text: str, image: Image.Image,
                    max_new_tokens: int, temperature: float, top_p: float,
+                   top_k: int, repetition_penalty: float, deepseek_preset: str = "gundam"):
     """
     Generates responses using the selected model for image input.
     Yields raw text and Markdown-formatted text.
         top_p: Nucleus sampling parameter
         top_k: Top-k sampling parameter
         repetition_penalty: Penalty for repeating tokens
+        deepseek_preset: Preset for DeepSeek model
     Yields:
         tuple: (raw_text, markdown_text)
     """
+    # Special handling for DeepSeek-OCR
+    if model_name == "DeepSeek-OCR":
+        yield from generate_image_deepseek(text, image, deepseek_preset)
+        return
     # Device will be cuda when @spaces.GPU decorator activates
     device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
             return
         processor = processor_d
         model = model_d.to(device)
     else:
         yield "Invalid model selected.", "Invalid model selected."
         return
         except Exception as template_error:
             # Fallback: create a simple prompt without chat template
             print(f"Chat template error: {template_error}. Using fallback prompt.")
             prompt_full = f"{text}"
         # Process inputs
                         step=0.1,
                         label="Repetition Penalty"
                     )
+                    gr.Markdown("### DeepSeek-OCR Specific Settings")
+                    deepseek_preset = gr.Radio(
+                        choices=["gundam", "base", "large", "small", "tiny"],
+                        value="gundam",
+                        label="DeepSeek Preset",
+                        info="Only applies when DeepSeek-OCR is selected"
+                    )
                 submit_btn = gr.Button("Extract Text", variant="primary")
         - **Nanonets-OCR2-3B**: Nanonets OCR model
         - **Chandra-OCR**: Datalab OCR model
         - **Dots.OCR**: Stranger Vision OCR model
+        - **DeepSeek-OCR**: DeepSeek AI's OCR model (uses native inference method)
+        ### DeepSeek-OCR Presets:
+        - **Gundam** (Recommended): Balanced performance with crop mode
+        - **Base**: Standard quality without cropping
+        - **Large**: Highest quality for complex documents
+        - **Small**: Faster processing, good for simple text
+        - **Tiny**: Fastest, suitable for clear printed text
         """)
         submit_btn.click(
                 temperature,
                 top_p,
                 top_k,
+                repetition_penalty,
+                deepseek_preset
             ],
             outputs=[output_text, output_markdown]
         )