Spaces:

akhaliq
/

DeepSeek-OCR

Running on Zero

App Files Files Community

akhaliq HF Staff commited on 9 days ago

Commit

1a07c5d

verified ·

1 Parent(s): bd0cfb9

Update Gradio app with multiple files

Browse files

Files changed (1) hide show

app.py +88 -85

app.py CHANGED Viewed

@@ -2,10 +2,9 @@ import gradio as gr
 import torch
 from transformers import AutoModel, AutoTokenizer
 from PIL import Image
-import io
 import os
-from typing import Optional
 import spaces
 # Set CUDA device
 os.environ["CUDA_VISIBLE_DEVICES"] = '0'
@@ -26,9 +25,7 @@ model = model.eval()
 def ocr_process(
     image_input: Image.Image,
     task_type: str = "ocr",
-    base_size: int = 1024,
-    image_size: int = 640,
-    crop_mode: bool = True,
 ) -> str:
     """
     Process image and extract text using DeepSeek-OCR model.
@@ -36,9 +33,7 @@ def ocr_process(
     Args:
         image_input: Input image
         task_type: Type of task - "ocr" for text extraction or "markdown" for document conversion
-        base_size: Base size for model processing
-        image_size: Target image size
-        crop_mode: Whether to use crop mode
     Returns:
         Extracted text or markdown content
@@ -50,42 +45,57 @@ def ocr_process(
         # Move model to GPU and set dtype
         model.cuda().to(torch.bfloat16)
-        # Save image temporarily
-        temp_image_path = "/tmp/temp_ocr_image.jpg"
-        image_input.save(temp_image_path)
-        # Create output directory
-        output_path = "/tmp/ocr_output"
-        os.makedirs(output_path, exist_ok=True)
-        # Set prompt based on task type
-        if task_type == "markdown":
-            prompt = "<image>\n<|grounding|>Convert the document to markdown. "
-        else:
-            prompt = "<image>\nFree OCR. "
-        # Run inference
-        output = model.infer(
-            tokenizer,
-            prompt=prompt,
-            image_file=temp_image_path,
-            output_path=output_path,
-            base_size=base_size,
-            image_size=image_size,
-            crop_mode=crop_mode,
-            save_results=False,
-            test_compress=False,
-        )
-        # Clean up temp file
-        if os.path.exists(temp_image_path):
-            os.remove(temp_image_path)
         # Move model back to CPU to free GPU memory
         model.to("cpu")
         torch.cuda.empty_cache()
-        return output if output else "No text detected in image."
     except Exception as e:
         # Ensure model is moved back to CPU on error
@@ -95,7 +105,7 @@ def ocr_process(
 # Create Gradio interface
-with gr.Blocks(title="DeepSeek OCR") as demo:
     gr.HTML(
         """
         <div style="text-align: center; margin-bottom: 20px;">
@@ -108,86 +118,79 @@ with gr.Blocks(title="DeepSeek OCR") as demo:
     with gr.Row():
         with gr.Column(scale=1):
-            gr.Markdown("### Upload Image")
             image_input = gr.Image(
                 label="Input Image",
                 type="pil",
                 sources=["upload", "webcam", "clipboard"],
             )
-            gr.Markdown("### Settings")
             task_type = gr.Radio(
                 choices=["ocr", "markdown"],
                 value="ocr",
                 label="Task Type",
-                info="OCR: Extract text | Markdown: Convert document to markdown",
             )
-            base_size = gr.Slider(
-                minimum=512,
-                maximum=1280,
-                step=128,
-                value=1024,
-                label="Base Size",
-                info="Model processing size - Tiny: 512, Small: 640, Base: 1024, Large: 1280",
             )
-            image_size = gr.Slider(
-                minimum=512,
-                maximum=1280,
-                step=128,
-                value=640,
-                label="Image Size",
-                info="Target image size - Gundam mode: 640 with crop, others match base_size",
-            )
-            crop_mode = gr.Checkbox(
-                value=True,
-                label="Crop Mode",
-                info="Enable crop mode for better processing",
-            )
             submit_btn = gr.Button("🚀 Extract Text", variant="primary", size="lg")
         with gr.Column(scale=1):
-            gr.Markdown("### Output")
             output_text = gr.Textbox(
                 label="Extracted Text",
-                lines=10,
                 interactive=False,
-                placeholder="Text will appear here...",
             )
-            copy_btn = gr.Button("📋 Copy Output")
     # Event handlers
     submit_btn.click(
         fn=ocr_process,
-        inputs=[image_input, task_type, base_size, image_size, crop_mode],
-        outputs=output_text,
-    )
-    copy_btn.click(
-        fn=lambda text: text,
-        inputs=output_text,
         outputs=output_text,
-        js="(text) => { navigator.clipboard.writeText(text); alert('Copied to clipboard!'); return text; }",
     )
     # Examples section
-    gr.Markdown("### Examples")
     gr.Examples(
         examples=[
-            ["https://images.unsplash.com/photo-1507003211169-0a1dd7228f2d?w=500", "ocr"],
-            [
-                "https://images.unsplash.com/photo-1481627834876-b7833e8f5570?w=500",
-                "markdown",
-            ],
         ],
-        inputs=[image_input, task_type],
-        label="Try these examples",
     )
 if __name__ == "__main__":
     demo.launch(share=False)

 import torch
 from transformers import AutoModel, AutoTokenizer
 from PIL import Image
 import os
 import spaces
+import tempfile
 # Set CUDA device
 os.environ["CUDA_VISIBLE_DEVICES"] = '0'
 def ocr_process(
     image_input: Image.Image,
     task_type: str = "ocr",
+    preset: str = "gundam",
 ) -> str:
     """
     Process image and extract text using DeepSeek-OCR model.
     Args:
         image_input: Input image
         task_type: Type of task - "ocr" for text extraction or "markdown" for document conversion
+        preset: Preset configuration for model parameters
     Returns:
         Extracted text or markdown content
         # Move model to GPU and set dtype
         model.cuda().to(torch.bfloat16)
+        # Create temp directory for this session
+        with tempfile.TemporaryDirectory() as temp_dir:
+            # Save image with proper format
+            temp_image_path = os.path.join(temp_dir, "input_image.jpg")
+            # Convert RGBA to RGB if necessary
+            if image_input.mode == 'RGBA':
+                rgb_image = Image.new('RGB', image_input.size, (255, 255, 255))
+                rgb_image.paste(image_input, mask=image_input.split()[3])
+                rgb_image.save(temp_image_path, 'JPEG')
+            else:
+                image_input.save(temp_image_path, 'JPEG')
+            # Set parameters based on preset
+            presets = {
+                "tiny": {"base_size": 512, "image_size": 512, "crop_mode": False},
+                "small": {"base_size": 640, "image_size": 640, "crop_mode": False},
+                "base": {"base_size": 1024, "image_size": 1024, "crop_mode": False},
+                "large": {"base_size": 1280, "image_size": 1280, "crop_mode": False},
+                "gundam": {"base_size": 1024, "image_size": 640, "crop_mode": True},
+            }
+            config = presets[preset]
+            # Set prompt based on task type
+            if task_type == "markdown":
+                prompt = "<image>\n<|grounding|>Convert the document to markdown. "
+            else:
+                prompt = "<image>\nFree OCR. "
+            # Run inference
+            result = model.infer(
+                tokenizer,
+                prompt=prompt,
+                image_file=temp_image_path,
+                output_path=temp_dir,  # Use temp directory for output
+                base_size=config["base_size"],
+                image_size=config["image_size"],
+                crop_mode=config["crop_mode"],
+                save_results=False,
+                test_compress=False,
+            )
         # Move model back to CPU to free GPU memory
         model.to("cpu")
         torch.cuda.empty_cache()
+        # Return the result
+        if result:
+            return result
+        else:
+            return "No text detected in the image. Please try a different preset or ensure the image contains readable text."
     except Exception as e:
         # Ensure model is moved back to CPU on error
 # Create Gradio interface
+with gr.Blocks(title="DeepSeek OCR", theme=gr.themes.Soft()) as demo:
     gr.HTML(
         """
         <div style="text-align: center; margin-bottom: 20px;">
     with gr.Row():
         with gr.Column(scale=1):
+            gr.Markdown("### 📤 Upload Image")
             image_input = gr.Image(
                 label="Input Image",
                 type="pil",
                 sources=["upload", "webcam", "clipboard"],
+                height=300,
             )
+            gr.Markdown("### ⚙️ Settings")
             task_type = gr.Radio(
                 choices=["ocr", "markdown"],
                 value="ocr",
                 label="Task Type",
+                info="OCR: Extract text | Markdown: Convert document to markdown format",
             )
+            preset = gr.Radio(
+                choices=["gundam", "tiny", "small", "base", "large"],
+                value="gundam",
+                label="Model Preset",
+                info="Gundam: Optimized for mixed content | Tiny/Small: Fast | Base/Large: High quality",
             )
+            with gr.Accordion("Preset Details", open=False):
+                gr.Markdown("""
+                - **Gundam**: base_size=1024, image_size=640, crop_mode=True (Recommended)
+                - **Tiny**: base_size=512, image_size=512, crop_mode=False (Fastest)
+                - **Small**: base_size=640, image_size=640, crop_mode=False
+                - **Base**: base_size=1024, image_size=1024, crop_mode=False
+                - **Large**: base_size=1280, image_size=1280, crop_mode=False (Best quality)
+                """)
             submit_btn = gr.Button("🚀 Extract Text", variant="primary", size="lg")
+            clear_btn = gr.ClearButton([image_input], value="🗑️ Clear")
         with gr.Column(scale=1):
+            gr.Markdown("### 📝 Output")
             output_text = gr.Textbox(
                 label="Extracted Text",
+                lines=15,
+                max_lines=30,
                 interactive=False,
+                placeholder="Extracted text will appear here...",
+                show_copy_button=True,
             )
     # Event handlers
     submit_btn.click(
         fn=ocr_process,
+        inputs=[image_input, task_type, preset],
         outputs=output_text,
     )
     # Examples section
+    gr.Markdown("### 📚 Examples")
     gr.Examples(
         examples=[
+            ["example1.jpg", "ocr", "gundam"],
+            ["example2.jpg", "markdown", "gundam"],
         ],
+        inputs=[image_input, task_type, preset],
+        label="Try these examples (upload your own images for testing)",
     )
+    gr.Markdown("""
+    ### 💡 Tips
+    - For general OCR, use the "gundam" preset (optimized balance)
+    - For high-quality scanned documents, try "base" or "large" presets
+    - For handwritten text, "large" preset may work better
+    - Use "markdown" mode for structured documents with formatting
+    - If processing fails, try a different preset
+    """)
 if __name__ == "__main__":
     demo.launch(share=False)