akhaliq HF Staff commited on
Commit
1a07c5d
Β·
verified Β·
1 Parent(s): bd0cfb9

Update Gradio app with multiple files

Browse files
Files changed (1) hide show
  1. app.py +88 -85
app.py CHANGED
@@ -2,10 +2,9 @@ import gradio as gr
2
  import torch
3
  from transformers import AutoModel, AutoTokenizer
4
  from PIL import Image
5
- import io
6
  import os
7
- from typing import Optional
8
  import spaces
 
9
 
10
  # Set CUDA device
11
  os.environ["CUDA_VISIBLE_DEVICES"] = '0'
@@ -26,9 +25,7 @@ model = model.eval()
26
  def ocr_process(
27
  image_input: Image.Image,
28
  task_type: str = "ocr",
29
- base_size: int = 1024,
30
- image_size: int = 640,
31
- crop_mode: bool = True,
32
  ) -> str:
33
  """
34
  Process image and extract text using DeepSeek-OCR model.
@@ -36,9 +33,7 @@ def ocr_process(
36
  Args:
37
  image_input: Input image
38
  task_type: Type of task - "ocr" for text extraction or "markdown" for document conversion
39
- base_size: Base size for model processing
40
- image_size: Target image size
41
- crop_mode: Whether to use crop mode
42
 
43
  Returns:
44
  Extracted text or markdown content
@@ -50,42 +45,57 @@ def ocr_process(
50
  # Move model to GPU and set dtype
51
  model.cuda().to(torch.bfloat16)
52
 
53
- # Save image temporarily
54
- temp_image_path = "/tmp/temp_ocr_image.jpg"
55
- image_input.save(temp_image_path)
56
-
57
- # Create output directory
58
- output_path = "/tmp/ocr_output"
59
- os.makedirs(output_path, exist_ok=True)
60
-
61
- # Set prompt based on task type
62
- if task_type == "markdown":
63
- prompt = "<image>\n<|grounding|>Convert the document to markdown. "
64
- else:
65
- prompt = "<image>\nFree OCR. "
66
-
67
- # Run inference
68
- output = model.infer(
69
- tokenizer,
70
- prompt=prompt,
71
- image_file=temp_image_path,
72
- output_path=output_path,
73
- base_size=base_size,
74
- image_size=image_size,
75
- crop_mode=crop_mode,
76
- save_results=False,
77
- test_compress=False,
78
- )
79
-
80
- # Clean up temp file
81
- if os.path.exists(temp_image_path):
82
- os.remove(temp_image_path)
 
 
 
 
 
 
 
 
 
 
 
83
 
84
  # Move model back to CPU to free GPU memory
85
  model.to("cpu")
86
  torch.cuda.empty_cache()
87
 
88
- return output if output else "No text detected in image."
 
 
 
 
89
 
90
  except Exception as e:
91
  # Ensure model is moved back to CPU on error
@@ -95,7 +105,7 @@ def ocr_process(
95
 
96
 
97
  # Create Gradio interface
98
- with gr.Blocks(title="DeepSeek OCR") as demo:
99
  gr.HTML(
100
  """
101
  <div style="text-align: center; margin-bottom: 20px;">
@@ -108,86 +118,79 @@ with gr.Blocks(title="DeepSeek OCR") as demo:
108
 
109
  with gr.Row():
110
  with gr.Column(scale=1):
111
- gr.Markdown("### Upload Image")
112
  image_input = gr.Image(
113
  label="Input Image",
114
  type="pil",
115
  sources=["upload", "webcam", "clipboard"],
 
116
  )
117
 
118
- gr.Markdown("### Settings")
119
  task_type = gr.Radio(
120
  choices=["ocr", "markdown"],
121
  value="ocr",
122
  label="Task Type",
123
- info="OCR: Extract text | Markdown: Convert document to markdown",
124
  )
125
 
126
- base_size = gr.Slider(
127
- minimum=512,
128
- maximum=1280,
129
- step=128,
130
- value=1024,
131
- label="Base Size",
132
- info="Model processing size - Tiny: 512, Small: 640, Base: 1024, Large: 1280",
133
  )
134
 
135
- image_size = gr.Slider(
136
- minimum=512,
137
- maximum=1280,
138
- step=128,
139
- value=640,
140
- label="Image Size",
141
- info="Target image size - Gundam mode: 640 with crop, others match base_size",
142
- )
143
-
144
- crop_mode = gr.Checkbox(
145
- value=True,
146
- label="Crop Mode",
147
- info="Enable crop mode for better processing",
148
- )
149
 
150
  submit_btn = gr.Button("πŸš€ Extract Text", variant="primary", size="lg")
 
151
 
152
  with gr.Column(scale=1):
153
- gr.Markdown("### Output")
154
  output_text = gr.Textbox(
155
  label="Extracted Text",
156
- lines=10,
 
157
  interactive=False,
158
- placeholder="Text will appear here...",
 
159
  )
160
 
161
- copy_btn = gr.Button("πŸ“‹ Copy Output")
162
-
163
  # Event handlers
164
  submit_btn.click(
165
  fn=ocr_process,
166
- inputs=[image_input, task_type, base_size, image_size, crop_mode],
167
- outputs=output_text,
168
- )
169
-
170
- copy_btn.click(
171
- fn=lambda text: text,
172
- inputs=output_text,
173
  outputs=output_text,
174
- js="(text) => { navigator.clipboard.writeText(text); alert('Copied to clipboard!'); return text; }",
175
  )
176
 
177
  # Examples section
178
- gr.Markdown("### Examples")
179
  gr.Examples(
180
  examples=[
181
- ["https://images.unsplash.com/photo-1507003211169-0a1dd7228f2d?w=500", "ocr"],
182
- [
183
- "https://images.unsplash.com/photo-1481627834876-b7833e8f5570?w=500",
184
- "markdown",
185
- ],
186
  ],
187
- inputs=[image_input, task_type],
188
- label="Try these examples",
189
  )
190
 
 
 
 
 
 
 
 
 
 
191
 
192
  if __name__ == "__main__":
193
  demo.launch(share=False)
 
2
  import torch
3
  from transformers import AutoModel, AutoTokenizer
4
  from PIL import Image
 
5
  import os
 
6
  import spaces
7
+ import tempfile
8
 
9
  # Set CUDA device
10
  os.environ["CUDA_VISIBLE_DEVICES"] = '0'
 
25
  def ocr_process(
26
  image_input: Image.Image,
27
  task_type: str = "ocr",
28
+ preset: str = "gundam",
 
 
29
  ) -> str:
30
  """
31
  Process image and extract text using DeepSeek-OCR model.
 
33
  Args:
34
  image_input: Input image
35
  task_type: Type of task - "ocr" for text extraction or "markdown" for document conversion
36
+ preset: Preset configuration for model parameters
 
 
37
 
38
  Returns:
39
  Extracted text or markdown content
 
45
  # Move model to GPU and set dtype
46
  model.cuda().to(torch.bfloat16)
47
 
48
+ # Create temp directory for this session
49
+ with tempfile.TemporaryDirectory() as temp_dir:
50
+ # Save image with proper format
51
+ temp_image_path = os.path.join(temp_dir, "input_image.jpg")
52
+ # Convert RGBA to RGB if necessary
53
+ if image_input.mode == 'RGBA':
54
+ rgb_image = Image.new('RGB', image_input.size, (255, 255, 255))
55
+ rgb_image.paste(image_input, mask=image_input.split()[3])
56
+ rgb_image.save(temp_image_path, 'JPEG')
57
+ else:
58
+ image_input.save(temp_image_path, 'JPEG')
59
+
60
+ # Set parameters based on preset
61
+ presets = {
62
+ "tiny": {"base_size": 512, "image_size": 512, "crop_mode": False},
63
+ "small": {"base_size": 640, "image_size": 640, "crop_mode": False},
64
+ "base": {"base_size": 1024, "image_size": 1024, "crop_mode": False},
65
+ "large": {"base_size": 1280, "image_size": 1280, "crop_mode": False},
66
+ "gundam": {"base_size": 1024, "image_size": 640, "crop_mode": True},
67
+ }
68
+
69
+ config = presets[preset]
70
+
71
+ # Set prompt based on task type
72
+ if task_type == "markdown":
73
+ prompt = "<image>\n<|grounding|>Convert the document to markdown. "
74
+ else:
75
+ prompt = "<image>\nFree OCR. "
76
+
77
+ # Run inference
78
+ result = model.infer(
79
+ tokenizer,
80
+ prompt=prompt,
81
+ image_file=temp_image_path,
82
+ output_path=temp_dir, # Use temp directory for output
83
+ base_size=config["base_size"],
84
+ image_size=config["image_size"],
85
+ crop_mode=config["crop_mode"],
86
+ save_results=False,
87
+ test_compress=False,
88
+ )
89
 
90
  # Move model back to CPU to free GPU memory
91
  model.to("cpu")
92
  torch.cuda.empty_cache()
93
 
94
+ # Return the result
95
+ if result:
96
+ return result
97
+ else:
98
+ return "No text detected in the image. Please try a different preset or ensure the image contains readable text."
99
 
100
  except Exception as e:
101
  # Ensure model is moved back to CPU on error
 
105
 
106
 
107
  # Create Gradio interface
108
+ with gr.Blocks(title="DeepSeek OCR", theme=gr.themes.Soft()) as demo:
109
  gr.HTML(
110
  """
111
  <div style="text-align: center; margin-bottom: 20px;">
 
118
 
119
  with gr.Row():
120
  with gr.Column(scale=1):
121
+ gr.Markdown("### πŸ“€ Upload Image")
122
  image_input = gr.Image(
123
  label="Input Image",
124
  type="pil",
125
  sources=["upload", "webcam", "clipboard"],
126
+ height=300,
127
  )
128
 
129
+ gr.Markdown("### βš™οΈ Settings")
130
  task_type = gr.Radio(
131
  choices=["ocr", "markdown"],
132
  value="ocr",
133
  label="Task Type",
134
+ info="OCR: Extract text | Markdown: Convert document to markdown format",
135
  )
136
 
137
+ preset = gr.Radio(
138
+ choices=["gundam", "tiny", "small", "base", "large"],
139
+ value="gundam",
140
+ label="Model Preset",
141
+ info="Gundam: Optimized for mixed content | Tiny/Small: Fast | Base/Large: High quality",
 
 
142
  )
143
 
144
+ with gr.Accordion("Preset Details", open=False):
145
+ gr.Markdown("""
146
+ - **Gundam**: base_size=1024, image_size=640, crop_mode=True (Recommended)
147
+ - **Tiny**: base_size=512, image_size=512, crop_mode=False (Fastest)
148
+ - **Small**: base_size=640, image_size=640, crop_mode=False
149
+ - **Base**: base_size=1024, image_size=1024, crop_mode=False
150
+ - **Large**: base_size=1280, image_size=1280, crop_mode=False (Best quality)
151
+ """)
 
 
 
 
 
 
152
 
153
  submit_btn = gr.Button("πŸš€ Extract Text", variant="primary", size="lg")
154
+ clear_btn = gr.ClearButton([image_input], value="πŸ—‘οΈ Clear")
155
 
156
  with gr.Column(scale=1):
157
+ gr.Markdown("### πŸ“ Output")
158
  output_text = gr.Textbox(
159
  label="Extracted Text",
160
+ lines=15,
161
+ max_lines=30,
162
  interactive=False,
163
+ placeholder="Extracted text will appear here...",
164
+ show_copy_button=True,
165
  )
166
 
 
 
167
  # Event handlers
168
  submit_btn.click(
169
  fn=ocr_process,
170
+ inputs=[image_input, task_type, preset],
 
 
 
 
 
 
171
  outputs=output_text,
 
172
  )
173
 
174
  # Examples section
175
+ gr.Markdown("### πŸ“š Examples")
176
  gr.Examples(
177
  examples=[
178
+ ["example1.jpg", "ocr", "gundam"],
179
+ ["example2.jpg", "markdown", "gundam"],
 
 
 
180
  ],
181
+ inputs=[image_input, task_type, preset],
182
+ label="Try these examples (upload your own images for testing)",
183
  )
184
 
185
+ gr.Markdown("""
186
+ ### πŸ’‘ Tips
187
+ - For general OCR, use the "gundam" preset (optimized balance)
188
+ - For high-quality scanned documents, try "base" or "large" presets
189
+ - For handwritten text, "large" preset may work better
190
+ - Use "markdown" mode for structured documents with formatting
191
+ - If processing fails, try a different preset
192
+ """)
193
+
194
 
195
  if __name__ == "__main__":
196
  demo.launch(share=False)