IFMedTechdemo commited on
Commit
e71abcc
·
verified ·
1 Parent(s): 161e0b2

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +23 -68
app.py CHANGED
@@ -19,28 +19,6 @@ import time
19
  # Device setup
20
  device = "cuda" if torch.cuda.is_available() else "cpu"
21
 
22
- # Load Chandra-OCR using AutoModel
23
- MODEL_ID_V = "datalab-to/chandra"
24
- processor_v = AutoProcessor.from_pretrained(MODEL_ID_V, trust_remote_code=True)
25
- model_v = AutoModel.from_pretrained(
26
- MODEL_ID_V,
27
- trust_remote_code=True,
28
- torch_dtype=torch.float16,
29
- attn_implementation="sdpa",
30
- device_map="auto"
31
- ).eval()
32
-
33
- # Load Nanonets-OCR2-3B using AutoModel
34
- MODEL_ID_X = "nanonets/Nanonets-OCR2-3B"
35
- processor_x = AutoProcessor.from_pretrained(MODEL_ID_X, trust_remote_code=True)
36
- model_x = AutoModel.from_pretrained(
37
- MODEL_ID_X,
38
- trust_remote_code=True,
39
- torch_dtype=torch.float16,
40
- attn_implementation="sdpa",
41
- device_map="auto"
42
- ).eval()
43
-
44
  # Load Dots.OCR
45
  MODEL_PATH_D = "strangervisionhf/dots.ocr-base-fix"
46
  processor_d = AutoProcessor.from_pretrained(MODEL_PATH_D, trust_remote_code=True)
@@ -52,9 +30,9 @@ model_d = AutoModelForCausalLM.from_pretrained(
52
  trust_remote_code=True
53
  ).eval()
54
 
55
- # Load olmOCR-2-7B-1025-FP8 using AutoModel
56
- MODEL_ID_M = "allenai/olmOCR-2-7B-1025-FP8"
57
- processor_m = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct", trust_remote_code=True)
58
  model_m = AutoModel.from_pretrained(
59
  MODEL_ID_M,
60
  trust_remote_code=True,
@@ -74,9 +52,6 @@ model_ds = AutoModel.from_pretrained(
74
  device_map="auto"
75
  ).eval().to(torch.bfloat16)
76
 
77
- # Rest of your code remains the same...
78
-
79
-
80
  @spaces.GPU
81
  def generate_image(model_name: str, text: str, image: Image.Image,
82
  max_new_tokens: int, temperature: float, top_p: float,
@@ -91,7 +66,6 @@ def generate_image(model_name: str, text: str, image: Image.Image,
91
 
92
  # Handle DeepSeek-OCR separately due to different API
93
  if model_name == "DeepSeek-OCR":
94
- # DeepSeek-OCR resolution configs
95
  resolution_configs = {
96
  "Tiny": {"base_size": 512, "image_size": 512, "crop_mode": False},
97
  "Small": {"base_size": 640, "image_size": 640, "crop_mode": False},
@@ -101,18 +75,14 @@ def generate_image(model_name: str, text: str, image: Image.Image,
101
  }
102
 
103
  config = resolution_configs[resolution_mode]
104
-
105
- # Save image temporarily
106
  temp_image_path = "/tmp/temp_ocr_image.jpg"
107
  image.save(temp_image_path)
108
 
109
- # DeepSeek-OCR uses special prompt format
110
  if not text:
111
  text = "Free OCR."
112
  prompt_ds = f"<image>\n{text}"
113
 
114
  try:
115
- # DeepSeek-OCR's custom infer method
116
  result = model_ds.infer(
117
  tokenizer_ds,
118
  prompt=prompt_ds,
@@ -128,21 +98,14 @@ def generate_image(model_name: str, text: str, image: Image.Image,
128
  except Exception as e:
129
  yield f"Error: {str(e)}", f"Error: {str(e)}"
130
  finally:
131
- # Clean up temp file
132
  if os.path.exists(temp_image_path):
133
  os.remove(temp_image_path)
134
  return
135
 
136
  # Handle other models with standard API
137
- if model_name == "olmOCR-2-7B-1025-FP8":
138
  processor = processor_m
139
  model = model_m
140
- elif model_name == "Nanonets-OCR2-3B":
141
- processor = processor_x
142
- model = model_x
143
- elif model_name == "Chandra-OCR":
144
- processor = processor_v
145
- model = model_v
146
  elif model_name == "Dots.OCR":
147
  processor = processor_d
148
  model = model_d
@@ -154,9 +117,10 @@ def generate_image(model_name: str, text: str, image: Image.Image,
154
  "role": "user",
155
  "content": [
156
  {"type": "image"},
157
- {"type": "text", "text": text},
158
  ]
159
  }]
 
160
  prompt_full = processor.apply_chat_template(
161
  messages, tokenize=False, add_generation_prompt=True
162
  )
@@ -215,26 +179,18 @@ with gr.Blocks(css=css, title="Multi-Model OCR Space") as demo:
215
  """
216
  # 🔍 Multi-Model OCR Comparison Space
217
 
218
- Compare five state-of-the-art OCR models on your images:
219
- - **Chandra-OCR**: Specialized OCR model
220
- - **Nanonets-OCR2-3B**: High-accuracy OCR
221
- - **Dots.OCR**: Lightweight OCR solution
222
- - **olmOCR-2-7B-1025-FP8**: Advanced FP8 quantized OCR model
223
- - **DeepSeek-OCR**: Context compression OCR with 10× compression ratio (97% accuracy)
224
  """
225
  )
226
 
227
  with gr.Row():
228
  with gr.Column(scale=1):
229
  model_selector = gr.Dropdown(
230
- choices=[
231
- "Chandra-OCR",
232
- "Nanonets-OCR2-3B",
233
- "Dots.OCR",
234
- "olmOCR-2-7B-1025-FP8",
235
- "DeepSeek-OCR"
236
- ],
237
- value="DeepSeek-OCR",
238
  label="Select OCR Model",
239
  elem_classes=["model-selector"]
240
  )
@@ -243,8 +199,8 @@ with gr.Blocks(css=css, title="Multi-Model OCR Space") as demo:
243
  choices=["Tiny", "Small", "Base", "Large", "Gundam"],
244
  value="Gundam",
245
  label="DeepSeek-OCR Resolution Mode",
246
- info="Only applies to DeepSeek-OCR. Gundam mode recommended for best results.",
247
- visible=True
248
  )
249
 
250
  image_input = gr.Image(type="pil", label="Upload Image")
@@ -339,20 +295,19 @@ with gr.Blocks(css=css, title="Multi-Model OCR Space") as demo:
339
 
340
  gr.Markdown(
341
  """
342
- ### Model Information:
 
 
 
 
343
 
344
- **DeepSeek-OCR Modes:**
345
- - **Tiny**: 64 tokens @ 512×512 (fastest, basic documents)
346
- - **Small**: 100 tokens @ 640×640 (good for simple pages)
347
- - **Base**: 256 tokens @ 1024×1024 (standard documents)
348
- - **Large**: 400 tokens @ 1280×1280 (complex layouts)
349
- - **Gundam**: Dynamic multi-view (recommended for best accuracy)
350
 
351
  ### Tips:
352
- - Upload clear images for best results
353
- - DeepSeek-OCR excels at table extraction and markdown conversion
 
354
  - Adjust temperature for more creative or conservative outputs
355
- - Try different models to compare performance on your specific use case
356
  """
357
  )
358
 
 
19
  # Device setup
20
  device = "cuda" if torch.cuda.is_available() else "cpu"
21
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22
  # Load Dots.OCR
23
  MODEL_PATH_D = "strangervisionhf/dots.ocr-base-fix"
24
  processor_d = AutoProcessor.from_pretrained(MODEL_PATH_D, trust_remote_code=True)
 
30
  trust_remote_code=True
31
  ).eval()
32
 
33
+ # Load olmOCR-2-7B-1025 (non-FP8 version for simplicity)
34
+ MODEL_ID_M = "allenai/olmOCR-2-7B-1025"
35
+ processor_m = AutoProcessor.from_pretrained(MODEL_ID_M, trust_remote_code=True)
36
  model_m = AutoModel.from_pretrained(
37
  MODEL_ID_M,
38
  trust_remote_code=True,
 
52
  device_map="auto"
53
  ).eval().to(torch.bfloat16)
54
 
 
 
 
55
  @spaces.GPU
56
  def generate_image(model_name: str, text: str, image: Image.Image,
57
  max_new_tokens: int, temperature: float, top_p: float,
 
66
 
67
  # Handle DeepSeek-OCR separately due to different API
68
  if model_name == "DeepSeek-OCR":
 
69
  resolution_configs = {
70
  "Tiny": {"base_size": 512, "image_size": 512, "crop_mode": False},
71
  "Small": {"base_size": 640, "image_size": 640, "crop_mode": False},
 
75
  }
76
 
77
  config = resolution_configs[resolution_mode]
 
 
78
  temp_image_path = "/tmp/temp_ocr_image.jpg"
79
  image.save(temp_image_path)
80
 
 
81
  if not text:
82
  text = "Free OCR."
83
  prompt_ds = f"<image>\n{text}"
84
 
85
  try:
 
86
  result = model_ds.infer(
87
  tokenizer_ds,
88
  prompt=prompt_ds,
 
98
  except Exception as e:
99
  yield f"Error: {str(e)}", f"Error: {str(e)}"
100
  finally:
 
101
  if os.path.exists(temp_image_path):
102
  os.remove(temp_image_path)
103
  return
104
 
105
  # Handle other models with standard API
106
+ if model_name == "olmOCR-2-7B-1025":
107
  processor = processor_m
108
  model = model_m
 
 
 
 
 
 
109
  elif model_name == "Dots.OCR":
110
  processor = processor_d
111
  model = model_d
 
117
  "role": "user",
118
  "content": [
119
  {"type": "image"},
120
+ {"type": "text", "text": text if text else "Perform OCR on this image."},
121
  ]
122
  }]
123
+
124
  prompt_full = processor.apply_chat_template(
125
  messages, tokenize=False, add_generation_prompt=True
126
  )
 
179
  """
180
  # 🔍 Multi-Model OCR Comparison Space
181
 
182
+ Compare three state-of-the-art OCR models:
183
+ - **Dots.OCR**: Lightweight and efficient OCR
184
+ - **olmOCR-2-7B-1025**: Advanced OCR for math, tables, and complex layouts (82.4% accuracy)
185
+ - **DeepSeek-OCR**: Context compression OCR with 10× compression (97% accuracy)
 
 
186
  """
187
  )
188
 
189
  with gr.Row():
190
  with gr.Column(scale=1):
191
  model_selector = gr.Dropdown(
192
+ choices=["Dots.OCR", "olmOCR-2-7B-1025", "DeepSeek-OCR"],
193
+ value="olmOCR-2-7B-1025",
 
 
 
 
 
 
194
  label="Select OCR Model",
195
  elem_classes=["model-selector"]
196
  )
 
199
  choices=["Tiny", "Small", "Base", "Large", "Gundam"],
200
  value="Gundam",
201
  label="DeepSeek-OCR Resolution Mode",
202
+ info="Only applies to DeepSeek-OCR. Gundam mode recommended.",
203
+ visible=False
204
  )
205
 
206
  image_input = gr.Image(type="pil", label="Upload Image")
 
295
 
296
  gr.Markdown(
297
  """
298
+ ### Model Strengths:
299
+
300
+ **Dots.OCR**: Fast and lightweight, great for simple documents and quick processing
301
+
302
+ **olmOCR-2-7B-1025**: Best for complex documents with tables, LaTeX equations, multi-column layouts, and handwritten text
303
 
304
+ **DeepSeek-OCR**: Excellent for markdown conversion, table extraction, and efficient context compression (10× smaller output)
 
 
 
 
 
305
 
306
  ### Tips:
307
+ - Upload clear, well-lit images for best results
308
+ - Use olmOCR for academic papers and technical documents
309
+ - Use DeepSeek for efficient processing of large document batches
310
  - Adjust temperature for more creative or conservative outputs
 
311
  """
312
  )
313