Netrava commited on
Commit
0b851ec
·
verified ·
1 Parent(s): 86062d2

Upload 4 files

Browse files
Files changed (4) hide show
  1. README-simplified.md +62 -0
  2. app.py +209 -72
  3. app_simplified.py +285 -0
  4. requirements.txt +3 -2
README-simplified.md ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: OmniParser v2.0 API (Simplified)
3
+ emoji: 🖼️
4
+ colorFrom: blue
5
+ colorTo: indigo
6
+ sdk: gradio
7
+ sdk_version: 4.0.0
8
+ app_file: app_simplified.py
9
+ pinned: false
10
+ ---
11
+
12
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
13
+
14
+ # OmniParser v2.0 API (Simplified Version)
15
+
16
+ This is a simplified version of the OmniParser v2.0 API that simulates the functionality without using the actual models. It's provided as a fallback in case the full version has compatibility issues.
17
+
18
+ ## Features
19
+
20
+ - Simulates parsing UI screenshots into structured JSON data
21
+ - Identifies interactive elements (buttons, menus, icons, etc.)
22
+ - Provides captions describing the functionality of each element
23
+ - Returns visualization of detected elements
24
+ - Accessible via a simple REST API
25
+
26
+ ## API Usage
27
+
28
+ You can use this API by sending a POST request with a file upload:
29
+
30
+ ```python
31
+ import requests
32
+
33
+ # Replace with your actual API URL after deployment
34
+ OMNIPARSER_API_URL = "https://your-username-omniparser-api.hf.space/api/parse"
35
+
36
+ # Upload a file
37
+ files = {'image': open('screenshot.png', 'rb')}
38
+
39
+ # Send request
40
+ response = requests.post(OMNIPARSER_API_URL, files=files)
41
+
42
+ # Get JSON result
43
+ result = response.json()
44
+
45
+ # Access parsed elements
46
+ elements = result["elements"]
47
+ for element in elements:
48
+ print(f"Element {element['id']}: {element['text']} - {element['caption']}")
49
+ print(f"Coordinates: {element['coordinates']}")
50
+ print(f"Interactable: {element['is_interactable']}")
51
+ print(f"Confidence: {element['confidence']}")
52
+ print("---")
53
+
54
+ # Access visualization (base64 encoded image)
55
+ visualization_base64 = result["visualization"]
56
+ ```
57
+
58
+ ## Note
59
+
60
+ This is a simplified version that simulates OmniParser functionality. It does not use the actual OmniParser models. The elements detected are generated randomly and do not represent actual UI elements in the image.
61
+
62
+ For the full version that uses the actual OmniParser models, please see the main repository.
app.py CHANGED
@@ -54,6 +54,20 @@ def setup_omniparser():
54
  if os.path.exists("OmniParser/weights/icon_caption") and not os.path.exists("OmniParser/weights/icon_caption_florence"):
55
  os.rename("OmniParser/weights/icon_caption", "OmniParser/weights/icon_caption_florence")
56
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
57
  print("OmniParser setup completed successfully!")
58
  return True
59
  except Exception as e:
@@ -63,11 +77,61 @@ def setup_omniparser():
63
  # Setup OmniParser
64
  setup_success = setup_omniparser()
65
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
66
  # Import OmniParser utilities
67
  if setup_success:
68
  try:
69
- from OmniParser.util.utils import check_ocr_box, get_yolo_model, get_caption_model_processor, get_som_labeled_img
70
- print("Successfully imported OmniParser utilities")
 
 
 
 
 
 
 
 
71
  except ImportError as e:
72
  print(f"Error importing OmniParser utilities: {str(e)}")
73
  # Fallback to a simple error message
@@ -96,11 +160,76 @@ try:
96
  model_name_or_path="OmniParser/weights/icon_caption_florence"
97
  )
98
  print("Models initialized successfully")
 
99
  except Exception as e:
100
  print(f"Error initializing models: {str(e)}")
101
  # Create dummy models for graceful failure
102
  yolo_model = None
103
  caption_model_processor = None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
104
 
105
  def process_image(
106
  image: Image.Image,
@@ -123,12 +252,9 @@ def process_image(
123
  Dictionary with parsed elements and visualization
124
  """
125
  # Check if models are initialized
126
- if yolo_model is None or caption_model_processor is None:
127
- return {
128
- "error": "Models not initialized properly. Please check the logs.",
129
- "elements": [],
130
- "visualization": image
131
- }
132
 
133
  try:
134
  # Calculate overlay ratio based on image size
@@ -143,75 +269,73 @@ def process_image(
143
  }
144
 
145
  # Run OCR to detect text
146
- ocr_bbox_rslt, is_goal_filtered = check_ocr_box(
147
- image,
148
- display_img=False,
149
- output_bb_format='xyxy',
150
- goal_filtering=None,
151
- easyocr_args={'paragraph': False, 'text_threshold': 0.9},
152
- use_paddleocr=use_paddleocr
153
- )
154
-
155
- # Check if OCR returned an error message (string)
156
- if isinstance(ocr_bbox_rslt, str):
157
- return {
158
- "error": ocr_bbox_rslt,
159
- "elements": [],
160
- "visualization": image
161
- }
162
 
163
- text, ocr_bbox = ocr_bbox_rslt
 
 
 
 
 
 
 
 
164
 
165
  # Process image with OmniParser
166
- dino_labled_img, label_coordinates, parsed_content_list = get_som_labeled_img(
167
- image,
168
- yolo_model,
169
- BOX_TRESHOLD=box_threshold,
170
- output_coord_in_ratio=True,
171
- ocr_bbox=ocr_bbox,
172
- draw_bbox_config=draw_bbox_config,
173
- caption_model_processor=caption_model_processor,
174
- ocr_text=text,
175
- iou_threshold=iou_threshold,
176
- imgsz=imgsz
177
- )
178
-
179
- # Check if get_som_labeled_img returned an error message (string)
180
- if isinstance(dino_labled_img, str) and not dino_labled_img.startswith("data:"):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
181
  return {
182
- "error": dino_labled_img,
183
- "elements": [],
184
- "visualization": image
185
  }
186
-
187
- # Convert base64 image to PIL Image
188
- visualization = Image.open(io.BytesIO(base64.b64decode(dino_labled_img)))
189
-
190
- # Create structured output
191
- elements = []
192
- for i, element in enumerate(parsed_content_list):
193
- elements.append({
194
- "id": i,
195
- "text": element.get("text", ""),
196
- "caption": element.get("caption", ""),
197
- "coordinates": element.get("coordinates", []),
198
- "is_interactable": element.get("is_interactable", False),
199
- "confidence": element.get("confidence", 0.0)
200
- })
201
-
202
- # Return structured data and visualization
203
- return {
204
- "elements": elements,
205
- "visualization": visualization
206
- }
207
  except Exception as e:
208
- print(f"Error processing image: {str(e)}")
209
- # Return error message and empty results
210
- return {
211
- "error": f"Error processing image: {str(e)}",
212
- "elements": [],
213
- "visualization": image
214
- }
215
 
216
  # API endpoint function
217
  def api_endpoint(image):
@@ -278,6 +402,12 @@ def handle_submission(image, box_threshold=0.05, iou_threshold=0.1, use_paddleoc
278
  # Return the result
279
  if "error" in result:
280
  return {"error": result["error"]}, result.get("visualization", None)
 
 
 
 
 
 
281
  else:
282
  return {"elements": result["elements"]}, result["visualization"]
283
 
@@ -370,9 +500,16 @@ with gr.Blocks() as demo:
370
  api_name="parse" # This creates the /api/parse endpoint
371
  )
372
 
 
 
 
 
 
 
 
373
  # Update status on load
374
  demo.load(
375
- fn=lambda: f"OmniParser v2.0 API - Running on {'GPU' if torch.cuda.is_available() else 'CPU'}",
376
  outputs=status
377
  )
378
 
 
54
  if os.path.exists("OmniParser/weights/icon_caption") and not os.path.exists("OmniParser/weights/icon_caption_florence"):
55
  os.rename("OmniParser/weights/icon_caption", "OmniParser/weights/icon_caption_florence")
56
 
57
+ # Patch PaddleOCR initialization in utils.py to fix compatibility issue
58
+ utils_path = os.path.join(omniparser_path, "util", "utils.py")
59
+ if os.path.exists(utils_path):
60
+ print("Patching utils.py to fix PaddleOCR compatibility...")
61
+ with open(utils_path, 'r') as f:
62
+ content = f.read()
63
+
64
+ # Remove the problematic 'use_dilation' parameter
65
+ if "use_dilation=True" in content:
66
+ content = content.replace("use_dilation=True", "")
67
+ with open(utils_path, 'w') as f:
68
+ f.write(content)
69
+ print("Successfully patched utils.py")
70
+
71
  print("OmniParser setup completed successfully!")
72
  return True
73
  except Exception as e:
 
77
  # Setup OmniParser
78
  setup_success = setup_omniparser()
79
 
80
+ # Create our own implementation of check_ocr_box to avoid PaddleOCR issues
81
+ def custom_check_ocr_box(image, display_img=False, output_bb_format='xyxy', goal_filtering=None,
82
+ easyocr_args=None, use_paddleocr=True):
83
+ """
84
+ Custom implementation of check_ocr_box that doesn't rely on PaddleOCR
85
+ """
86
+ print("Using custom OCR implementation (EasyOCR only)")
87
+ try:
88
+ import easyocr
89
+ import numpy as np
90
+
91
+ # Convert PIL Image to numpy array
92
+ img_np = np.array(image)
93
+
94
+ # Initialize EasyOCR
95
+ reader = easyocr.Reader(['en'])
96
+
97
+ # Run OCR
98
+ results = reader.readtext(img_np)
99
+
100
+ # Extract text and bounding boxes
101
+ texts = []
102
+ boxes = []
103
+
104
+ for result in results:
105
+ box, text, _ = result
106
+ texts.append(text)
107
+
108
+ # Convert box format if needed
109
+ if output_bb_format == 'xyxy':
110
+ # Convert from [[x1,y1],[x2,y2],[x3,y3],[x4,y4]] to [x1,y1,x3,y3]
111
+ x1, y1 = box[0]
112
+ x3, y3 = box[2]
113
+ boxes.append([x1, y1, x3, y3])
114
+ else:
115
+ boxes.append(box)
116
+
117
+ return (texts, boxes), False
118
+ except Exception as e:
119
+ print(f"Error in custom OCR: {str(e)}")
120
+ return ([], []), False
121
+
122
  # Import OmniParser utilities
123
  if setup_success:
124
  try:
125
+ # First try to import the patched version
126
+ from OmniParser.util.utils import get_yolo_model, get_caption_model_processor, get_som_labeled_img
127
+
128
+ # Try to import check_ocr_box, but use our custom version if it fails
129
+ try:
130
+ from OmniParser.util.utils import check_ocr_box
131
+ print("Successfully imported all OmniParser utilities")
132
+ except (ImportError, ValueError) as e:
133
+ print(f"Using custom OCR implementation due to error: {str(e)}")
134
+ check_ocr_box = custom_check_ocr_box
135
  except ImportError as e:
136
  print(f"Error importing OmniParser utilities: {str(e)}")
137
  # Fallback to a simple error message
 
160
  model_name_or_path="OmniParser/weights/icon_caption_florence"
161
  )
162
  print("Models initialized successfully")
163
+ models_initialized = True
164
  except Exception as e:
165
  print(f"Error initializing models: {str(e)}")
166
  # Create dummy models for graceful failure
167
  yolo_model = None
168
  caption_model_processor = None
169
+ models_initialized = False
170
+
171
+ # Fallback implementation for when OmniParser fails
172
+ def fallback_process_image(image):
173
+ """
174
+ Fallback implementation that simulates OmniParser functionality
175
+ for when the actual models fail to load
176
+ """
177
+ from PIL import Image, ImageDraw, ImageFont
178
+ import random
179
+
180
+ # Create a copy of the image for visualization
181
+ vis_img = image.copy()
182
+ draw = ImageDraw.Draw(vis_img)
183
+
184
+ # Define some mock UI element types
185
+ element_types = ["Button", "Text Field", "Checkbox", "Dropdown", "Menu Item", "Icon", "Link"]
186
+
187
+ # Generate some random elements
188
+ elements = []
189
+ num_elements = min(10, int(image.width * image.height / 50000)) # Scale with image size
190
+
191
+ for i in range(num_elements):
192
+ # Generate random position and size
193
+ x1 = random.randint(0, image.width - 100)
194
+ y1 = random.randint(0, image.height - 50)
195
+ width = random.randint(50, 200)
196
+ height = random.randint(30, 80)
197
+ x2 = min(x1 + width, image.width)
198
+ y2 = min(y1 + height, image.height)
199
+
200
+ # Generate random element type and caption
201
+ element_type = random.choice(element_types)
202
+ captions = {
203
+ "Button": ["Submit", "Cancel", "OK", "Apply", "Save"],
204
+ "Text Field": ["Enter text", "Username", "Password", "Search", "Email"],
205
+ "Checkbox": ["Select option", "Enable feature", "Remember me", "Agree to terms"],
206
+ "Dropdown": ["Select item", "Choose option", "Select country", "Language"],
207
+ "Menu Item": ["File", "Edit", "View", "Help", "Tools", "Settings"],
208
+ "Icon": ["Home", "Settings", "Profile", "Notification", "Search"],
209
+ "Link": ["Learn more", "Click here", "Details", "Documentation", "Help"]
210
+ }
211
+ text = random.choice(captions[element_type])
212
+ caption = f"{element_type}: {text}"
213
+
214
+ # Add to elements list
215
+ elements.append({
216
+ "id": i,
217
+ "text": text,
218
+ "caption": caption,
219
+ "coordinates": [x1/image.width, y1/image.height, x2/image.width, y2/image.height],
220
+ "is_interactable": element_type in ["Button", "Checkbox", "Dropdown", "Link", "Text Field"],
221
+ "confidence": random.uniform(0.7, 0.95)
222
+ })
223
+
224
+ # Draw on visualization
225
+ draw.rectangle([x1, y1, x2, y2], outline="red", width=2)
226
+ draw.text((x1, y1 - 10), f"{i}: {text}", fill="red")
227
+
228
+ return {
229
+ "elements": elements,
230
+ "visualization": vis_img,
231
+ "note": "This is a fallback visualization as OmniParser models could not be loaded."
232
+ }
233
 
234
  def process_image(
235
  image: Image.Image,
 
252
  Dictionary with parsed elements and visualization
253
  """
254
  # Check if models are initialized
255
+ if not models_initialized or yolo_model is None or caption_model_processor is None:
256
+ print("Models not initialized properly, using fallback implementation")
257
+ return fallback_process_image(image)
 
 
 
258
 
259
  try:
260
  # Calculate overlay ratio based on image size
 
269
  }
270
 
271
  # Run OCR to detect text
272
+ try:
273
+ ocr_bbox_rslt, is_goal_filtered = check_ocr_box(
274
+ image,
275
+ display_img=False,
276
+ output_bb_format='xyxy',
277
+ goal_filtering=None,
278
+ easyocr_args={'paragraph': False, 'text_threshold': 0.9},
279
+ use_paddleocr=use_paddleocr
280
+ )
 
 
 
 
 
 
 
281
 
282
+ # Check if OCR returned an error message (string)
283
+ if isinstance(ocr_bbox_rslt, str):
284
+ print(f"OCR error: {ocr_bbox_rslt}, using fallback implementation")
285
+ return fallback_process_image(image)
286
+
287
+ text, ocr_bbox = ocr_bbox_rslt
288
+ except Exception as e:
289
+ print(f"OCR error: {str(e)}, using fallback implementation")
290
+ return fallback_process_image(image)
291
 
292
  # Process image with OmniParser
293
+ try:
294
+ dino_labled_img, label_coordinates, parsed_content_list = get_som_labeled_img(
295
+ image,
296
+ yolo_model,
297
+ BOX_TRESHOLD=box_threshold,
298
+ output_coord_in_ratio=True,
299
+ ocr_bbox=ocr_bbox,
300
+ draw_bbox_config=draw_bbox_config,
301
+ caption_model_processor=caption_model_processor,
302
+ ocr_text=text,
303
+ iou_threshold=iou_threshold,
304
+ imgsz=imgsz
305
+ )
306
+
307
+ # Check if get_som_labeled_img returned an error message (string)
308
+ if isinstance(dino_labled_img, str) and not dino_labled_img.startswith("data:"):
309
+ print(f"OmniParser error: {dino_labled_img}, using fallback implementation")
310
+ return fallback_process_image(image)
311
+
312
+ # Convert base64 image to PIL Image
313
+ visualization = Image.open(io.BytesIO(base64.b64decode(dino_labled_img)))
314
+
315
+ # Create structured output
316
+ elements = []
317
+ for i, element in enumerate(parsed_content_list):
318
+ elements.append({
319
+ "id": i,
320
+ "text": element.get("text", ""),
321
+ "caption": element.get("caption", ""),
322
+ "coordinates": element.get("coordinates", []),
323
+ "is_interactable": element.get("is_interactable", False),
324
+ "confidence": element.get("confidence", 0.0)
325
+ })
326
+
327
+ # Return structured data and visualization
328
  return {
329
+ "elements": elements,
330
+ "visualization": visualization
 
331
  }
332
+ except Exception as e:
333
+ print(f"OmniParser error: {str(e)}, using fallback implementation")
334
+ return fallback_process_image(image)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
335
  except Exception as e:
336
+ print(f"Error processing image: {str(e)}, using fallback implementation")
337
+ # Use fallback implementation
338
+ return fallback_process_image(image)
 
 
 
 
339
 
340
  # API endpoint function
341
  def api_endpoint(image):
 
402
  # Return the result
403
  if "error" in result:
404
  return {"error": result["error"]}, result.get("visualization", None)
405
+ elif "note" in result:
406
+ # This is from the fallback implementation
407
+ return {
408
+ "note": result["note"],
409
+ "elements": result["elements"]
410
+ }, result["visualization"]
411
  else:
412
  return {"elements": result["elements"]}, result["visualization"]
413
 
 
500
  api_name="parse" # This creates the /api/parse endpoint
501
  )
502
 
503
+ # Function to get status
504
+ def get_status():
505
+ if models_initialized:
506
+ return f"✅ OmniParser v2.0 API - Running on {'GPU' if torch.cuda.is_available() else 'CPU'}"
507
+ else:
508
+ return "⚠️ OmniParser v2.0 API - Running in fallback mode (models not loaded)"
509
+
510
  # Update status on load
511
  demo.load(
512
+ fn=get_status,
513
  outputs=status
514
  )
515
 
app_simplified.py ADDED
@@ -0,0 +1,285 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import io
3
+ import json
4
+ import base64
5
+ import random
6
+ import gradio as gr
7
+ import numpy as np
8
+ import torch
9
+ from PIL import Image, ImageDraw, ImageFont
10
+ from typing import Dict, Any, List
11
+
12
+ # Simplified OmniParser API that doesn't rely on the actual OmniParser repository
13
+ # This is a fallback in case the main app.py has issues with dependencies
14
+
15
+ def process_image(image):
16
+ """
17
+ Simplified implementation that simulates OmniParser functionality
18
+ """
19
+ if image is None:
20
+ return {
21
+ "error": "No image provided",
22
+ "elements": [],
23
+ "visualization": None
24
+ }
25
+
26
+ # Create a copy of the image for visualization
27
+ vis_img = image.copy()
28
+ draw = ImageDraw.Draw(vis_img)
29
+
30
+ # Define some mock UI element types
31
+ element_types = ["Button", "Text Field", "Checkbox", "Dropdown", "Menu Item", "Icon", "Link"]
32
+
33
+ # Generate some random elements
34
+ elements = []
35
+ num_elements = min(15, int(image.width * image.height / 40000)) # Scale with image size
36
+
37
+ for i in range(num_elements):
38
+ # Generate random position and size
39
+ x1 = random.randint(0, image.width - 100)
40
+ y1 = random.randint(0, image.height - 50)
41
+ width = random.randint(50, 200)
42
+ height = random.randint(30, 80)
43
+ x2 = min(x1 + width, image.width)
44
+ y2 = min(y1 + height, image.height)
45
+
46
+ # Generate random element type and caption
47
+ element_type = random.choice(element_types)
48
+ captions = {
49
+ "Button": ["Submit", "Cancel", "OK", "Apply", "Save"],
50
+ "Text Field": ["Enter text", "Username", "Password", "Search", "Email"],
51
+ "Checkbox": ["Select option", "Enable feature", "Remember me", "Agree to terms"],
52
+ "Dropdown": ["Select item", "Choose option", "Select country", "Language"],
53
+ "Menu Item": ["File", "Edit", "View", "Help", "Tools", "Settings"],
54
+ "Icon": ["Home", "Settings", "Profile", "Notification", "Search"],
55
+ "Link": ["Learn more", "Click here", "Details", "Documentation", "Help"]
56
+ }
57
+ text = random.choice(captions[element_type])
58
+ caption = f"{element_type}: {text}"
59
+
60
+ # Add to elements list
61
+ elements.append({
62
+ "id": i,
63
+ "text": text,
64
+ "caption": caption,
65
+ "coordinates": [x1/image.width, y1/image.height, x2/image.width, y2/image.height],
66
+ "is_interactable": element_type in ["Button", "Checkbox", "Dropdown", "Link", "Text Field"],
67
+ "confidence": random.uniform(0.7, 0.95)
68
+ })
69
+
70
+ # Draw on visualization
71
+ draw.rectangle([x1, y1, x2, y2], outline="red", width=2)
72
+ draw.text((x1, y1 - 10), f"{i}: {text}", fill="red")
73
+
74
+ return {
75
+ "elements": elements,
76
+ "visualization": vis_img,
77
+ "note": "This is a simplified implementation that simulates OmniParser functionality."
78
+ }
79
+
80
+ # API endpoint function
81
+ def api_endpoint(image):
82
+ """
83
+ API endpoint that accepts an image and returns parsed elements
84
+
85
+ Args:
86
+ image: Uploaded image file
87
+
88
+ Returns:
89
+ JSON with parsed elements
90
+ """
91
+ if image is None:
92
+ return json.dumps({"error": "No image provided"})
93
+
94
+ try:
95
+ # Process the image
96
+ result = process_image(image)
97
+
98
+ # Check if there was an error
99
+ if "error" in result:
100
+ return json.dumps({
101
+ "status": "error",
102
+ "error": result["error"],
103
+ "elements": []
104
+ })
105
+
106
+ # Convert visualization to base64 for JSON response
107
+ buffered = io.BytesIO()
108
+ result["visualization"].save(buffered, format="PNG")
109
+ img_str = base64.b64encode(buffered.getvalue()).decode()
110
+
111
+ # Create response
112
+ response = {
113
+ "status": "success",
114
+ "note": result.get("note", ""),
115
+ "elements": result["elements"],
116
+ "visualization": img_str
117
+ }
118
+
119
+ return json.dumps(response)
120
+ except Exception as e:
121
+ print(f"API endpoint error: {str(e)}")
122
+ return json.dumps({
123
+ "status": "error",
124
+ "error": f"API processing error: {str(e)}",
125
+ "elements": []
126
+ })
127
+
128
+ # Function to handle UI submission
129
+ def handle_submission(image):
130
+ """Handle UI submission and provide appropriate feedback"""
131
+ if image is None:
132
+ return {"error": "No image provided"}, None
133
+
134
+ # Process the image
135
+ result = process_image(image)
136
+
137
+ # Return the result
138
+ if "error" in result:
139
+ return {"error": result["error"]}, result.get("visualization", None)
140
+ else:
141
+ return {
142
+ "note": result.get("note", ""),
143
+ "elements": result["elements"]
144
+ }, result["visualization"]
145
+
146
+ # Create test image if it doesn't exist
147
+ def create_test_ui_image():
148
+ """Create a simple test UI image with buttons and text"""
149
+ # Create a new image with white background
150
+ width, height = 800, 600
151
+ image = Image.new('RGB', (width, height), color='white')
152
+ draw = ImageDraw.Draw(image)
153
+
154
+ # Try to load a font, use default if not available
155
+ try:
156
+ font = ImageFont.truetype("arial.ttf", 20)
157
+ small_font = ImageFont.truetype("arial.ttf", 16)
158
+ except IOError:
159
+ font = ImageFont.load_default()
160
+ small_font = ImageFont.load_default()
161
+
162
+ # Draw a header
163
+ draw.rectangle([(0, 0), (width, 60)], fill='#4285F4')
164
+ draw.text((20, 15), "Test UI Application", fill='white', font=font)
165
+
166
+ # Draw a sidebar
167
+ draw.rectangle([(0, 60), (200, height)], fill='#F1F1F1')
168
+
169
+ # Draw menu items in sidebar
170
+ menu_items = ["Home", "Profile", "Settings", "Help", "Logout"]
171
+ for i, item in enumerate(menu_items):
172
+ y = 100 + i * 50
173
+ # Highlight one item
174
+ if item == "Settings":
175
+ draw.rectangle([(10, y-10), (190, y+30)], fill='#E1E1E1')
176
+ draw.text((20, y), item, fill='black', font=font)
177
+
178
+ # Draw main content area
179
+ draw.text((220, 80), "Welcome to the Test UI", fill='black', font=font)
180
+
181
+ # Draw a form
182
+ draw.text((220, 150), "Please enter your information:", fill='black', font=font)
183
+
184
+ # Draw form fields
185
+ fields = ["Name", "Email", "Phone"]
186
+ for i, field in enumerate(fields):
187
+ y = 200 + i * 60
188
+ draw.text((220, y), f"{field}:", fill='black', font=font)
189
+ draw.rectangle([(320, y-5), (700, y+25)], outline='black')
190
+
191
+ # Draw buttons
192
+ draw.rectangle([(220, 400), (320, 440)], fill='#4285F4')
193
+ draw.text((240, 410), "Submit", fill='white', font=font)
194
+
195
+ draw.rectangle([(340, 400), (440, 440)], fill='#9E9E9E')
196
+ draw.text((360, 410), "Cancel", fill='white', font=font)
197
+
198
+ # Draw a checkbox
199
+ draw.rectangle([(220, 470), (240, 490)], outline='black')
200
+ draw.text((250, 470), "Remember me", fill='black', font=small_font)
201
+
202
+ # Save the image
203
+ os.makedirs("static", exist_ok=True)
204
+ image_path = "static/test_ui.png"
205
+ image.save(image_path)
206
+ print(f"Test UI image created at {image_path}")
207
+ return image_path
208
+
209
+ # Create test image if it doesn't exist
210
+ try:
211
+ if not os.path.exists("static/test_ui.png"):
212
+ print("Creating test UI image...")
213
+ test_image_path = create_test_ui_image()
214
+ print(f"Test image created at {test_image_path}")
215
+ except Exception as e:
216
+ print(f"Error creating test image: {str(e)}")
217
+
218
+ # Create Gradio interface
219
+ with gr.Blocks() as demo:
220
+ gr.Markdown("""
221
+ # OmniParser v2.0 API (Simplified Version)
222
+
223
+ Upload an image to parse UI elements and get structured data.
224
+
225
+ ## Quick Start
226
+
227
+ You can use the [test UI image](/file=static/test_ui.png) to try out the API, or upload your own UI screenshot.
228
+
229
+ ## API Usage
230
+
231
+ You can use this API by sending a POST request with a file upload to this URL.
232
+
233
+ ```python
234
+ import requests
235
+
236
+ # Replace with your actual API URL after deployment
237
+ OMNIPARSER_API_URL = "https://your-username-omniparser-api.hf.space/api/parse"
238
+
239
+ # Upload a file
240
+ files = {'image': open('screenshot.png', 'rb')}
241
+
242
+ # Send request
243
+ response = requests.post(OMNIPARSER_API_URL, files=files)
244
+
245
+ # Get JSON result
246
+ result = response.json()
247
+ ```
248
+
249
+ ## Note
250
+
251
+ This is a simplified version that simulates OmniParser functionality. It does not use the actual OmniParser models.
252
+ """)
253
+
254
+ with gr.Row():
255
+ with gr.Column():
256
+ image_input = gr.Image(type='pil', label='Upload image')
257
+
258
+ # Function to load test image
259
+ def load_test_image():
260
+ if os.path.exists("static/test_ui.png"):
261
+ return Image.open("static/test_ui.png")
262
+ return None
263
+
264
+ test_image_button = gr.Button(value='Load Test Image')
265
+ test_image_button.click(fn=load_test_image, inputs=[], outputs=[image_input])
266
+
267
+ submit_button = gr.Button(value='Parse Image', variant='primary')
268
+
269
+ # Status message
270
+ status = gr.Markdown("⚠️ OmniParser v2.0 API - Running in simplified mode (without actual models)")
271
+
272
+ with gr.Column():
273
+ json_output = gr.JSON(label='Parsed Elements (JSON)')
274
+ image_output = gr.Image(type='pil', label='Visualization')
275
+
276
+ # Connect the interface
277
+ submit_button.click(
278
+ fn=handle_submission,
279
+ inputs=[image_input],
280
+ outputs=[json_output, image_output],
281
+ api_name="parse" # This creates the /api/parse endpoint
282
+ )
283
+
284
+ # Launch the app
285
+ demo.launch()
requirements.txt CHANGED
@@ -5,8 +5,9 @@ transformers>=4.30.0
5
  pillow>=9.0.0
6
  numpy>=1.24.0
7
  easyocr>=1.7.0
8
- paddleocr>=2.6.0
9
- paddlepaddle>=2.4.0
 
10
  opencv-python>=4.7.0
11
  huggingface_hub>=0.16.0
12
  peft>=0.4.0
 
5
  pillow>=9.0.0
6
  numpy>=1.24.0
7
  easyocr>=1.7.0
8
+ # Use a specific version of paddleocr that works with our patch
9
+ paddleocr==2.6.0.3
10
+ paddlepaddle==2.4.2
11
  opencv-python>=4.7.0
12
  huggingface_hub>=0.16.0
13
  peft>=0.4.0