Mohammed422 commited on
Commit
714bf44
·
1 Parent(s): 81a90a1

small changes

Browse files
Files changed (1) hide show
  1. app.py +343 -39
app.py CHANGED
@@ -1,3 +1,250 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import gradio as gr
2
  import spaces
3
  from gradio.themes.base import Base
@@ -9,6 +256,7 @@ import os
9
  import json
10
  import fitz # PyMuPDF
11
 
 
12
  # Define a custom theme inheriting from the soft theme
13
  class CustomTheme(Base):
14
  def __init__(self):
@@ -16,10 +264,12 @@ class CustomTheme(Base):
16
  self.primary_hue = "blue"
17
  self.secondary_hue = "sky"
18
 
 
19
  custom_theme = CustomTheme()
20
 
21
  DESCRIPTION = "A powerful vision-language model that can understand images and text to provide detailed analysis."
22
 
 
23
  def array_to_image_path(image_filepath, max_width=1250, max_height=1750):
24
  if image_filepath is None:
25
  raise ValueError("No image provided.")
@@ -31,6 +281,7 @@ def array_to_image_path(image_filepath, max_width=1250, max_height=1750):
31
 
32
  return os.path.abspath(image_filepath), img.width, img.height
33
 
 
34
  def convert_pdf_to_images(pdf_path):
35
  """Opens a PDF and converts each page into a high-resolution PNG image."""
36
  image_paths = []
@@ -40,21 +291,21 @@ def convert_pdf_to_images(pdf_path):
40
  for i, page in enumerate(doc):
41
  pix = page.get_pixmap(dpi=200)
42
  timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
43
- image_path = f"{base_name}_page_{i+1}_{timestamp}.png"
44
  pix.save(image_path)
45
  image_paths.append(image_path)
46
 
47
  doc.close()
48
  return image_paths
49
 
 
50
  # Initialize the model and processor
51
  model = Qwen2VLForConditionalGeneration.from_pretrained(
52
- "Qwen/Qwen2-VL-7B-Instruct",
53
- torch_dtype="auto",
54
- device_map="auto"
55
  )
56
  processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct")
57
 
 
58
  @spaces.GPU
59
  def run_inference(uploaded_files, text_input):
60
  results = []
@@ -67,7 +318,9 @@ def run_inference(uploaded_files, text_input):
67
  )
68
 
69
  if not uploaded_files:
70
- error_json = json.dumps({"error": "No file provided. Please upload an image or PDF."}, indent=4)
 
 
71
  return error_json, gr.Button(interactive=False)
72
 
73
  image_paths_to_process = []
@@ -76,56 +329,92 @@ def run_inference(uploaded_files, text_input):
76
  file_path = file_obj.name
77
  temp_files_to_clean.append(file_path)
78
 
79
- if file_path.lower().endswith('.pdf'):
80
  pdf_page_images = convert_pdf_to_images(file_path)
81
  image_paths_to_process.extend(pdf_page_images)
82
  temp_files_to_clean.extend(pdf_page_images)
83
- elif file_path.lower().endswith(('.png', '.jpg', '.jpeg', '.bmp', '.gif', '.webp')):
 
 
84
  image_paths_to_process.append(file_path)
85
  else:
86
  unsupported_files.append(os.path.basename(file_path))
87
 
88
  if unsupported_files:
89
  unsupported_str = ", ".join(unsupported_files)
90
- results.append(json.dumps({
91
- "error": f"Unsupported file type(s) were ignored: {unsupported_str}",
92
- "details": "Please upload only images (PNG, JPG, etc.) or PDF files."
93
- }, indent=4))
 
 
 
 
 
94
 
95
  for image_file in image_paths_to_process:
96
  try:
97
  image_path, width, height = array_to_image_path(image_file)
98
 
99
  messages = [
100
- {"role": "user", "content": [
101
- {"type": "image", "image": image_path, "resized_height": height, "resized_width": width},
102
- {"type": "text", "text": json_prompt}
103
- ]}
 
 
 
 
 
 
 
 
104
  ]
105
- text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
 
 
106
  image_inputs, video_inputs = process_vision_info(messages)
107
- inputs = processor(text=[text], images=image_inputs, videos=video_inputs, padding=True, return_tensors="pt").to("cuda")
 
 
 
 
 
 
108
 
109
  generated_ids = model.generate(**inputs, max_new_tokens=4096)
110
- generated_ids_trimmed = [out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
111
- raw_output = processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=True)
 
 
 
 
 
 
 
112
  raw_text = raw_output[0]
113
 
114
  try:
115
- start_index = raw_text.find('{')
116
- end_index = raw_text.rfind('}') + 1
117
  if start_index != -1 and end_index != 0:
118
  json_string = raw_text[start_index:end_index]
119
  parsed_json = json.loads(json_string)
120
- parsed_json['source_page'] = os.path.basename(image_path)
121
  formatted_json = json.dumps(parsed_json, indent=4)
122
  results.append(formatted_json)
123
  else:
124
- results.append(f'{{"error": "Model did not return valid JSON.", "source_page": "{os.path.basename(image_path)}", "raw_response": "{raw_text}"}}')
 
 
125
  except json.JSONDecodeError:
126
- results.append(f'{{"error": "Failed to decode JSON.", "source_page": "{os.path.basename(image_path)}", "raw_response": "{raw_text}"}}')
 
 
127
  except Exception as e:
128
- results.append(f'{{"error": "An unexpected error occurred during processing.", "details": "{str(e)}"}}')
 
 
129
 
130
  for f in temp_files_to_clean:
131
  if os.path.exists(f):
@@ -153,20 +442,30 @@ def generate_explanation(json_text):
153
  )
154
 
155
  messages = [{"role": "user", "content": explanation_prompt}]
156
- text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
 
 
157
  inputs = processor(text=[text], return_tensors="pt").to("cuda")
158
 
159
  generated_ids = model.generate(**inputs, max_new_tokens=2048)
160
- generated_ids_trimmed = [out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
161
- explanation_output = processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=True)[0]
 
 
 
 
 
 
 
162
 
163
  return explanation_output
164
 
165
- # --- FINAL AND MOST ROBUST CSS FIX ---
 
166
  css = """
167
  .gradio-container { font-family: 'IBM Plex Sans', sans-serif; }
168
 
169
- /* --- Light Mode Styles --- */
170
  #output-code, #output-code pre, #output-code code {
171
  background-color: #f0f0f0;
172
  border: 1px solid #e0e0e0;
@@ -185,7 +484,7 @@ css = """
185
  border-radius: 7px;
186
  }
187
 
188
- /* --- Dark Mode Overrides targeting Gradio's .dark class --- */
189
  .dark #output-code, .dark #output-code pre, .dark #output-code code {
190
  background-color: #2b2b2b !important;
191
  border: 1px solid #444 !important;
@@ -194,11 +493,9 @@ css = """
194
  .dark #explanation-box {
195
  border: 1px solid #444 !important;
196
  }
197
- /* This is a catch-all to ensure all parts of the syntax start light-colored */
198
  .dark #output-code code span {
199
  color: #f0f0f0 !important;
200
  }
201
- /* Then, we apply specific colors for syntax highlighting on top */
202
  .dark #output-code .token.punctuation { color: #ccc !important; }
203
  .dark #output-code .token.property, .dark #output-code .token.string { color: #90ee90 !important; }
204
  .dark #output-code .token.number { color: #add8e6 !important; }
@@ -214,7 +511,7 @@ with gr.Blocks(theme=custom_theme, css=css) as demo:
214
  input_files = gr.Files(label="Upload Images or PDFs")
215
  text_input = gr.Textbox(
216
  label="Your Query",
217
- placeholder="e.g., Extract the total amount from this receipt."
218
  )
219
  submit_btn = gr.Button("Analyze File(s)", variant="primary")
220
 
@@ -223,22 +520,29 @@ with gr.Blocks(theme=custom_theme, css=css) as demo:
223
  label="Full JSON Response",
224
  language="json",
225
  elem_id="output-code",
226
- interactive=False # This makes the output field read-only
 
 
 
 
 
 
227
  )
228
- explanation_btn = gr.Button("📄 Generate Detailed Explanation", interactive=False)
229
- explanation_output = gr.Markdown(label="Detailed Explanation", elem_id="explanation-box")
230
 
 
231
  submit_btn.click(
232
  fn=run_inference,
233
  inputs=[input_files, text_input],
234
- outputs=[output_text, explanation_btn]
 
235
  )
236
 
237
  explanation_btn.click(
238
  fn=generate_explanation,
239
  inputs=[output_text],
240
  outputs=[explanation_output],
241
- show_progress='full'
 
242
  )
243
 
244
  demo.queue()
 
1
+ # import gradio as gr
2
+ # import spaces
3
+ # from gradio.themes.base import Base
4
+ # from transformers import Qwen2VLForConditionalGeneration, AutoProcessor
5
+ # from qwen_vl_utils import process_vision_info
6
+ # from PIL import Image
7
+ # from datetime import datetime
8
+ # import os
9
+ # import json
10
+ # import fitz # PyMuPDF
11
+
12
+ # # Define a custom theme inheriting from the soft theme
13
+ # class CustomTheme(Base):
14
+ # def __init__(self):
15
+ # super().__init__()
16
+ # self.primary_hue = "blue"
17
+ # self.secondary_hue = "sky"
18
+
19
+ # custom_theme = CustomTheme()
20
+
21
+ # DESCRIPTION = "A powerful vision-language model that can understand images and text to provide detailed analysis."
22
+
23
+ # def array_to_image_path(image_filepath, max_width=1250, max_height=1750):
24
+ # if image_filepath is None:
25
+ # raise ValueError("No image provided.")
26
+
27
+ # img = Image.open(image_filepath)
28
+ # width, height = img.size
29
+ # if width > max_width or height > max_height:
30
+ # img.thumbnail((max_width, max_height))
31
+
32
+ # return os.path.abspath(image_filepath), img.width, img.height
33
+
34
+ # def convert_pdf_to_images(pdf_path):
35
+ # """Opens a PDF and converts each page into a high-resolution PNG image."""
36
+ # image_paths = []
37
+ # doc = fitz.open(pdf_path)
38
+ # base_name = os.path.splitext(os.path.basename(pdf_path))[0]
39
+
40
+ # for i, page in enumerate(doc):
41
+ # pix = page.get_pixmap(dpi=200)
42
+ # timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
43
+ # image_path = f"{base_name}_page_{i+1}_{timestamp}.png"
44
+ # pix.save(image_path)
45
+ # image_paths.append(image_path)
46
+
47
+ # doc.close()
48
+ # return image_paths
49
+
50
+ # # Initialize the model and processor
51
+ # model = Qwen2VLForConditionalGeneration.from_pretrained(
52
+ # "Qwen/Qwen2-VL-7B-Instruct",
53
+ # torch_dtype="auto",
54
+ # device_map="auto"
55
+ # )
56
+ # processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct")
57
+
58
+ # @spaces.GPU
59
+ # def run_inference(uploaded_files, text_input):
60
+ # results = []
61
+ # temp_files_to_clean = []
62
+
63
+ # json_prompt = (
64
+ # f"{text_input}\n\nBased on the image and the query, respond ONLY with a single, "
65
+ # "valid JSON object. This object should be well-structured, using nested objects "
66
+ # "and arrays to logically represent the information."
67
+ # )
68
+
69
+ # if not uploaded_files:
70
+ # error_json = json.dumps({"error": "No file provided. Please upload an image or PDF."}, indent=4)
71
+ # return error_json, gr.Button(interactive=False)
72
+
73
+ # image_paths_to_process = []
74
+ # unsupported_files = []
75
+ # for file_obj in uploaded_files:
76
+ # file_path = file_obj.name
77
+ # temp_files_to_clean.append(file_path)
78
+
79
+ # if file_path.lower().endswith('.pdf'):
80
+ # pdf_page_images = convert_pdf_to_images(file_path)
81
+ # image_paths_to_process.extend(pdf_page_images)
82
+ # temp_files_to_clean.extend(pdf_page_images)
83
+ # elif file_path.lower().endswith(('.png', '.jpg', '.jpeg', '.bmp', '.gif', '.webp')):
84
+ # image_paths_to_process.append(file_path)
85
+ # else:
86
+ # unsupported_files.append(os.path.basename(file_path))
87
+
88
+ # if unsupported_files:
89
+ # unsupported_str = ", ".join(unsupported_files)
90
+ # results.append(json.dumps({
91
+ # "error": f"Unsupported file type(s) were ignored: {unsupported_str}",
92
+ # "details": "Please upload only images (PNG, JPG, etc.) or PDF files."
93
+ # }, indent=4))
94
+
95
+ # for image_file in image_paths_to_process:
96
+ # try:
97
+ # image_path, width, height = array_to_image_path(image_file)
98
+
99
+ # messages = [
100
+ # {"role": "user", "content": [
101
+ # {"type": "image", "image": image_path, "resized_height": height, "resized_width": width},
102
+ # {"type": "text", "text": json_prompt}
103
+ # ]}
104
+ # ]
105
+ # text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
106
+ # image_inputs, video_inputs = process_vision_info(messages)
107
+ # inputs = processor(text=[text], images=image_inputs, videos=video_inputs, padding=True, return_tensors="pt").to("cuda")
108
+
109
+ # generated_ids = model.generate(**inputs, max_new_tokens=4096)
110
+ # generated_ids_trimmed = [out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
111
+ # raw_output = processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=True)
112
+ # raw_text = raw_output[0]
113
+
114
+ # try:
115
+ # start_index = raw_text.find('{')
116
+ # end_index = raw_text.rfind('}') + 1
117
+ # if start_index != -1 and end_index != 0:
118
+ # json_string = raw_text[start_index:end_index]
119
+ # parsed_json = json.loads(json_string)
120
+ # parsed_json['source_page'] = os.path.basename(image_path)
121
+ # formatted_json = json.dumps(parsed_json, indent=4)
122
+ # results.append(formatted_json)
123
+ # else:
124
+ # results.append(f'{{"error": "Model did not return valid JSON.", "source_page": "{os.path.basename(image_path)}", "raw_response": "{raw_text}"}}')
125
+ # except json.JSONDecodeError:
126
+ # results.append(f'{{"error": "Failed to decode JSON.", "source_page": "{os.path.basename(image_path)}", "raw_response": "{raw_text}"}}')
127
+ # except Exception as e:
128
+ # results.append(f'{{"error": "An unexpected error occurred during processing.", "details": "{str(e)}"}}')
129
+
130
+ # for f in temp_files_to_clean:
131
+ # if os.path.exists(f):
132
+ # try:
133
+ # os.remove(f)
134
+ # except OSError as e:
135
+ # print(f"Error deleting file {f}: {e}")
136
+
137
+ # final_json = "\n---\n".join(results)
138
+ # is_error = '"error":' in final_json
139
+ # return final_json, gr.Button(interactive=not is_error)
140
+
141
+
142
+ # @spaces.GPU
143
+ # def generate_explanation(json_text):
144
+ # if not json_text or '"error":' in json_text:
145
+ # return "Cannot generate an explanation. Please produce a valid JSON output first. 🙁"
146
+
147
+ # explanation_prompt = (
148
+ # "You are an expert data analyst. Your task is to provide a comprehensive, human-readable explanation "
149
+ # "of the following JSON data, which may represent one or more pages from a document. First, provide a textual explanation. "
150
+ # "If the JSON contains data from multiple sources (pages), explain each one. Then, if the JSON data represents a table, "
151
+ # "a list of items, or a receipt, you **must** re-format the key information into a Markdown table for clarity.\n\n"
152
+ # f"JSON Data:\n```json\n{json_text}\n```"
153
+ # )
154
+
155
+ # messages = [{"role": "user", "content": explanation_prompt}]
156
+ # text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
157
+ # inputs = processor(text=[text], return_tensors="pt").to("cuda")
158
+
159
+ # generated_ids = model.generate(**inputs, max_new_tokens=2048)
160
+ # generated_ids_trimmed = [out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
161
+ # explanation_output = processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=True)[0]
162
+
163
+ # return explanation_output
164
+
165
+ # # --- FINAL AND MOST ROBUST CSS FIX ---
166
+ # css = """
167
+ # .gradio-container { font-family: 'IBM Plex Sans', sans-serif; }
168
+
169
+ # /* --- Light Mode Styles --- */
170
+ # #output-code, #output-code pre, #output-code code {
171
+ # background-color: #f0f0f0;
172
+ # border: 1px solid #e0e0e0;
173
+ # border-radius: 7px;
174
+ # color: #333;
175
+ # }
176
+ # #output-code .token.punctuation { color: #393a34; }
177
+ # #output-code .token.property, #output-code .token.string { color: #0b7500; }
178
+ # #output-code .token.number { color: #2973b7; }
179
+ # #output-code .token.boolean { color: #9a050f; }
180
+
181
+ # #explanation-box {
182
+ # min-height: 200px;
183
+ # border: 1px solid #e0e0e0;
184
+ # padding: 15px;
185
+ # border-radius: 7px;
186
+ # }
187
+
188
+ # /* --- Dark Mode Overrides targeting Gradio's .dark class --- */
189
+ # .dark #output-code, .dark #output-code pre, .dark #output-code code {
190
+ # background-color: #2b2b2b !important;
191
+ # border: 1px solid #444 !important;
192
+ # color: #f0f0f0 !important;
193
+ # }
194
+ # .dark #explanation-box {
195
+ # border: 1px solid #444 !important;
196
+ # }
197
+ # /* This is a catch-all to ensure all parts of the syntax start light-colored */
198
+ # .dark #output-code code span {
199
+ # color: #f0f0f0 !important;
200
+ # }
201
+ # /* Then, we apply specific colors for syntax highlighting on top */
202
+ # .dark #output-code .token.punctuation { color: #ccc !important; }
203
+ # .dark #output-code .token.property, .dark #output-code .token.string { color: #90ee90 !important; }
204
+ # .dark #output-code .token.number { color: #add8e6 !important; }
205
+ # .dark #output-code .token.boolean { color: #f08080 !important; }
206
+ # """
207
+
208
+ # with gr.Blocks(theme=custom_theme, css=css) as demo:
209
+ # gr.Markdown("# Sparrow Qwen2-VL-7B Vision AI 👁️")
210
+ # gr.Markdown(DESCRIPTION)
211
+
212
+ # with gr.Row():
213
+ # with gr.Column(scale=1):
214
+ # input_files = gr.Files(label="Upload Images or PDFs")
215
+ # text_input = gr.Textbox(
216
+ # label="Your Query",
217
+ # placeholder="e.g., Extract the total amount from this receipt."
218
+ # )
219
+ # submit_btn = gr.Button("Analyze File(s)", variant="primary")
220
+
221
+ # with gr.Column(scale=2):
222
+ # output_text = gr.Code(
223
+ # label="Full JSON Response",
224
+ # language="json",
225
+ # elem_id="output-code",
226
+ # interactive=False # This makes the output field read-only
227
+ # )
228
+ # explanation_btn = gr.Button("📄 Generate Detailed Explanation", interactive=False)
229
+ # explanation_output = gr.Markdown(label="Detailed Explanation", elem_id="explanation-box")
230
+
231
+ # submit_btn.click(
232
+ # fn=run_inference,
233
+ # inputs=[input_files, text_input],
234
+ # outputs=[output_text, explanation_btn]
235
+ # )
236
+
237
+ # explanation_btn.click(
238
+ # fn=generate_explanation,
239
+ # inputs=[output_text],
240
+ # outputs=[explanation_output],
241
+ # show_progress='full'
242
+ # )
243
+
244
+ # demo.queue()
245
+ # demo.launch(debug=True)
246
+
247
+
248
  import gradio as gr
249
  import spaces
250
  from gradio.themes.base import Base
 
256
  import json
257
  import fitz # PyMuPDF
258
 
259
+
260
  # Define a custom theme inheriting from the soft theme
261
  class CustomTheme(Base):
262
  def __init__(self):
 
264
  self.primary_hue = "blue"
265
  self.secondary_hue = "sky"
266
 
267
+
268
  custom_theme = CustomTheme()
269
 
270
  DESCRIPTION = "A powerful vision-language model that can understand images and text to provide detailed analysis."
271
 
272
+
273
  def array_to_image_path(image_filepath, max_width=1250, max_height=1750):
274
  if image_filepath is None:
275
  raise ValueError("No image provided.")
 
281
 
282
  return os.path.abspath(image_filepath), img.width, img.height
283
 
284
+
285
  def convert_pdf_to_images(pdf_path):
286
  """Opens a PDF and converts each page into a high-resolution PNG image."""
287
  image_paths = []
 
291
  for i, page in enumerate(doc):
292
  pix = page.get_pixmap(dpi=200)
293
  timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
294
+ image_path = f"{base_name}_page_{i + 1}_{timestamp}.png"
295
  pix.save(image_path)
296
  image_paths.append(image_path)
297
 
298
  doc.close()
299
  return image_paths
300
 
301
+
302
  # Initialize the model and processor
303
  model = Qwen2VLForConditionalGeneration.from_pretrained(
304
+ "Qwen/Qwen2-VL-7B-Instruct", torch_dtype="auto", device_map="auto"
 
 
305
  )
306
  processor = AutoProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct")
307
 
308
+
309
  @spaces.GPU
310
  def run_inference(uploaded_files, text_input):
311
  results = []
 
318
  )
319
 
320
  if not uploaded_files:
321
+ error_json = json.dumps(
322
+ {"error": "No file provided. Please upload an image or PDF."}, indent=4
323
+ )
324
  return error_json, gr.Button(interactive=False)
325
 
326
  image_paths_to_process = []
 
329
  file_path = file_obj.name
330
  temp_files_to_clean.append(file_path)
331
 
332
+ if file_path.lower().endswith(".pdf"):
333
  pdf_page_images = convert_pdf_to_images(file_path)
334
  image_paths_to_process.extend(pdf_page_images)
335
  temp_files_to_clean.extend(pdf_page_images)
336
+ elif file_path.lower().endswith(
337
+ (".png", ".jpg", ".jpeg", ".bmp", ".gif", ".webp")
338
+ ):
339
  image_paths_to_process.append(file_path)
340
  else:
341
  unsupported_files.append(os.path.basename(file_path))
342
 
343
  if unsupported_files:
344
  unsupported_str = ", ".join(unsupported_files)
345
+ results.append(
346
+ json.dumps(
347
+ {
348
+ "error": f"Unsupported file type(s) were ignored: {unsupported_str}",
349
+ "details": "Please upload only images (PNG, JPG, etc.) or PDF files.",
350
+ },
351
+ indent=4,
352
+ )
353
+ )
354
 
355
  for image_file in image_paths_to_process:
356
  try:
357
  image_path, width, height = array_to_image_path(image_file)
358
 
359
  messages = [
360
+ {
361
+ "role": "user",
362
+ "content": [
363
+ {
364
+ "type": "image",
365
+ "image": image_path,
366
+ "resized_height": height,
367
+ "resized_width": width,
368
+ },
369
+ {"type": "text", "text": json_prompt},
370
+ ],
371
+ }
372
  ]
373
+ text = processor.apply_chat_template(
374
+ messages, tokenize=False, add_generation_prompt=True
375
+ )
376
  image_inputs, video_inputs = process_vision_info(messages)
377
+ inputs = processor(
378
+ text=[text],
379
+ images=image_inputs,
380
+ videos=video_inputs,
381
+ padding=True,
382
+ return_tensors="pt",
383
+ ).to("cuda")
384
 
385
  generated_ids = model.generate(**inputs, max_new_tokens=4096)
386
+ generated_ids_trimmed = [
387
+ out_ids[len(in_ids) :]
388
+ for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
389
+ ]
390
+ raw_output = processor.batch_decode(
391
+ generated_ids_trimmed,
392
+ skip_special_tokens=True,
393
+ clean_up_tokenization_spaces=True,
394
+ )
395
  raw_text = raw_output[0]
396
 
397
  try:
398
+ start_index = raw_text.find("{")
399
+ end_index = raw_text.rfind("}") + 1
400
  if start_index != -1 and end_index != 0:
401
  json_string = raw_text[start_index:end_index]
402
  parsed_json = json.loads(json_string)
403
+ parsed_json["source_page"] = os.path.basename(image_path)
404
  formatted_json = json.dumps(parsed_json, indent=4)
405
  results.append(formatted_json)
406
  else:
407
+ results.append(
408
+ f'{{"error": "Model did not return valid JSON.", "source_page": "{os.path.basename(image_path)}", "raw_response": "{raw_text}"}}'
409
+ )
410
  except json.JSONDecodeError:
411
+ results.append(
412
+ f'{{"error": "Failed to decode JSON.", "source_page": "{os.path.basename(image_path)}", "raw_response": "{raw_text}"}}'
413
+ )
414
  except Exception as e:
415
+ results.append(
416
+ f'{{"error": "An unexpected error occurred during processing.", "details": "{str(e)}"}}'
417
+ )
418
 
419
  for f in temp_files_to_clean:
420
  if os.path.exists(f):
 
442
  )
443
 
444
  messages = [{"role": "user", "content": explanation_prompt}]
445
+ text = processor.apply_chat_template(
446
+ messages, tokenize=False, add_generation_prompt=True
447
+ )
448
  inputs = processor(text=[text], return_tensors="pt").to("cuda")
449
 
450
  generated_ids = model.generate(**inputs, max_new_tokens=2048)
451
+ generated_ids_trimmed = [
452
+ out_ids[len(in_ids) :]
453
+ for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
454
+ ]
455
+ explanation_output = processor.batch_decode(
456
+ generated_ids_trimmed,
457
+ skip_special_tokens=True,
458
+ clean_up_tokenization_spaces=True,
459
+ )[0]
460
 
461
  return explanation_output
462
 
463
+
464
+ # Define the Gradio UI
465
  css = """
466
  .gradio-container { font-family: 'IBM Plex Sans', sans-serif; }
467
 
468
+ /* Default (Light Mode) Styles */
469
  #output-code, #output-code pre, #output-code code {
470
  background-color: #f0f0f0;
471
  border: 1px solid #e0e0e0;
 
484
  border-radius: 7px;
485
  }
486
 
487
+ /* Dark Mode Overrides targeting Gradio's .dark class */
488
  .dark #output-code, .dark #output-code pre, .dark #output-code code {
489
  background-color: #2b2b2b !important;
490
  border: 1px solid #444 !important;
 
493
  .dark #explanation-box {
494
  border: 1px solid #444 !important;
495
  }
 
496
  .dark #output-code code span {
497
  color: #f0f0f0 !important;
498
  }
 
499
  .dark #output-code .token.punctuation { color: #ccc !important; }
500
  .dark #output-code .token.property, .dark #output-code .token.string { color: #90ee90 !important; }
501
  .dark #output-code .token.number { color: #add8e6 !important; }
 
511
  input_files = gr.Files(label="Upload Images or PDFs")
512
  text_input = gr.Textbox(
513
  label="Your Query",
514
+ placeholder="e.g., Extract the total amount from this receipt.",
515
  )
516
  submit_btn = gr.Button("Analyze File(s)", variant="primary")
517
 
 
520
  label="Full JSON Response",
521
  language="json",
522
  elem_id="output-code",
523
+ interactive=False,
524
+ )
525
+ explanation_btn = gr.Button(
526
+ "📄 Generate Detailed Explanation", interactive=False
527
+ )
528
+ explanation_output = gr.Markdown(
529
+ label="Detailed Explanation", elem_id="explanation-box"
530
  )
 
 
531
 
532
+ # Add api_name to create stable API endpoints
533
  submit_btn.click(
534
  fn=run_inference,
535
  inputs=[input_files, text_input],
536
+ outputs=[output_text, explanation_btn],
537
+ api_name="analyze_document",
538
  )
539
 
540
  explanation_btn.click(
541
  fn=generate_explanation,
542
  inputs=[output_text],
543
  outputs=[explanation_output],
544
+ show_progress="full",
545
+ api_name="generate_explanation",
546
  )
547
 
548
  demo.queue()