Toulik commited on
Commit
55e1313
·
verified ·
1 Parent(s): 4975bf7

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +324 -286
app.py CHANGED
@@ -1,10 +1,16 @@
1
  # app.py
2
  """
3
- Gradio app: upload PDF / Image -> extract text (PyMuPDF + Tesseract fallback) ->
4
- call GPT-5 (OpenAI new client) to produce machine-parseable metadata JSON (between markers) ->
5
- validate JSON (jsonschema) -> show JSON and allow download.
6
-
7
- Requirements (add to requirements.txt for HF Space or local venv):
 
 
 
 
 
 
8
  gradio>=3.0
9
  PyMuPDF
10
  pytesseract
@@ -12,11 +18,11 @@ Requirements (add to requirements.txt for HF Space or local venv):
12
  openai>=1.0.0
13
  jsonschema
14
 
15
- System packages required (HF Spaces apt-packages):
16
  tesseract-ocr
17
  poppler-utils
18
 
19
- Put OPENAI_API_KEY into your environment/Space Secrets.
20
  """
21
 
22
  import os
@@ -31,20 +37,18 @@ from PIL import Image
31
  import fitz # PyMuPDF
32
  import pytesseract
33
  from jsonschema import validate as json_validate, ValidationError
34
-
35
- # new OpenAI client surface
36
  from openai import OpenAI
37
 
38
  # -----------------------
39
- # Config / client
40
  # -----------------------
41
  OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
42
  if not OPENAI_API_KEY:
43
- raise RuntimeError("OPENAI_API_KEY not found in environment. Add to HF Space Secrets or env var.")
44
 
45
  client = OpenAI(api_key=OPENAI_API_KEY)
46
 
47
- LLM_MODEL = os.getenv("OPENAI_MODEL", "gpt-5") # change if you have a different model id
48
  MAX_COMPLETION_TOKENS = int(os.getenv("MAX_COMPLETION_TOKENS", "1500"))
49
 
50
  # -----------------------
@@ -53,17 +57,8 @@ MAX_COMPLETION_TOKENS = int(os.getenv("MAX_COMPLETION_TOKENS", "1500"))
53
  METADATA_SCHEMA = {
54
  "type": "object",
55
  "required": [
56
- "doc_id",
57
- "title",
58
- "summary",
59
- "doc_type",
60
- "source",
61
- "tags",
62
- "tag_confidences",
63
- "taxonomy_path",
64
- "extracted_entities",
65
- "raw_url",
66
- "ingest_timestamp",
67
  ],
68
  "properties": {
69
  "doc_id": {"type": "string"},
@@ -82,33 +77,38 @@ METADATA_SCHEMA = {
82
  }
83
 
84
  # -----------------------
85
- # Extraction helpers
86
  # -----------------------
87
- def extract_text_from_pdf(path: str) -> str:
 
88
  try:
89
  doc = fitz.open(path)
90
  except Exception as e:
91
  raise RuntimeError(f"Failed to open PDF: {e}")
92
-
93
  texts: List[str] = []
94
  for i in range(len(doc)):
95
  page = doc.load_page(i)
96
  txt = page.get_text("text").strip()
97
  if txt:
 
98
  texts.append(txt)
99
  else:
100
- # render and OCR
101
  pix = page.get_pixmap(dpi=200)
102
  with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp:
103
  pix.save(tmp.name)
104
  ocr_text = pytesseract.image_to_string(Image.open(tmp.name))
 
105
  texts.append(ocr_text)
106
  return "\n\n".join(texts).strip()
107
 
108
 
109
- def extract_text_from_image(path: str) -> str:
 
110
  img = Image.open(path).convert("RGB")
111
- return pytesseract.image_to_string(img).strip()
 
 
112
 
113
 
114
  def chunk_text(text: str, max_chars: int = 3000) -> List[str]:
@@ -127,17 +127,10 @@ def chunk_text(text: str, max_chars: int = 3000) -> List[str]:
127
  return chunks
128
 
129
  # -----------------------
130
- # Utilities for robust upload handling
131
  # -----------------------
132
- def save_uploaded_to_tmp(file_obj):
133
- """
134
- Accepts common Gradio upload types:
135
- - file-like (has .read())
136
- - dict-like {"name": ..., "data": b'...'}
137
- - path string
138
- - objects with .name attribute pointing to a path (NamedString)
139
- Returns (tmp_path, original_filename)
140
- """
141
  # file-like
142
  if hasattr(file_obj, "read") and callable(getattr(file_obj, "read")):
143
  try:
@@ -148,13 +141,13 @@ def save_uploaded_to_tmp(file_obj):
148
  suffix = os.path.splitext(name)[1] or ""
149
  with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as tmp:
150
  tmp.write(content)
 
151
  return tmp.name, os.path.basename(name)
152
- except Exception:
153
- pass
154
-
155
  # dict-like
156
- if isinstance(file_obj, dict):
157
- if "data" in file_obj and "name" in file_obj:
158
  data = file_obj["data"]
159
  if isinstance(data, str):
160
  data = data.encode("utf-8")
@@ -162,11 +155,14 @@ def save_uploaded_to_tmp(file_obj):
162
  suffix = os.path.splitext(name)[1] or ""
163
  with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as tmp:
164
  tmp.write(data)
 
165
  return tmp.name, os.path.basename(name)
166
-
 
167
  # path string
168
  if isinstance(file_obj, str):
169
  if os.path.exists(file_obj):
 
170
  return file_obj, os.path.basename(file_obj)
171
  try:
172
  with open(file_obj, "rb") as f:
@@ -174,11 +170,11 @@ def save_uploaded_to_tmp(file_obj):
174
  suffix = os.path.splitext(file_obj)[1] or ""
175
  with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as tmp:
176
  tmp.write(data)
 
177
  return tmp.name, os.path.basename(file_obj)
178
- except Exception:
179
- pass
180
-
181
- # object with .name attribute referencing existing path
182
  name = getattr(file_obj, "name", None)
183
  if name and isinstance(name, str):
184
  try:
@@ -187,21 +183,16 @@ def save_uploaded_to_tmp(file_obj):
187
  suffix = os.path.splitext(name)[1] or ""
188
  with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as tmp:
189
  tmp.write(data)
 
190
  return tmp.name, os.path.basename(name)
191
- except Exception:
192
- pass
193
-
194
  raise ValueError(f"Unsupported uploaded file object type: {type(file_obj)}. repr: {repr(file_obj)[:400]}")
195
 
196
-
197
  # -----------------------
198
- # JSON extraction & validation helpers
199
  # -----------------------
200
  def extract_json_from_text(text: str) -> str:
201
- """
202
- Prefer explicit markers <<BEGIN_JSON>> ... <<END_JSON>>.
203
- Otherwise try to get the last {...} block, then first {...} block.
204
- """
205
  m = re.search(r"<<BEGIN_JSON>>(.*?)<<END_JSON>>", text, re.DOTALL)
206
  if m:
207
  return m.group(1).strip()
@@ -215,79 +206,42 @@ def extract_json_from_text(text: str) -> str:
215
 
216
 
217
  def try_parse_and_validate(json_text: str) -> (bool, Dict[str, Any], str):
218
- """
219
- Returns (ok, parsed_dict_or_none, error_message_or_empty)
220
- """
221
  try:
222
  parsed = json.loads(json_text)
223
  except Exception as e:
224
  return False, None, f"json.loads error: {e}"
225
-
226
  try:
227
  json_validate(parsed, METADATA_SCHEMA)
228
  except ValidationError as e:
229
  return False, parsed, f"schema validation error: {e}"
230
  except Exception as e:
231
- # other validation errors
232
  return False, parsed, f"schema validation unexpected error: {e}"
233
-
234
  return True, parsed, ""
235
 
236
-
237
  # -----------------------
238
- # LLM call with retries + repair logic
239
  # -----------------------
240
- def call_gpt5_for_metadata(title: str, short_text: str, top_chunks: List[str], max_attempts: int = 3) -> Dict[str, Any]:
241
- """
242
- Robust LLM call:
243
- - uses system message to enforce JSON-only output between markers
244
- - retries up to max_attempts
245
- - if model returns partial/invalid JSON, asks model to repair it
246
- - validates the JSON against METADATA_SCHEMA
247
- Returns:
248
- - valid metadata dict OR dict with keys like _parsing_error/raw_output for UI consumption
249
- """
250
  system_msg = (
251
  "You are an automated document taxonomy and tagging assistant for enterprise catalogs. "
252
- "When producing output for this task you MUST return ONLY a JSON object and NOTHING ELSE. "
253
- "Wrap the JSON in explicit markers: <<BEGIN_JSON>> and <<END_JSON>>. "
254
- "Do not include any commentary, explanation, or text outside those markers."
255
- )
256
-
257
- prompt_intro = (
258
- f"Document title: {title}\n\n"
259
- f"Short document text (first ~1000 chars): {short_text}\n\n"
260
- "Top content chunks (short):\n"
261
  )
262
-
263
  prompt_chunks = ""
264
  for i, c in enumerate(top_chunks[:6]):
265
  chunk_text_clean = c[:800].replace("\n", " ")
266
  prompt_chunks += f"CHUNK_{i+1}: {chunk_text_clean}\n\n"
267
-
268
  prompt_end = (
269
- "Task: Produce a single JSON object with EXACT keys:\n"
270
- "doc_id, title, summary, doc_type, source, tags (array of strings), tag_confidences (map tag->float), "
271
- "taxonomy_path (array of strings), extracted_entities (map), raw_url, ingest_timestamp\n\n"
272
- "Guidelines:\n"
273
- "- summary: 1-2 sentences.\n"
274
- "- doc_type: short enum-like string (e.g., architecture_comparison).\n"
275
- "- tags: up to 8 short tags like arch:docai.\n"
276
- "- tag_confidences: floats 0-1 for each tag.\n"
277
- "- taxonomy_path: hierarchical list.\n\n"
278
- "Output MUST be the JSON only, enclosed between <<BEGIN_JSON>> and <<END_JSON>>.\n"
279
  )
280
-
281
- user_prompt = prompt_intro + prompt_chunks + prompt_end
282
-
283
- messages = [
284
- {"role": "system", "content": system_msg},
285
- {"role": "user", "content": user_prompt},
286
- ]
287
-
288
  last_raw = None
289
 
290
  for attempt in range(1, max_attempts + 1):
 
291
  try:
292
  resp = client.chat.completions.create(
293
  model=LLM_MODEL,
@@ -295,147 +249,79 @@ def call_gpt5_for_metadata(title: str, short_text: str, top_chunks: List[str], m
295
  max_completion_tokens=MAX_COMPLETION_TOKENS,
296
  )
297
  except Exception as e:
298
- return {"_api_error": True, "error": f"OpenAI API call failed: {e}"}
 
299
 
300
- # extract text
301
  try:
302
- text = resp.choices[0].message["content"].strip()
303
  except Exception:
304
  try:
305
- text = resp.choices[0].message.content.strip()
306
  except Exception:
307
- text = str(resp)
308
-
309
- last_raw = text
310
 
311
- # extract the JSON
312
- json_text = extract_json_from_text(text)
313
  if not json_text:
314
- # prepare a repair prompt and retry if attempts left
315
  if attempt < max_attempts:
316
- fix_prompt = (
317
- "The previous response did not include a JSON object wrapped in <<BEGIN_JSON>> and <<END_JSON>> markers, "
318
- "or returned invalid JSON. Here is the raw output:\n\n"
319
- f"{text}\n\n"
320
- "Please return ONLY a valid JSON object wrapped between <<BEGIN_JSON>> and <<END_JSON>>. "
321
- "Do not include anything else."
322
- )
323
  messages = [
324
  {"role": "system", "content": system_msg},
325
- {"role": "user", "content": fix_prompt},
326
  ]
327
  continue
328
  else:
329
- return {"_parsing_error": True, "raw_output": last_raw, "error": "no JSON found between markers or as object."}
330
 
331
  ok, parsed_or_partial, parse_err = try_parse_and_validate(json_text)
332
  if ok:
333
- return parsed_or_partial
 
 
334
  else:
335
- # parsed_or_partial may be dict (parsed but schema-failed) or None
336
  if attempt < max_attempts:
337
- repair_prompt = (
338
- "The JSON you returned is invalid or does not meet the schema. Here is the JSON you returned:\n\n"
339
- f"{json_text}\n\n"
340
- "Please return ONLY a corrected JSON object wrapped in <<BEGIN_JSON>> and <<END_JSON>> that includes the required keys: "
341
- "doc_id, title, summary, doc_type, source, tags, tag_confidences, taxonomy_path, extracted_entities, raw_url, ingest_timestamp. "
342
- "If you must guess missing fields, use reasonable defaults (empty string or empty list/map)."
343
- )
344
  messages = [
345
  {"role": "system", "content": system_msg},
346
- {"role": "user", "content": repair_prompt},
347
  ]
348
  continue
349
  else:
350
- return {
351
- "_parsing_error": True,
352
- "raw_output": last_raw,
353
- "parsed_partial": parsed_or_partial,
354
- "parse_error": parse_err,
355
- }
356
 
357
- return {"_parsing_error": True, "raw_output": last_raw or "", "error": "exhausted retries"}
358
 
359
-
360
- # -----------------------
361
- # process file (save -> extract -> chunk -> call LLM)
362
- # -----------------------
363
- def process_file(file_obj) -> Dict[str, Any]:
364
- try:
365
- tmp_path, orig_name = save_uploaded_to_tmp(file_obj)
366
- except Exception as e:
367
- return {"error": f"Failed to save uploaded file: {e}"}
368
-
369
- # extract text
370
- try:
371
- if orig_name.lower().endswith(".pdf"):
372
- extracted_text = extract_text_from_pdf(tmp_path)
373
  else:
374
- extracted_text = extract_text_from_image(tmp_path)
375
- except Exception as e:
376
- return {"error": f"Text extraction failed: {e}"}
377
-
378
- if not extracted_text:
379
- return {"error": "No text found in document after extraction."}
380
-
381
- chunks = chunk_text(extracted_text)
382
- sorted_chunks = sorted(chunks, key=lambda x: len(x), reverse=True)
383
- top_chunks = sorted_chunks[:6] if sorted_chunks else [extracted_text[:2000]]
384
-
385
- short_text = (extracted_text[:1000] + "...") if len(extracted_text) > 1000 else extracted_text
386
-
387
- metadata = call_gpt5_for_metadata(orig_name, short_text, top_chunks, max_attempts=3)
388
-
389
- # If API error
390
- if metadata.get("_api_error"):
391
- return {"error": metadata.get("error")}
392
-
393
- # If parsing/validation error, include raw_output so UI can show & repair
394
- if metadata.get("_parsing_error"):
395
- return {
396
- "error": "LLM output parsing failed. See raw_output.",
397
- "raw_output": metadata.get("raw_output"),
398
- "parsed_partial": metadata.get("parsed_partial"),
399
- "parse_error": metadata.get("parse_error"),
400
- }
401
-
402
- # Ensure minimal keys and timestamp
403
- now = datetime.datetime.now(datetime.timezone.utc).astimezone().isoformat()
404
- metadata.setdefault("doc_id", os.path.splitext(orig_name)[0])
405
- metadata.setdefault("title", orig_name)
406
- metadata.setdefault("source", "user_upload")
407
- metadata.setdefault("raw_url", "")
408
- metadata.setdefault("ingest_timestamp", now)
409
-
410
- return metadata
411
 
412
-
413
- # -----------------------
414
- # Repair-only function (user-triggered) - repair raw_output into valid JSON
415
- # -----------------------
416
- def repair_raw_output(raw_output: str, max_attempts: int = 2) -> Dict[str, Any]:
417
- """
418
- Send the raw output back to the model and ask for corrected JSON between markers.
419
- This function is useful if the initial parsing failed and you want a manual 'Repair' button in UI.
420
- """
421
  system_msg = (
422
- "You are an automated assistant. The user previously received a response that was intended to be a JSON object "
423
- "but it may be malformed or contain extra text. Your job: RETURN ONLY a corrected JSON object wrapped between "
424
- "<<BEGIN_JSON>> and <<END_JSON>>. Do NOT include any other text."
425
  )
426
-
427
  repair_prompt = (
428
- "Here is the raw output that failed to parse:\n\n"
429
- f"{raw_output}\n\n"
430
- "Please return ONLY a corrected JSON object wrapped between <<BEGIN_JSON>> and <<END_JSON>>. "
431
- "Ensure the object contains keys: doc_id, title, summary, doc_type, source, tags, tag_confidences, taxonomy_path, extracted_entities, raw_url, ingest_timestamp. "
432
- "If a field is missing, use a reasonable default (empty string, empty list, or empty map)."
433
  )
434
-
435
- messages = [{"role": "system", "content": system_msg}, {"role": "user", "content": repair_prompt}]
436
-
437
  last_raw = None
438
  for attempt in range(1, max_attempts + 1):
 
439
  try:
440
  resp = client.chat.completions.create(
441
  model=LLM_MODEL,
@@ -443,116 +329,268 @@ def repair_raw_output(raw_output: str, max_attempts: int = 2) -> Dict[str, Any]:
443
  max_completion_tokens=MAX_COMPLETION_TOKENS,
444
  )
445
  except Exception as e:
446
- return {"_api_error": True, "error": f"OpenAI API call failed: {e}"}
447
-
448
  try:
449
- text = resp.choices[0].message["content"].strip()
450
  except Exception:
451
  try:
452
- text = resp.choices[0].message.content.strip()
453
  except Exception:
454
- text = str(resp)
455
-
456
- last_raw = text
457
- json_text = extract_json_from_text(text)
458
  if not json_text:
 
459
  if attempt < max_attempts:
460
- messages = [
461
- {"role": "system", "content": system_msg},
462
- {"role": "user", "content": "Your previous reply did not include a JSON block. Please return ONLY the JSON wrapped in <<BEGIN_JSON>> and <<END_JSON>>."},
463
- ]
464
  continue
465
  else:
466
- return {"_parsing_error": True, "raw_output": last_raw, "error": "no JSON found after repair attempts"}
467
-
468
  ok, parsed_or_partial, parse_err = try_parse_and_validate(json_text)
469
  if ok:
470
- return parsed_or_partial
 
471
  else:
 
472
  if attempt < max_attempts:
473
- messages = [
474
- {"role": "system", "content": system_msg},
475
- {"role": "user", "content": "The JSON you returned is invalid. Please correct and return ONLY the JSON wrapped in <<BEGIN_JSON>> and <<END_JSON>>."},
476
- ]
477
  continue
478
  else:
479
- return {"_parsing_error": True, "raw_output": last_raw, "parsed_partial": parsed_or_partial, "parse_error": parse_err}
 
480
 
481
- return {"_parsing_error": True, "raw_output": last_raw or "", "error": "exhausted retries"}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
482
 
483
  # -----------------------
484
  # Gradio UI
485
  # -----------------------
486
- with gr.Blocks(title="DocClassify — Gradio GPT-5 Taxonomy & Tagging") as demo:
487
- gr.Markdown("## 📂 Upload a PDF or Image the app will classify, tag, and propose a taxonomy using GPT-5")
488
  with gr.Row():
489
  with gr.Column(scale=1):
490
  uploader = gr.File(label="Upload PDF / Image", file_types=[".pdf", ".png", ".jpg", ".jpeg", ".tiff"])
491
  run_button = gr.Button("Process document")
492
  status = gr.Textbox(label="Status", value="", interactive=False)
493
  download_button = gr.File(label="Download metadata JSON", visible=False)
494
- repair_button = gr.Button("Repair last raw output", visible=True)
 
 
 
495
  with gr.Column(scale=1):
496
- output_json = gr.JSON(label="Document metadata (JSON)")
497
- raw_output_box = gr.Textbox(label="Raw LLM output / parse errors", interactive=False)
498
-
499
- # State holders
500
- last_raw_state = gr.State(value=None) # stores raw_output when parsing fails
501
- last_metadata_file = gr.State(value=None) # stores path to last generated metadata file (for download)
502
-
503
- def on_process(file_obj, last_raw_state):
504
- status = "Processing..."
505
- # initial empty responses
506
- empty_val = {}
 
507
  try:
508
  result = process_file(file_obj)
509
  except Exception as e:
510
- return empty_val, f"Failed: {e}", None, None
511
-
 
 
512
  if result.get("error"):
513
- # if LLM returned parsing error, store raw_output in state and show it
514
- raw = result.get("raw_output", "")
515
- # prepare displayed payload that includes the error note
516
- display_obj = {"error": result.get("error")}
517
- if result.get("parsed_partial") is not None:
518
- display_obj["parsed_partial"] = result.get("parsed_partial")
519
- # Save raw_output to state for potential repair
520
- return display_obj, f"Error: {result.get('error')}", None, raw
521
-
522
- # success: return JSON and create downloadable temp file
 
523
  tmpf = tempfile.NamedTemporaryFile(delete=False, suffix=".json")
524
  with open(tmpf.name, "w", encoding="utf8") as f:
525
- json.dump(result, f, indent=2, ensure_ascii=False)
526
-
527
- return result, "Done", tmpf.name, None
528
-
529
- def on_repair(raw_output):
530
- if not raw_output:
531
- return {}, "No raw_output available to repair.", None
532
- try:
533
- repaired = repair_raw_output(raw_output, max_attempts=2)
534
- except Exception as e:
535
- return {}, f"Repair failed: {e}", None
536
-
537
  if repaired.get("_api_error"):
538
- return {}, f"Repair API error: {repaired.get('error')}", None
539
-
540
  if repaired.get("_parsing_error"):
541
- # still failed; show raw_output and parsed_partial
542
- display = {"error": "Repair failed to produce valid JSON", "parsed_partial": repaired.get("parsed_partial")}
543
- return display, "Repair failed: parsing error", None
544
-
545
- # success -> create download file
546
  tmpf = tempfile.NamedTemporaryFile(delete=False, suffix=".json")
547
  with open(tmpf.name, "w", encoding="utf8") as f:
548
- json.dump(repaired, f, indent=2, ensure_ascii=False)
549
-
550
- return repaired, "Repair succeeded", tmpf.name
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
551
 
552
- # Wire up buttons
553
- run_button.click(on_process, inputs=[uploader, last_raw_state], outputs=[output_json, status, download_button, raw_output_box])
554
- repair_button.click(on_repair, inputs=[raw_output_box], outputs=[output_json, status, download_button])
555
 
556
- # launch
557
  if __name__ == "__main__":
558
  demo.launch()
 
1
  # app.py
2
  """
3
+ Final Gradio app robust document tagging + automated taxonomy via GPT-5 (OpenAI new client).
4
+ Features:
5
+ - Upload PDF or Image
6
+ - Extract text (PyMuPDF + Tesseract fallback)
7
+ - Chunk text, call GPT-5 to produce JSON metadata between markers <<BEGIN_JSON>><<END_JSON>>
8
+ - Validate JSON with jsonschema
9
+ - Automatic repair attempts + manual-repair (paste raw output)
10
+ - Detailed step-by-step logs displayed on the UI and full GPT response shown
11
+ - Download metadata JSON on success
12
+
13
+ Requirements (requirements.txt):
14
  gradio>=3.0
15
  PyMuPDF
16
  pytesseract
 
18
  openai>=1.0.0
19
  jsonschema
20
 
21
+ System packages (apt-packages for HF Spaces):
22
  tesseract-ocr
23
  poppler-utils
24
 
25
+ Put OPENAI_API_KEY into HF Space Secrets or environment.
26
  """
27
 
28
  import os
 
37
  import fitz # PyMuPDF
38
  import pytesseract
39
  from jsonschema import validate as json_validate, ValidationError
 
 
40
  from openai import OpenAI
41
 
42
  # -----------------------
43
+ # Config & OpenAI client
44
  # -----------------------
45
  OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
46
  if not OPENAI_API_KEY:
47
+ raise RuntimeError("OPENAI_API_KEY not found in environment. Add it to HF Space Secrets or env var.")
48
 
49
  client = OpenAI(api_key=OPENAI_API_KEY)
50
 
51
+ LLM_MODEL = os.getenv("OPENAI_MODEL", "gpt-5") # change if needed
52
  MAX_COMPLETION_TOKENS = int(os.getenv("MAX_COMPLETION_TOKENS", "1500"))
53
 
54
  # -----------------------
 
57
  METADATA_SCHEMA = {
58
  "type": "object",
59
  "required": [
60
+ "doc_id", "title", "summary", "doc_type", "source", "tags",
61
+ "tag_confidences", "taxonomy_path", "extracted_entities", "raw_url", "ingest_timestamp"
 
 
 
 
 
 
 
 
 
62
  ],
63
  "properties": {
64
  "doc_id": {"type": "string"},
 
77
  }
78
 
79
  # -----------------------
80
+ # Helpers: extraction & chunking
81
  # -----------------------
82
+ def extract_text_from_pdf(path: str, log: List[str]) -> str:
83
+ log.append(f"Opening PDF: {path}")
84
  try:
85
  doc = fitz.open(path)
86
  except Exception as e:
87
  raise RuntimeError(f"Failed to open PDF: {e}")
 
88
  texts: List[str] = []
89
  for i in range(len(doc)):
90
  page = doc.load_page(i)
91
  txt = page.get_text("text").strip()
92
  if txt:
93
+ log.append(f"Page {i+1}: text extracted ({len(txt)} chars)")
94
  texts.append(txt)
95
  else:
96
+ log.append(f"Page {i+1}: no text found, performing OCR fallback")
97
  pix = page.get_pixmap(dpi=200)
98
  with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp:
99
  pix.save(tmp.name)
100
  ocr_text = pytesseract.image_to_string(Image.open(tmp.name))
101
+ log.append(f"Page {i+1}: OCR extracted ({len(ocr_text)} chars)")
102
  texts.append(ocr_text)
103
  return "\n\n".join(texts).strip()
104
 
105
 
106
+ def extract_text_from_image(path: str, log: List[str]) -> str:
107
+ log.append(f"OCR on image: {path}")
108
  img = Image.open(path).convert("RGB")
109
+ txt = pytesseract.image_to_string(img).strip()
110
+ log.append(f"OCR extracted ({len(txt)} chars)")
111
+ return txt
112
 
113
 
114
  def chunk_text(text: str, max_chars: int = 3000) -> List[str]:
 
127
  return chunks
128
 
129
  # -----------------------
130
+ # Upload handling
131
  # -----------------------
132
+ def save_uploaded_to_tmp(file_obj, log: List[str]):
133
+ log.append(f"Saving uploaded object of type {type(file_obj)}")
 
 
 
 
 
 
 
134
  # file-like
135
  if hasattr(file_obj, "read") and callable(getattr(file_obj, "read")):
136
  try:
 
141
  suffix = os.path.splitext(name)[1] or ""
142
  with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as tmp:
143
  tmp.write(content)
144
+ log.append(f"Saved uploaded file-like as {tmp.name}")
145
  return tmp.name, os.path.basename(name)
146
+ except Exception as e:
147
+ log.append(f"file-like save failed: {e}")
 
148
  # dict-like
149
+ if isinstance(file_obj, dict) and "data" in file_obj and "name" in file_obj:
150
+ try:
151
  data = file_obj["data"]
152
  if isinstance(data, str):
153
  data = data.encode("utf-8")
 
155
  suffix = os.path.splitext(name)[1] or ""
156
  with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as tmp:
157
  tmp.write(data)
158
+ log.append(f"Saved dict-like upload as {tmp.name}")
159
  return tmp.name, os.path.basename(name)
160
+ except Exception as e:
161
+ log.append(f"dict-like save failed: {e}")
162
  # path string
163
  if isinstance(file_obj, str):
164
  if os.path.exists(file_obj):
165
+ log.append(f"Upload was path string existing on disk: {file_obj}")
166
  return file_obj, os.path.basename(file_obj)
167
  try:
168
  with open(file_obj, "rb") as f:
 
170
  suffix = os.path.splitext(file_obj)[1] or ""
171
  with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as tmp:
172
  tmp.write(data)
173
+ log.append(f"Copied path-string file to {tmp.name}")
174
  return tmp.name, os.path.basename(file_obj)
175
+ except Exception as e:
176
+ log.append(f"path-string handling failed: {e}")
177
+ # object with .name attr
 
178
  name = getattr(file_obj, "name", None)
179
  if name and isinstance(name, str):
180
  try:
 
183
  suffix = os.path.splitext(name)[1] or ""
184
  with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as tmp:
185
  tmp.write(data)
186
+ log.append(f"Saved file from .name attr to {tmp.name}")
187
  return tmp.name, os.path.basename(name)
188
+ except Exception as e:
189
+ log.append(f".name-based save failed: {e}")
 
190
  raise ValueError(f"Unsupported uploaded file object type: {type(file_obj)}. repr: {repr(file_obj)[:400]}")
191
 
 
192
  # -----------------------
193
+ # JSON extraction & validation
194
  # -----------------------
195
  def extract_json_from_text(text: str) -> str:
 
 
 
 
196
  m = re.search(r"<<BEGIN_JSON>>(.*?)<<END_JSON>>", text, re.DOTALL)
197
  if m:
198
  return m.group(1).strip()
 
206
 
207
 
208
  def try_parse_and_validate(json_text: str) -> (bool, Dict[str, Any], str):
 
 
 
209
  try:
210
  parsed = json.loads(json_text)
211
  except Exception as e:
212
  return False, None, f"json.loads error: {e}"
 
213
  try:
214
  json_validate(parsed, METADATA_SCHEMA)
215
  except ValidationError as e:
216
  return False, parsed, f"schema validation error: {e}"
217
  except Exception as e:
 
218
  return False, parsed, f"schema validation unexpected error: {e}"
 
219
  return True, parsed, ""
220
 
 
221
  # -----------------------
222
+ # LLM interactions (metadata, repair, autocomplete)
223
  # -----------------------
224
+ def call_gpt5_for_metadata(title: str, short_text: str, top_chunks: List[str], log: List[str], max_attempts: int = 2):
225
+ log.append("Preparing prompt for metadata generation")
 
 
 
 
 
 
 
 
226
  system_msg = (
227
  "You are an automated document taxonomy and tagging assistant for enterprise catalogs. "
228
+ "Return ONLY a JSON object wrapped between <<BEGIN_JSON>> and <<END_JSON>> and nothing else."
 
 
 
 
 
 
 
 
229
  )
230
+ prompt_intro = f"Document title: {title}\n\nShort document text (first ~1000 chars): {short_text}\n\nTop content chunks:\n"
231
  prompt_chunks = ""
232
  for i, c in enumerate(top_chunks[:6]):
233
  chunk_text_clean = c[:800].replace("\n", " ")
234
  prompt_chunks += f"CHUNK_{i+1}: {chunk_text_clean}\n\n"
 
235
  prompt_end = (
236
+ "Task: Produce a JSON object with EXACT keys: doc_id, title, summary, doc_type, source, tags (array of strings), "
237
+ "tag_confidences (map tag->float), taxonomy_path (array of strings), extracted_entities (map), raw_url, ingest_timestamp. "
238
+ "Output MUST be the JSON only, enclosed between <<BEGIN_JSON>> and <<END_JSON>>."
 
 
 
 
 
 
 
239
  )
240
+ messages = [{"role": "system", "content": system_msg}, {"role": "user", "content": prompt_intro + prompt_chunks + prompt_end}]
 
 
 
 
 
 
 
241
  last_raw = None
242
 
243
  for attempt in range(1, max_attempts + 1):
244
+ log.append(f"Calling OpenAI (attempt {attempt})")
245
  try:
246
  resp = client.chat.completions.create(
247
  model=LLM_MODEL,
 
249
  max_completion_tokens=MAX_COMPLETION_TOKENS,
250
  )
251
  except Exception as e:
252
+ log.append(f"OpenAI API call failed: {e}")
253
+ return {"_api_error": True, "error": f"OpenAI API call failed: {e}", "log": log, "raw_response": None}
254
 
255
+ # extract full model response text for UI logs
256
  try:
257
+ full_text = resp.choices[0].message["content"].strip()
258
  except Exception:
259
  try:
260
+ full_text = resp.choices[0].message.content.strip()
261
  except Exception:
262
+ full_text = str(resp)
263
+ last_raw = full_text
264
+ log.append("OpenAI response received (raw length: " + str(len(full_text)) + ")")
265
 
266
+ # attempt to extract JSON
267
+ json_text = extract_json_from_text(full_text)
268
  if not json_text:
269
+ log.append("No JSON found in response")
270
  if attempt < max_attempts:
 
 
 
 
 
 
 
271
  messages = [
272
  {"role": "system", "content": system_msg},
273
+ {"role": "user", "content": "Previous response lacked JSON markers. Return only JSON between <<BEGIN_JSON>> and <<END_JSON>>."},
274
  ]
275
  continue
276
  else:
277
+ return {"_parsing_error": True, "raw_output": last_raw, "log": log, "raw_response": full_text}
278
 
279
  ok, parsed_or_partial, parse_err = try_parse_and_validate(json_text)
280
  if ok:
281
+ log.append("JSON parsed and validated successfully")
282
+ # attach model raw response as well
283
+ return {"metadata": parsed_or_partial, "log": log, "raw_response": full_text}
284
  else:
285
+ log.append(f"JSON parsed but schema validation failed: {parse_err}")
286
  if attempt < max_attempts:
 
 
 
 
 
 
 
287
  messages = [
288
  {"role": "system", "content": system_msg},
289
+ {"role": "user", "content": "The JSON you returned is invalid vs schema. Return corrected JSON only between markers."},
290
  ]
291
  continue
292
  else:
293
+ return {"_parsing_error": True, "raw_output": last_raw, "parsed_partial": parsed_or_partial, "parse_error": parse_err, "log": log, "raw_response": full_text}
 
 
 
 
 
294
 
295
+ return {"_parsing_error": True, "raw_output": last_raw, "log": log, "raw_response": full_text}
296
 
297
+ def repair_raw_output(raw_output: str, manual_pasted_json: str, log: List[str], max_attempts: int = 2):
298
+ log.append("Starting repair flow")
299
+ # if manual JSON pasted by user, try parse+validate directly
300
+ if manual_pasted_json:
301
+ log.append("User provided manual pasted JSON — trying to parse and validate")
302
+ jtxt = extract_json_from_text(manual_pasted_json) or manual_pasted_json
303
+ ok, parsed, err = try_parse_and_validate(jtxt)
304
+ if ok:
305
+ log.append("Manual pasted JSON validated successfully")
306
+ return {"metadata": parsed, "log": log, "raw_response": manual_pasted_json}
 
 
 
 
307
  else:
308
+ log.append(f"Manual pasted JSON validation failed: {err}")
309
+ return {"_parsing_error": True, "raw_output": manual_pasted_json, "parsed_partial": parsed, "parse_error": err, "log": log}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
310
 
311
+ # otherwise instruct model to repair the raw_output
 
 
 
 
 
 
 
 
312
  system_msg = (
313
+ "You are an assistant that must extract and/or correct a malformed JSON from the user's raw_output. "
314
+ "Return ONLY a corrected JSON object wrapped between <<BEGIN_JSON>> and <<END_JSON>> and nothing else."
 
315
  )
 
316
  repair_prompt = (
317
+ "Here is the raw output (possibly containing a malformed JSON). Extract and return a corrected JSON object "
318
+ "containing keys: doc_id,title,summary,doc_type,source,tags,tag_confidences,taxonomy_path,extracted_entities,raw_url,ingest_timestamp. "
319
+ "If fields are missing, use reasonable defaults (empty string, empty list or empty map)."
 
 
320
  )
321
+ messages = [{"role": "system", "content": system_msg}, {"role": "user", "content": repair_prompt + "\n\nRaw output:\n\n" + (raw_output or "")}]
 
 
322
  last_raw = None
323
  for attempt in range(1, max_attempts + 1):
324
+ log.append(f"Repair attempt {attempt}")
325
  try:
326
  resp = client.chat.completions.create(
327
  model=LLM_MODEL,
 
329
  max_completion_tokens=MAX_COMPLETION_TOKENS,
330
  )
331
  except Exception as e:
332
+ log.append(f"Repair API call failed: {e}")
333
+ return {"_api_error": True, "error": f"OpenAI API call failed: {e}", "log": log, "raw_response": None}
334
  try:
335
+ full_text = resp.choices[0].message["content"].strip()
336
  except Exception:
337
  try:
338
+ full_text = resp.choices[0].message.content.strip()
339
  except Exception:
340
+ full_text = str(resp)
341
+ last_raw = full_text
342
+ log.append("Repair model response received (raw length: " + str(len(full_text)) + ")")
343
+ json_text = extract_json_from_text(full_text)
344
  if not json_text:
345
+ log.append("Repair response contained no JSON")
346
  if attempt < max_attempts:
347
+ messages = [{"role": "system", "content": system_msg}, {"role": "user", "content": "Your previous reply did not include the JSON. Return ONLY the corrected JSON between markers."}]
 
 
 
348
  continue
349
  else:
350
+ return {"_parsing_error": True, "raw_output": last_raw, "log": log, "raw_response": full_text}
 
351
  ok, parsed_or_partial, parse_err = try_parse_and_validate(json_text)
352
  if ok:
353
+ log.append("Repair produced valid JSON")
354
+ return {"metadata": parsed_or_partial, "log": log, "raw_response": full_text}
355
  else:
356
+ log.append(f"Repair produced JSON but validation failed: {parse_err}")
357
  if attempt < max_attempts:
358
+ messages = [{"role": "system", "content": system_msg}, {"role": "user", "content": "Your JSON is invalid. Please correct and return ONLY the corrected JSON between markers."}]
 
 
 
359
  continue
360
  else:
361
+ return {"_parsing_error": True, "raw_output": last_raw, "parsed_partial": parsed_or_partial, "parse_error": parse_err, "log": log, "raw_response": full_text}
362
+ return {"_parsing_error": True, "raw_output": last_raw or "", "log": log, "raw_response": last_raw or ""}
363
 
364
+ def auto_complete_partial(parsed_partial: Dict[str, Any], orig_name: str, extracted_text: str, top_chunks: List[str], log: List[str], max_attempts: int = 2):
365
+ log.append("Starting auto-complete for parsed partial")
366
+ system_msg = (
367
+ "You are an assistant that must fill missing metadata fields for a document. "
368
+ "Return ONLY a single JSON object wrapped in <<BEGIN_JSON>> and <<END_JSON>> with the exact keys: "
369
+ "doc_id, title, summary, doc_type, source, tags, tag_confidences, taxonomy_path, extracted_entities, raw_url, ingest_timestamp. "
370
+ "If you cannot infer a value, use reasonable defaults."
371
+ )
372
+ partial_str = json.dumps(parsed_partial, ensure_ascii=False)
373
+ short_text = (extracted_text[:1200] + "...") if len(extracted_text) > 1200 else extracted_text
374
+ prompt = f"Original filename: {orig_name}\n\nPreviously parsed partial JSON:\n{partial_str}\n\nDocument short text:\n{short_text}\n\nTop chunks:\n"
375
+ for i, c in enumerate(top_chunks[:6]):
376
+ prompt += f"CHUNK_{i+1}: {c[:900].replace(chr(10), ' ')}\n\n"
377
+ prompt += ("Task: Fill any missing or empty fields in the JSON above using the document context. "
378
+ "Return ONLY the completed JSON wrapped between <<BEGIN_JSON>> and <<END_JSON>>.")
379
+ messages = [{"role": "system", "content": system_msg}, {"role": "user", "content": prompt}]
380
+ last_raw = None
381
+ for attempt in range(1, max_attempts + 1):
382
+ log.append(f"Auto-complete attempt {attempt}")
383
+ try:
384
+ resp = client.chat.completions.create(
385
+ model=LLM_MODEL,
386
+ messages=messages,
387
+ max_completion_tokens=MAX_COMPLETION_TOKENS,
388
+ )
389
+ except Exception as e:
390
+ log.append(f"Auto-complete API call failed: {e}")
391
+ return {"_api_error": True, "error": f"OpenAI API call failed: {e}", "log": log}
392
+ try:
393
+ full_text = resp.choices[0].message["content"].strip()
394
+ except Exception:
395
+ try:
396
+ full_text = resp.choices[0].message.content.strip()
397
+ except Exception:
398
+ full_text = str(resp)
399
+ last_raw = full_text
400
+ log.append("Auto-complete model response received")
401
+ json_text = extract_json_from_text(full_text)
402
+ if not json_text:
403
+ log.append("Auto-complete response had no JSON")
404
+ if attempt < max_attempts:
405
+ messages = [{"role": "system", "content": system_msg}, {"role": "user", "content": "Return ONLY the JSON wrapped in <<BEGIN_JSON>> and <<END_JSON>>."}]
406
+ continue
407
+ else:
408
+ return {"_parsing_error": True, "raw_output": last_raw, "log": log, "raw_response": full_text}
409
+ ok, parsed_or_partial2, parse_err = try_parse_and_validate(json_text)
410
+ if ok:
411
+ log.append("Auto-complete succeeded and validated")
412
+ return {"metadata": parsed_or_partial2, "log": log, "raw_response": full_text}
413
+ else:
414
+ log.append(f"Auto-complete produced JSON but validation failed: {parse_err}")
415
+ if attempt < max_attempts:
416
+ messages = [{"role": "system", "content": system_msg}, {"role": "user", "content": "The JSON you returned is invalid. Please correct and return ONLY the JSON wrapped in <<BEGIN_JSON>> and <<END_JSON>>."}]
417
+ continue
418
+ else:
419
+ return {"_parsing_error": True, "raw_output": last_raw, "parsed_partial": parsed_or_partial2, "parse_error": parse_err, "log": log, "raw_response": full_text}
420
+ return {"_parsing_error": True, "raw_output": last_raw or "", "log": log, "raw_response": last_raw or ""}
421
+
422
+ # -----------------------
423
+ # Orchestration: process file
424
+ # -----------------------
425
+ def process_file(file_obj):
426
+ ui_log: List[str] = []
427
+ try:
428
+ tmp_path, orig_name = save_uploaded_to_tmp(file_obj, ui_log)
429
+ except Exception as e:
430
+ ui_log.append(f"Failed to save upload: {e}")
431
+ return {"error": f"Failed to save uploaded file: {e}", "log": ui_log, "raw_response": ""}
432
+
433
+ try:
434
+ if orig_name.lower().endswith(".pdf"):
435
+ extracted_text = extract_text_from_pdf(tmp_path, ui_log)
436
+ else:
437
+ extracted_text = extract_text_from_image(tmp_path, ui_log)
438
+ except Exception as e:
439
+ ui_log.append(f"Text extraction failed: {e}")
440
+ return {"error": f"Text extraction failed: {e}", "log": ui_log, "raw_response": ""}
441
+
442
+ if not extracted_text:
443
+ ui_log.append("No text found after extraction.")
444
+ return {"error": "No text found in document after extraction.", "log": ui_log, "raw_response": ""}
445
+
446
+ chunks = chunk_text(extracted_text)
447
+ ui_log.append(f"Document split into {len(chunks)} chunks")
448
+ sorted_chunks = sorted(chunks, key=lambda x: len(x), reverse=True)
449
+ top_chunks = sorted_chunks[:6] if sorted_chunks else [extracted_text[:2000]]
450
+ short_text = (extracted_text[:1000] + "...") if len(extracted_text) > 1000 else extracted_text
451
+
452
+ # Primary LLM call
453
+ result = call_gpt5_for_metadata(orig_name, short_text, top_chunks, ui_log, max_attempts=2)
454
+
455
+ # If API error
456
+ if result.get("_api_error"):
457
+ return {"error": result.get("error"), "log": ui_log + result.get("log", []), "raw_response": result.get("raw_response")}
458
+
459
+ # If parsing error, attempt auto-complete if we have parsed_partial
460
+ if result.get("_parsing_error"):
461
+ ui_log += result.get("log", [])
462
+ raw_out = result.get("raw_output", result.get("raw_response", ""))
463
+ parsed_partial = result.get("parsed_partial", {})
464
+ ui_log.append("Initial parse failed; attempting auto-complete if partial available")
465
+ if parsed_partial:
466
+ ac = auto_complete_partial(parsed_partial, orig_name, extracted_text, top_chunks, ui_log, max_attempts=2)
467
+ if ac.get("_api_error"):
468
+ ui_log += ac.get("log", [])
469
+ return {"error": "Auto-complete API error", "log": ui_log, "raw_response": ac.get("raw_response", raw_out)}
470
+ if ac.get("_parsing_error"):
471
+ ui_log += ac.get("log", [])
472
+ return {"error": "LLM output parsing failed. See raw_output.", "raw_output": ac.get("raw_output", raw_out), "parsed_partial": ac.get("parsed_partial"), "parse_error": ac.get("parse_error"), "log": ui_log, "raw_response": ac.get("raw_response", raw_out)}
473
+ # success
474
+ metadata = ac.get("metadata")
475
+ ui_log += ac.get("log", [])
476
+ ui_log.append("Auto-complete produced metadata")
477
+ # ensure defaults
478
+ now = datetime.datetime.now(datetime.timezone.utc).astimezone().isoformat()
479
+ metadata.setdefault("doc_id", os.path.splitext(orig_name)[0])
480
+ metadata.setdefault("title", orig_name)
481
+ metadata.setdefault("source", "user_upload")
482
+ metadata.setdefault("raw_url", "")
483
+ metadata.setdefault("ingest_timestamp", now)
484
+ return {"metadata": metadata, "log": ui_log, "raw_response": ac.get("raw_response", raw_out)}
485
+ else:
486
+ ui_log.append("No parsed_partial to auto-complete; returning raw output for manual repair")
487
+ return {"error": "LLM output parsing failed. See raw_output.", "raw_output": raw_out, "parsed_partial": parsed_partial, "parse_error": result.get("parse_error"), "log": ui_log, "raw_response": result.get("raw_response", raw_out)}
488
+
489
+ # success path
490
+ metadata = result.get("metadata")
491
+ ui_log += result.get("log", [])
492
+ raw_model_response = result.get("raw_response")
493
+ now = datetime.datetime.now(datetime.timezone.utc).astimezone().isoformat()
494
+ metadata.setdefault("doc_id", os.path.splitext(orig_name)[0])
495
+ metadata.setdefault("title", orig_name)
496
+ metadata.setdefault("source", "user_upload")
497
+ metadata.setdefault("raw_url", "")
498
+ metadata.setdefault("ingest_timestamp", now)
499
+ ui_log.append("Metadata generation successful")
500
+ return {"metadata": metadata, "log": ui_log, "raw_response": raw_model_response}
501
 
502
  # -----------------------
503
  # Gradio UI
504
  # -----------------------
505
+ with gr.Blocks(title="DocClassify — Final Robust") as demo:
506
+ gr.Markdown("## 📂 Upload PDF / Image automated taxonomy & tagging (GPT-5). Logs & GPT response shown below.")
507
  with gr.Row():
508
  with gr.Column(scale=1):
509
  uploader = gr.File(label="Upload PDF / Image", file_types=[".pdf", ".png", ".jpg", ".jpeg", ".tiff"])
510
  run_button = gr.Button("Process document")
511
  status = gr.Textbox(label="Status", value="", interactive=False)
512
  download_button = gr.File(label="Download metadata JSON", visible=False)
513
+ gr.Markdown("### Manual repair (paste raw LLM output if needed)")
514
+ manual_raw_input = gr.Textbox(label="Paste raw LLM output here (optional)", lines=8, placeholder="Paste the malformed raw response if you need manual repair")
515
+ repair_from_paste_btn = gr.Button("Repair from pasted raw output")
516
+ repair_auto_btn = gr.Button("Attempt automatic repair of last raw output")
517
  with gr.Column(scale=1):
518
+ output_json = gr.JSON(label="Metadata JSON (parsed)")
519
+ raw_output_box = gr.Textbox(label="Full GPT model raw response", lines=12, interactive=False)
520
+ logs_box = gr.Textbox(label="Step-by-step logs", lines=12, interactive=False)
521
+
522
+ # state holders
523
+ last_raw_state = gr.State(value=None) # store last raw model response
524
+ last_metadata_file = gr.State(value=None) # path to downloadable json
525
+
526
+ def on_process(file_obj):
527
+ if not file_obj:
528
+ return {}, "No file uploaded", None, "", ""
529
+ status_msg = "Processing..."
530
  try:
531
  result = process_file(file_obj)
532
  except Exception as e:
533
+ return {}, f"Failed: {e}", None, "", "\n".join([f"Exception: {e}"])
534
+ # handle errors and success
535
+ logs = result.get("log", [])
536
+ raw_response = result.get("raw_response", "")
537
  if result.get("error"):
538
+ # show raw_output for manual repair if present
539
+ raw_out = result.get("raw_output", raw_response) or ""
540
+ parsed_partial = result.get("parsed_partial")
541
+ display = {"error": result.get("error")}
542
+ if parsed_partial is not None:
543
+ display["parsed_partial"] = parsed_partial
544
+ # put logs and raw_response into UI
545
+ logs_text = "\n".join(logs + [f"Error: {result.get('error')}"])
546
+ return display, f"Error: {result.get('error')}", None, raw_out, logs_text
547
+ # success -> create temp file for download
548
+ metadata = result.get("metadata")
549
  tmpf = tempfile.NamedTemporaryFile(delete=False, suffix=".json")
550
  with open(tmpf.name, "w", encoding="utf8") as f:
551
+ json.dump(metadata, f, indent=2, ensure_ascii=False)
552
+ logs_text = "\n".join(logs)
553
+ return metadata, "Done", tmpf.name, raw_response or "", logs_text
554
+
555
+ def on_repair_from_paste(manual_text):
556
+ if not manual_text:
557
+ return {}, "No pasted raw output provided.", None, "", "No pasted raw output provided."
558
+ # try repair using model (or direct parse)
559
+ ui_log = ["Repair-from-paste initiated"]
560
+ repaired = repair_raw_output(raw_output=None, manual_pasted_json=manual_text, log=ui_log, max_attempts=2)
561
+ logs_text = "\n".join(repaired.get("log", ui_log))
 
562
  if repaired.get("_api_error"):
563
+ return {}, f"Repair API error: {repaired.get('error')}", None, repaired.get("raw_response", manual_text), logs_text
 
564
  if repaired.get("_parsing_error"):
565
+ display = {"error": "Repair failed to produce valid JSON", "parsed_partial": repaired.get("parsed_partial"), "parse_error": repaired.get("parse_error")}
566
+ return display, "Repair failed", None, repaired.get("raw_response", manual_text), logs_text
567
+ # success
568
+ metadata = repaired.get("metadata")
 
569
  tmpf = tempfile.NamedTemporaryFile(delete=False, suffix=".json")
570
  with open(tmpf.name, "w", encoding="utf8") as f:
571
+ json.dump(metadata, f, indent=2, ensure_ascii=False)
572
+ return metadata, "Repair succeeded", tmpf.name, repaired.get("raw_response", manual_text), logs_text
573
+
574
+ def on_repair_auto(raw_response_text):
575
+ if not raw_response_text:
576
+ return {}, "No raw_response available for auto repair. Run process or paste raw output.", None, "", "No raw_response available."
577
+ ui_log = ["Auto repair initiated"]
578
+ repaired = repair_raw_output(raw_output=raw_response_text, manual_pasted_json=None, log=ui_log, max_attempts=2)
579
+ logs_text = "\n".join(repaired.get("log", ui_log))
580
+ if repaired.get("_api_error"):
581
+ return {}, f"Repair API error: {repaired.get('error')}", None, repaired.get("raw_response", raw_response_text), logs_text
582
+ if repaired.get("_parsing_error"):
583
+ display = {"error": "Auto-repair failed to produce valid JSON", "parsed_partial": repaired.get("parsed_partial"), "parse_error": repaired.get("parse_error")}
584
+ return display, "Auto-repair failed", None, repaired.get("raw_response", raw_response_text), logs_text
585
+ metadata = repaired.get("metadata")
586
+ tmpf = tempfile.NamedTemporaryFile(delete=False, suffix=".json")
587
+ with open(tmpf.name, "w", encoding="utf8") as f:
588
+ json.dump(metadata, f, indent=2, ensure_ascii=False)
589
+ return metadata, "Auto-repair succeeded", tmpf.name, repaired.get("raw_response", raw_response_text), logs_text
590
 
591
+ run_button.click(on_process, inputs=[uploader], outputs=[output_json, status, download_button, raw_output_box, logs_box])
592
+ repair_from_paste_btn.click(on_repair_from_paste, inputs=[manual_raw_input], outputs=[output_json, status, download_button, raw_output_box, logs_box])
593
+ repair_auto_btn.click(on_repair_auto, inputs=[raw_output_box], outputs=[output_json, status, download_button, raw_output_box, logs_box])
594
 
 
595
  if __name__ == "__main__":
596
  demo.launch()