Toulik commited on
Commit
4515495
·
verified ·
1 Parent(s): 55e1313

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +96 -22
app.py CHANGED
@@ -219,29 +219,58 @@ def try_parse_and_validate(json_text: str) -> (bool, Dict[str, Any], str):
219
  return True, parsed, ""
220
 
221
  # -----------------------
222
- # LLM interactions (metadata, repair, autocomplete)
223
  # -----------------------
224
- def call_gpt5_for_metadata(title: str, short_text: str, top_chunks: List[str], log: List[str], max_attempts: int = 2):
225
- log.append("Preparing prompt for metadata generation")
 
 
 
 
 
 
226
  system_msg = (
227
- "You are an automated document taxonomy and tagging assistant for enterprise catalogs. "
 
228
  "Return ONLY a JSON object wrapped between <<BEGIN_JSON>> and <<END_JSON>> and nothing else."
229
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
230
  prompt_intro = f"Document title: {title}\n\nShort document text (first ~1000 chars): {short_text}\n\nTop content chunks:\n"
231
  prompt_chunks = ""
232
  for i, c in enumerate(top_chunks[:6]):
233
  chunk_text_clean = c[:800].replace("\n", " ")
234
  prompt_chunks += f"CHUNK_{i+1}: {chunk_text_clean}\n\n"
 
235
  prompt_end = (
236
  "Task: Produce a JSON object with EXACT keys: doc_id, title, summary, doc_type, source, tags (array of strings), "
237
- "tag_confidences (map tag->float), taxonomy_path (array of strings), extracted_entities (map), raw_url, ingest_timestamp. "
238
- "Output MUST be the JSON only, enclosed between <<BEGIN_JSON>> and <<END_JSON>>."
239
  )
240
- messages = [{"role": "system", "content": system_msg}, {"role": "user", "content": prompt_intro + prompt_chunks + prompt_end}]
 
 
 
 
 
241
  last_raw = None
242
 
243
  for attempt in range(1, max_attempts + 1):
244
- log.append(f"Calling OpenAI (attempt {attempt})")
245
  try:
246
  resp = client.chat.completions.create(
247
  model=LLM_MODEL,
@@ -249,10 +278,10 @@ def call_gpt5_for_metadata(title: str, short_text: str, top_chunks: List[str], l
249
  max_completion_tokens=MAX_COMPLETION_TOKENS,
250
  )
251
  except Exception as e:
252
- log.append(f"OpenAI API call failed: {e}")
253
  return {"_api_error": True, "error": f"OpenAI API call failed: {e}", "log": log, "raw_response": None}
254
 
255
- # extract full model response text for UI logs
256
  try:
257
  full_text = resp.choices[0].message["content"].strip()
258
  except Exception:
@@ -261,39 +290,87 @@ def call_gpt5_for_metadata(title: str, short_text: str, top_chunks: List[str], l
261
  except Exception:
262
  full_text = str(resp)
263
  last_raw = full_text
264
- log.append("OpenAI response received (raw length: " + str(len(full_text)) + ")")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
265
 
266
- # attempt to extract JSON
267
  json_text = extract_json_from_text(full_text)
268
  if not json_text:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
269
  log.append("No JSON found in response")
270
  if attempt < max_attempts:
271
  messages = [
272
  {"role": "system", "content": system_msg},
273
- {"role": "user", "content": "Previous response lacked JSON markers. Return only JSON between <<BEGIN_JSON>> and <<END_JSON>>."},
274
  ]
275
  continue
276
  else:
277
  return {"_parsing_error": True, "raw_output": last_raw, "log": log, "raw_response": full_text}
278
 
 
279
  ok, parsed_or_partial, parse_err = try_parse_and_validate(json_text)
280
  if ok:
281
  log.append("JSON parsed and validated successfully")
282
- # attach model raw response as well
283
  return {"metadata": parsed_or_partial, "log": log, "raw_response": full_text}
284
  else:
285
  log.append(f"JSON parsed but schema validation failed: {parse_err}")
 
 
 
 
 
 
 
 
 
 
 
 
286
  if attempt < max_attempts:
287
  messages = [
288
  {"role": "system", "content": system_msg},
289
- {"role": "user", "content": "The JSON you returned is invalid vs schema. Return corrected JSON only between markers."},
290
  ]
291
  continue
292
  else:
293
  return {"_parsing_error": True, "raw_output": last_raw, "parsed_partial": parsed_or_partial, "parse_error": parse_err, "log": log, "raw_response": full_text}
294
 
295
- return {"_parsing_error": True, "raw_output": last_raw, "log": log, "raw_response": full_text}
296
 
 
 
 
297
  def repair_raw_output(raw_output: str, manual_pasted_json: str, log: List[str], max_attempts: int = 2):
298
  log.append("Starting repair flow")
299
  # if manual JSON pasted by user, try parse+validate directly
@@ -450,7 +527,7 @@ def process_file(file_obj):
450
  short_text = (extracted_text[:1000] + "...") if len(extracted_text) > 1000 else extracted_text
451
 
452
  # Primary LLM call
453
- result = call_gpt5_for_metadata(orig_name, short_text, top_chunks, ui_log, max_attempts=2)
454
 
455
  # If API error
456
  if result.get("_api_error"):
@@ -511,13 +588,13 @@ with gr.Blocks(title="DocClassify — Final Robust") as demo:
511
  status = gr.Textbox(label="Status", value="", interactive=False)
512
  download_button = gr.File(label="Download metadata JSON", visible=False)
513
  gr.Markdown("### Manual repair (paste raw LLM output if needed)")
514
- manual_raw_input = gr.Textbox(label="Paste raw LLM output here (optional)", lines=8, placeholder="Paste the malformed raw response if you need manual repair")
515
  repair_from_paste_btn = gr.Button("Repair from pasted raw output")
516
  repair_auto_btn = gr.Button("Attempt automatic repair of last raw output")
517
  with gr.Column(scale=1):
518
  output_json = gr.JSON(label="Metadata JSON (parsed)")
519
  raw_output_box = gr.Textbox(label="Full GPT model raw response", lines=12, interactive=False)
520
- logs_box = gr.Textbox(label="Step-by-step logs", lines=12, interactive=False)
521
 
522
  # state holders
523
  last_raw_state = gr.State(value=None) # store last raw model response
@@ -541,7 +618,6 @@ with gr.Blocks(title="DocClassify — Final Robust") as demo:
541
  display = {"error": result.get("error")}
542
  if parsed_partial is not None:
543
  display["parsed_partial"] = parsed_partial
544
- # put logs and raw_response into UI
545
  logs_text = "\n".join(logs + [f"Error: {result.get('error')}"])
546
  return display, f"Error: {result.get('error')}", None, raw_out, logs_text
547
  # success -> create temp file for download
@@ -555,7 +631,6 @@ with gr.Blocks(title="DocClassify — Final Robust") as demo:
555
  def on_repair_from_paste(manual_text):
556
  if not manual_text:
557
  return {}, "No pasted raw output provided.", None, "", "No pasted raw output provided."
558
- # try repair using model (or direct parse)
559
  ui_log = ["Repair-from-paste initiated"]
560
  repaired = repair_raw_output(raw_output=None, manual_pasted_json=manual_text, log=ui_log, max_attempts=2)
561
  logs_text = "\n".join(repaired.get("log", ui_log))
@@ -564,7 +639,6 @@ with gr.Blocks(title="DocClassify — Final Robust") as demo:
564
  if repaired.get("_parsing_error"):
565
  display = {"error": "Repair failed to produce valid JSON", "parsed_partial": repaired.get("parsed_partial"), "parse_error": repaired.get("parse_error")}
566
  return display, "Repair failed", None, repaired.get("raw_response", manual_text), logs_text
567
- # success
568
  metadata = repaired.get("metadata")
569
  tmpf = tempfile.NamedTemporaryFile(delete=False, suffix=".json")
570
  with open(tmpf.name, "w", encoding="utf8") as f:
 
219
  return True, parsed, ""
220
 
221
  # -----------------------
222
+ # Improved call_gpt5_for_metadata (prevents tool invocation; includes example; retries with document_text)
223
  # -----------------------
224
+ def call_gpt5_for_metadata(title: str, short_text: str, top_chunks: List[str], log: List[str], max_attempts: int = 3):
225
+ """
226
+ Robust metadata generation:
227
+ - Prevents tool invocation by instruction
228
+ - Includes example JSON
229
+ - Retries with explicit document_text if model returns tool-like MISSING_INPUT objects
230
+ - Logs full model response
231
+ """
232
  system_msg = (
233
+ "You are an assistant that must PRODUCE a JSON metadata object for the uploaded document. "
234
+ "Do NOT attempt to call any external APIs or tools. Do NOT return status/error objects from other services. "
235
  "Return ONLY a JSON object wrapped between <<BEGIN_JSON>> and <<END_JSON>> and nothing else."
236
  )
237
+
238
+ example_json = {
239
+ "doc_id": "example_001",
240
+ "title": "Example Title",
241
+ "summary": "Short summary of the document in 1-2 sentences.",
242
+ "doc_type": "architecture_comparison",
243
+ "source": "user_upload",
244
+ "tags": ["arch:docai", "topic:ocr-parsing"],
245
+ "tag_confidences": {"arch:docai": 0.95, "topic:ocr-parsing": 0.9},
246
+ "taxonomy_path": ["Technology", "Document Processing", "OCR & Parsing"],
247
+ "extracted_entities": {"platforms": ["GCP", "BigQuery"], "tools": ["DocAI"]},
248
+ "raw_url": "",
249
+ "ingest_timestamp": "2025-09-19T09:13:00+05:30"
250
+ }
251
+ example_block = "Example JSON (use this schema, but fill with values from the document):\n<<BEGIN_JSON>>\n" + json.dumps(example_json, ensure_ascii=False, indent=2) + "\n<<END_JSON>>\n\n"
252
+
253
  prompt_intro = f"Document title: {title}\n\nShort document text (first ~1000 chars): {short_text}\n\nTop content chunks:\n"
254
  prompt_chunks = ""
255
  for i, c in enumerate(top_chunks[:6]):
256
  chunk_text_clean = c[:800].replace("\n", " ")
257
  prompt_chunks += f"CHUNK_{i+1}: {chunk_text_clean}\n\n"
258
+
259
  prompt_end = (
260
  "Task: Produce a JSON object with EXACT keys: doc_id, title, summary, doc_type, source, tags (array of strings), "
261
+ "tag_confidences (map tag->float), taxonomy_path (array of strings), extracted_entities (map), raw_url, ingest_timestamp.\n"
262
+ "Return ONLY the JSON between <<BEGIN_JSON>> and <<END_JSON>>. Do not add any commentary."
263
  )
264
+
265
+ messages = [
266
+ {"role": "system", "content": system_msg},
267
+ {"role": "user", "content": example_block + prompt_intro + prompt_chunks + prompt_end},
268
+ ]
269
+
270
  last_raw = None
271
 
272
  for attempt in range(1, max_attempts + 1):
273
+ log.append(f"OpenAI call attempt {attempt}")
274
  try:
275
  resp = client.chat.completions.create(
276
  model=LLM_MODEL,
 
278
  max_completion_tokens=MAX_COMPLETION_TOKENS,
279
  )
280
  except Exception as e:
281
+ log.append(f"OpenAI API call failed on attempt {attempt}: {e}")
282
  return {"_api_error": True, "error": f"OpenAI API call failed: {e}", "log": log, "raw_response": None}
283
 
284
+ # capture full model response text for UI logs
285
  try:
286
  full_text = resp.choices[0].message["content"].strip()
287
  except Exception:
 
290
  except Exception:
291
  full_text = str(resp)
292
  last_raw = full_text
293
+ log.append(f"OpenAI response received (len={len(full_text)})")
294
+ log.append("---- FULL MODEL RESPONSE START ----")
295
+ log.append(full_text)
296
+ log.append("---- FULL MODEL RESPONSE END ----")
297
+
298
+ # If model returned empty, retry with explicit document_text included
299
+ if not full_text:
300
+ log.append("Model returned empty response — will retry with explicit document_text provided.")
301
+ if attempt < max_attempts:
302
+ messages = [
303
+ {"role": "system", "content": system_msg},
304
+ {"role": "user", "content": example_block + "Providing document_text to avoid missing-input errors.\n\ndocument_text: " + short_text + "\n\n" + prompt_chunks + prompt_end}
305
+ ]
306
+ continue
307
+ else:
308
+ return {"_parsing_error": True, "raw_output": last_raw, "log": log, "raw_response": full_text}
309
 
310
+ # Try extract JSON
311
  json_text = extract_json_from_text(full_text)
312
  if not json_text:
313
+ # try detect tool-like error in JSON
314
+ try:
315
+ maybe_obj = json.loads(full_text)
316
+ if isinstance(maybe_obj, dict) and any("document" in str(v).lower() or "missing_input" in str(v).lower() for v in maybe_obj.values()):
317
+ log.append("Model returned an error-like dict referencing 'document' or 'missing_input'. Retrying with explicit document_text.")
318
+ if attempt < max_attempts:
319
+ messages = [
320
+ {"role": "system", "content": system_msg},
321
+ {"role": "user", "content": example_block + "The model output looked like an error requiring a 'document_text' parameter. "
322
+ + "Provide the document_text here explicitly and return the metadata JSON.\n\n"
323
+ + "document_text: " + short_text + "\n\n" + prompt_chunks + prompt_end}
324
+ ]
325
+ continue
326
+ else:
327
+ return {"_parsing_error": True, "raw_output": last_raw, "log": log, "raw_response": full_text}
328
+ except Exception:
329
+ pass
330
+
331
  log.append("No JSON found in response")
332
  if attempt < max_attempts:
333
  messages = [
334
  {"role": "system", "content": system_msg},
335
+ {"role": "user", "content": "Previous response lacked a JSON block. Return ONLY the JSON between <<BEGIN_JSON>> and <<END_JSON>>. Use the example format."}
336
  ]
337
  continue
338
  else:
339
  return {"_parsing_error": True, "raw_output": last_raw, "log": log, "raw_response": full_text}
340
 
341
+ # Validate JSON
342
  ok, parsed_or_partial, parse_err = try_parse_and_validate(json_text)
343
  if ok:
344
  log.append("JSON parsed and validated successfully")
 
345
  return {"metadata": parsed_or_partial, "log": log, "raw_response": full_text}
346
  else:
347
  log.append(f"JSON parsed but schema validation failed: {parse_err}")
348
+ # If parsed JSON is a tool-style error, retry with explicit document_text
349
+ if isinstance(parsed_or_partial, dict) and parsed_or_partial.get("status") == "error" and ("MISSING_INPUT" in str(parsed_or_partial.get("error_code", "")).upper() or "document" in str(parsed_or_partial.get("message", "")).lower()):
350
+ log.append("Detected tool-like MISSING_INPUT response inside JSON. Retrying with explicit document_text.")
351
+ if attempt < max_attempts:
352
+ messages = [
353
+ {"role": "system", "content": system_msg},
354
+ {"role": "user", "content": example_block + "The previous response contained an error object asking for document_text. "
355
+ + "Please produce the metadata JSON now. document_text: " + short_text + "\n\n" + prompt_chunks + prompt_end}
356
+ ]
357
+ continue
358
+ else:
359
+ return {"_parsing_error": True, "raw_output": last_raw, "parsed_partial": parsed_or_partial, "parse_error": parse_err, "log": log, "raw_response": full_text}
360
  if attempt < max_attempts:
361
  messages = [
362
  {"role": "system", "content": system_msg},
363
+ {"role": "user", "content": "Your JSON is invalid vs schema. Return corrected JSON only between markers, using the example format."}
364
  ]
365
  continue
366
  else:
367
  return {"_parsing_error": True, "raw_output": last_raw, "parsed_partial": parsed_or_partial, "parse_error": parse_err, "log": log, "raw_response": full_text}
368
 
369
+ return {"_parsing_error": True, "raw_output": last_raw, "log": log, "raw_response": last_raw}
370
 
371
+ # -----------------------
372
+ # Other LLM helpers: repair + auto-complete (same as before)
373
+ # -----------------------
374
  def repair_raw_output(raw_output: str, manual_pasted_json: str, log: List[str], max_attempts: int = 2):
375
  log.append("Starting repair flow")
376
  # if manual JSON pasted by user, try parse+validate directly
 
527
  short_text = (extracted_text[:1000] + "...") if len(extracted_text) > 1000 else extracted_text
528
 
529
  # Primary LLM call
530
+ result = call_gpt5_for_metadata(orig_name, short_text, top_chunks, ui_log, max_attempts=3)
531
 
532
  # If API error
533
  if result.get("_api_error"):
 
588
  status = gr.Textbox(label="Status", value="", interactive=False)
589
  download_button = gr.File(label="Download metadata JSON", visible=False)
590
  gr.Markdown("### Manual repair (paste raw LLM output if needed)")
591
+ manual_raw_input = gr.Textbox(label="Paste raw LLM output here (optional)", lines=8, placeholder="Paste the malformed raw response here if you need manual repair")
592
  repair_from_paste_btn = gr.Button("Repair from pasted raw output")
593
  repair_auto_btn = gr.Button("Attempt automatic repair of last raw output")
594
  with gr.Column(scale=1):
595
  output_json = gr.JSON(label="Metadata JSON (parsed)")
596
  raw_output_box = gr.Textbox(label="Full GPT model raw response", lines=12, interactive=False)
597
+ logs_box = gr.Textbox(label="Step-by-step logs", lines=18, interactive=False)
598
 
599
  # state holders
600
  last_raw_state = gr.State(value=None) # store last raw model response
 
618
  display = {"error": result.get("error")}
619
  if parsed_partial is not None:
620
  display["parsed_partial"] = parsed_partial
 
621
  logs_text = "\n".join(logs + [f"Error: {result.get('error')}"])
622
  return display, f"Error: {result.get('error')}", None, raw_out, logs_text
623
  # success -> create temp file for download
 
631
  def on_repair_from_paste(manual_text):
632
  if not manual_text:
633
  return {}, "No pasted raw output provided.", None, "", "No pasted raw output provided."
 
634
  ui_log = ["Repair-from-paste initiated"]
635
  repaired = repair_raw_output(raw_output=None, manual_pasted_json=manual_text, log=ui_log, max_attempts=2)
636
  logs_text = "\n".join(repaired.get("log", ui_log))
 
639
  if repaired.get("_parsing_error"):
640
  display = {"error": "Repair failed to produce valid JSON", "parsed_partial": repaired.get("parsed_partial"), "parse_error": repaired.get("parse_error")}
641
  return display, "Repair failed", None, repaired.get("raw_response", manual_text), logs_text
 
642
  metadata = repaired.get("metadata")
643
  tmpf = tempfile.NamedTemporaryFile(delete=False, suffix=".json")
644
  with open(tmpf.name, "w", encoding="utf8") as f: