Update app.py
Browse files
app.py
CHANGED
|
@@ -219,29 +219,58 @@ def try_parse_and_validate(json_text: str) -> (bool, Dict[str, Any], str):
|
|
| 219 |
return True, parsed, ""
|
| 220 |
|
| 221 |
# -----------------------
|
| 222 |
-
#
|
| 223 |
# -----------------------
|
| 224 |
-
def call_gpt5_for_metadata(title: str, short_text: str, top_chunks: List[str], log: List[str], max_attempts: int =
|
| 225 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 226 |
system_msg = (
|
| 227 |
-
"You are an
|
|
|
|
| 228 |
"Return ONLY a JSON object wrapped between <<BEGIN_JSON>> and <<END_JSON>> and nothing else."
|
| 229 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 230 |
prompt_intro = f"Document title: {title}\n\nShort document text (first ~1000 chars): {short_text}\n\nTop content chunks:\n"
|
| 231 |
prompt_chunks = ""
|
| 232 |
for i, c in enumerate(top_chunks[:6]):
|
| 233 |
chunk_text_clean = c[:800].replace("\n", " ")
|
| 234 |
prompt_chunks += f"CHUNK_{i+1}: {chunk_text_clean}\n\n"
|
|
|
|
| 235 |
prompt_end = (
|
| 236 |
"Task: Produce a JSON object with EXACT keys: doc_id, title, summary, doc_type, source, tags (array of strings), "
|
| 237 |
-
"tag_confidences (map tag->float), taxonomy_path (array of strings), extracted_entities (map), raw_url, ingest_timestamp.
|
| 238 |
-
"
|
| 239 |
)
|
| 240 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 241 |
last_raw = None
|
| 242 |
|
| 243 |
for attempt in range(1, max_attempts + 1):
|
| 244 |
-
log.append(f"
|
| 245 |
try:
|
| 246 |
resp = client.chat.completions.create(
|
| 247 |
model=LLM_MODEL,
|
|
@@ -249,10 +278,10 @@ def call_gpt5_for_metadata(title: str, short_text: str, top_chunks: List[str], l
|
|
| 249 |
max_completion_tokens=MAX_COMPLETION_TOKENS,
|
| 250 |
)
|
| 251 |
except Exception as e:
|
| 252 |
-
log.append(f"OpenAI API call failed: {e}")
|
| 253 |
return {"_api_error": True, "error": f"OpenAI API call failed: {e}", "log": log, "raw_response": None}
|
| 254 |
|
| 255 |
-
#
|
| 256 |
try:
|
| 257 |
full_text = resp.choices[0].message["content"].strip()
|
| 258 |
except Exception:
|
|
@@ -261,39 +290,87 @@ def call_gpt5_for_metadata(title: str, short_text: str, top_chunks: List[str], l
|
|
| 261 |
except Exception:
|
| 262 |
full_text = str(resp)
|
| 263 |
last_raw = full_text
|
| 264 |
-
log.append("OpenAI response received (
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 265 |
|
| 266 |
-
#
|
| 267 |
json_text = extract_json_from_text(full_text)
|
| 268 |
if not json_text:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 269 |
log.append("No JSON found in response")
|
| 270 |
if attempt < max_attempts:
|
| 271 |
messages = [
|
| 272 |
{"role": "system", "content": system_msg},
|
| 273 |
-
{"role": "user", "content": "Previous response lacked JSON
|
| 274 |
]
|
| 275 |
continue
|
| 276 |
else:
|
| 277 |
return {"_parsing_error": True, "raw_output": last_raw, "log": log, "raw_response": full_text}
|
| 278 |
|
|
|
|
| 279 |
ok, parsed_or_partial, parse_err = try_parse_and_validate(json_text)
|
| 280 |
if ok:
|
| 281 |
log.append("JSON parsed and validated successfully")
|
| 282 |
-
# attach model raw response as well
|
| 283 |
return {"metadata": parsed_or_partial, "log": log, "raw_response": full_text}
|
| 284 |
else:
|
| 285 |
log.append(f"JSON parsed but schema validation failed: {parse_err}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 286 |
if attempt < max_attempts:
|
| 287 |
messages = [
|
| 288 |
{"role": "system", "content": system_msg},
|
| 289 |
-
{"role": "user", "content": "
|
| 290 |
]
|
| 291 |
continue
|
| 292 |
else:
|
| 293 |
return {"_parsing_error": True, "raw_output": last_raw, "parsed_partial": parsed_or_partial, "parse_error": parse_err, "log": log, "raw_response": full_text}
|
| 294 |
|
| 295 |
-
return {"_parsing_error": True, "raw_output": last_raw, "log": log, "raw_response":
|
| 296 |
|
|
|
|
|
|
|
|
|
|
| 297 |
def repair_raw_output(raw_output: str, manual_pasted_json: str, log: List[str], max_attempts: int = 2):
|
| 298 |
log.append("Starting repair flow")
|
| 299 |
# if manual JSON pasted by user, try parse+validate directly
|
|
@@ -450,7 +527,7 @@ def process_file(file_obj):
|
|
| 450 |
short_text = (extracted_text[:1000] + "...") if len(extracted_text) > 1000 else extracted_text
|
| 451 |
|
| 452 |
# Primary LLM call
|
| 453 |
-
result = call_gpt5_for_metadata(orig_name, short_text, top_chunks, ui_log, max_attempts=
|
| 454 |
|
| 455 |
# If API error
|
| 456 |
if result.get("_api_error"):
|
|
@@ -511,13 +588,13 @@ with gr.Blocks(title="DocClassify — Final Robust") as demo:
|
|
| 511 |
status = gr.Textbox(label="Status", value="", interactive=False)
|
| 512 |
download_button = gr.File(label="Download metadata JSON", visible=False)
|
| 513 |
gr.Markdown("### Manual repair (paste raw LLM output if needed)")
|
| 514 |
-
manual_raw_input = gr.Textbox(label="Paste raw LLM output here (optional)", lines=8, placeholder="Paste the malformed raw response if you need manual repair")
|
| 515 |
repair_from_paste_btn = gr.Button("Repair from pasted raw output")
|
| 516 |
repair_auto_btn = gr.Button("Attempt automatic repair of last raw output")
|
| 517 |
with gr.Column(scale=1):
|
| 518 |
output_json = gr.JSON(label="Metadata JSON (parsed)")
|
| 519 |
raw_output_box = gr.Textbox(label="Full GPT model raw response", lines=12, interactive=False)
|
| 520 |
-
logs_box = gr.Textbox(label="Step-by-step logs", lines=
|
| 521 |
|
| 522 |
# state holders
|
| 523 |
last_raw_state = gr.State(value=None) # store last raw model response
|
|
@@ -541,7 +618,6 @@ with gr.Blocks(title="DocClassify — Final Robust") as demo:
|
|
| 541 |
display = {"error": result.get("error")}
|
| 542 |
if parsed_partial is not None:
|
| 543 |
display["parsed_partial"] = parsed_partial
|
| 544 |
-
# put logs and raw_response into UI
|
| 545 |
logs_text = "\n".join(logs + [f"Error: {result.get('error')}"])
|
| 546 |
return display, f"Error: {result.get('error')}", None, raw_out, logs_text
|
| 547 |
# success -> create temp file for download
|
|
@@ -555,7 +631,6 @@ with gr.Blocks(title="DocClassify — Final Robust") as demo:
|
|
| 555 |
def on_repair_from_paste(manual_text):
|
| 556 |
if not manual_text:
|
| 557 |
return {}, "No pasted raw output provided.", None, "", "No pasted raw output provided."
|
| 558 |
-
# try repair using model (or direct parse)
|
| 559 |
ui_log = ["Repair-from-paste initiated"]
|
| 560 |
repaired = repair_raw_output(raw_output=None, manual_pasted_json=manual_text, log=ui_log, max_attempts=2)
|
| 561 |
logs_text = "\n".join(repaired.get("log", ui_log))
|
|
@@ -564,7 +639,6 @@ with gr.Blocks(title="DocClassify — Final Robust") as demo:
|
|
| 564 |
if repaired.get("_parsing_error"):
|
| 565 |
display = {"error": "Repair failed to produce valid JSON", "parsed_partial": repaired.get("parsed_partial"), "parse_error": repaired.get("parse_error")}
|
| 566 |
return display, "Repair failed", None, repaired.get("raw_response", manual_text), logs_text
|
| 567 |
-
# success
|
| 568 |
metadata = repaired.get("metadata")
|
| 569 |
tmpf = tempfile.NamedTemporaryFile(delete=False, suffix=".json")
|
| 570 |
with open(tmpf.name, "w", encoding="utf8") as f:
|
|
|
|
| 219 |
return True, parsed, ""
|
| 220 |
|
| 221 |
# -----------------------
|
| 222 |
+
# Improved call_gpt5_for_metadata (prevents tool invocation; includes example; retries with document_text)
|
| 223 |
# -----------------------
|
| 224 |
+
def call_gpt5_for_metadata(title: str, short_text: str, top_chunks: List[str], log: List[str], max_attempts: int = 3):
|
| 225 |
+
"""
|
| 226 |
+
Robust metadata generation:
|
| 227 |
+
- Prevents tool invocation by instruction
|
| 228 |
+
- Includes example JSON
|
| 229 |
+
- Retries with explicit document_text if model returns tool-like MISSING_INPUT objects
|
| 230 |
+
- Logs full model response
|
| 231 |
+
"""
|
| 232 |
system_msg = (
|
| 233 |
+
"You are an assistant that must PRODUCE a JSON metadata object for the uploaded document. "
|
| 234 |
+
"Do NOT attempt to call any external APIs or tools. Do NOT return status/error objects from other services. "
|
| 235 |
"Return ONLY a JSON object wrapped between <<BEGIN_JSON>> and <<END_JSON>> and nothing else."
|
| 236 |
)
|
| 237 |
+
|
| 238 |
+
example_json = {
|
| 239 |
+
"doc_id": "example_001",
|
| 240 |
+
"title": "Example Title",
|
| 241 |
+
"summary": "Short summary of the document in 1-2 sentences.",
|
| 242 |
+
"doc_type": "architecture_comparison",
|
| 243 |
+
"source": "user_upload",
|
| 244 |
+
"tags": ["arch:docai", "topic:ocr-parsing"],
|
| 245 |
+
"tag_confidences": {"arch:docai": 0.95, "topic:ocr-parsing": 0.9},
|
| 246 |
+
"taxonomy_path": ["Technology", "Document Processing", "OCR & Parsing"],
|
| 247 |
+
"extracted_entities": {"platforms": ["GCP", "BigQuery"], "tools": ["DocAI"]},
|
| 248 |
+
"raw_url": "",
|
| 249 |
+
"ingest_timestamp": "2025-09-19T09:13:00+05:30"
|
| 250 |
+
}
|
| 251 |
+
example_block = "Example JSON (use this schema, but fill with values from the document):\n<<BEGIN_JSON>>\n" + json.dumps(example_json, ensure_ascii=False, indent=2) + "\n<<END_JSON>>\n\n"
|
| 252 |
+
|
| 253 |
prompt_intro = f"Document title: {title}\n\nShort document text (first ~1000 chars): {short_text}\n\nTop content chunks:\n"
|
| 254 |
prompt_chunks = ""
|
| 255 |
for i, c in enumerate(top_chunks[:6]):
|
| 256 |
chunk_text_clean = c[:800].replace("\n", " ")
|
| 257 |
prompt_chunks += f"CHUNK_{i+1}: {chunk_text_clean}\n\n"
|
| 258 |
+
|
| 259 |
prompt_end = (
|
| 260 |
"Task: Produce a JSON object with EXACT keys: doc_id, title, summary, doc_type, source, tags (array of strings), "
|
| 261 |
+
"tag_confidences (map tag->float), taxonomy_path (array of strings), extracted_entities (map), raw_url, ingest_timestamp.\n"
|
| 262 |
+
"Return ONLY the JSON between <<BEGIN_JSON>> and <<END_JSON>>. Do not add any commentary."
|
| 263 |
)
|
| 264 |
+
|
| 265 |
+
messages = [
|
| 266 |
+
{"role": "system", "content": system_msg},
|
| 267 |
+
{"role": "user", "content": example_block + prompt_intro + prompt_chunks + prompt_end},
|
| 268 |
+
]
|
| 269 |
+
|
| 270 |
last_raw = None
|
| 271 |
|
| 272 |
for attempt in range(1, max_attempts + 1):
|
| 273 |
+
log.append(f"OpenAI call attempt {attempt}")
|
| 274 |
try:
|
| 275 |
resp = client.chat.completions.create(
|
| 276 |
model=LLM_MODEL,
|
|
|
|
| 278 |
max_completion_tokens=MAX_COMPLETION_TOKENS,
|
| 279 |
)
|
| 280 |
except Exception as e:
|
| 281 |
+
log.append(f"OpenAI API call failed on attempt {attempt}: {e}")
|
| 282 |
return {"_api_error": True, "error": f"OpenAI API call failed: {e}", "log": log, "raw_response": None}
|
| 283 |
|
| 284 |
+
# capture full model response text for UI logs
|
| 285 |
try:
|
| 286 |
full_text = resp.choices[0].message["content"].strip()
|
| 287 |
except Exception:
|
|
|
|
| 290 |
except Exception:
|
| 291 |
full_text = str(resp)
|
| 292 |
last_raw = full_text
|
| 293 |
+
log.append(f"OpenAI response received (len={len(full_text)})")
|
| 294 |
+
log.append("---- FULL MODEL RESPONSE START ----")
|
| 295 |
+
log.append(full_text)
|
| 296 |
+
log.append("---- FULL MODEL RESPONSE END ----")
|
| 297 |
+
|
| 298 |
+
# If model returned empty, retry with explicit document_text included
|
| 299 |
+
if not full_text:
|
| 300 |
+
log.append("Model returned empty response — will retry with explicit document_text provided.")
|
| 301 |
+
if attempt < max_attempts:
|
| 302 |
+
messages = [
|
| 303 |
+
{"role": "system", "content": system_msg},
|
| 304 |
+
{"role": "user", "content": example_block + "Providing document_text to avoid missing-input errors.\n\ndocument_text: " + short_text + "\n\n" + prompt_chunks + prompt_end}
|
| 305 |
+
]
|
| 306 |
+
continue
|
| 307 |
+
else:
|
| 308 |
+
return {"_parsing_error": True, "raw_output": last_raw, "log": log, "raw_response": full_text}
|
| 309 |
|
| 310 |
+
# Try extract JSON
|
| 311 |
json_text = extract_json_from_text(full_text)
|
| 312 |
if not json_text:
|
| 313 |
+
# try detect tool-like error in JSON
|
| 314 |
+
try:
|
| 315 |
+
maybe_obj = json.loads(full_text)
|
| 316 |
+
if isinstance(maybe_obj, dict) and any("document" in str(v).lower() or "missing_input" in str(v).lower() for v in maybe_obj.values()):
|
| 317 |
+
log.append("Model returned an error-like dict referencing 'document' or 'missing_input'. Retrying with explicit document_text.")
|
| 318 |
+
if attempt < max_attempts:
|
| 319 |
+
messages = [
|
| 320 |
+
{"role": "system", "content": system_msg},
|
| 321 |
+
{"role": "user", "content": example_block + "The model output looked like an error requiring a 'document_text' parameter. "
|
| 322 |
+
+ "Provide the document_text here explicitly and return the metadata JSON.\n\n"
|
| 323 |
+
+ "document_text: " + short_text + "\n\n" + prompt_chunks + prompt_end}
|
| 324 |
+
]
|
| 325 |
+
continue
|
| 326 |
+
else:
|
| 327 |
+
return {"_parsing_error": True, "raw_output": last_raw, "log": log, "raw_response": full_text}
|
| 328 |
+
except Exception:
|
| 329 |
+
pass
|
| 330 |
+
|
| 331 |
log.append("No JSON found in response")
|
| 332 |
if attempt < max_attempts:
|
| 333 |
messages = [
|
| 334 |
{"role": "system", "content": system_msg},
|
| 335 |
+
{"role": "user", "content": "Previous response lacked a JSON block. Return ONLY the JSON between <<BEGIN_JSON>> and <<END_JSON>>. Use the example format."}
|
| 336 |
]
|
| 337 |
continue
|
| 338 |
else:
|
| 339 |
return {"_parsing_error": True, "raw_output": last_raw, "log": log, "raw_response": full_text}
|
| 340 |
|
| 341 |
+
# Validate JSON
|
| 342 |
ok, parsed_or_partial, parse_err = try_parse_and_validate(json_text)
|
| 343 |
if ok:
|
| 344 |
log.append("JSON parsed and validated successfully")
|
|
|
|
| 345 |
return {"metadata": parsed_or_partial, "log": log, "raw_response": full_text}
|
| 346 |
else:
|
| 347 |
log.append(f"JSON parsed but schema validation failed: {parse_err}")
|
| 348 |
+
# If parsed JSON is a tool-style error, retry with explicit document_text
|
| 349 |
+
if isinstance(parsed_or_partial, dict) and parsed_or_partial.get("status") == "error" and ("MISSING_INPUT" in str(parsed_or_partial.get("error_code", "")).upper() or "document" in str(parsed_or_partial.get("message", "")).lower()):
|
| 350 |
+
log.append("Detected tool-like MISSING_INPUT response inside JSON. Retrying with explicit document_text.")
|
| 351 |
+
if attempt < max_attempts:
|
| 352 |
+
messages = [
|
| 353 |
+
{"role": "system", "content": system_msg},
|
| 354 |
+
{"role": "user", "content": example_block + "The previous response contained an error object asking for document_text. "
|
| 355 |
+
+ "Please produce the metadata JSON now. document_text: " + short_text + "\n\n" + prompt_chunks + prompt_end}
|
| 356 |
+
]
|
| 357 |
+
continue
|
| 358 |
+
else:
|
| 359 |
+
return {"_parsing_error": True, "raw_output": last_raw, "parsed_partial": parsed_or_partial, "parse_error": parse_err, "log": log, "raw_response": full_text}
|
| 360 |
if attempt < max_attempts:
|
| 361 |
messages = [
|
| 362 |
{"role": "system", "content": system_msg},
|
| 363 |
+
{"role": "user", "content": "Your JSON is invalid vs schema. Return corrected JSON only between markers, using the example format."}
|
| 364 |
]
|
| 365 |
continue
|
| 366 |
else:
|
| 367 |
return {"_parsing_error": True, "raw_output": last_raw, "parsed_partial": parsed_or_partial, "parse_error": parse_err, "log": log, "raw_response": full_text}
|
| 368 |
|
| 369 |
+
return {"_parsing_error": True, "raw_output": last_raw, "log": log, "raw_response": last_raw}
|
| 370 |
|
| 371 |
+
# -----------------------
|
| 372 |
+
# Other LLM helpers: repair + auto-complete (same as before)
|
| 373 |
+
# -----------------------
|
| 374 |
def repair_raw_output(raw_output: str, manual_pasted_json: str, log: List[str], max_attempts: int = 2):
|
| 375 |
log.append("Starting repair flow")
|
| 376 |
# if manual JSON pasted by user, try parse+validate directly
|
|
|
|
| 527 |
short_text = (extracted_text[:1000] + "...") if len(extracted_text) > 1000 else extracted_text
|
| 528 |
|
| 529 |
# Primary LLM call
|
| 530 |
+
result = call_gpt5_for_metadata(orig_name, short_text, top_chunks, ui_log, max_attempts=3)
|
| 531 |
|
| 532 |
# If API error
|
| 533 |
if result.get("_api_error"):
|
|
|
|
| 588 |
status = gr.Textbox(label="Status", value="", interactive=False)
|
| 589 |
download_button = gr.File(label="Download metadata JSON", visible=False)
|
| 590 |
gr.Markdown("### Manual repair (paste raw LLM output if needed)")
|
| 591 |
+
manual_raw_input = gr.Textbox(label="Paste raw LLM output here (optional)", lines=8, placeholder="Paste the malformed raw response here if you need manual repair")
|
| 592 |
repair_from_paste_btn = gr.Button("Repair from pasted raw output")
|
| 593 |
repair_auto_btn = gr.Button("Attempt automatic repair of last raw output")
|
| 594 |
with gr.Column(scale=1):
|
| 595 |
output_json = gr.JSON(label="Metadata JSON (parsed)")
|
| 596 |
raw_output_box = gr.Textbox(label="Full GPT model raw response", lines=12, interactive=False)
|
| 597 |
+
logs_box = gr.Textbox(label="Step-by-step logs", lines=18, interactive=False)
|
| 598 |
|
| 599 |
# state holders
|
| 600 |
last_raw_state = gr.State(value=None) # store last raw model response
|
|
|
|
| 618 |
display = {"error": result.get("error")}
|
| 619 |
if parsed_partial is not None:
|
| 620 |
display["parsed_partial"] = parsed_partial
|
|
|
|
| 621 |
logs_text = "\n".join(logs + [f"Error: {result.get('error')}"])
|
| 622 |
return display, f"Error: {result.get('error')}", None, raw_out, logs_text
|
| 623 |
# success -> create temp file for download
|
|
|
|
| 631 |
def on_repair_from_paste(manual_text):
|
| 632 |
if not manual_text:
|
| 633 |
return {}, "No pasted raw output provided.", None, "", "No pasted raw output provided."
|
|
|
|
| 634 |
ui_log = ["Repair-from-paste initiated"]
|
| 635 |
repaired = repair_raw_output(raw_output=None, manual_pasted_json=manual_text, log=ui_log, max_attempts=2)
|
| 636 |
logs_text = "\n".join(repaired.get("log", ui_log))
|
|
|
|
| 639 |
if repaired.get("_parsing_error"):
|
| 640 |
display = {"error": "Repair failed to produce valid JSON", "parsed_partial": repaired.get("parsed_partial"), "parse_error": repaired.get("parse_error")}
|
| 641 |
return display, "Repair failed", None, repaired.get("raw_response", manual_text), logs_text
|
|
|
|
| 642 |
metadata = repaired.get("metadata")
|
| 643 |
tmpf = tempfile.NamedTemporaryFile(delete=False, suffix=".json")
|
| 644 |
with open(tmpf.name, "w", encoding="utf8") as f:
|