Spaces:

Toulik
/

MagicFile

Sleeping

App Files Files Community

Toulik commited on Sep 19, 2025

Commit

4515495

verified ·

1 Parent(s): 55e1313

Update app.py

Browse files

Files changed (1) hide show

app.py +96 -22

app.py CHANGED Viewed

@@ -219,29 +219,58 @@ def try_parse_and_validate(json_text: str) -> (bool, Dict[str, Any], str):
     return True, parsed, ""
 # -----------------------
-# LLM interactions (metadata, repair, autocomplete)
 # -----------------------
-def call_gpt5_for_metadata(title: str, short_text: str, top_chunks: List[str], log: List[str], max_attempts: int = 2):
-    log.append("Preparing prompt for metadata generation")
     system_msg = (
-        "You are an automated document taxonomy and tagging assistant for enterprise catalogs. "
         "Return ONLY a JSON object wrapped between <<BEGIN_JSON>> and <<END_JSON>> and nothing else."
     )
     prompt_intro = f"Document title: {title}\n\nShort document text (first ~1000 chars): {short_text}\n\nTop content chunks:\n"
     prompt_chunks = ""
     for i, c in enumerate(top_chunks[:6]):
         chunk_text_clean = c[:800].replace("\n", " ")
         prompt_chunks += f"CHUNK_{i+1}: {chunk_text_clean}\n\n"
     prompt_end = (
         "Task: Produce a JSON object with EXACT keys: doc_id, title, summary, doc_type, source, tags (array of strings), "
-        "tag_confidences (map tag->float), taxonomy_path (array of strings), extracted_entities (map), raw_url, ingest_timestamp. "
-        "Output MUST be the JSON only, enclosed between <<BEGIN_JSON>> and <<END_JSON>>."
     )
-    messages = [{"role": "system", "content": system_msg}, {"role": "user", "content": prompt_intro + prompt_chunks + prompt_end}]
     last_raw = None
     for attempt in range(1, max_attempts + 1):
-        log.append(f"Calling OpenAI (attempt {attempt})")
         try:
             resp = client.chat.completions.create(
                 model=LLM_MODEL,
@@ -249,10 +278,10 @@ def call_gpt5_for_metadata(title: str, short_text: str, top_chunks: List[str], l
                 max_completion_tokens=MAX_COMPLETION_TOKENS,
             )
         except Exception as e:
-            log.append(f"OpenAI API call failed: {e}")
             return {"_api_error": True, "error": f"OpenAI API call failed: {e}", "log": log, "raw_response": None}
-        # extract full model response text for UI logs
         try:
             full_text = resp.choices[0].message["content"].strip()
         except Exception:
@@ -261,39 +290,87 @@ def call_gpt5_for_metadata(title: str, short_text: str, top_chunks: List[str], l
             except Exception:
                 full_text = str(resp)
         last_raw = full_text
-        log.append("OpenAI response received (raw length: " + str(len(full_text)) + ")")
-        # attempt to extract JSON
         json_text = extract_json_from_text(full_text)
         if not json_text:
             log.append("No JSON found in response")
             if attempt < max_attempts:
                 messages = [
                     {"role": "system", "content": system_msg},
-                    {"role": "user", "content": "Previous response lacked JSON markers. Return only JSON between <<BEGIN_JSON>> and <<END_JSON>>."},
                 ]
                 continue
             else:
                 return {"_parsing_error": True, "raw_output": last_raw, "log": log, "raw_response": full_text}
         ok, parsed_or_partial, parse_err = try_parse_and_validate(json_text)
         if ok:
             log.append("JSON parsed and validated successfully")
-            # attach model raw response as well
             return {"metadata": parsed_or_partial, "log": log, "raw_response": full_text}
         else:
             log.append(f"JSON parsed but schema validation failed: {parse_err}")
             if attempt < max_attempts:
                 messages = [
                     {"role": "system", "content": system_msg},
-                    {"role": "user", "content": "The JSON you returned is invalid vs schema. Return corrected JSON only between markers."},
                 ]
                 continue
             else:
                 return {"_parsing_error": True, "raw_output": last_raw, "parsed_partial": parsed_or_partial, "parse_error": parse_err, "log": log, "raw_response": full_text}
-    return {"_parsing_error": True, "raw_output": last_raw, "log": log, "raw_response": full_text}
 def repair_raw_output(raw_output: str, manual_pasted_json: str, log: List[str], max_attempts: int = 2):
     log.append("Starting repair flow")
     # if manual JSON pasted by user, try parse+validate directly
@@ -450,7 +527,7 @@ def process_file(file_obj):
     short_text = (extracted_text[:1000] + "...") if len(extracted_text) > 1000 else extracted_text
     # Primary LLM call
-    result = call_gpt5_for_metadata(orig_name, short_text, top_chunks, ui_log, max_attempts=2)
     # If API error
     if result.get("_api_error"):
@@ -511,13 +588,13 @@ with gr.Blocks(title="DocClassify — Final Robust") as demo:
             status = gr.Textbox(label="Status", value="", interactive=False)
             download_button = gr.File(label="Download metadata JSON", visible=False)
             gr.Markdown("### Manual repair (paste raw LLM output if needed)")
-            manual_raw_input = gr.Textbox(label="Paste raw LLM output here (optional)", lines=8, placeholder="Paste the malformed raw response if you need manual repair")
             repair_from_paste_btn = gr.Button("Repair from pasted raw output")
             repair_auto_btn = gr.Button("Attempt automatic repair of last raw output")
         with gr.Column(scale=1):
             output_json = gr.JSON(label="Metadata JSON (parsed)")
             raw_output_box = gr.Textbox(label="Full GPT model raw response", lines=12, interactive=False)
-            logs_box = gr.Textbox(label="Step-by-step logs", lines=12, interactive=False)
     # state holders
     last_raw_state = gr.State(value=None)    # store last raw model response
@@ -541,7 +618,6 @@ with gr.Blocks(title="DocClassify — Final Robust") as demo:
             display = {"error": result.get("error")}
             if parsed_partial is not None:
                 display["parsed_partial"] = parsed_partial
-            # put logs and raw_response into UI
             logs_text = "\n".join(logs + [f"Error: {result.get('error')}"])
             return display, f"Error: {result.get('error')}", None, raw_out, logs_text
         # success -> create temp file for download
@@ -555,7 +631,6 @@ with gr.Blocks(title="DocClassify — Final Robust") as demo:
     def on_repair_from_paste(manual_text):
         if not manual_text:
             return {}, "No pasted raw output provided.", None, "", "No pasted raw output provided."
-        # try repair using model (or direct parse)
         ui_log = ["Repair-from-paste initiated"]
         repaired = repair_raw_output(raw_output=None, manual_pasted_json=manual_text, log=ui_log, max_attempts=2)
         logs_text = "\n".join(repaired.get("log", ui_log))
@@ -564,7 +639,6 @@ with gr.Blocks(title="DocClassify — Final Robust") as demo:
         if repaired.get("_parsing_error"):
             display = {"error": "Repair failed to produce valid JSON", "parsed_partial": repaired.get("parsed_partial"), "parse_error": repaired.get("parse_error")}
             return display, "Repair failed", None, repaired.get("raw_response", manual_text), logs_text
-        # success
         metadata = repaired.get("metadata")
         tmpf = tempfile.NamedTemporaryFile(delete=False, suffix=".json")
         with open(tmpf.name, "w", encoding="utf8") as f:

     return True, parsed, ""
 # -----------------------
+# Improved call_gpt5_for_metadata (prevents tool invocation; includes example; retries with document_text)
 # -----------------------
+def call_gpt5_for_metadata(title: str, short_text: str, top_chunks: List[str], log: List[str], max_attempts: int = 3):
+    """
+    Robust metadata generation:
+    - Prevents tool invocation by instruction
+    - Includes example JSON
+    - Retries with explicit document_text if model returns tool-like MISSING_INPUT objects
+    - Logs full model response
+    """
     system_msg = (
+        "You are an assistant that must PRODUCE a JSON metadata object for the uploaded document. "
+        "Do NOT attempt to call any external APIs or tools. Do NOT return status/error objects from other services. "
         "Return ONLY a JSON object wrapped between <<BEGIN_JSON>> and <<END_JSON>> and nothing else."
     )
+    example_json = {
+        "doc_id": "example_001",
+        "title": "Example Title",
+        "summary": "Short summary of the document in 1-2 sentences.",
+        "doc_type": "architecture_comparison",
+        "source": "user_upload",
+        "tags": ["arch:docai", "topic:ocr-parsing"],
+        "tag_confidences": {"arch:docai": 0.95, "topic:ocr-parsing": 0.9},
+        "taxonomy_path": ["Technology", "Document Processing", "OCR & Parsing"],
+        "extracted_entities": {"platforms": ["GCP", "BigQuery"], "tools": ["DocAI"]},
+        "raw_url": "",
+        "ingest_timestamp": "2025-09-19T09:13:00+05:30"
+    }
+    example_block = "Example JSON (use this schema, but fill with values from the document):\n<<BEGIN_JSON>>\n" + json.dumps(example_json, ensure_ascii=False, indent=2) + "\n<<END_JSON>>\n\n"
     prompt_intro = f"Document title: {title}\n\nShort document text (first ~1000 chars): {short_text}\n\nTop content chunks:\n"
     prompt_chunks = ""
     for i, c in enumerate(top_chunks[:6]):
         chunk_text_clean = c[:800].replace("\n", " ")
         prompt_chunks += f"CHUNK_{i+1}: {chunk_text_clean}\n\n"
     prompt_end = (
         "Task: Produce a JSON object with EXACT keys: doc_id, title, summary, doc_type, source, tags (array of strings), "
+        "tag_confidences (map tag->float), taxonomy_path (array of strings), extracted_entities (map), raw_url, ingest_timestamp.\n"
+        "Return ONLY the JSON between <<BEGIN_JSON>> and <<END_JSON>>. Do not add any commentary."
     )
+    messages = [
+        {"role": "system", "content": system_msg},
+        {"role": "user", "content": example_block + prompt_intro + prompt_chunks + prompt_end},
+    ]
     last_raw = None
     for attempt in range(1, max_attempts + 1):
+        log.append(f"OpenAI call attempt {attempt}")
         try:
             resp = client.chat.completions.create(
                 model=LLM_MODEL,
                 max_completion_tokens=MAX_COMPLETION_TOKENS,
             )
         except Exception as e:
+            log.append(f"OpenAI API call failed on attempt {attempt}: {e}")
             return {"_api_error": True, "error": f"OpenAI API call failed: {e}", "log": log, "raw_response": None}
+        # capture full model response text for UI logs
         try:
             full_text = resp.choices[0].message["content"].strip()
         except Exception:
             except Exception:
                 full_text = str(resp)
         last_raw = full_text
+        log.append(f"OpenAI response received (len={len(full_text)})")
+        log.append("---- FULL MODEL RESPONSE START ----")
+        log.append(full_text)
+        log.append("---- FULL MODEL RESPONSE END ----")
+        # If model returned empty, retry with explicit document_text included
+        if not full_text:
+            log.append("Model returned empty response — will retry with explicit document_text provided.")
+            if attempt < max_attempts:
+                messages = [
+                    {"role": "system", "content": system_msg},
+                    {"role": "user", "content": example_block + "Providing document_text to avoid missing-input errors.\n\ndocument_text: " + short_text + "\n\n" + prompt_chunks + prompt_end}
+                ]
+                continue
+            else:
+                return {"_parsing_error": True, "raw_output": last_raw, "log": log, "raw_response": full_text}
+        # Try extract JSON
         json_text = extract_json_from_text(full_text)
         if not json_text:
+            # try detect tool-like error in JSON
+            try:
+                maybe_obj = json.loads(full_text)
+                if isinstance(maybe_obj, dict) and any("document" in str(v).lower() or "missing_input" in str(v).lower() for v in maybe_obj.values()):
+                    log.append("Model returned an error-like dict referencing 'document' or 'missing_input'. Retrying with explicit document_text.")
+                    if attempt < max_attempts:
+                        messages = [
+                            {"role": "system", "content": system_msg},
+                            {"role": "user", "content": example_block + "The model output looked like an error requiring a 'document_text' parameter. "
+                                + "Provide the document_text here explicitly and return the metadata JSON.\n\n"
+                                + "document_text: " + short_text + "\n\n" + prompt_chunks + prompt_end}
+                        ]
+                        continue
+                    else:
+                        return {"_parsing_error": True, "raw_output": last_raw, "log": log, "raw_response": full_text}
+            except Exception:
+                pass
             log.append("No JSON found in response")
             if attempt < max_attempts:
                 messages = [
                     {"role": "system", "content": system_msg},
+                    {"role": "user", "content": "Previous response lacked a JSON block. Return ONLY the JSON between <<BEGIN_JSON>> and <<END_JSON>>. Use the example format."}
                 ]
                 continue
             else:
                 return {"_parsing_error": True, "raw_output": last_raw, "log": log, "raw_response": full_text}
+        # Validate JSON
         ok, parsed_or_partial, parse_err = try_parse_and_validate(json_text)
         if ok:
             log.append("JSON parsed and validated successfully")
             return {"metadata": parsed_or_partial, "log": log, "raw_response": full_text}
         else:
             log.append(f"JSON parsed but schema validation failed: {parse_err}")
+            # If parsed JSON is a tool-style error, retry with explicit document_text
+            if isinstance(parsed_or_partial, dict) and parsed_or_partial.get("status") == "error" and ("MISSING_INPUT" in str(parsed_or_partial.get("error_code", "")).upper() or "document" in str(parsed_or_partial.get("message", "")).lower()):
+                log.append("Detected tool-like MISSING_INPUT response inside JSON. Retrying with explicit document_text.")
+                if attempt < max_attempts:
+                    messages = [
+                        {"role": "system", "content": system_msg},
+                        {"role": "user", "content": example_block + "The previous response contained an error object asking for document_text. "
+                            + "Please produce the metadata JSON now. document_text: " + short_text + "\n\n" + prompt_chunks + prompt_end}
+                    ]
+                    continue
+                else:
+                    return {"_parsing_error": True, "raw_output": last_raw, "parsed_partial": parsed_or_partial, "parse_error": parse_err, "log": log, "raw_response": full_text}
             if attempt < max_attempts:
                 messages = [
                     {"role": "system", "content": system_msg},
+                    {"role": "user", "content": "Your JSON is invalid vs schema. Return corrected JSON only between markers, using the example format."}
                 ]
                 continue
             else:
                 return {"_parsing_error": True, "raw_output": last_raw, "parsed_partial": parsed_or_partial, "parse_error": parse_err, "log": log, "raw_response": full_text}
+    return {"_parsing_error": True, "raw_output": last_raw, "log": log, "raw_response": last_raw}
+# -----------------------
+# Other LLM helpers: repair + auto-complete (same as before)
+# -----------------------
 def repair_raw_output(raw_output: str, manual_pasted_json: str, log: List[str], max_attempts: int = 2):
     log.append("Starting repair flow")
     # if manual JSON pasted by user, try parse+validate directly
     short_text = (extracted_text[:1000] + "...") if len(extracted_text) > 1000 else extracted_text
     # Primary LLM call
+    result = call_gpt5_for_metadata(orig_name, short_text, top_chunks, ui_log, max_attempts=3)
     # If API error
     if result.get("_api_error"):
             status = gr.Textbox(label="Status", value="", interactive=False)
             download_button = gr.File(label="Download metadata JSON", visible=False)
             gr.Markdown("### Manual repair (paste raw LLM output if needed)")
+            manual_raw_input = gr.Textbox(label="Paste raw LLM output here (optional)", lines=8, placeholder="Paste the malformed raw response here if you need manual repair")
             repair_from_paste_btn = gr.Button("Repair from pasted raw output")
             repair_auto_btn = gr.Button("Attempt automatic repair of last raw output")
         with gr.Column(scale=1):
             output_json = gr.JSON(label="Metadata JSON (parsed)")
             raw_output_box = gr.Textbox(label="Full GPT model raw response", lines=12, interactive=False)
+            logs_box = gr.Textbox(label="Step-by-step logs", lines=18, interactive=False)
     # state holders
     last_raw_state = gr.State(value=None)    # store last raw model response
             display = {"error": result.get("error")}
             if parsed_partial is not None:
                 display["parsed_partial"] = parsed_partial
             logs_text = "\n".join(logs + [f"Error: {result.get('error')}"])
             return display, f"Error: {result.get('error')}", None, raw_out, logs_text
         # success -> create temp file for download
     def on_repair_from_paste(manual_text):
         if not manual_text:
             return {}, "No pasted raw output provided.", None, "", "No pasted raw output provided."
         ui_log = ["Repair-from-paste initiated"]
         repaired = repair_raw_output(raw_output=None, manual_pasted_json=manual_text, log=ui_log, max_attempts=2)
         logs_text = "\n".join(repaired.get("log", ui_log))
         if repaired.get("_parsing_error"):
             display = {"error": "Repair failed to produce valid JSON", "parsed_partial": repaired.get("parsed_partial"), "parse_error": repaired.get("parse_error")}
             return display, "Repair failed", None, repaired.get("raw_response", manual_text), logs_text
         metadata = repaired.get("metadata")
         tmpf = tempfile.NamedTemporaryFile(delete=False, suffix=".json")
         with open(tmpf.name, "w", encoding="utf8") as f: