Spaces:

Toulik
/

MagicFile

Sleeping

App Files Files Community

Toulik commited on Sep 19, 2025

Commit

55e1313

verified ·

1 Parent(s): 4975bf7

Update app.py

Browse files

Files changed (1) hide show

app.py +324 -286

app.py CHANGED Viewed

@@ -1,10 +1,16 @@
 # app.py
 """
-Gradio app: upload PDF / Image -> extract text (PyMuPDF + Tesseract fallback) ->
-call GPT-5 (OpenAI new client) to produce machine-parseable metadata JSON (between markers) ->
-validate JSON (jsonschema) -> show JSON and allow download.
-Requirements (add to requirements.txt for HF Space or local venv):
   gradio>=3.0
   PyMuPDF
   pytesseract
@@ -12,11 +18,11 @@ Requirements (add to requirements.txt for HF Space or local venv):
   openai>=1.0.0
   jsonschema
-System packages required (HF Spaces apt-packages):
   tesseract-ocr
   poppler-utils
-Put OPENAI_API_KEY into your environment/Space Secrets.
 """
 import os
@@ -31,20 +37,18 @@ from PIL import Image
 import fitz  # PyMuPDF
 import pytesseract
 from jsonschema import validate as json_validate, ValidationError
-# new OpenAI client surface
 from openai import OpenAI
 # -----------------------
-# Config / client
 # -----------------------
 OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
 if not OPENAI_API_KEY:
-    raise RuntimeError("OPENAI_API_KEY not found in environment. Add to HF Space Secrets or env var.")
 client = OpenAI(api_key=OPENAI_API_KEY)
-LLM_MODEL = os.getenv("OPENAI_MODEL", "gpt-5")  # change if you have a different model id
 MAX_COMPLETION_TOKENS = int(os.getenv("MAX_COMPLETION_TOKENS", "1500"))
 # -----------------------
@@ -53,17 +57,8 @@ MAX_COMPLETION_TOKENS = int(os.getenv("MAX_COMPLETION_TOKENS", "1500"))
 METADATA_SCHEMA = {
     "type": "object",
     "required": [
-        "doc_id",
-        "title",
-        "summary",
-        "doc_type",
-        "source",
-        "tags",
-        "tag_confidences",
-        "taxonomy_path",
-        "extracted_entities",
-        "raw_url",
-        "ingest_timestamp",
     ],
     "properties": {
         "doc_id": {"type": "string"},
@@ -82,33 +77,38 @@ METADATA_SCHEMA = {
 }
 # -----------------------
-# Extraction helpers
 # -----------------------
-def extract_text_from_pdf(path: str) -> str:
     try:
         doc = fitz.open(path)
     except Exception as e:
         raise RuntimeError(f"Failed to open PDF: {e}")
     texts: List[str] = []
     for i in range(len(doc)):
         page = doc.load_page(i)
         txt = page.get_text("text").strip()
         if txt:
             texts.append(txt)
         else:
-            # render and OCR
             pix = page.get_pixmap(dpi=200)
             with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp:
                 pix.save(tmp.name)
                 ocr_text = pytesseract.image_to_string(Image.open(tmp.name))
                 texts.append(ocr_text)
     return "\n\n".join(texts).strip()
-def extract_text_from_image(path: str) -> str:
     img = Image.open(path).convert("RGB")
-    return pytesseract.image_to_string(img).strip()
 def chunk_text(text: str, max_chars: int = 3000) -> List[str]:
@@ -127,17 +127,10 @@ def chunk_text(text: str, max_chars: int = 3000) -> List[str]:
     return chunks
 # -----------------------
-# Utilities for robust upload handling
 # -----------------------
-def save_uploaded_to_tmp(file_obj):
-    """
-    Accepts common Gradio upload types:
-      - file-like (has .read())
-      - dict-like {"name": ..., "data": b'...'}
-      - path string
-      - objects with .name attribute pointing to a path (NamedString)
-    Returns (tmp_path, original_filename)
-    """
     # file-like
     if hasattr(file_obj, "read") and callable(getattr(file_obj, "read")):
         try:
@@ -148,13 +141,13 @@ def save_uploaded_to_tmp(file_obj):
             suffix = os.path.splitext(name)[1] or ""
             with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as tmp:
                 tmp.write(content)
                 return tmp.name, os.path.basename(name)
-        except Exception:
-            pass
     # dict-like
-    if isinstance(file_obj, dict):
-        if "data" in file_obj and "name" in file_obj:
             data = file_obj["data"]
             if isinstance(data, str):
                 data = data.encode("utf-8")
@@ -162,11 +155,14 @@ def save_uploaded_to_tmp(file_obj):
             suffix = os.path.splitext(name)[1] or ""
             with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as tmp:
                 tmp.write(data)
                 return tmp.name, os.path.basename(name)
     # path string
     if isinstance(file_obj, str):
         if os.path.exists(file_obj):
             return file_obj, os.path.basename(file_obj)
         try:
             with open(file_obj, "rb") as f:
@@ -174,11 +170,11 @@ def save_uploaded_to_tmp(file_obj):
             suffix = os.path.splitext(file_obj)[1] or ""
             with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as tmp:
                 tmp.write(data)
                 return tmp.name, os.path.basename(file_obj)
-        except Exception:
-            pass
-    # object with .name attribute referencing existing path
     name = getattr(file_obj, "name", None)
     if name and isinstance(name, str):
         try:
@@ -187,21 +183,16 @@ def save_uploaded_to_tmp(file_obj):
             suffix = os.path.splitext(name)[1] or ""
             with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as tmp:
                 tmp.write(data)
                 return tmp.name, os.path.basename(name)
-        except Exception:
-            pass
     raise ValueError(f"Unsupported uploaded file object type: {type(file_obj)}. repr: {repr(file_obj)[:400]}")
 # -----------------------
-# JSON extraction & validation helpers
 # -----------------------
 def extract_json_from_text(text: str) -> str:
-    """
-    Prefer explicit markers <<BEGIN_JSON>> ... <<END_JSON>>.
-    Otherwise try to get the last {...} block, then first {...} block.
-    """
     m = re.search(r"<<BEGIN_JSON>>(.*?)<<END_JSON>>", text, re.DOTALL)
     if m:
         return m.group(1).strip()
@@ -215,79 +206,42 @@ def extract_json_from_text(text: str) -> str:
 def try_parse_and_validate(json_text: str) -> (bool, Dict[str, Any], str):
-    """
-    Returns (ok, parsed_dict_or_none, error_message_or_empty)
-    """
     try:
         parsed = json.loads(json_text)
     except Exception as e:
         return False, None, f"json.loads error: {e}"
     try:
         json_validate(parsed, METADATA_SCHEMA)
     except ValidationError as e:
         return False, parsed, f"schema validation error: {e}"
     except Exception as e:
-        # other validation errors
         return False, parsed, f"schema validation unexpected error: {e}"
     return True, parsed, ""
 # -----------------------
-# LLM call with retries + repair logic
 # -----------------------
-def call_gpt5_for_metadata(title: str, short_text: str, top_chunks: List[str], max_attempts: int = 3) -> Dict[str, Any]:
-    """
-    Robust LLM call:
-      - uses system message to enforce JSON-only output between markers
-      - retries up to max_attempts
-      - if model returns partial/invalid JSON, asks model to repair it
-      - validates the JSON against METADATA_SCHEMA
-    Returns:
-      - valid metadata dict OR dict with keys like _parsing_error/raw_output for UI consumption
-    """
     system_msg = (
         "You are an automated document taxonomy and tagging assistant for enterprise catalogs. "
-        "When producing output for this task you MUST return ONLY a JSON object and NOTHING ELSE. "
-        "Wrap the JSON in explicit markers: <<BEGIN_JSON>> and <<END_JSON>>. "
-        "Do not include any commentary, explanation, or text outside those markers."
-    )
-    prompt_intro = (
-        f"Document title: {title}\n\n"
-        f"Short document text (first ~1000 chars): {short_text}\n\n"
-        "Top content chunks (short):\n"
     )
     prompt_chunks = ""
     for i, c in enumerate(top_chunks[:6]):
         chunk_text_clean = c[:800].replace("\n", " ")
         prompt_chunks += f"CHUNK_{i+1}: {chunk_text_clean}\n\n"
     prompt_end = (
-        "Task: Produce a single JSON object with EXACT keys:\n"
-        "doc_id, title, summary, doc_type, source, tags (array of strings), tag_confidences (map tag->float), "
-        "taxonomy_path (array of strings), extracted_entities (map), raw_url, ingest_timestamp\n\n"
-        "Guidelines:\n"
-        "- summary: 1-2 sentences.\n"
-        "- doc_type: short enum-like string (e.g., architecture_comparison).\n"
-        "- tags: up to 8 short tags like arch:docai.\n"
-        "- tag_confidences: floats 0-1 for each tag.\n"
-        "- taxonomy_path: hierarchical list.\n\n"
-        "Output MUST be the JSON only, enclosed between <<BEGIN_JSON>> and <<END_JSON>>.\n"
     )
-    user_prompt = prompt_intro + prompt_chunks + prompt_end
-    messages = [
-        {"role": "system", "content": system_msg},
-        {"role": "user", "content": user_prompt},
-    ]
     last_raw = None
     for attempt in range(1, max_attempts + 1):
         try:
             resp = client.chat.completions.create(
                 model=LLM_MODEL,
@@ -295,147 +249,79 @@ def call_gpt5_for_metadata(title: str, short_text: str, top_chunks: List[str], m
                 max_completion_tokens=MAX_COMPLETION_TOKENS,
             )
         except Exception as e:
-            return {"_api_error": True, "error": f"OpenAI API call failed: {e}"}
-        # extract text
         try:
-            text = resp.choices[0].message["content"].strip()
         except Exception:
             try:
-                text = resp.choices[0].message.content.strip()
             except Exception:
-                text = str(resp)
-        last_raw = text
-        # extract the JSON
-        json_text = extract_json_from_text(text)
         if not json_text:
-            # prepare a repair prompt and retry if attempts left
             if attempt < max_attempts:
-                fix_prompt = (
-                    "The previous response did not include a JSON object wrapped in <<BEGIN_JSON>> and <<END_JSON>> markers, "
-                    "or returned invalid JSON. Here is the raw output:\n\n"
-                    f"{text}\n\n"
-                    "Please return ONLY a valid JSON object wrapped between <<BEGIN_JSON>> and <<END_JSON>>. "
-                    "Do not include anything else."
-                )
                 messages = [
                     {"role": "system", "content": system_msg},
-                    {"role": "user", "content": fix_prompt},
                 ]
                 continue
             else:
-                return {"_parsing_error": True, "raw_output": last_raw, "error": "no JSON found between markers or as object."}
         ok, parsed_or_partial, parse_err = try_parse_and_validate(json_text)
         if ok:
-            return parsed_or_partial
         else:
-            # parsed_or_partial may be dict (parsed but schema-failed) or None
             if attempt < max_attempts:
-                repair_prompt = (
-                    "The JSON you returned is invalid or does not meet the schema. Here is the JSON you returned:\n\n"
-                    f"{json_text}\n\n"
-                    "Please return ONLY a corrected JSON object wrapped in <<BEGIN_JSON>> and <<END_JSON>> that includes the required keys: "
-                    "doc_id, title, summary, doc_type, source, tags, tag_confidences, taxonomy_path, extracted_entities, raw_url, ingest_timestamp. "
-                    "If you must guess missing fields, use reasonable defaults (empty string or empty list/map)."
-                )
                 messages = [
                     {"role": "system", "content": system_msg},
-                    {"role": "user", "content": repair_prompt},
                 ]
                 continue
             else:
-                return {
-                    "_parsing_error": True,
-                    "raw_output": last_raw,
-                    "parsed_partial": parsed_or_partial,
-                    "parse_error": parse_err,
-                }
-    return {"_parsing_error": True, "raw_output": last_raw or "", "error": "exhausted retries"}
-# -----------------------
-# process file (save -> extract -> chunk -> call LLM)
-# -----------------------
-def process_file(file_obj) -> Dict[str, Any]:
-    try:
-        tmp_path, orig_name = save_uploaded_to_tmp(file_obj)
-    except Exception as e:
-        return {"error": f"Failed to save uploaded file: {e}"}
-    # extract text
-    try:
-        if orig_name.lower().endswith(".pdf"):
-            extracted_text = extract_text_from_pdf(tmp_path)
         else:
-            extracted_text = extract_text_from_image(tmp_path)
-    except Exception as e:
-        return {"error": f"Text extraction failed: {e}"}
-    if not extracted_text:
-        return {"error": "No text found in document after extraction."}
-    chunks = chunk_text(extracted_text)
-    sorted_chunks = sorted(chunks, key=lambda x: len(x), reverse=True)
-    top_chunks = sorted_chunks[:6] if sorted_chunks else [extracted_text[:2000]]
-    short_text = (extracted_text[:1000] + "...") if len(extracted_text) > 1000 else extracted_text
-    metadata = call_gpt5_for_metadata(orig_name, short_text, top_chunks, max_attempts=3)
-    # If API error
-    if metadata.get("_api_error"):
-        return {"error": metadata.get("error")}
-    # If parsing/validation error, include raw_output so UI can show & repair
-    if metadata.get("_parsing_error"):
-        return {
-            "error": "LLM output parsing failed. See raw_output.",
-            "raw_output": metadata.get("raw_output"),
-            "parsed_partial": metadata.get("parsed_partial"),
-            "parse_error": metadata.get("parse_error"),
-        }
-    # Ensure minimal keys and timestamp
-    now = datetime.datetime.now(datetime.timezone.utc).astimezone().isoformat()
-    metadata.setdefault("doc_id", os.path.splitext(orig_name)[0])
-    metadata.setdefault("title", orig_name)
-    metadata.setdefault("source", "user_upload")
-    metadata.setdefault("raw_url", "")
-    metadata.setdefault("ingest_timestamp", now)
-    return metadata
-# -----------------------
-# Repair-only function (user-triggered) - repair raw_output into valid JSON
-# -----------------------
-def repair_raw_output(raw_output: str, max_attempts: int = 2) -> Dict[str, Any]:
-    """
-    Send the raw output back to the model and ask for corrected JSON between markers.
-    This function is useful if the initial parsing failed and you want a manual 'Repair' button in UI.
-    """
     system_msg = (
-        "You are an automated assistant. The user previously received a response that was intended to be a JSON object "
-        "but it may be malformed or contain extra text. Your job: RETURN ONLY a corrected JSON object wrapped between "
-        "<<BEGIN_JSON>> and <<END_JSON>>. Do NOT include any other text."
     )
     repair_prompt = (
-        "Here is the raw output that failed to parse:\n\n"
-        f"{raw_output}\n\n"
-        "Please return ONLY a corrected JSON object wrapped between <<BEGIN_JSON>> and <<END_JSON>>. "
-        "Ensure the object contains keys: doc_id, title, summary, doc_type, source, tags, tag_confidences, taxonomy_path, extracted_entities, raw_url, ingest_timestamp. "
-        "If a field is missing, use a reasonable default (empty string, empty list, or empty map)."
     )
-    messages = [{"role": "system", "content": system_msg}, {"role": "user", "content": repair_prompt}]
     last_raw = None
     for attempt in range(1, max_attempts + 1):
         try:
             resp = client.chat.completions.create(
                 model=LLM_MODEL,
@@ -443,116 +329,268 @@ def repair_raw_output(raw_output: str, max_attempts: int = 2) -> Dict[str, Any]:
                 max_completion_tokens=MAX_COMPLETION_TOKENS,
             )
         except Exception as e:
-            return {"_api_error": True, "error": f"OpenAI API call failed: {e}"}
         try:
-            text = resp.choices[0].message["content"].strip()
         except Exception:
             try:
-                text = resp.choices[0].message.content.strip()
             except Exception:
-                text = str(resp)
-        last_raw = text
-        json_text = extract_json_from_text(text)
         if not json_text:
             if attempt < max_attempts:
-                messages = [
-                    {"role": "system", "content": system_msg},
-                    {"role": "user", "content": "Your previous reply did not include a JSON block. Please return ONLY the JSON wrapped in <<BEGIN_JSON>> and <<END_JSON>>."},
-                ]
                 continue
             else:
-                return {"_parsing_error": True, "raw_output": last_raw, "error": "no JSON found after repair attempts"}
         ok, parsed_or_partial, parse_err = try_parse_and_validate(json_text)
         if ok:
-            return parsed_or_partial
         else:
             if attempt < max_attempts:
-                messages = [
-                    {"role": "system", "content": system_msg},
-                    {"role": "user", "content": "The JSON you returned is invalid. Please correct and return ONLY the JSON wrapped in <<BEGIN_JSON>> and <<END_JSON>>."},
-                ]
                 continue
             else:
-                return {"_parsing_error": True, "raw_output": last_raw, "parsed_partial": parsed_or_partial, "parse_error": parse_err}
-    return {"_parsing_error": True, "raw_output": last_raw or "", "error": "exhausted retries"}
 # -----------------------
 # Gradio UI
 # -----------------------
-with gr.Blocks(title="DocClassify — Gradio GPT-5 Taxonomy & Tagging") as demo:
-    gr.Markdown("## 📂 Upload a PDF or Image — the app will classify, tag, and propose a taxonomy using GPT-5")
     with gr.Row():
         with gr.Column(scale=1):
             uploader = gr.File(label="Upload PDF / Image", file_types=[".pdf", ".png", ".jpg", ".jpeg", ".tiff"])
             run_button = gr.Button("Process document")
             status = gr.Textbox(label="Status", value="", interactive=False)
             download_button = gr.File(label="Download metadata JSON", visible=False)
-            repair_button = gr.Button("Repair last raw output", visible=True)
         with gr.Column(scale=1):
-            output_json = gr.JSON(label="Document metadata (JSON)")
-            raw_output_box = gr.Textbox(label="Raw LLM output / parse errors", interactive=False)
-    # State holders
-    last_raw_state = gr.State(value=None)       # stores raw_output when parsing fails
-    last_metadata_file = gr.State(value=None)   # stores path to last generated metadata file (for download)
-    def on_process(file_obj, last_raw_state):
-        status = "Processing..."
-        # initial empty responses
-        empty_val = {}
         try:
             result = process_file(file_obj)
         except Exception as e:
-            return empty_val, f"Failed: {e}", None, None
         if result.get("error"):
-            # if LLM returned parsing error, store raw_output in state and show it
-            raw = result.get("raw_output", "")
-            # prepare displayed payload that includes the error note
-            display_obj = {"error": result.get("error")}
-            if result.get("parsed_partial") is not None:
-                display_obj["parsed_partial"] = result.get("parsed_partial")
-            # Save raw_output to state for potential repair
-            return display_obj, f"Error: {result.get('error')}", None, raw
-        # success: return JSON and create downloadable temp file
         tmpf = tempfile.NamedTemporaryFile(delete=False, suffix=".json")
         with open(tmpf.name, "w", encoding="utf8") as f:
-            json.dump(result, f, indent=2, ensure_ascii=False)
-        return result, "Done", tmpf.name, None
-    def on_repair(raw_output):
-        if not raw_output:
-            return {}, "No raw_output available to repair.", None
-        try:
-            repaired = repair_raw_output(raw_output, max_attempts=2)
-        except Exception as e:
-            return {}, f"Repair failed: {e}", None
         if repaired.get("_api_error"):
-            return {}, f"Repair API error: {repaired.get('error')}", None
         if repaired.get("_parsing_error"):
-            # still failed; show raw_output and parsed_partial
-            display = {"error": "Repair failed to produce valid JSON", "parsed_partial": repaired.get("parsed_partial")}
-            return display, "Repair failed: parsing error", None
-        # success -> create download file
         tmpf = tempfile.NamedTemporaryFile(delete=False, suffix=".json")
         with open(tmpf.name, "w", encoding="utf8") as f:
-            json.dump(repaired, f, indent=2, ensure_ascii=False)
-        return repaired, "Repair succeeded", tmpf.name
-    # Wire up buttons
-    run_button.click(on_process, inputs=[uploader, last_raw_state], outputs=[output_json, status, download_button, raw_output_box])
-    repair_button.click(on_repair, inputs=[raw_output_box], outputs=[output_json, status, download_button])
-# launch
 if __name__ == "__main__":
     demo.launch()

 # app.py
 """
+Final Gradio app — robust document tagging + automated taxonomy via GPT-5 (OpenAI new client).
+Features:
+ - Upload PDF or Image
+ - Extract text (PyMuPDF + Tesseract fallback)
+ - Chunk text, call GPT-5 to produce JSON metadata between markers <<BEGIN_JSON>><<END_JSON>>
+ - Validate JSON with jsonschema
+ - Automatic repair attempts + manual-repair (paste raw output)
+ - Detailed step-by-step logs displayed on the UI and full GPT response shown
+ - Download metadata JSON on success
+Requirements (requirements.txt):
   gradio>=3.0
   PyMuPDF
   pytesseract
   openai>=1.0.0
   jsonschema
+System packages (apt-packages for HF Spaces):
   tesseract-ocr
   poppler-utils
+Put OPENAI_API_KEY into HF Space Secrets or environment.
 """
 import os
 import fitz  # PyMuPDF
 import pytesseract
 from jsonschema import validate as json_validate, ValidationError
 from openai import OpenAI
 # -----------------------
+# Config & OpenAI client
 # -----------------------
 OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
 if not OPENAI_API_KEY:
+    raise RuntimeError("OPENAI_API_KEY not found in environment. Add it to HF Space Secrets or env var.")
 client = OpenAI(api_key=OPENAI_API_KEY)
+LLM_MODEL = os.getenv("OPENAI_MODEL", "gpt-5")              # change if needed
 MAX_COMPLETION_TOKENS = int(os.getenv("MAX_COMPLETION_TOKENS", "1500"))
 # -----------------------
 METADATA_SCHEMA = {
     "type": "object",
     "required": [
+        "doc_id", "title", "summary", "doc_type", "source", "tags",
+        "tag_confidences", "taxonomy_path", "extracted_entities", "raw_url", "ingest_timestamp"
     ],
     "properties": {
         "doc_id": {"type": "string"},
 }
 # -----------------------
+# Helpers: extraction & chunking
 # -----------------------
+def extract_text_from_pdf(path: str, log: List[str]) -> str:
+    log.append(f"Opening PDF: {path}")
     try:
         doc = fitz.open(path)
     except Exception as e:
         raise RuntimeError(f"Failed to open PDF: {e}")
     texts: List[str] = []
     for i in range(len(doc)):
         page = doc.load_page(i)
         txt = page.get_text("text").strip()
         if txt:
+            log.append(f"Page {i+1}: text extracted ({len(txt)} chars)")
             texts.append(txt)
         else:
+            log.append(f"Page {i+1}: no text found, performing OCR fallback")
             pix = page.get_pixmap(dpi=200)
             with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp:
                 pix.save(tmp.name)
                 ocr_text = pytesseract.image_to_string(Image.open(tmp.name))
+                log.append(f"Page {i+1}: OCR extracted ({len(ocr_text)} chars)")
                 texts.append(ocr_text)
     return "\n\n".join(texts).strip()
+def extract_text_from_image(path: str, log: List[str]) -> str:
+    log.append(f"OCR on image: {path}")
     img = Image.open(path).convert("RGB")
+    txt = pytesseract.image_to_string(img).strip()
+    log.append(f"OCR extracted ({len(txt)} chars)")
+    return txt
 def chunk_text(text: str, max_chars: int = 3000) -> List[str]:
     return chunks
 # -----------------------
+# Upload handling
 # -----------------------
+def save_uploaded_to_tmp(file_obj, log: List[str]):
+    log.append(f"Saving uploaded object of type {type(file_obj)}")
     # file-like
     if hasattr(file_obj, "read") and callable(getattr(file_obj, "read")):
         try:
             suffix = os.path.splitext(name)[1] or ""
             with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as tmp:
                 tmp.write(content)
+                log.append(f"Saved uploaded file-like as {tmp.name}")
                 return tmp.name, os.path.basename(name)
+        except Exception as e:
+            log.append(f"file-like save failed: {e}")
     # dict-like
+    if isinstance(file_obj, dict) and "data" in file_obj and "name" in file_obj:
+        try:
             data = file_obj["data"]
             if isinstance(data, str):
                 data = data.encode("utf-8")
             suffix = os.path.splitext(name)[1] or ""
             with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as tmp:
                 tmp.write(data)
+                log.append(f"Saved dict-like upload as {tmp.name}")
                 return tmp.name, os.path.basename(name)
+        except Exception as e:
+            log.append(f"dict-like save failed: {e}")
     # path string
     if isinstance(file_obj, str):
         if os.path.exists(file_obj):
+            log.append(f"Upload was path string existing on disk: {file_obj}")
             return file_obj, os.path.basename(file_obj)
         try:
             with open(file_obj, "rb") as f:
             suffix = os.path.splitext(file_obj)[1] or ""
             with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as tmp:
                 tmp.write(data)
+                log.append(f"Copied path-string file to {tmp.name}")
                 return tmp.name, os.path.basename(file_obj)
+        except Exception as e:
+            log.append(f"path-string handling failed: {e}")
+    # object with .name attr
     name = getattr(file_obj, "name", None)
     if name and isinstance(name, str):
         try:
             suffix = os.path.splitext(name)[1] or ""
             with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as tmp:
                 tmp.write(data)
+                log.append(f"Saved file from .name attr to {tmp.name}")
                 return tmp.name, os.path.basename(name)
+        except Exception as e:
+            log.append(f".name-based save failed: {e}")
     raise ValueError(f"Unsupported uploaded file object type: {type(file_obj)}. repr: {repr(file_obj)[:400]}")
 # -----------------------
+# JSON extraction & validation
 # -----------------------
 def extract_json_from_text(text: str) -> str:
     m = re.search(r"<<BEGIN_JSON>>(.*?)<<END_JSON>>", text, re.DOTALL)
     if m:
         return m.group(1).strip()
 def try_parse_and_validate(json_text: str) -> (bool, Dict[str, Any], str):
     try:
         parsed = json.loads(json_text)
     except Exception as e:
         return False, None, f"json.loads error: {e}"
     try:
         json_validate(parsed, METADATA_SCHEMA)
     except ValidationError as e:
         return False, parsed, f"schema validation error: {e}"
     except Exception as e:
         return False, parsed, f"schema validation unexpected error: {e}"
     return True, parsed, ""
 # -----------------------
+# LLM interactions (metadata, repair, autocomplete)
 # -----------------------
+def call_gpt5_for_metadata(title: str, short_text: str, top_chunks: List[str], log: List[str], max_attempts: int = 2):
+    log.append("Preparing prompt for metadata generation")
     system_msg = (
         "You are an automated document taxonomy and tagging assistant for enterprise catalogs. "
+        "Return ONLY a JSON object wrapped between <<BEGIN_JSON>> and <<END_JSON>> and nothing else."
     )
+    prompt_intro = f"Document title: {title}\n\nShort document text (first ~1000 chars): {short_text}\n\nTop content chunks:\n"
     prompt_chunks = ""
     for i, c in enumerate(top_chunks[:6]):
         chunk_text_clean = c[:800].replace("\n", " ")
         prompt_chunks += f"CHUNK_{i+1}: {chunk_text_clean}\n\n"
     prompt_end = (
+        "Task: Produce a JSON object with EXACT keys: doc_id, title, summary, doc_type, source, tags (array of strings), "
+        "tag_confidences (map tag->float), taxonomy_path (array of strings), extracted_entities (map), raw_url, ingest_timestamp. "
+        "Output MUST be the JSON only, enclosed between <<BEGIN_JSON>> and <<END_JSON>>."
     )
+    messages = [{"role": "system", "content": system_msg}, {"role": "user", "content": prompt_intro + prompt_chunks + prompt_end}]
     last_raw = None
     for attempt in range(1, max_attempts + 1):
+        log.append(f"Calling OpenAI (attempt {attempt})")
         try:
             resp = client.chat.completions.create(
                 model=LLM_MODEL,
                 max_completion_tokens=MAX_COMPLETION_TOKENS,
             )
         except Exception as e:
+            log.append(f"OpenAI API call failed: {e}")
+            return {"_api_error": True, "error": f"OpenAI API call failed: {e}", "log": log, "raw_response": None}
+        # extract full model response text for UI logs
         try:
+            full_text = resp.choices[0].message["content"].strip()
         except Exception:
             try:
+                full_text = resp.choices[0].message.content.strip()
             except Exception:
+                full_text = str(resp)
+        last_raw = full_text
+        log.append("OpenAI response received (raw length: " + str(len(full_text)) + ")")
+        # attempt to extract JSON
+        json_text = extract_json_from_text(full_text)
         if not json_text:
+            log.append("No JSON found in response")
             if attempt < max_attempts:
                 messages = [
                     {"role": "system", "content": system_msg},
+                    {"role": "user", "content": "Previous response lacked JSON markers. Return only JSON between <<BEGIN_JSON>> and <<END_JSON>>."},
                 ]
                 continue
             else:
+                return {"_parsing_error": True, "raw_output": last_raw, "log": log, "raw_response": full_text}
         ok, parsed_or_partial, parse_err = try_parse_and_validate(json_text)
         if ok:
+            log.append("JSON parsed and validated successfully")
+            # attach model raw response as well
+            return {"metadata": parsed_or_partial, "log": log, "raw_response": full_text}
         else:
+            log.append(f"JSON parsed but schema validation failed: {parse_err}")
             if attempt < max_attempts:
                 messages = [
                     {"role": "system", "content": system_msg},
+                    {"role": "user", "content": "The JSON you returned is invalid vs schema. Return corrected JSON only between markers."},
                 ]
                 continue
             else:
+                return {"_parsing_error": True, "raw_output": last_raw, "parsed_partial": parsed_or_partial, "parse_error": parse_err, "log": log, "raw_response": full_text}
+    return {"_parsing_error": True, "raw_output": last_raw, "log": log, "raw_response": full_text}
+def repair_raw_output(raw_output: str, manual_pasted_json: str, log: List[str], max_attempts: int = 2):
+    log.append("Starting repair flow")
+    # if manual JSON pasted by user, try parse+validate directly
+    if manual_pasted_json:
+        log.append("User provided manual pasted JSON — trying to parse and validate")
+        jtxt = extract_json_from_text(manual_pasted_json) or manual_pasted_json
+        ok, parsed, err = try_parse_and_validate(jtxt)
+        if ok:
+            log.append("Manual pasted JSON validated successfully")
+            return {"metadata": parsed, "log": log, "raw_response": manual_pasted_json}
         else:
+            log.append(f"Manual pasted JSON validation failed: {err}")
+            return {"_parsing_error": True, "raw_output": manual_pasted_json, "parsed_partial": parsed, "parse_error": err, "log": log}
+    # otherwise instruct model to repair the raw_output
     system_msg = (
+        "You are an assistant that must extract and/or correct a malformed JSON from the user's raw_output. "
+        "Return ONLY a corrected JSON object wrapped between <<BEGIN_JSON>> and <<END_JSON>> and nothing else."
     )
     repair_prompt = (
+        "Here is the raw output (possibly containing a malformed JSON). Extract and return a corrected JSON object "
+        "containing keys: doc_id,title,summary,doc_type,source,tags,tag_confidences,taxonomy_path,extracted_entities,raw_url,ingest_timestamp. "
+        "If fields are missing, use reasonable defaults (empty string, empty list or empty map)."
     )
+    messages = [{"role": "system", "content": system_msg}, {"role": "user", "content": repair_prompt + "\n\nRaw output:\n\n" + (raw_output or "")}]
     last_raw = None
     for attempt in range(1, max_attempts + 1):
+        log.append(f"Repair attempt {attempt}")
         try:
             resp = client.chat.completions.create(
                 model=LLM_MODEL,
                 max_completion_tokens=MAX_COMPLETION_TOKENS,
             )
         except Exception as e:
+            log.append(f"Repair API call failed: {e}")
+            return {"_api_error": True, "error": f"OpenAI API call failed: {e}", "log": log, "raw_response": None}
         try:
+            full_text = resp.choices[0].message["content"].strip()
         except Exception:
             try:
+                full_text = resp.choices[0].message.content.strip()
             except Exception:
+                full_text = str(resp)
+        last_raw = full_text
+        log.append("Repair model response received (raw length: " + str(len(full_text)) + ")")
+        json_text = extract_json_from_text(full_text)
         if not json_text:
+            log.append("Repair response contained no JSON")
             if attempt < max_attempts:
+                messages = [{"role": "system", "content": system_msg}, {"role": "user", "content": "Your previous reply did not include the JSON. Return ONLY the corrected JSON between markers."}]
                 continue
             else:
+                return {"_parsing_error": True, "raw_output": last_raw, "log": log, "raw_response": full_text}
         ok, parsed_or_partial, parse_err = try_parse_and_validate(json_text)
         if ok:
+            log.append("Repair produced valid JSON")
+            return {"metadata": parsed_or_partial, "log": log, "raw_response": full_text}
         else:
+            log.append(f"Repair produced JSON but validation failed: {parse_err}")
             if attempt < max_attempts:
+                messages = [{"role": "system", "content": system_msg}, {"role": "user", "content": "Your JSON is invalid. Please correct and return ONLY the corrected JSON between markers."}]
                 continue
             else:
+                return {"_parsing_error": True, "raw_output": last_raw, "parsed_partial": parsed_or_partial, "parse_error": parse_err, "log": log, "raw_response": full_text}
+    return {"_parsing_error": True, "raw_output": last_raw or "", "log": log, "raw_response": last_raw or ""}
+def auto_complete_partial(parsed_partial: Dict[str, Any], orig_name: str, extracted_text: str, top_chunks: List[str], log: List[str], max_attempts: int = 2):
+    log.append("Starting auto-complete for parsed partial")
+    system_msg = (
+        "You are an assistant that must fill missing metadata fields for a document. "
+        "Return ONLY a single JSON object wrapped in <<BEGIN_JSON>> and <<END_JSON>> with the exact keys: "
+        "doc_id, title, summary, doc_type, source, tags, tag_confidences, taxonomy_path, extracted_entities, raw_url, ingest_timestamp. "
+        "If you cannot infer a value, use reasonable defaults."
+    )
+    partial_str = json.dumps(parsed_partial, ensure_ascii=False)
+    short_text = (extracted_text[:1200] + "...") if len(extracted_text) > 1200 else extracted_text
+    prompt = f"Original filename: {orig_name}\n\nPreviously parsed partial JSON:\n{partial_str}\n\nDocument short text:\n{short_text}\n\nTop chunks:\n"
+    for i, c in enumerate(top_chunks[:6]):
+        prompt += f"CHUNK_{i+1}: {c[:900].replace(chr(10), ' ')}\n\n"
+    prompt += ("Task: Fill any missing or empty fields in the JSON above using the document context. "
+               "Return ONLY the completed JSON wrapped between <<BEGIN_JSON>> and <<END_JSON>>.")
+    messages = [{"role": "system", "content": system_msg}, {"role": "user", "content": prompt}]
+    last_raw = None
+    for attempt in range(1, max_attempts + 1):
+        log.append(f"Auto-complete attempt {attempt}")
+        try:
+            resp = client.chat.completions.create(
+                model=LLM_MODEL,
+                messages=messages,
+                max_completion_tokens=MAX_COMPLETION_TOKENS,
+            )
+        except Exception as e:
+            log.append(f"Auto-complete API call failed: {e}")
+            return {"_api_error": True, "error": f"OpenAI API call failed: {e}", "log": log}
+        try:
+            full_text = resp.choices[0].message["content"].strip()
+        except Exception:
+            try:
+                full_text = resp.choices[0].message.content.strip()
+            except Exception:
+                full_text = str(resp)
+        last_raw = full_text
+        log.append("Auto-complete model response received")
+        json_text = extract_json_from_text(full_text)
+        if not json_text:
+            log.append("Auto-complete response had no JSON")
+            if attempt < max_attempts:
+                messages = [{"role": "system", "content": system_msg}, {"role": "user", "content": "Return ONLY the JSON wrapped in <<BEGIN_JSON>> and <<END_JSON>>."}]
+                continue
+            else:
+                return {"_parsing_error": True, "raw_output": last_raw, "log": log, "raw_response": full_text}
+        ok, parsed_or_partial2, parse_err = try_parse_and_validate(json_text)
+        if ok:
+            log.append("Auto-complete succeeded and validated")
+            return {"metadata": parsed_or_partial2, "log": log, "raw_response": full_text}
+        else:
+            log.append(f"Auto-complete produced JSON but validation failed: {parse_err}")
+            if attempt < max_attempts:
+                messages = [{"role": "system", "content": system_msg}, {"role": "user", "content": "The JSON you returned is invalid. Please correct and return ONLY the JSON wrapped in <<BEGIN_JSON>> and <<END_JSON>>."}]
+                continue
+            else:
+                return {"_parsing_error": True, "raw_output": last_raw, "parsed_partial": parsed_or_partial2, "parse_error": parse_err, "log": log, "raw_response": full_text}
+    return {"_parsing_error": True, "raw_output": last_raw or "", "log": log, "raw_response": last_raw or ""}
+# -----------------------
+# Orchestration: process file
+# -----------------------
+def process_file(file_obj):
+    ui_log: List[str] = []
+    try:
+        tmp_path, orig_name = save_uploaded_to_tmp(file_obj, ui_log)
+    except Exception as e:
+        ui_log.append(f"Failed to save upload: {e}")
+        return {"error": f"Failed to save uploaded file: {e}", "log": ui_log, "raw_response": ""}
+    try:
+        if orig_name.lower().endswith(".pdf"):
+            extracted_text = extract_text_from_pdf(tmp_path, ui_log)
+        else:
+            extracted_text = extract_text_from_image(tmp_path, ui_log)
+    except Exception as e:
+        ui_log.append(f"Text extraction failed: {e}")
+        return {"error": f"Text extraction failed: {e}", "log": ui_log, "raw_response": ""}
+    if not extracted_text:
+        ui_log.append("No text found after extraction.")
+        return {"error": "No text found in document after extraction.", "log": ui_log, "raw_response": ""}
+    chunks = chunk_text(extracted_text)
+    ui_log.append(f"Document split into {len(chunks)} chunks")
+    sorted_chunks = sorted(chunks, key=lambda x: len(x), reverse=True)
+    top_chunks = sorted_chunks[:6] if sorted_chunks else [extracted_text[:2000]]
+    short_text = (extracted_text[:1000] + "...") if len(extracted_text) > 1000 else extracted_text
+    # Primary LLM call
+    result = call_gpt5_for_metadata(orig_name, short_text, top_chunks, ui_log, max_attempts=2)
+    # If API error
+    if result.get("_api_error"):
+        return {"error": result.get("error"), "log": ui_log + result.get("log", []), "raw_response": result.get("raw_response")}
+    # If parsing error, attempt auto-complete if we have parsed_partial
+    if result.get("_parsing_error"):
+        ui_log += result.get("log", [])
+        raw_out = result.get("raw_output", result.get("raw_response", ""))
+        parsed_partial = result.get("parsed_partial", {})
+        ui_log.append("Initial parse failed; attempting auto-complete if partial available")
+        if parsed_partial:
+            ac = auto_complete_partial(parsed_partial, orig_name, extracted_text, top_chunks, ui_log, max_attempts=2)
+            if ac.get("_api_error"):
+                ui_log += ac.get("log", [])
+                return {"error": "Auto-complete API error", "log": ui_log, "raw_response": ac.get("raw_response", raw_out)}
+            if ac.get("_parsing_error"):
+                ui_log += ac.get("log", [])
+                return {"error": "LLM output parsing failed. See raw_output.", "raw_output": ac.get("raw_output", raw_out), "parsed_partial": ac.get("parsed_partial"), "parse_error": ac.get("parse_error"), "log": ui_log, "raw_response": ac.get("raw_response", raw_out)}
+            # success
+            metadata = ac.get("metadata")
+            ui_log += ac.get("log", [])
+            ui_log.append("Auto-complete produced metadata")
+            # ensure defaults
+            now = datetime.datetime.now(datetime.timezone.utc).astimezone().isoformat()
+            metadata.setdefault("doc_id", os.path.splitext(orig_name)[0])
+            metadata.setdefault("title", orig_name)
+            metadata.setdefault("source", "user_upload")
+            metadata.setdefault("raw_url", "")
+            metadata.setdefault("ingest_timestamp", now)
+            return {"metadata": metadata, "log": ui_log, "raw_response": ac.get("raw_response", raw_out)}
+        else:
+            ui_log.append("No parsed_partial to auto-complete; returning raw output for manual repair")
+            return {"error": "LLM output parsing failed. See raw_output.", "raw_output": raw_out, "parsed_partial": parsed_partial, "parse_error": result.get("parse_error"), "log": ui_log, "raw_response": result.get("raw_response", raw_out)}
+    # success path
+    metadata = result.get("metadata")
+    ui_log += result.get("log", [])
+    raw_model_response = result.get("raw_response")
+    now = datetime.datetime.now(datetime.timezone.utc).astimezone().isoformat()
+    metadata.setdefault("doc_id", os.path.splitext(orig_name)[0])
+    metadata.setdefault("title", orig_name)
+    metadata.setdefault("source", "user_upload")
+    metadata.setdefault("raw_url", "")
+    metadata.setdefault("ingest_timestamp", now)
+    ui_log.append("Metadata generation successful")
+    return {"metadata": metadata, "log": ui_log, "raw_response": raw_model_response}
 # -----------------------
 # Gradio UI
 # -----------------------
+with gr.Blocks(title="DocClassify — Final Robust") as demo:
+    gr.Markdown("## 📂 Upload PDF / Image → automated taxonomy & tagging (GPT-5). Logs & GPT response shown below.")
     with gr.Row():
         with gr.Column(scale=1):
             uploader = gr.File(label="Upload PDF / Image", file_types=[".pdf", ".png", ".jpg", ".jpeg", ".tiff"])
             run_button = gr.Button("Process document")
             status = gr.Textbox(label="Status", value="", interactive=False)
             download_button = gr.File(label="Download metadata JSON", visible=False)
+            gr.Markdown("### Manual repair (paste raw LLM output if needed)")
+            manual_raw_input = gr.Textbox(label="Paste raw LLM output here (optional)", lines=8, placeholder="Paste the malformed raw response if you need manual repair")
+            repair_from_paste_btn = gr.Button("Repair from pasted raw output")
+            repair_auto_btn = gr.Button("Attempt automatic repair of last raw output")
         with gr.Column(scale=1):
+            output_json = gr.JSON(label="Metadata JSON (parsed)")
+            raw_output_box = gr.Textbox(label="Full GPT model raw response", lines=12, interactive=False)
+            logs_box = gr.Textbox(label="Step-by-step logs", lines=12, interactive=False)
+    # state holders
+    last_raw_state = gr.State(value=None)    # store last raw model response
+    last_metadata_file = gr.State(value=None)  # path to downloadable json
+    def on_process(file_obj):
+        if not file_obj:
+            return {}, "No file uploaded", None, "", ""
+        status_msg = "Processing..."
         try:
             result = process_file(file_obj)
         except Exception as e:
+            return {}, f"Failed: {e}", None, "", "\n".join([f"Exception: {e}"])
+        # handle errors and success
+        logs = result.get("log", [])
+        raw_response = result.get("raw_response", "")
         if result.get("error"):
+            # show raw_output for manual repair if present
+            raw_out = result.get("raw_output", raw_response) or ""
+            parsed_partial = result.get("parsed_partial")
+            display = {"error": result.get("error")}
+            if parsed_partial is not None:
+                display["parsed_partial"] = parsed_partial
+            # put logs and raw_response into UI
+            logs_text = "\n".join(logs + [f"Error: {result.get('error')}"])
+            return display, f"Error: {result.get('error')}", None, raw_out, logs_text
+        # success -> create temp file for download
+        metadata = result.get("metadata")
         tmpf = tempfile.NamedTemporaryFile(delete=False, suffix=".json")
         with open(tmpf.name, "w", encoding="utf8") as f:
+            json.dump(metadata, f, indent=2, ensure_ascii=False)
+        logs_text = "\n".join(logs)
+        return metadata, "Done", tmpf.name, raw_response or "", logs_text
+    def on_repair_from_paste(manual_text):
+        if not manual_text:
+            return {}, "No pasted raw output provided.", None, "", "No pasted raw output provided."
+        # try repair using model (or direct parse)
+        ui_log = ["Repair-from-paste initiated"]
+        repaired = repair_raw_output(raw_output=None, manual_pasted_json=manual_text, log=ui_log, max_attempts=2)
+        logs_text = "\n".join(repaired.get("log", ui_log))
         if repaired.get("_api_error"):
+            return {}, f"Repair API error: {repaired.get('error')}", None, repaired.get("raw_response", manual_text), logs_text
         if repaired.get("_parsing_error"):
+            display = {"error": "Repair failed to produce valid JSON", "parsed_partial": repaired.get("parsed_partial"), "parse_error": repaired.get("parse_error")}
+            return display, "Repair failed", None, repaired.get("raw_response", manual_text), logs_text
+        # success
+        metadata = repaired.get("metadata")
         tmpf = tempfile.NamedTemporaryFile(delete=False, suffix=".json")
         with open(tmpf.name, "w", encoding="utf8") as f:
+            json.dump(metadata, f, indent=2, ensure_ascii=False)
+        return metadata, "Repair succeeded", tmpf.name, repaired.get("raw_response", manual_text), logs_text
+    def on_repair_auto(raw_response_text):
+        if not raw_response_text:
+            return {}, "No raw_response available for auto repair. Run process or paste raw output.", None, "", "No raw_response available."
+        ui_log = ["Auto repair initiated"]
+        repaired = repair_raw_output(raw_output=raw_response_text, manual_pasted_json=None, log=ui_log, max_attempts=2)
+        logs_text = "\n".join(repaired.get("log", ui_log))
+        if repaired.get("_api_error"):
+            return {}, f"Repair API error: {repaired.get('error')}", None, repaired.get("raw_response", raw_response_text), logs_text
+        if repaired.get("_parsing_error"):
+            display = {"error": "Auto-repair failed to produce valid JSON", "parsed_partial": repaired.get("parsed_partial"), "parse_error": repaired.get("parse_error")}
+            return display, "Auto-repair failed", None, repaired.get("raw_response", raw_response_text), logs_text
+        metadata = repaired.get("metadata")
+        tmpf = tempfile.NamedTemporaryFile(delete=False, suffix=".json")
+        with open(tmpf.name, "w", encoding="utf8") as f:
+            json.dump(metadata, f, indent=2, ensure_ascii=False)
+        return metadata, "Auto-repair succeeded", tmpf.name, repaired.get("raw_response", raw_response_text), logs_text
+    run_button.click(on_process, inputs=[uploader], outputs=[output_json, status, download_button, raw_output_box, logs_box])
+    repair_from_paste_btn.click(on_repair_from_paste, inputs=[manual_raw_input], outputs=[output_json, status, download_button, raw_output_box, logs_box])
+    repair_auto_btn.click(on_repair_auto, inputs=[raw_output_box], outputs=[output_json, status, download_button, raw_output_box, logs_box])
 if __name__ == "__main__":
     demo.launch()