Spaces:

Shami96
/

PDF-Data_Extractor

Running

App Files Files Community

Shami96 commited on Aug 22

Commit

3edd648

verified ·

1 Parent(s): 7a2fc08

Update update_docx_with_pdf.py

Browse files

Files changed (1) hide show

update_docx_with_pdf.py +239 -48

update_docx_with_pdf.py CHANGED Viewed

@@ -1,80 +1,271 @@
-# update_docx_with_pdf.py
-from openai import OpenAI
-import json
 import os
 import time
-def read_any(f):
-    if hasattr(f, "read"):
-        f.seek(0)
-        content = f.read()
         if isinstance(content, bytes):
             content = content.decode("utf-8")
         return content
     else:
-        with open(f, "r", encoding="utf-8") as fh:
             return fh.read()
 def update_json_with_pdf(word_json_file, pdf_txt_file, output_file):
-    word_json = read_any(word_json_file)
     pdf_txt = read_any(pdf_txt_file)
     user_prompt = f"""
 Here is a JSON template. It contains only the fields that need updating:
-{word_json}
 Here is the extracted text from a PDF:
 {pdf_txt}
 Instructions:
 - ONLY update the fields present in the JSON template, using information from the PDF text.
 - DO NOT add any extra fields, and do not change the JSON structure.
 - Output ONLY the updated JSON, as raw JSON (no markdown, no extra text, no greetings).
-- If a field cannot be populated, keep its original value.
 """
     api_key = os.environ.get("OPENAI_API_KEY")
     if not api_key:
-        raise RuntimeError("OPENAI_API_KEY not found in environment variables!")
     client = OpenAI(api_key=api_key)
-    # Try a small number of attempts if the model returns text instead of JSON
-    for attempt in range(3):
-        response = client.chat.completions.create(
-            model="gpt-4o",
-            messages=[
-                {"role":"system","content":"You are a data extraction assistant. Only reply with valid JSON. Do not add any extra text or formatting."},
-                {"role":"user","content":user_prompt}
-            ],
-            max_tokens=4096,
-            temperature=0
-        )
-        updated_json_str = response.choices[0].message.content.strip()
         try:
-            parsed = json.loads(updated_json_str)
-            template_keys = set(json.loads(word_json).keys())
-            parsed_keys = set(parsed.keys())
-            added = parsed_keys - template_keys
-            if added:
-                print("⚠️ Model returned extra top-level keys; pruning:", added)
-                for ak in added:
-                    parsed.pop(ak, None)
-            if hasattr(output_file, "write"):
-                json.dump(parsed, output_file, indent=2, ensure_ascii=False)
-                output_file.flush()
-            else:
-                with open(output_file, "w", encoding="utf-8") as f:
-                    json.dump(parsed, f, indent=2, ensure_ascii=False)
-            print("✅ JSON updated and saved to", getattr(output_file, "name", output_file))
-            return
-        except json.JSONDecodeError:
-            print("⚠️ Model output was not valid JSON. Raw output (truncated):")
-            print(updated_json_str[:2000])
-            time.sleep(1)
-    raise RuntimeError("Model failed to return valid JSON after retries.")
 if __name__ == "__main__":
-    import sys
     if len(sys.argv) != 4:
         print("Usage: python update_docx_with_pdf.py <word_json_file> <pdf_txt_file> <output_json_file>")
-        exit(1)
-    update_json_with_pdf(sys.argv[1], sys.argv[2], sys.argv[3])

+#!/usr/bin/env python3
 import os
+import sys
+import json
 import time
+import re
+from typing import Optional
+# Try to import OpenAI client in the style you used previously
+try:
+    from openai import OpenAI
+except Exception as e:
+    OpenAI = None
+RETRIES = 3
+RETRY_DELAY = 1.0  # seconds between retries
+def read_any(path_or_file):
+    """Read content from file path or file-like object."""
+    if hasattr(path_or_file, "read"):
+        path_or_file.seek(0)
+        content = path_or_file.read()
         if isinstance(content, bytes):
             content = content.decode("utf-8")
         return content
     else:
+        with open(path_or_file, "r", encoding="utf-8") as fh:
             return fh.read()
+def extract_json_substring(s: str) -> Optional[str]:
+    """
+    Attempt to find the first balanced JSON object substring in s.
+    Returns the substring or None.
+    """
+    if not s:
+        return None
+    # Find first '{' then walk forward counting braces
+    start = s.find("{")
+    if start == -1:
+        return None
+    depth = 0
+    in_string = False
+    escape = False
+    for i in range(start, len(s)):
+        ch = s[i]
+        if ch == '"' and not escape:
+            in_string = not in_string
+        if in_string:
+            if ch == "\\" and not escape:
+                escape = True
+            else:
+                escape = False
+            continue
+        if ch == "{":
+            depth += 1
+        elif ch == "}":
+            depth -= 1
+            if depth == 0:
+                candidate = s[start:i+1]
+                return candidate
+    return None
+def try_parse_json(s: str):
+    """Try standard json.loads, return parsed or raise."""
+    return json.loads(s)
+def safe_write(path: str, data):
+    with open(path, "w", encoding="utf-8") as f:
+        json.dump(data, f, indent=2, ensure_ascii=False)
 def update_json_with_pdf(word_json_file, pdf_txt_file, output_file):
+    # --- load inputs ---
+    word_json_text = read_any(word_json_file)
     pdf_txt = read_any(pdf_txt_file)
+    try:
+        word_json = json.loads(word_json_text)
+    except Exception:
+        # If the input word_json isn't valid JSON, abort early but write original to output
+        print("⚠️ Input word_json is not valid JSON. Writing raw input to output and exiting.")
+        if hasattr(output_file, "write"):
+            output_file.write(word_json_text)
+            output_file.flush()
+        else:
+            with open(output_file, "w", encoding="utf-8") as f:
+                f.write(word_json_text)
+        return
+    # --- build prompt ---
     user_prompt = f"""
 Here is a JSON template. It contains only the fields that need updating:
+{json.dumps(word_json, ensure_ascii=False)}
 Here is the extracted text from a PDF:
 {pdf_txt}
 Instructions:
 - ONLY update the fields present in the JSON template, using information from the PDF text.
 - DO NOT add any extra fields, and do not change the JSON structure.
 - Output ONLY the updated JSON, as raw JSON (no markdown, no extra text, no greetings).
+- Ensure your output is valid JSON. If you cannot find data for a field, keep its existing value in the template.
 """
     api_key = os.environ.get("OPENAI_API_KEY")
     if not api_key:
+        print("⚠️ OPENAI_API_KEY not found in environment variables! Writing original JSON to output and exiting.")
+        if hasattr(output_file, "write"):
+            json.dump(word_json, output_file, indent=2, ensure_ascii=False)
+            output_file.flush()
+        else:
+            safe_write(output_file, word_json)
+        return
+    if OpenAI is None:
+        print("⚠️ OpenAI SDK not available (could not import OpenAI). Writing original JSON to output and exiting.")
+        if hasattr(output_file, "write"):
+            json.dump(word_json, output_file, indent=2, ensure_ascii=False)
+            output_file.flush()
+        else:
+            safe_write(output_file, word_json)
+        return
     client = OpenAI(api_key=api_key)
+    system_msgs = [
+        "You are a data extraction assistant. Only reply with valid JSON. Do not add any extra text or formatting. Do NOT use markdown/code blocks, just output JSON.",
+    ]
+    # Progressive user prompts: first attempt standard, later attempt stricter guidance
+    additional_user_variants = [
+        user_prompt,
+        user_prompt + "\nIf you must, you may output only a minimal JSON by keeping unspecified fields unchanged.",
+        user_prompt + "\nIMPORTANT: Output must be exactly and only valid JSON. If you append anything else, I will ignore it.",
+    ]
+    model_name = os.environ.get("OPENAI_MODEL", "gpt-4o")  # keep same default you used
+    raw_outputs = []
+    parsed = None
+    for attempt in range(RETRIES):
+        user_content = additional_user_variants[min(attempt, len(additional_user_variants)-1)]
         try:
+            print(f"🛰️ Calling LLM (attempt {attempt+1}/{RETRIES}) with model {model_name}...")
+            response = client.chat.completions.create(
+                model=model_name,
+                messages=[
+                    {"role": "system", "content": system_msgs[0]},
+                    {"role": "user", "content": user_content}
+                ],
+                max_tokens=4096,
+                temperature=0.0
+            )
+            # The SDK returns different shapes; attempt to access responsibly
+            raw_text = None
+            try:
+                # preferred: choices[0].message.content
+                raw_text = response.choices[0].message.content
+            except Exception:
+                try:
+                    raw_text = response.choices[0].text
+                except Exception:
+                    raw_text = str(response)
+            if isinstance(raw_text, bytes):
+                raw_text = raw_text.decode("utf-8", errors="replace")
+            raw_text = raw_text.strip()
+            raw_outputs.append(raw_text)
+            # Try parse as JSON directly
+            try:
+                parsed = json.loads(raw_text)
+                print("✅ Model returned valid JSON.")
+                # write output and exit
+                if hasattr(output_file, "write"):
+                    json.dump(parsed, output_file, indent=2, ensure_ascii=False)
+                    output_file.flush()
+                else:
+                    safe_write(output_file, parsed)
+                return parsed
+            except Exception as e:
+                print("⚠️ Model output was not valid JSON. Will attempt to extract JSON substring.")
+                # try extracting json substring
+                candidate = extract_json_substring(raw_text)
+                if candidate:
+                    try:
+                        parsed = json.loads(candidate)
+                        print("✅ Successfully extracted and parsed JSON substring from model output.")
+                        if hasattr(output_file, "write"):
+                            json.dump(parsed, output_file, indent=2, ensure_ascii=False)
+                            output_file.flush()
+                        else:
+                            safe_write(output_file, parsed)
+                        # save raw for debugging too
+                        raw_path = f"{output_file}.model_raw.txt" if isinstance(output_file, str) else f"{getattr(output_file, 'name', 'output')}.model_raw.txt"
+                        with open(raw_path, "w", encoding="utf-8") as rf:
+                            rf.write(raw_text)
+                        return parsed
+                    except Exception:
+                        print("⚠️ Extracted substring still not valid JSON.")
+                else:
+                    print("⚠️ Could not find a balanced JSON substring in the model output.")
+            # if here, wait and retry
+        except Exception as e:
+            print(f"⚠️ Exception while calling model: {e}")
+        time.sleep(RETRY_DELAY)
+    # If we've reached here, all attempts failed
+    # Save raw outputs for debugging
+    print("❗ All LLM attempts failed to produce valid JSON. Saving diagnostics and returning original JSON (no crash).")
+    # write raw outputs to file next to output_file
+    raw_path = None
+    try:
+        raw_path = f"{output_file}.model_raw.txt" if isinstance(output_file, str) else f"{getattr(output_file, 'name', 'output')}.model_raw.txt"
+        with open(raw_path, "w", encoding="utf-8") as rf:
+            rf.write("=== RAW MODEL OUTPUTS (attempts) ===\n\n")
+            for i, out in enumerate(raw_outputs):
+                rf.write(f"--- ATTEMPT {i+1} ---\n")
+                rf.write(out + "\n\n")
+            rf.write("\n=== END ===\n\n")
+            rf.write("\n\n=== PDF TEXT USED ===\n\n")
+            rf.write(pdf_txt or "")
+        print(f"ℹ️ Raw model outputs and pdf text saved to: {raw_path}")
+    except Exception as e:
+        print(f"⚠️ Failed to save raw model output: {e}")
+    # Also create a salvage file for manual inspection
+    salvage_path = None
+    try:
+        salvage_path = f"{output_file}.salvage.json" if isinstance(output_file, str) else f"{getattr(output_file, 'name', 'output')}.salvage.json"
+        salvage_bundle = {
+            "original_word_json": word_json,
+            "pdf_text_sample": (pdf_txt[:2000] + "...") if pdf_txt else "",
+            "raw_outputs_path": raw_path
+        }
+        with open(salvage_path, "w", encoding="utf-8") as sf:
+            json.dump(salvage_bundle, sf, indent=2, ensure_ascii=False)
+        print(f"ℹ️ Salvage bundle saved to: {salvage_path}")
+    except Exception as e:
+        print(f"⚠️ Failed to save salvage bundle: {e}")
+    # Write original JSON to output to avoid failing the calling process
+    try:
+        if hasattr(output_file, "write"):
+            json.dump(word_json, output_file, indent=2, ensure_ascii=False)
+            output_file.flush()
+        else:
+            safe_write(output_file, word_json)
+        print("✅ Original JSON template written to output (no updates applied).")
+    except Exception as e:
+        print(f"⚠️ Failed to write original JSON to output: {e}")
+    return None
 if __name__ == "__main__":
     if len(sys.argv) != 4:
         print("Usage: python update_docx_with_pdf.py <word_json_file> <pdf_txt_file> <output_json_file>")
+        sys.exit(0)
+    try:
+        update_json_with_pdf(sys.argv[1], sys.argv[2], sys.argv[3])
+    except Exception as e:
+        # This top-level catch ensures the script exits successfully while logging the issue.
+        print(f"Unexpected exception in update_docx_with_pdf.py: {e}")
+        # Attempt to copy original json to output before exiting
+        try:
+            with open(sys.argv[1], "r", encoding="utf-8") as inf, open(sys.argv[3], "w", encoding="utf-8") as outf:
+                outf.write(inf.read())
+                print("Wrote original input JSON to output due to exception.")
+        except Exception:
+            pass
+        # exit with status 0 so calling process doesn't crash (preserve pipeline behavior)
+        sys.exit(0)