Spaces:
Running
Running
Update update_docx_with_pdf.py
Browse files- update_docx_with_pdf.py +48 -47
update_docx_with_pdf.py
CHANGED
|
@@ -1,75 +1,76 @@
|
|
|
|
|
| 1 |
from openai import OpenAI
|
| 2 |
import json
|
| 3 |
import os
|
|
|
|
| 4 |
|
| 5 |
-
def
|
| 6 |
-
""
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
content = f.read()
|
| 16 |
-
if isinstance(content, bytes):
|
| 17 |
-
content = content.decode("utf-8")
|
| 18 |
-
return content
|
| 19 |
-
else:
|
| 20 |
-
with open(f, "r", encoding="utf-8") as fh:
|
| 21 |
-
return fh.read()
|
| 22 |
|
|
|
|
| 23 |
word_json = read_any(word_json_file)
|
| 24 |
pdf_txt = read_any(pdf_txt_file)
|
| 25 |
|
| 26 |
-
# --- Build prompt ---
|
| 27 |
user_prompt = f"""
|
| 28 |
Here is a JSON template. It contains only the fields that need updating:
|
| 29 |
{word_json}
|
| 30 |
-
|
| 31 |
Here is the extracted text from a PDF:
|
| 32 |
{pdf_txt}
|
| 33 |
-
|
| 34 |
Instructions:
|
| 35 |
- ONLY update the fields present in the JSON template, using information from the PDF text.
|
| 36 |
- DO NOT add any extra fields, and do not change the JSON structure.
|
| 37 |
- Output ONLY the updated JSON, as raw JSON (no markdown, no extra text, no greetings).
|
| 38 |
-
-
|
| 39 |
"""
|
| 40 |
|
| 41 |
-
# --- Call OpenAI API ---
|
| 42 |
api_key = os.environ.get("OPENAI_API_KEY")
|
| 43 |
if not api_key:
|
| 44 |
raise RuntimeError("OPENAI_API_KEY not found in environment variables!")
|
| 45 |
client = OpenAI(api_key=api_key)
|
| 46 |
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
|
|
|
| 58 |
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
|
| 72 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 73 |
|
| 74 |
if __name__ == "__main__":
|
| 75 |
import sys
|
|
|
|
| 1 |
+
# update_docx_with_pdf.py
|
| 2 |
from openai import OpenAI
|
| 3 |
import json
|
| 4 |
import os
|
| 5 |
+
import time
|
| 6 |
|
| 7 |
+
def read_any(f):
|
| 8 |
+
if hasattr(f, "read"):
|
| 9 |
+
f.seek(0)
|
| 10 |
+
content = f.read()
|
| 11 |
+
if isinstance(content, bytes):
|
| 12 |
+
content = content.decode("utf-8")
|
| 13 |
+
return content
|
| 14 |
+
else:
|
| 15 |
+
with open(f, "r", encoding="utf-8") as fh:
|
| 16 |
+
return fh.read()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 17 |
|
| 18 |
+
def update_json_with_pdf(word_json_file, pdf_txt_file, output_file):
|
| 19 |
word_json = read_any(word_json_file)
|
| 20 |
pdf_txt = read_any(pdf_txt_file)
|
| 21 |
|
|
|
|
| 22 |
user_prompt = f"""
|
| 23 |
Here is a JSON template. It contains only the fields that need updating:
|
| 24 |
{word_json}
|
|
|
|
| 25 |
Here is the extracted text from a PDF:
|
| 26 |
{pdf_txt}
|
|
|
|
| 27 |
Instructions:
|
| 28 |
- ONLY update the fields present in the JSON template, using information from the PDF text.
|
| 29 |
- DO NOT add any extra fields, and do not change the JSON structure.
|
| 30 |
- Output ONLY the updated JSON, as raw JSON (no markdown, no extra text, no greetings).
|
| 31 |
+
- If a field cannot be populated, keep its original value.
|
| 32 |
"""
|
| 33 |
|
|
|
|
| 34 |
api_key = os.environ.get("OPENAI_API_KEY")
|
| 35 |
if not api_key:
|
| 36 |
raise RuntimeError("OPENAI_API_KEY not found in environment variables!")
|
| 37 |
client = OpenAI(api_key=api_key)
|
| 38 |
|
| 39 |
+
# Try a small number of attempts if the model returns text instead of JSON
|
| 40 |
+
for attempt in range(3):
|
| 41 |
+
response = client.chat.completions.create(
|
| 42 |
+
model="gpt-4o",
|
| 43 |
+
messages=[
|
| 44 |
+
{"role":"system","content":"You are a data extraction assistant. Only reply with valid JSON. Do not add any extra text or formatting."},
|
| 45 |
+
{"role":"user","content":user_prompt}
|
| 46 |
+
],
|
| 47 |
+
max_tokens=4096,
|
| 48 |
+
temperature=0
|
| 49 |
+
)
|
| 50 |
+
updated_json_str = response.choices[0].message.content.strip()
|
| 51 |
|
| 52 |
+
try:
|
| 53 |
+
parsed = json.loads(updated_json_str)
|
| 54 |
+
template_keys = set(json.loads(word_json).keys())
|
| 55 |
+
parsed_keys = set(parsed.keys())
|
| 56 |
+
added = parsed_keys - template_keys
|
| 57 |
+
if added:
|
| 58 |
+
print("⚠️ Model returned extra top-level keys; pruning:", added)
|
| 59 |
+
for ak in added:
|
| 60 |
+
parsed.pop(ak, None)
|
| 61 |
+
if hasattr(output_file, "write"):
|
| 62 |
+
json.dump(parsed, output_file, indent=2, ensure_ascii=False)
|
| 63 |
+
output_file.flush()
|
| 64 |
+
else:
|
| 65 |
+
with open(output_file, "w", encoding="utf-8") as f:
|
| 66 |
+
json.dump(parsed, f, indent=2, ensure_ascii=False)
|
| 67 |
+
print("✅ JSON updated and saved to", getattr(output_file, "name", output_file))
|
| 68 |
+
return
|
| 69 |
+
except json.JSONDecodeError:
|
| 70 |
+
print("⚠️ Model output was not valid JSON. Raw output (truncated):")
|
| 71 |
+
print(updated_json_str[:2000])
|
| 72 |
+
time.sleep(1)
|
| 73 |
+
raise RuntimeError("Model failed to return valid JSON after retries.")
|
| 74 |
|
| 75 |
if __name__ == "__main__":
|
| 76 |
import sys
|