Shami96 commited on
Commit
8001b1f
·
verified ·
1 Parent(s): 102bd04

Update update_docx_with_pdf.py

Browse files
Files changed (1) hide show
  1. update_docx_with_pdf.py +48 -47
update_docx_with_pdf.py CHANGED
@@ -1,75 +1,76 @@
 
1
  from openai import OpenAI
2
  import json
3
  import os
 
4
 
5
- def update_json_with_pdf(word_json_file, pdf_txt_file, output_file):
6
- """
7
- word_json_file: file-like object or file path (docx extraction JSON)
8
- pdf_txt_file: file-like object or file path (PDF plain text)
9
- output_file: file-like object (opened for writing) or file path
10
- """
11
- # --- Load files ---
12
- def read_any(f):
13
- if hasattr(f, "read"):
14
- f.seek(0)
15
- content = f.read()
16
- if isinstance(content, bytes):
17
- content = content.decode("utf-8")
18
- return content
19
- else:
20
- with open(f, "r", encoding="utf-8") as fh:
21
- return fh.read()
22
 
 
23
  word_json = read_any(word_json_file)
24
  pdf_txt = read_any(pdf_txt_file)
25
 
26
- # --- Build prompt ---
27
  user_prompt = f"""
28
  Here is a JSON template. It contains only the fields that need updating:
29
  {word_json}
30
-
31
  Here is the extracted text from a PDF:
32
  {pdf_txt}
33
-
34
  Instructions:
35
  - ONLY update the fields present in the JSON template, using information from the PDF text.
36
  - DO NOT add any extra fields, and do not change the JSON structure.
37
  - Output ONLY the updated JSON, as raw JSON (no markdown, no extra text, no greetings).
38
- - Make sure the JSON is valid and ready to use.
39
  """
40
 
41
- # --- Call OpenAI API ---
42
  api_key = os.environ.get("OPENAI_API_KEY")
43
  if not api_key:
44
  raise RuntimeError("OPENAI_API_KEY not found in environment variables!")
45
  client = OpenAI(api_key=api_key)
46
 
47
- response = client.chat.completions.create(
48
- model="gpt-4o",
49
- messages=[
50
- {"role": "system", "content": "You are a data extraction assistant. Only reply with valid JSON. Do not add any extra text or formatting. Do NOT use markdown/code blocks, just output JSON."},
51
- {"role": "user", "content": user_prompt}
52
- ],
53
- max_tokens=4096,
54
- temperature=0
55
- )
56
-
57
- updated_json_str = response.choices[0].message.content.strip()
 
58
 
59
- # --- Try to parse as JSON ---
60
- try:
61
- parsed = json.loads(updated_json_str)
62
- if hasattr(output_file, "write"):
63
- json.dump(parsed, output_file, indent=2, ensure_ascii=False)
64
- output_file.flush()
65
- else:
66
- with open(output_file, "w", encoding="utf-8") as f:
67
- json.dump(parsed, f, indent=2, ensure_ascii=False)
68
- print("✅ JSON updated and saved to", getattr(output_file, "name", output_file))
69
- except Exception as e:
70
- print("⚠️ Model did not return valid JSON. Raw output below:\n")
71
- print(updated_json_str)
72
- print("\n❌ Failed to parse updated JSON:", e)
 
 
 
 
 
 
 
 
73
 
74
  if __name__ == "__main__":
75
  import sys
 
1
+ # update_docx_with_pdf.py
2
  from openai import OpenAI
3
  import json
4
  import os
5
+ import time
6
 
7
+ def read_any(f):
8
+ if hasattr(f, "read"):
9
+ f.seek(0)
10
+ content = f.read()
11
+ if isinstance(content, bytes):
12
+ content = content.decode("utf-8")
13
+ return content
14
+ else:
15
+ with open(f, "r", encoding="utf-8") as fh:
16
+ return fh.read()
 
 
 
 
 
 
 
17
 
18
+ def update_json_with_pdf(word_json_file, pdf_txt_file, output_file):
19
  word_json = read_any(word_json_file)
20
  pdf_txt = read_any(pdf_txt_file)
21
 
 
22
  user_prompt = f"""
23
  Here is a JSON template. It contains only the fields that need updating:
24
  {word_json}
 
25
  Here is the extracted text from a PDF:
26
  {pdf_txt}
 
27
  Instructions:
28
  - ONLY update the fields present in the JSON template, using information from the PDF text.
29
  - DO NOT add any extra fields, and do not change the JSON structure.
30
  - Output ONLY the updated JSON, as raw JSON (no markdown, no extra text, no greetings).
31
+ - If a field cannot be populated, keep its original value.
32
  """
33
 
 
34
  api_key = os.environ.get("OPENAI_API_KEY")
35
  if not api_key:
36
  raise RuntimeError("OPENAI_API_KEY not found in environment variables!")
37
  client = OpenAI(api_key=api_key)
38
 
39
+ # Try a small number of attempts if the model returns text instead of JSON
40
+ for attempt in range(3):
41
+ response = client.chat.completions.create(
42
+ model="gpt-4o",
43
+ messages=[
44
+ {"role":"system","content":"You are a data extraction assistant. Only reply with valid JSON. Do not add any extra text or formatting."},
45
+ {"role":"user","content":user_prompt}
46
+ ],
47
+ max_tokens=4096,
48
+ temperature=0
49
+ )
50
+ updated_json_str = response.choices[0].message.content.strip()
51
 
52
+ try:
53
+ parsed = json.loads(updated_json_str)
54
+ template_keys = set(json.loads(word_json).keys())
55
+ parsed_keys = set(parsed.keys())
56
+ added = parsed_keys - template_keys
57
+ if added:
58
+ print("⚠️ Model returned extra top-level keys; pruning:", added)
59
+ for ak in added:
60
+ parsed.pop(ak, None)
61
+ if hasattr(output_file, "write"):
62
+ json.dump(parsed, output_file, indent=2, ensure_ascii=False)
63
+ output_file.flush()
64
+ else:
65
+ with open(output_file, "w", encoding="utf-8") as f:
66
+ json.dump(parsed, f, indent=2, ensure_ascii=False)
67
+ print("✅ JSON updated and saved to", getattr(output_file, "name", output_file))
68
+ return
69
+ except json.JSONDecodeError:
70
+ print("⚠️ Model output was not valid JSON. Raw output (truncated):")
71
+ print(updated_json_str[:2000])
72
+ time.sleep(1)
73
+ raise RuntimeError("Model failed to return valid JSON after retries.")
74
 
75
  if __name__ == "__main__":
76
  import sys