Shami96 commited on
Commit
3edd648
·
verified ·
1 Parent(s): 7a2fc08

Update update_docx_with_pdf.py

Browse files
Files changed (1) hide show
  1. update_docx_with_pdf.py +239 -48
update_docx_with_pdf.py CHANGED
@@ -1,80 +1,271 @@
1
- # update_docx_with_pdf.py
2
- from openai import OpenAI
3
- import json
4
  import os
 
 
5
  import time
 
 
 
 
 
 
 
 
6
 
7
- def read_any(f):
8
- if hasattr(f, "read"):
9
- f.seek(0)
10
- content = f.read()
 
 
 
 
11
  if isinstance(content, bytes):
12
  content = content.decode("utf-8")
13
  return content
14
  else:
15
- with open(f, "r", encoding="utf-8") as fh:
16
  return fh.read()
17
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18
  def update_json_with_pdf(word_json_file, pdf_txt_file, output_file):
19
- word_json = read_any(word_json_file)
 
20
  pdf_txt = read_any(pdf_txt_file)
21
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22
  user_prompt = f"""
23
  Here is a JSON template. It contains only the fields that need updating:
24
- {word_json}
 
25
  Here is the extracted text from a PDF:
26
  {pdf_txt}
 
27
  Instructions:
28
  - ONLY update the fields present in the JSON template, using information from the PDF text.
29
  - DO NOT add any extra fields, and do not change the JSON structure.
30
  - Output ONLY the updated JSON, as raw JSON (no markdown, no extra text, no greetings).
31
- - If a field cannot be populated, keep its original value.
32
  """
33
 
34
  api_key = os.environ.get("OPENAI_API_KEY")
35
  if not api_key:
36
- raise RuntimeError("OPENAI_API_KEY not found in environment variables!")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
37
  client = OpenAI(api_key=api_key)
38
 
39
- # Try a small number of attempts if the model returns text instead of JSON
40
- for attempt in range(3):
41
- response = client.chat.completions.create(
42
- model="gpt-4o",
43
- messages=[
44
- {"role":"system","content":"You are a data extraction assistant. Only reply with valid JSON. Do not add any extra text or formatting."},
45
- {"role":"user","content":user_prompt}
46
- ],
47
- max_tokens=4096,
48
- temperature=0
49
- )
50
- updated_json_str = response.choices[0].message.content.strip()
51
 
 
 
 
 
 
52
  try:
53
- parsed = json.loads(updated_json_str)
54
- template_keys = set(json.loads(word_json).keys())
55
- parsed_keys = set(parsed.keys())
56
- added = parsed_keys - template_keys
57
- if added:
58
- print("⚠️ Model returned extra top-level keys; pruning:", added)
59
- for ak in added:
60
- parsed.pop(ak, None)
61
- if hasattr(output_file, "write"):
62
- json.dump(parsed, output_file, indent=2, ensure_ascii=False)
63
- output_file.flush()
64
- else:
65
- with open(output_file, "w", encoding="utf-8") as f:
66
- json.dump(parsed, f, indent=2, ensure_ascii=False)
67
- print("✅ JSON updated and saved to", getattr(output_file, "name", output_file))
68
- return
69
- except json.JSONDecodeError:
70
- print("⚠️ Model output was not valid JSON. Raw output (truncated):")
71
- print(updated_json_str[:2000])
72
- time.sleep(1)
73
- raise RuntimeError("Model failed to return valid JSON after retries.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
74
 
75
  if __name__ == "__main__":
76
- import sys
77
  if len(sys.argv) != 4:
78
  print("Usage: python update_docx_with_pdf.py <word_json_file> <pdf_txt_file> <output_json_file>")
79
- exit(1)
80
- update_json_with_pdf(sys.argv[1], sys.argv[2], sys.argv[3])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
 
 
2
  import os
3
+ import sys
4
+ import json
5
  import time
6
+ import re
7
+ from typing import Optional
8
+
9
+ # Try to import OpenAI client in the style you used previously
10
+ try:
11
+ from openai import OpenAI
12
+ except Exception as e:
13
+ OpenAI = None
14
 
15
+ RETRIES = 3
16
+ RETRY_DELAY = 1.0 # seconds between retries
17
+
18
+ def read_any(path_or_file):
19
+ """Read content from file path or file-like object."""
20
+ if hasattr(path_or_file, "read"):
21
+ path_or_file.seek(0)
22
+ content = path_or_file.read()
23
  if isinstance(content, bytes):
24
  content = content.decode("utf-8")
25
  return content
26
  else:
27
+ with open(path_or_file, "r", encoding="utf-8") as fh:
28
  return fh.read()
29
 
30
+ def extract_json_substring(s: str) -> Optional[str]:
31
+ """
32
+ Attempt to find the first balanced JSON object substring in s.
33
+ Returns the substring or None.
34
+ """
35
+ if not s:
36
+ return None
37
+ # Find first '{' then walk forward counting braces
38
+ start = s.find("{")
39
+ if start == -1:
40
+ return None
41
+ depth = 0
42
+ in_string = False
43
+ escape = False
44
+ for i in range(start, len(s)):
45
+ ch = s[i]
46
+ if ch == '"' and not escape:
47
+ in_string = not in_string
48
+ if in_string:
49
+ if ch == "\\" and not escape:
50
+ escape = True
51
+ else:
52
+ escape = False
53
+ continue
54
+ if ch == "{":
55
+ depth += 1
56
+ elif ch == "}":
57
+ depth -= 1
58
+ if depth == 0:
59
+ candidate = s[start:i+1]
60
+ return candidate
61
+ return None
62
+
63
+ def try_parse_json(s: str):
64
+ """Try standard json.loads, return parsed or raise."""
65
+ return json.loads(s)
66
+
67
+ def safe_write(path: str, data):
68
+ with open(path, "w", encoding="utf-8") as f:
69
+ json.dump(data, f, indent=2, ensure_ascii=False)
70
+
71
  def update_json_with_pdf(word_json_file, pdf_txt_file, output_file):
72
+ # --- load inputs ---
73
+ word_json_text = read_any(word_json_file)
74
  pdf_txt = read_any(pdf_txt_file)
75
 
76
+ try:
77
+ word_json = json.loads(word_json_text)
78
+ except Exception:
79
+ # If the input word_json isn't valid JSON, abort early but write original to output
80
+ print("⚠️ Input word_json is not valid JSON. Writing raw input to output and exiting.")
81
+ if hasattr(output_file, "write"):
82
+ output_file.write(word_json_text)
83
+ output_file.flush()
84
+ else:
85
+ with open(output_file, "w", encoding="utf-8") as f:
86
+ f.write(word_json_text)
87
+ return
88
+
89
+ # --- build prompt ---
90
  user_prompt = f"""
91
  Here is a JSON template. It contains only the fields that need updating:
92
+ {json.dumps(word_json, ensure_ascii=False)}
93
+
94
  Here is the extracted text from a PDF:
95
  {pdf_txt}
96
+
97
  Instructions:
98
  - ONLY update the fields present in the JSON template, using information from the PDF text.
99
  - DO NOT add any extra fields, and do not change the JSON structure.
100
  - Output ONLY the updated JSON, as raw JSON (no markdown, no extra text, no greetings).
101
+ - Ensure your output is valid JSON. If you cannot find data for a field, keep its existing value in the template.
102
  """
103
 
104
  api_key = os.environ.get("OPENAI_API_KEY")
105
  if not api_key:
106
+ print("⚠️ OPENAI_API_KEY not found in environment variables! Writing original JSON to output and exiting.")
107
+ if hasattr(output_file, "write"):
108
+ json.dump(word_json, output_file, indent=2, ensure_ascii=False)
109
+ output_file.flush()
110
+ else:
111
+ safe_write(output_file, word_json)
112
+ return
113
+
114
+ if OpenAI is None:
115
+ print("⚠️ OpenAI SDK not available (could not import OpenAI). Writing original JSON to output and exiting.")
116
+ if hasattr(output_file, "write"):
117
+ json.dump(word_json, output_file, indent=2, ensure_ascii=False)
118
+ output_file.flush()
119
+ else:
120
+ safe_write(output_file, word_json)
121
+ return
122
+
123
  client = OpenAI(api_key=api_key)
124
 
125
+ system_msgs = [
126
+ "You are a data extraction assistant. Only reply with valid JSON. Do not add any extra text or formatting. Do NOT use markdown/code blocks, just output JSON.",
127
+ ]
128
+
129
+ # Progressive user prompts: first attempt standard, later attempt stricter guidance
130
+ additional_user_variants = [
131
+ user_prompt,
132
+ user_prompt + "\nIf you must, you may output only a minimal JSON by keeping unspecified fields unchanged.",
133
+ user_prompt + "\nIMPORTANT: Output must be exactly and only valid JSON. If you append anything else, I will ignore it.",
134
+ ]
135
+
136
+ model_name = os.environ.get("OPENAI_MODEL", "gpt-4o") # keep same default you used
137
 
138
+ raw_outputs = []
139
+ parsed = None
140
+
141
+ for attempt in range(RETRIES):
142
+ user_content = additional_user_variants[min(attempt, len(additional_user_variants)-1)]
143
  try:
144
+ print(f"🛰️ Calling LLM (attempt {attempt+1}/{RETRIES}) with model {model_name}...")
145
+ response = client.chat.completions.create(
146
+ model=model_name,
147
+ messages=[
148
+ {"role": "system", "content": system_msgs[0]},
149
+ {"role": "user", "content": user_content}
150
+ ],
151
+ max_tokens=4096,
152
+ temperature=0.0
153
+ )
154
+ # The SDK returns different shapes; attempt to access responsibly
155
+ raw_text = None
156
+ try:
157
+ # preferred: choices[0].message.content
158
+ raw_text = response.choices[0].message.content
159
+ except Exception:
160
+ try:
161
+ raw_text = response.choices[0].text
162
+ except Exception:
163
+ raw_text = str(response)
164
+ if isinstance(raw_text, bytes):
165
+ raw_text = raw_text.decode("utf-8", errors="replace")
166
+ raw_text = raw_text.strip()
167
+ raw_outputs.append(raw_text)
168
+ # Try parse as JSON directly
169
+ try:
170
+ parsed = json.loads(raw_text)
171
+ print("✅ Model returned valid JSON.")
172
+ # write output and exit
173
+ if hasattr(output_file, "write"):
174
+ json.dump(parsed, output_file, indent=2, ensure_ascii=False)
175
+ output_file.flush()
176
+ else:
177
+ safe_write(output_file, parsed)
178
+ return parsed
179
+ except Exception as e:
180
+ print("⚠️ Model output was not valid JSON. Will attempt to extract JSON substring.")
181
+ # try extracting json substring
182
+ candidate = extract_json_substring(raw_text)
183
+ if candidate:
184
+ try:
185
+ parsed = json.loads(candidate)
186
+ print("✅ Successfully extracted and parsed JSON substring from model output.")
187
+ if hasattr(output_file, "write"):
188
+ json.dump(parsed, output_file, indent=2, ensure_ascii=False)
189
+ output_file.flush()
190
+ else:
191
+ safe_write(output_file, parsed)
192
+ # save raw for debugging too
193
+ raw_path = f"{output_file}.model_raw.txt" if isinstance(output_file, str) else f"{getattr(output_file, 'name', 'output')}.model_raw.txt"
194
+ with open(raw_path, "w", encoding="utf-8") as rf:
195
+ rf.write(raw_text)
196
+ return parsed
197
+ except Exception:
198
+ print("⚠️ Extracted substring still not valid JSON.")
199
+ else:
200
+ print("⚠️ Could not find a balanced JSON substring in the model output.")
201
+ # if here, wait and retry
202
+ except Exception as e:
203
+ print(f"⚠️ Exception while calling model: {e}")
204
+ time.sleep(RETRY_DELAY)
205
+
206
+ # If we've reached here, all attempts failed
207
+ # Save raw outputs for debugging
208
+ print("❗ All LLM attempts failed to produce valid JSON. Saving diagnostics and returning original JSON (no crash).")
209
+ # write raw outputs to file next to output_file
210
+ raw_path = None
211
+ try:
212
+ raw_path = f"{output_file}.model_raw.txt" if isinstance(output_file, str) else f"{getattr(output_file, 'name', 'output')}.model_raw.txt"
213
+ with open(raw_path, "w", encoding="utf-8") as rf:
214
+ rf.write("=== RAW MODEL OUTPUTS (attempts) ===\n\n")
215
+ for i, out in enumerate(raw_outputs):
216
+ rf.write(f"--- ATTEMPT {i+1} ---\n")
217
+ rf.write(out + "\n\n")
218
+ rf.write("\n=== END ===\n\n")
219
+ rf.write("\n\n=== PDF TEXT USED ===\n\n")
220
+ rf.write(pdf_txt or "")
221
+ print(f"ℹ️ Raw model outputs and pdf text saved to: {raw_path}")
222
+ except Exception as e:
223
+ print(f"⚠️ Failed to save raw model output: {e}")
224
+
225
+ # Also create a salvage file for manual inspection
226
+ salvage_path = None
227
+ try:
228
+ salvage_path = f"{output_file}.salvage.json" if isinstance(output_file, str) else f"{getattr(output_file, 'name', 'output')}.salvage.json"
229
+ salvage_bundle = {
230
+ "original_word_json": word_json,
231
+ "pdf_text_sample": (pdf_txt[:2000] + "...") if pdf_txt else "",
232
+ "raw_outputs_path": raw_path
233
+ }
234
+ with open(salvage_path, "w", encoding="utf-8") as sf:
235
+ json.dump(salvage_bundle, sf, indent=2, ensure_ascii=False)
236
+ print(f"ℹ️ Salvage bundle saved to: {salvage_path}")
237
+ except Exception as e:
238
+ print(f"⚠️ Failed to save salvage bundle: {e}")
239
+
240
+ # Write original JSON to output to avoid failing the calling process
241
+ try:
242
+ if hasattr(output_file, "write"):
243
+ json.dump(word_json, output_file, indent=2, ensure_ascii=False)
244
+ output_file.flush()
245
+ else:
246
+ safe_write(output_file, word_json)
247
+ print("✅ Original JSON template written to output (no updates applied).")
248
+ except Exception as e:
249
+ print(f"⚠️ Failed to write original JSON to output: {e}")
250
+
251
+ return None
252
 
253
  if __name__ == "__main__":
 
254
  if len(sys.argv) != 4:
255
  print("Usage: python update_docx_with_pdf.py <word_json_file> <pdf_txt_file> <output_json_file>")
256
+ sys.exit(0)
257
+
258
+ try:
259
+ update_json_with_pdf(sys.argv[1], sys.argv[2], sys.argv[3])
260
+ except Exception as e:
261
+ # This top-level catch ensures the script exits successfully while logging the issue.
262
+ print(f"Unexpected exception in update_docx_with_pdf.py: {e}")
263
+ # Attempt to copy original json to output before exiting
264
+ try:
265
+ with open(sys.argv[1], "r", encoding="utf-8") as inf, open(sys.argv[3], "w", encoding="utf-8") as outf:
266
+ outf.write(inf.read())
267
+ print("Wrote original input JSON to output due to exception.")
268
+ except Exception:
269
+ pass
270
+ # exit with status 0 so calling process doesn't crash (preserve pipeline behavior)
271
+ sys.exit(0)