jadenngraham amzand commited on
Commit
013d153
·
verified ·
1 Parent(s): 02660e3

Upload OpenAI_tools.py (#5)

Browse files

- Upload OpenAI_tools.py (ccc38587e1470803cd5e6c2b857f00ea4e616fdd)


Co-authored-by: Aria Zand <amzand@users.noreply.huggingface.co>

Files changed (1) hide show
  1. OpenAI_tools.py +341 -0
OpenAI_tools.py ADDED
@@ -0,0 +1,341 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import csv
3
+ import time
4
+ import base64
5
+ import tempfile
6
+ import random
7
+ import requests
8
+ import pandas as pd
9
+ from PyPDF2 import PdfReader
10
+ from docx import Document
11
+ from bs4 import BeautifulSoup
12
+ from selenium import webdriver
13
+ from selenium.webdriver.edge.options import Options
14
+ from selenium.webdriver.common.by import By
15
+ from selenium.webdriver.edge.service import Service as EdgeService
16
+ from selenium.webdriver.support.ui import WebDriverWait
17
+ from selenium.webdriver.support import expected_conditions as EC
18
+ from urllib.parse import urlparse
19
+ from API_tools import full_agency_search, map_search_term
20
+
21
+ from openai import OpenAI
22
+ client = OpenAI(api_key="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")
23
+
24
+ def extract_docx_text(binary_data):
25
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".docx") as tmp:
26
+ tmp.write(binary_data)
27
+ temp_path = tmp.name
28
+ try:
29
+ doc = Document(temp_path)
30
+ return "\n".join([para.text for para in doc.paragraphs])
31
+ finally:
32
+ os.remove(temp_path)
33
+
34
+ def clean_html(html_content):
35
+ soup = BeautifulSoup(html_content, "html.parser")
36
+ return soup.get_text(separator="\n")
37
+
38
+ def extract_with_selenium(url):
39
+ try:
40
+ print(f"🌐 Falling back to Selenium for: {url}")
41
+ options = Options()
42
+ options.add_argument("--headless=new")
43
+ driver = webdriver.Edge(service=EdgeService("C:/msedgedriver.exe"), options=options)
44
+ driver.get(url)
45
+ print("⏳ Waiting in browser for full PDF to load...")
46
+ WebDriverWait(driver, 6).until(lambda d: d.execute_script("return document.readyState") == "complete")
47
+ html = driver.page_source
48
+ driver.quit()
49
+ return clean_html(html)[:5000]
50
+ except Exception as e:
51
+ print(f"⚠️ Selenium fallback failed: {e}")
52
+ return ""
53
+
54
+
55
+ def download_pdf_with_selenium(url):
56
+ print("🚀 Launching browser to fetch PDF directly...")
57
+
58
+ options = Options()
59
+ options.add_argument("--headless=new")
60
+ options.add_argument("--disable-blink-features=AutomationControlled")
61
+ options.add_argument("--disable-gpu")
62
+ options.add_argument("--disable-dev-shm-usage")
63
+
64
+ driver = webdriver.Edge(service=EdgeService("C:/msedgedriver.exe"), options=options)
65
+ driver.get(url)
66
+ time.sleep(5) # Let the PDF fully render
67
+
68
+ print("📥 Attempting CDP-based download...")
69
+ try:
70
+ result = driver.execute_cdp_cmd("Page.getResourceContent", {
71
+ "frameId": driver.execute_cdp_cmd("Page.getFrameTree", {})['frameTree']['frame']['id'],
72
+ "url": url
73
+ })
74
+ content = base64.b64decode(result['content']) if result.get('base64Encoded') else result['content']
75
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp:
76
+ tmp.write(content)
77
+ return tmp.name
78
+ except Exception as e:
79
+ print(f"❌ Failed to grab PDF via CDP: {e}")
80
+ return None
81
+ finally:
82
+ driver.quit()
83
+
84
+
85
+ def download_snippet(url):
86
+ """Download a document and extract a text snippet."""
87
+
88
+ domain = urlparse(url).scheme + "://" + urlparse(url).netloc + "/"
89
+
90
+ headers = {
91
+ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36",
92
+ "Accept": "application/pdf,application/vnd.openxmlformats-officedocument.wordprocessingml.document,text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
93
+ "Referer": domain
94
+ }
95
+
96
+ def try_read_pdf(path):
97
+ reader = PdfReader(path)
98
+ snippet = ""
99
+ for page in reader.pages:
100
+ text = page.extract_text() or ""
101
+ if len(text.strip()) < 50 or "intentionally left blank" in text.lower():
102
+ continue
103
+ snippet += text
104
+ if len(snippet) > 1000:
105
+ break
106
+ return snippet.strip()[:5000]
107
+
108
+ try:
109
+ response = requests.get(url, headers=headers, timeout=60)
110
+ content_type = response.headers.get("Content-Type", "")
111
+ if response.status_code != 200 or ("application/pdf" in content_type and len(response.content) < 500):
112
+ raise Exception(f"Bypassing requests — status: {response.status_code}, type: {content_type}")
113
+
114
+ if "application/pdf" in content_type or url.lower().endswith(".pdf"):
115
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp:
116
+ tmp.write(response.content)
117
+ temp_path = tmp.name
118
+
119
+ try:
120
+ snippet = try_read_pdf(temp_path)
121
+ if len(snippet.strip()) < 300:
122
+ raise Exception("PyPDF2 text too short")
123
+ print(f"📄 PyPDF2 snippet length: {len(snippet.strip())}")
124
+ return snippet
125
+ except Exception as e:
126
+ print(f"⚠️ PyPDF2 failed: {e}")
127
+ os.remove(temp_path)
128
+ raise
129
+ finally:
130
+ if os.path.exists(temp_path):
131
+ os.remove(temp_path)
132
+
133
+ elif "application/vnd.openxmlformats-officedocument.wordprocessingml.document" in content_type or url.lower().endswith(".docx"):
134
+ try:
135
+ return extract_docx_text(response.content)[:5000]
136
+ except Exception as e:
137
+ print(f"⚠️ DOCX extraction failed: {e}")
138
+ return ""
139
+
140
+ else:
141
+ print(f"📝 Attempting HTML fallback for: {url}")
142
+ cleaned = clean_html(response.text)
143
+ if len(cleaned.strip()) < 200:
144
+ return extract_with_selenium(url)
145
+ return cleaned[:5000]
146
+
147
+ except Exception as e:
148
+ print(f"⚠️ Final fallback to Selenium for {url} due to: {e}")
149
+
150
+ # Try in-browser fetch-based PDF capture
151
+ try:
152
+ options = Options()
153
+ options.add_argument("--headless=new")
154
+ options.add_argument("--disable-blink-features=AutomationControlled")
155
+ options.add_argument("--disable-gpu")
156
+ options.add_argument("--disable-dev-shm-usage")
157
+ driver = webdriver.Edge(service=EdgeService("C:/msedgedriver.exe"), options=options)
158
+ driver.get(url)
159
+ print("⏳ Waiting in browser for full PDF to load...")
160
+ import time
161
+ time.sleep(10)
162
+
163
+ base64_data = driver.execute_async_script("""
164
+ const url = window.location.href;
165
+ const done = arguments[0];
166
+ fetch(url)
167
+ .then(resp => resp.blob())
168
+ .then(blob => {
169
+ const reader = new FileReader();
170
+ reader.onloadend = () => {
171
+ const base64 = reader.result.split(',')[1];
172
+ done(base64);
173
+ };
174
+ reader.readAsDataURL(blob);
175
+ })
176
+ .catch(err => done(null));
177
+ """)
178
+
179
+ driver.quit()
180
+
181
+ if base64_data:
182
+ decoded = base64.b64decode(base64_data)
183
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp:
184
+ tmp.write(decoded)
185
+ temp_path = tmp.name
186
+
187
+ try:
188
+ snippet = try_read_pdf(temp_path)
189
+ os.remove(temp_path)
190
+ return snippet
191
+ except Exception as e:
192
+ print(f"⚠️ In-browser fetch + PyPDF2 failed: {e}")
193
+ if os.path.exists(temp_path):
194
+ os.remove(temp_path)
195
+ except Exception as e:
196
+ print(f"⚠️ Full Selenium fallback failed: {e}")
197
+
198
+ return extract_with_selenium(url)
199
+
200
+ def ask_openai(prompt, temperature=0):
201
+ response = client.chat.completions.create(
202
+ model="gpt-4o",
203
+ temperature=temperature,
204
+ messages=[
205
+ {"role": "system", "content": "You are an expert data extractor and document classifier for government reports."},
206
+ {"role": "user", "content": prompt}
207
+ ]
208
+ )
209
+ usage = response.usage # contains prompt_tokens and completion_tokens
210
+ total_tokens = usage.total_tokens
211
+ print(f"🧮 Tokens used: {total_tokens}")
212
+
213
+ # Optional: estimate cost for GPT-4o
214
+ cost = (total_tokens / 1000) * 0.005 # $0.005 per 1K tokens for GPT-4o as of April 2024
215
+ print(f"💰 Estimated cost: ${cost:.4f}")
216
+
217
+ return response.choices[0].message.content.strip()
218
+
219
+ def classify_report(url, text):
220
+ prompt = f"""Given the following text snippet from a government document, classify it as one of the following report types:
221
+ - Congressional Justification (CJ)
222
+ - Agency Financial Report (AFR)
223
+ - Performance and Accountability Report (PAR)
224
+ - Congressional Research Service Report (CRS)
225
+
226
+ Text:
227
+ {text}
228
+
229
+ Return:
230
+ Report Type: <type>
231
+ Agency: <agency>
232
+ Year: <year>
233
+ Report Title: <title>"""
234
+ return ask_openai(prompt)
235
+
236
+ def save_csv(rows, filename):
237
+ with open(filename, "w", newline='', encoding="utf-8") as f:
238
+ writer = csv.DictWriter(f, fieldnames=[
239
+ "Agency", "Year", "Report Type", "Report Name",
240
+ "Report Hosting Web Page", "Report PDF Link"
241
+ ])
242
+ writer.writeheader()
243
+ writer.writerows(rows)
244
+ print(f"✅ Saved results to {filename}")
245
+
246
+ def run_report_classifier(
247
+ agency_df,
248
+ search_term="budget",
249
+ fiscal_year=2024,
250
+ start_index=0,
251
+ end_index=0,
252
+ max_results=10,
253
+ output_filename="pia_full_project_results.csv",
254
+ brave_api_key=None,
255
+ google_api_key=None,
256
+ google_cse_id=None
257
+ ):
258
+ output_rows = []
259
+
260
+ if start_index > end_index or end_index >= len(agency_df):
261
+ print(f"⚠️ Invalid index range: {start_index}–{end_index} (max index = {len(agency_df)-1})")
262
+ return
263
+
264
+ for i in range(start_index, end_index + 1):
265
+ agency_name = agency_df.iloc[i]['agency_name']
266
+ agency_url = agency_df.iloc[i]['agency_url']
267
+
268
+ print(f"\n🔍 Starting search for {agency_name} ({agency_url})")
269
+
270
+ try:
271
+ mapped_search_term = map_search_term(search_term)
272
+ found_links = full_agency_search(
273
+ agency_name=agency_name,
274
+ agency_url=agency_url,
275
+ search_term=mapped_search_term,
276
+ fiscal_year=fiscal_year,
277
+ max_results=max_results,
278
+ brave_api_key=brave_api_key,
279
+ google_api_key=google_api_key,
280
+ google_cse_id=google_cse_id,
281
+ use_google=False # Brave-only mode
282
+ )
283
+ except Exception as e:
284
+ print(f"⚠️ Error searching {agency_name}: {e}")
285
+ continue
286
+
287
+ for idx, doc in enumerate(found_links, 1):
288
+ url = doc.get("Link")
289
+ if not url:
290
+ continue
291
+ print(f"\n📄 Analyzing document {idx} of {len(found_links)}: {url}")
292
+ snippet = download_snippet(url)
293
+ if snippet:
294
+ try:
295
+ # Using OpenAI API for classification:
296
+ classification = classify_report(url, snippet)
297
+ print(f"✅ Classification:\n{classification}\n")
298
+ lines = classification.split("\n")
299
+ parsed = {k.split(":")[0].strip(): k.split(":")[1].strip() for k in lines if ":" in k}
300
+ output_rows.append({
301
+ "Agency": parsed.get("Agency", agency_name),
302
+ "Year": parsed.get("Year", ""),
303
+ "Report Type": parsed.get("Report Type", ""),
304
+ "Report Name": parsed.get("Report Title", ""),
305
+ "Report Hosting Web Page": agency_url,
306
+ "Report PDF Link": url
307
+ })
308
+ '''
309
+ # If editting APIs and not desiring to burn OpenAI tokens (comment this out if doing the whole process):
310
+ output_rows.append({
311
+ "Agency": agency_name,
312
+ "Year": "",
313
+ "Report Type": "",
314
+ "Report Name": "",
315
+ "Report Hosting Web Page": agency_url,
316
+ "Report PDF Link": url
317
+ })
318
+ '''
319
+ except Exception as e:
320
+ print(f"⚠️ Classification error: {e}")
321
+ time.sleep(random.uniform(0.3, 1.0))
322
+
323
+ save_csv(output_rows, output_filename)
324
+
325
+ '''
326
+ # If testing locally, uncomment the following lines:
327
+ if __name__ == "__main__":
328
+ df = pd.read_csv("agency_directory.csv")
329
+ run_report_classifier(
330
+ agency_df=df,
331
+ search_term="budget",
332
+ fiscal_year=2024,
333
+ start_index=3,
334
+ end_index=3,
335
+ max_results=15,
336
+ output_filename="pia_full_project_results.csv",
337
+ brave_api_key="BSAnrtOGAioqFKfAPoKPl1tjiNZMyLW",
338
+ google_api_key="AIzaSyBf8FTeYbZWclDiDnf4eFudlWPQAhOybVY",
339
+ google_cse_id="f3d82263565884717"
340
+ )
341
+ '''