Spaces:

pia-capstone
/

pia-ui

No application file

App Files Files Community

jadenngraham

amzand commited on May 2

Commit

013d153

verified ·

1 Parent(s): 02660e3

Upload OpenAI_tools.py (#5)

Browse files

- Upload OpenAI_tools.py (ccc38587e1470803cd5e6c2b857f00ea4e616fdd)

Co-authored-by: Aria Zand <amzand@users.noreply.huggingface.co>

Files changed (1) hide show

OpenAI_tools.py +341 -0

OpenAI_tools.py ADDED Viewed

	@@ -0,0 +1,341 @@

+import os
+import csv
+import time
+import base64
+import tempfile
+import random
+import requests
+import pandas as pd
+from PyPDF2 import PdfReader
+from docx import Document
+from bs4 import BeautifulSoup
+from selenium import webdriver
+from selenium.webdriver.edge.options import Options
+from selenium.webdriver.common.by import By
+from selenium.webdriver.edge.service import Service as EdgeService
+from selenium.webdriver.support.ui import WebDriverWait
+from selenium.webdriver.support import expected_conditions as EC
+from urllib.parse import urlparse
+from API_tools import full_agency_search, map_search_term
+from openai import OpenAI
+client = OpenAI(api_key="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")
+def extract_docx_text(binary_data):
+    with tempfile.NamedTemporaryFile(delete=False, suffix=".docx") as tmp:
+        tmp.write(binary_data)
+        temp_path = tmp.name
+    try:
+        doc = Document(temp_path)
+        return "\n".join([para.text for para in doc.paragraphs])
+    finally:
+        os.remove(temp_path)
+def clean_html(html_content):
+    soup = BeautifulSoup(html_content, "html.parser")
+    return soup.get_text(separator="\n")
+def extract_with_selenium(url):
+    try:
+        print(f"🌐 Falling back to Selenium for: {url}")
+        options = Options()
+        options.add_argument("--headless=new")
+        driver = webdriver.Edge(service=EdgeService("C:/msedgedriver.exe"), options=options)
+        driver.get(url)
+        print("⏳ Waiting in browser for full PDF to load...")
+        WebDriverWait(driver, 6).until(lambda d: d.execute_script("return document.readyState") == "complete")
+        html = driver.page_source
+        driver.quit()
+        return clean_html(html)[:5000]
+    except Exception as e:
+        print(f"⚠️ Selenium fallback failed: {e}")
+        return ""
+def download_pdf_with_selenium(url):
+    print("🚀 Launching browser to fetch PDF directly...")
+    options = Options()
+    options.add_argument("--headless=new")
+    options.add_argument("--disable-blink-features=AutomationControlled")
+    options.add_argument("--disable-gpu")
+    options.add_argument("--disable-dev-shm-usage")
+    driver = webdriver.Edge(service=EdgeService("C:/msedgedriver.exe"), options=options)
+    driver.get(url)
+    time.sleep(5)  # Let the PDF fully render
+    print("📥 Attempting CDP-based download...")
+    try:
+        result = driver.execute_cdp_cmd("Page.getResourceContent", {
+            "frameId": driver.execute_cdp_cmd("Page.getFrameTree", {})['frameTree']['frame']['id'],
+            "url": url
+        })
+        content = base64.b64decode(result['content']) if result.get('base64Encoded') else result['content']
+        with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp:
+            tmp.write(content)
+            return tmp.name
+    except Exception as e:
+        print(f"❌ Failed to grab PDF via CDP: {e}")
+        return None
+    finally:
+        driver.quit()
+def download_snippet(url):
+    """Download a document and extract a text snippet."""
+    domain = urlparse(url).scheme + "://" + urlparse(url).netloc + "/"
+    headers = {
+        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36",
+        "Accept": "application/pdf,application/vnd.openxmlformats-officedocument.wordprocessingml.document,text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
+        "Referer": domain
+    }
+    def try_read_pdf(path):
+        reader = PdfReader(path)
+        snippet = ""
+        for page in reader.pages:
+            text = page.extract_text() or ""
+            if len(text.strip()) < 50 or "intentionally left blank" in text.lower():
+                continue
+            snippet += text
+            if len(snippet) > 1000:
+                break
+        return snippet.strip()[:5000]
+    try:
+        response = requests.get(url, headers=headers, timeout=60)
+        content_type = response.headers.get("Content-Type", "")
+        if response.status_code != 200 or ("application/pdf" in content_type and len(response.content) < 500):
+            raise Exception(f"Bypassing requests — status: {response.status_code}, type: {content_type}")
+        if "application/pdf" in content_type or url.lower().endswith(".pdf"):
+            with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp:
+                tmp.write(response.content)
+                temp_path = tmp.name
+            try:
+                snippet = try_read_pdf(temp_path)
+                if len(snippet.strip()) < 300:
+                    raise Exception("PyPDF2 text too short")
+                print(f"📄 PyPDF2 snippet length: {len(snippet.strip())}")
+                return snippet
+            except Exception as e:
+                print(f"⚠️ PyPDF2 failed: {e}")
+                os.remove(temp_path)
+                raise
+            finally:
+                if os.path.exists(temp_path):
+                    os.remove(temp_path)
+        elif "application/vnd.openxmlformats-officedocument.wordprocessingml.document" in content_type or url.lower().endswith(".docx"):
+            try:
+                return extract_docx_text(response.content)[:5000]
+            except Exception as e:
+                print(f"⚠️ DOCX extraction failed: {e}")
+                return ""
+        else:
+            print(f"📝 Attempting HTML fallback for: {url}")
+            cleaned = clean_html(response.text)
+            if len(cleaned.strip()) < 200:
+                return extract_with_selenium(url)
+            return cleaned[:5000]
+    except Exception as e:
+        print(f"⚠️ Final fallback to Selenium for {url} due to: {e}")
+        # Try in-browser fetch-based PDF capture
+        try:
+            options = Options()
+            options.add_argument("--headless=new")
+            options.add_argument("--disable-blink-features=AutomationControlled")
+            options.add_argument("--disable-gpu")
+            options.add_argument("--disable-dev-shm-usage")
+            driver = webdriver.Edge(service=EdgeService("C:/msedgedriver.exe"), options=options)
+            driver.get(url)
+            print("⏳ Waiting in browser for full PDF to load...")
+            import time
+            time.sleep(10)
+            base64_data = driver.execute_async_script("""
+                const url = window.location.href;
+                const done = arguments[0];
+                fetch(url)
+                    .then(resp => resp.blob())
+                    .then(blob => {
+                        const reader = new FileReader();
+                        reader.onloadend = () => {
+                            const base64 = reader.result.split(',')[1];
+                            done(base64);
+                        };
+                        reader.readAsDataURL(blob);
+                    })
+                    .catch(err => done(null));
+            """)
+            driver.quit()
+            if base64_data:
+                decoded = base64.b64decode(base64_data)
+                with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as tmp:
+                    tmp.write(decoded)
+                    temp_path = tmp.name
+                try:
+                    snippet = try_read_pdf(temp_path)
+                    os.remove(temp_path)
+                    return snippet
+                except Exception as e:
+                    print(f"⚠️ In-browser fetch + PyPDF2 failed: {e}")
+                    if os.path.exists(temp_path):
+                        os.remove(temp_path)
+        except Exception as e:
+            print(f"⚠️ Full Selenium fallback failed: {e}")
+        return extract_with_selenium(url)
+def ask_openai(prompt, temperature=0):
+    response = client.chat.completions.create(
+        model="gpt-4o",
+        temperature=temperature,
+        messages=[
+            {"role": "system", "content": "You are an expert data extractor and document classifier for government reports."},
+            {"role": "user", "content": prompt}
+        ]
+    )
+    usage = response.usage  # contains prompt_tokens and completion_tokens
+    total_tokens = usage.total_tokens
+    print(f"🧮 Tokens used: {total_tokens}")
+    # Optional: estimate cost for GPT-4o
+    cost = (total_tokens / 1000) * 0.005  # $0.005 per 1K tokens for GPT-4o as of April 2024
+    print(f"💰 Estimated cost: ${cost:.4f}")
+    return response.choices[0].message.content.strip()
+def classify_report(url, text):
+    prompt = f"""Given the following text snippet from a government document, classify it as one of the following report types:
+- Congressional Justification (CJ)
+- Agency Financial Report (AFR)
+- Performance and Accountability Report (PAR)
+- Congressional Research Service Report (CRS)
+Text:
+{text}
+Return:
+Report Type: <type>
+Agency: <agency>
+Year: <year>
+Report Title: <title>"""
+    return ask_openai(prompt)
+def save_csv(rows, filename):
+    with open(filename, "w", newline='', encoding="utf-8") as f:
+        writer = csv.DictWriter(f, fieldnames=[
+            "Agency", "Year", "Report Type", "Report Name",
+            "Report Hosting Web Page", "Report PDF Link"
+        ])
+        writer.writeheader()
+        writer.writerows(rows)
+    print(f"✅ Saved results to {filename}")
+def run_report_classifier(
+    agency_df,
+    search_term="budget",
+    fiscal_year=2024,
+    start_index=0,
+    end_index=0,
+    max_results=10,
+    output_filename="pia_full_project_results.csv",
+    brave_api_key=None,
+    google_api_key=None,
+    google_cse_id=None
+):
+    output_rows = []
+    if start_index > end_index or end_index >= len(agency_df):
+        print(f"⚠️ Invalid index range: {start_index}–{end_index} (max index = {len(agency_df)-1})")
+        return
+    for i in range(start_index, end_index + 1):
+        agency_name = agency_df.iloc[i]['agency_name']
+        agency_url = agency_df.iloc[i]['agency_url']
+        print(f"\n🔍 Starting search for {agency_name} ({agency_url})")
+        try:
+            mapped_search_term = map_search_term(search_term)
+            found_links = full_agency_search(
+                agency_name=agency_name,
+                agency_url=agency_url,
+                search_term=mapped_search_term,
+                fiscal_year=fiscal_year,
+                max_results=max_results,
+                brave_api_key=brave_api_key,
+                google_api_key=google_api_key,
+                google_cse_id=google_cse_id,
+                use_google=False  # Brave-only mode
+            )
+        except Exception as e:
+            print(f"⚠️ Error searching {agency_name}: {e}")
+            continue
+        for idx, doc in enumerate(found_links, 1):
+            url = doc.get("Link")
+            if not url:
+                continue
+            print(f"\n📄 Analyzing document {idx} of {len(found_links)}: {url}")
+            snippet = download_snippet(url)
+            if snippet:
+                try:
+                    # Using OpenAI API for classification:
+                    classification = classify_report(url, snippet)
+                    print(f"✅ Classification:\n{classification}\n")
+                    lines = classification.split("\n")
+                    parsed = {k.split(":")[0].strip(): k.split(":")[1].strip() for k in lines if ":" in k}
+                    output_rows.append({
+                        "Agency": parsed.get("Agency", agency_name),
+                        "Year": parsed.get("Year", ""),
+                        "Report Type": parsed.get("Report Type", ""),
+                        "Report Name": parsed.get("Report Title", ""),
+                        "Report Hosting Web Page": agency_url,
+                        "Report PDF Link": url
+                    })
+                    '''
+                    # If editting APIs and not desiring to burn OpenAI tokens (comment this out if doing the whole process):
+                    output_rows.append({
+                        "Agency": agency_name,
+                        "Year": "",
+                        "Report Type": "",
+                        "Report Name": "",
+                        "Report Hosting Web Page": agency_url,
+                        "Report PDF Link": url
+                    })
+                    '''
+                except Exception as e:
+                    print(f"⚠️ Classification error: {e}")
+            time.sleep(random.uniform(0.3, 1.0))
+    save_csv(output_rows, output_filename)
+'''
+# If testing locally, uncomment the following lines:
+if __name__ == "__main__":
+    df = pd.read_csv("agency_directory.csv")
+    run_report_classifier(
+        agency_df=df,
+        search_term="budget",
+        fiscal_year=2024,
+        start_index=3,
+        end_index=3,
+        max_results=15,
+        output_filename="pia_full_project_results.csv",
+        brave_api_key="BSAnrtOGAioqFKfAPoKPl1tjiNZMyLW",
+        google_api_key="AIzaSyBf8FTeYbZWclDiDnf4eFudlWPQAhOybVY",
+        google_cse_id="f3d82263565884717"
+    )
+'''