Spaces:

pia-capstone
/

pia-ui

No application file

App Files Files Community

amzand commited on May 2

Commit

83251ae

verified ·

1 Parent(s): 98a25af

Upload OpenAI_interface.py

Browse files

Files changed (1) hide show

OpenAI_interface.py +180 -0

OpenAI_interface.py ADDED Viewed

	@@ -0,0 +1,180 @@

+from openai import OpenAI
+import pandas as pd
+import os
+import re
+import tiktoken
+import numpy as np
+from sklearn.metrics.pairwise import cosine_similarity
+from sentence_transformers import SentenceTransformer
+from OpenAI_tools import run_report_classifier
+# ──────────────────────────────────────────────────────────────
+# 🔐 OpenAI setup
+client = OpenAI(api_key="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")
+# ──────────────────────────────────────────────────────────────
+# 📄 Load high-priority agency directory
+AGENCY_CSV = "high_priority_agencies.csv"
+df = pd.read_csv(AGENCY_CSV)
+# ──────────────────────────────────────────────────────────────
+# 🤖 Load embedding model and precompute agency embeddings
+model = SentenceTransformer("all-MiniLM-L6-v2")
+agency_names = df["agency_name"].tolist()
+agency_embeddings = model.encode(agency_names)
+# ──────────────────────────────────────────────────────────────
+# 🧠 Cosine similarity matcher for agency name
+def resolve_agency_index(agency_name):
+    input_vec = model.encode([agency_name])
+    sims = cosine_similarity(input_vec, agency_embeddings)[0]
+    top_k = 3
+    top_indices = sims.argsort()[-top_k:][::-1]
+    print("🔍 Top cosine matches:")
+    for idx in top_indices:
+        print(f" • {df.iloc[idx]['agency_name']} (score: {sims[idx]:.2f})")
+    best_idx = top_indices[0]
+    best_score = sims[best_idx]
+    best_name = df.iloc[best_idx]["agency_name"]
+    if best_score >= 0.7:
+        print(f"🧠 Cosine match for '{agency_name}' ➝ '{best_name}' (score={best_score:.2f})")
+        return best_idx, best_name
+    else:
+        print(f"❌ No confident match found for agency: '{agency_name}' (score={best_score:.2f})")
+        return None, None
+# ──────────────────────────────────────────────────────────────
+# 📊 Token counting utility
+def count_tokens(messages, model="gpt-3.5-turbo"):
+    try:
+        encoding = tiktoken.encoding_for_model(model)
+    except KeyError:
+        encoding = tiktoken.get_encoding("cl100k_base")
+    num_tokens = 0
+    for message in messages:
+        num_tokens += 4  # message overhead
+        for key, value in message.items():
+            num_tokens += len(encoding.encode(value))
+    num_tokens += 2  # reply overhead
+    return num_tokens
+# ──────────────────────────────────────────────────────────────
+# 📬 Query OpenAI for structured extraction or conversation
+def ask_openai(prompt, chatbot_mode=False):
+    system_prompt = (
+        "You are a helpful assistant that responds casually and explains things clearly."
+        if chatbot_mode else
+        "You are an extraction agent. Extract the following from the user’s prompt. "
+        "Respond only in the format:\n"
+        "Agency: [agency name]\nKeyword: [keyword]\nYear: [4-digit year or None]"
+    )
+    messages = [
+        {"role": "system", "content": system_prompt},
+        {"role": "user", "content": prompt}
+    ]
+    num_tokens = count_tokens(messages)
+    cost = num_tokens / 1000 * 0.0015
+    response = client.chat.completions.create(
+        model="gpt-3.5-turbo",
+        messages=messages,
+        temperature=0.2
+    )
+    print(f"🧮 Tokens used: {num_tokens}")
+    print(f"💰 Estimated cost: ${cost:.4f}")
+    return response.choices[0].message.content
+# ──────────────────────────────────────────────────────────────
+# 📤 Extract structured values from model response
+def extract_fields(text):
+    agency = "unknown"
+    keyword = "budget"
+    year = None
+    for line in text.lower().splitlines():
+        if "agency" in line:
+            agency = line.split(":", 1)[-1].strip()
+        elif "keyword" in line:
+            keyword = line.split(":", 1)[-1].strip()
+        elif "year" in line:
+            match = re.search(r"\d{4}", line)
+            if match:
+                year = int(match.group())
+    return {"agency": agency, "keyword": keyword, "year": year}
+# ──────────────────────────────────────────────────────────────
+# 🧾 Main CLI loop
+def main():
+    print("🤖 OpenAI Agent Online. Ask about agency budgets or reports.")
+    print("Say 'let's talk' to switch to chatbot mode.")
+    print("Say 'let's search' to return to extraction/search mode.")
+    print("Say 'exit' or 'quit' to finish.\n")
+    chatbot_mode = False
+    while True:
+        user_input = input("You > ").strip()
+        if not user_input:
+            print("⚠️ Please enter a valid question.")
+            continue
+        lowered = user_input.lower()
+        if lowered in ["exit", "quit"]:
+            print("👋 Goodbye!")
+            break
+        elif lowered == "let's talk":
+            chatbot_mode = True
+            print("🗣️ Switched to chatbot mode.")
+            continue
+        elif lowered == "let's search":
+            chatbot_mode = False
+            print("🔍 Switched to extraction/search mode.")
+            continue
+        try:
+            if chatbot_mode:
+                response = ask_openai(user_input, chatbot_mode=True)
+                print("\n💬 Chatbot Response:\n" + response + "\n")
+            else:
+                response = ask_openai(user_input, chatbot_mode=False)
+                print("\n🧠 LLM Response:\n" + response + "\n")
+                parsed = extract_fields(response)
+                agency, keyword, year = parsed["agency"], parsed["keyword"], parsed["year"]
+                print(f"🧾 Parsed → Agency: {agency} | Keyword: {keyword} | Year: {year}")
+                index, resolved_agency = resolve_agency_index(agency)
+                if index is None:
+                    print(f"⚠️ Could not resolve agency: {agency}")
+                    continue
+                print(f"🚀 Launching search for '{resolved_agency}' (index {index}) with keyword '{keyword}' and FY {year}\n")
+                run_report_classifier(
+                    agency_df=df,
+                    search_term=keyword,
+                    fiscal_year=year if year else "",
+                    start_index=index,
+                    end_index=index,
+                    max_results=15,
+                    output_filename="openAI_bot_output.csv",
+                    brave_api_key="BSAnrtOGAioqFKfAPoKPl1tjiNZMyLW",
+                    google_api_key="AIzaSyBf8FTeYbZWclDiDnf4eFudlWPQAhOybVY",
+                    google_cse_id="f3d82263565884717"
+                )
+        except Exception as e:
+            print(f"❌ Error: {e}")
+# ──────────────────────────────────────────────────────────────
+if __name__ == "__main__":
+    main()