Spaces:
Sleeping
Sleeping
import os | |
# === CRITICAL: Set cache directories BEFORE any other imports === | |
os.environ['HF_HOME'] = '/tmp/huggingface_cache' | |
os.environ['TRANSFORMERS_CACHE'] = '/tmp/transformers_cache' | |
os.environ['HF_DATASETS_CACHE'] = '/tmp/datasets_cache' | |
# Now import everything else | |
import json | |
import datetime | |
import requests | |
import gspread | |
from dotenv import load_dotenv | |
from huggingface_hub import login as hf_login | |
from langchain_community.vectorstores import FAISS | |
from langchain.embeddings.base import Embeddings | |
from sentence_transformers import SentenceTransformer | |
from langchain_tavily import TavilySearch | |
from google.adk.tools import FunctionTool | |
# === LOAD ENV === | |
load_dotenv() | |
HF_TOKEN = os.getenv("HUGGINGFACE_TOKEN") | |
GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY") | |
TAVILY_API_KEY = os.getenv("TAVILY_API_KEY") | |
SERVICE_ACCOUNT_JSON = os.getenv("GOOGLE_SERVICE_ACCOUNT_JSON") | |
SHEET_KEY = os.getenv("SHEET_KEY") | |
PREDICTOR_API_URL = os.getenv("PREDICTOR_API_URL") | |
PREDICTOR_API_KEY = os.getenv("PREDICTOR_API_KEY") | |
hf_login(token=HF_TOKEN) | |
# === GOOGLE SHEET LOGGING === | |
service_account_dict = json.loads(SERVICE_ACCOUNT_JSON) if isinstance(SERVICE_ACCOUNT_JSON, str) else SERVICE_ACCOUNT_JSON | |
def add_query_to_sheet(user_id: str, query: str, response: str): | |
gc = gspread.service_account_from_dict(service_account_dict) | |
sh = gc.open_by_key(SHEET_KEY) | |
ws = sh.worksheet("Sheet1") | |
timestamp = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") | |
ws.append_row([user_id, timestamp, query, response]) | |
# === VECTOR STORE === | |
def load_vector_store(data_dir: str): | |
texts = [] | |
for fname in os.listdir(data_dir): | |
if fname.lower().endswith(".md"): | |
path = os.path.join(data_dir, fname) | |
try: | |
with open(path, "r", encoding="utf-8") as f: | |
texts.append(f.read()) | |
except UnicodeDecodeError: | |
with open(path, "r", encoding="latin-1") as f: | |
texts.append(f.read()) | |
st_model = SentenceTransformer("all-MiniLM-L6-v2") | |
class LocalEmbeddings(Embeddings): | |
def embed_documents(self, docs): | |
return st_model.encode(docs).tolist() | |
def embed_query(self, q): | |
return st_model.encode([q])[0].tolist() | |
return FAISS.from_texts(texts, LocalEmbeddings()) | |
vector_store = load_vector_store("College_markdown") | |
# === TOOL DEFINITIONS === | |
def db_search(query: str) -> dict: | |
docs = vector_store.similarity_search(query, k=6) | |
if not docs: return {"results": []} | |
return {"results": [d.page_content for d in docs]} | |
def tavily_search(query: str) -> dict: | |
tool = TavilySearch(max_results=6, topic="general", include_raw_content=True) | |
result = tool.invoke({"query": query}) | |
snippets = [item.get('content') for item in result.get('results', [])] | |
return {"results": snippets or []} | |
def college_predictor( | |
userCrl: int, | |
userCategory: str, | |
userGender: str, | |
userHomeState: str, | |
limit: int = 6, | |
counsellingName: str = "josaa", | |
collegeName: str = "national institute of technology", | |
branchName: str = "computer science and engineering" | |
) -> str: | |
headers = { | |
"Content-Type": "application/json", | |
"Authorization": f"Bearer {PREDICTOR_API_KEY}" | |
} | |
params = { | |
"userCrl": userCrl, | |
"userCategory": userCategory, | |
"userGender": userGender, | |
"userHomeState": userHomeState, | |
"limit": limit, | |
"counsellingName": counsellingName, | |
} | |
if collegeName: | |
params["collegeQuery"] = collegeName | |
if branchName: | |
params["branchQuery"] = branchName | |
try: | |
response = requests.post(PREDICTOR_API_URL, json=params, headers=headers, timeout=30) | |
response.raise_for_status() | |
data = response.json() | |
if not data or 'data' not in data or 'colleges' not in data['data']: | |
return "No college predictions found with the given criteria." | |
colleges = data['data']['colleges'] | |
if not colleges: | |
return "No college predictions found with the given criteria." | |
results = [] | |
for i, college in enumerate(colleges[:limit], start=1): | |
parts = [f"{i}. College: {college.get('Institute', 'N/A')}"] | |
if college.get('Academic_Program_Name'): | |
parts.append(f"Branch: {college['Academic_Program_Name']}") | |
if college.get('Seat_Type'): | |
parts.append(f"Category: {college['Seat_Type']}") | |
if college.get('Max_ClosingRank'): | |
parts.append(f"Closing Rank: {college['Max_ClosingRank']}") | |
results.append(", ".join(parts)) | |
return f"Based on your rank {userCrl}, here are college predictions:\n\n" + "\n".join(results) | |
except requests.exceptions.RequestException as e: | |
return f"Error fetching college predictions: {str(e)}" | |
def mentor_search(college_query: str) -> str: | |
"""Search mentors by college name and return formatted links.""" | |
url = f"https://test.api.precollege.in/api/v1/mentor/search?q={college_query}" | |
try: | |
response = requests.get(url, timeout=10) | |
response.raise_for_status() | |
data = response.json() | |
if not data or "data" not in data or not data["data"]: | |
return f"No mentors found for '{college_query}'." | |
mentors = data["data"] | |
lines = [] | |
for mentor in mentors: | |
name = mentor.get("name", "Unknown") | |
username = mentor.get("username", "") | |
profile_url = f"https://www.precollege.in/mentor/{username}" if username else "No profile link" | |
lines.append(f"{name}: {profile_url}") | |
return f"Mentors for '{college_query}':\n\n" + "\n".join(lines) | |
except requests.exceptions.RequestException as e: | |
return f"Failed to fetch mentors: {str(e)}" | |
# === FUNCTION TOOL WRAPPERS === | |
db_tool = FunctionTool(db_search) | |
tavily_tool = FunctionTool(tavily_search) | |
predictor_tool = FunctionTool(college_predictor) | |
mentor_tool = FunctionTool(mentor_search) | |