Spaces:
Sleeping
Sleeping
File size: 4,181 Bytes
b467f2f ade1780 b467f2f 6dd717f ade1780 1bdac65 ade1780 b467f2f 1bdac65 b467f2f 1bdac65 ade1780 1bdac65 ade1780 b467f2f 1bdac65 6dd717f 1bdac65 b467f2f 1bdac65 b467f2f 1bdac65 b467f2f 1bdac65 b467f2f 1bdac65 ade1780 b467f2f 1bdac65 b467f2f 1bdac65 b467f2f 1bdac65 b467f2f 1bdac65 1c1b54f 1bdac65 f8b9b57 1bdac65 b467f2f 1bdac65 b467f2f 1bdac65 b467f2f 1bdac65 b467f2f 1bdac65 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 |
import json
from datetime import datetime
from sentence_transformers import SentenceTransformer, util
from groq import Groq
from dotenv import load_dotenv
import os
from datasets import load_dataset, Dataset, DatasetDict
import pandas as pd
# Load environment variables
load_dotenv()
# Initialize clients
groq_client = Groq(api_key=os.getenv("GROQ_API_KEY"))
similarity_model = SentenceTransformer('paraphrase-MiniLM-L6-v2')
# Configuration
HF_DATASET_REPO = "midrees2806/unmatched_queries" # Your dataset repo
HF_TOKEN = os.getenv("HF_TOKEN") # From Space secrets
# --- Dataset Loading ---
try:
with open('dataset.json', 'r') as f:
dataset = json.load(f)
if not all(isinstance(item, dict) and 'input' in item and 'response' in item for item in dataset):
raise ValueError("Invalid dataset structure")
except Exception as e:
print(f"Error loading dataset: {e}")
dataset = []
# Precompute embeddings
dataset_questions = [item.get("input", "").lower().strip() for item in dataset]
dataset_answers = [item.get("response", "") for item in dataset]
dataset_embeddings = similarity_model.encode(dataset_questions, convert_to_tensor=True)
# --- Unmatched Queries Handler ---
def manage_unmatched_queries(query: str):
"""Save unmatched queries to HF Dataset with error handling"""
try:
timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
# Load existing dataset or create new
try:
ds = load_dataset(HF_DATASET_REPO, token=HF_TOKEN)
df = ds["train"].to_pandas()
except:
df = pd.DataFrame(columns=["Query", "Timestamp", "Processed"])
# Append new query (avoid duplicates)
if query not in df["Query"].values:
new_entry = {"Query": query, "Timestamp": timestamp, "Processed": False}
df = pd.concat([df, pd.DataFrame([new_entry])], ignore_index=True)
# Push to Hub
updated_ds = Dataset.from_pandas(df)
updated_ds.push_to_hub(HF_DATASET_REPO, token=HF_TOKEN)
except Exception as e:
print(f"Failed to save query: {e}")
# --- Enhanced LLM Query ---
def query_llm(prompt: str, model: str = "llama3-70b-8192") -> str:
try:
response = groq_client.chat.completions.create(
messages=[{"role": "user", "content": prompt}],
model=model,
temperature=0.7,
max_tokens=1024,
top_p=0.9
)
return response.choices[0].message.content.strip()
except Exception as e:
print(f"LLM Error: {e}")
return None
# --- Main Chat Function ---
def get_best_answer(user_input: str) -> str:
user_input = user_input.strip()
lower_input = user_input.lower()
# 1. Handle special cases
if any(kw in lower_input for kw in ["fee", "fees", "tuition"]):
return ("π° Fee information:\n"
"Please visit: https://ue.edu.pk/allfeestructure.php\n"
"For personalized help, contact accounts@ue.edu.pk")
# 2. Semantic similarity search
query_embedding = similarity_model.encode(lower_input, convert_to_tensor=True)
scores = util.pytorch_cos_sim(query_embedding, dataset_embeddings)[0]
best_idx = scores.argmax().item()
best_score = scores[best_idx].item()
# 3. Save unmatched queries (threshold = 0.65)
if best_score < 0.65:
manage_unmatched_queries(user_input)
# 4. Generate response
if best_score >= 0.65:
context = dataset_answers[best_idx]
prompt = f"""University Assistant Task:
Question: {user_input}
Context: {context}
Generate a helpful, accurate response using the context. If unsure, say "Please contact info@ue.edu.pk" """
else:
prompt = f"""As an official University of Education assistant, answer:
Question: {user_input}
Guidelines:
- Be polite and professional
- Direct to official channels if uncertain
- Keep responses under 3 sentences"""
response = query_llm(prompt)
return response or """For official assistance:
π +92-42-99262231-33
βοΈ info@ue.edu.pk""" |