File size: 4,181 Bytes
b467f2f
ade1780
b467f2f
 
6dd717f
ade1780
1bdac65
ade1780
b467f2f
 
 
 
1bdac65
b467f2f
 
 
1bdac65
 
 
 
 
ade1780
 
 
 
 
1bdac65
ade1780
 
b467f2f
 
 
 
 
 
1bdac65
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6dd717f
1bdac65
 
b467f2f
1bdac65
 
 
b467f2f
1bdac65
 
b467f2f
1bdac65
b467f2f
1bdac65
ade1780
b467f2f
1bdac65
 
 
 
b467f2f
1bdac65
 
 
 
 
b467f2f
1bdac65
 
 
 
 
b467f2f
1bdac65
1c1b54f
1bdac65
f8b9b57
1bdac65
b467f2f
1bdac65
 
b467f2f
1bdac65
 
b467f2f
1bdac65
b467f2f
1bdac65
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
import json
from datetime import datetime
from sentence_transformers import SentenceTransformer, util
from groq import Groq
from dotenv import load_dotenv
import os
from datasets import load_dataset, Dataset, DatasetDict
import pandas as pd

# Load environment variables
load_dotenv()

# Initialize clients
groq_client = Groq(api_key=os.getenv("GROQ_API_KEY"))
similarity_model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

# Configuration
HF_DATASET_REPO = "midrees2806/unmatched_queries"  # Your dataset repo
HF_TOKEN = os.getenv("HF_TOKEN")  # From Space secrets

# --- Dataset Loading ---
try:
    with open('dataset.json', 'r') as f:
        dataset = json.load(f)
    if not all(isinstance(item, dict) and 'input' in item and 'response' in item for item in dataset):
        raise ValueError("Invalid dataset structure")
except Exception as e:
    print(f"Error loading dataset: {e}")
    dataset = []

# Precompute embeddings
dataset_questions = [item.get("input", "").lower().strip() for item in dataset]
dataset_answers = [item.get("response", "") for item in dataset]
dataset_embeddings = similarity_model.encode(dataset_questions, convert_to_tensor=True)

# --- Unmatched Queries Handler ---
def manage_unmatched_queries(query: str):
    """Save unmatched queries to HF Dataset with error handling"""
    try:
        timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        
        # Load existing dataset or create new
        try:
            ds = load_dataset(HF_DATASET_REPO, token=HF_TOKEN)
            df = ds["train"].to_pandas()
        except:
            df = pd.DataFrame(columns=["Query", "Timestamp", "Processed"])
        
        # Append new query (avoid duplicates)
        if query not in df["Query"].values:
            new_entry = {"Query": query, "Timestamp": timestamp, "Processed": False}
            df = pd.concat([df, pd.DataFrame([new_entry])], ignore_index=True)
            
            # Push to Hub
            updated_ds = Dataset.from_pandas(df)
            updated_ds.push_to_hub(HF_DATASET_REPO, token=HF_TOKEN)
    except Exception as e:
        print(f"Failed to save query: {e}")

# --- Enhanced LLM Query ---
def query_llm(prompt: str, model: str = "llama3-70b-8192") -> str:
    try:
        response = groq_client.chat.completions.create(
            messages=[{"role": "user", "content": prompt}],
            model=model,
            temperature=0.7,
            max_tokens=1024,
            top_p=0.9
        )
        return response.choices[0].message.content.strip()
    except Exception as e:
        print(f"LLM Error: {e}")
        return None

# --- Main Chat Function ---
def get_best_answer(user_input: str) -> str:
    user_input = user_input.strip()
    lower_input = user_input.lower()

    # 1. Handle special cases
    if any(kw in lower_input for kw in ["fee", "fees", "tuition"]):
        return ("πŸ’° Fee information:\n"
                "Please visit: https://ue.edu.pk/allfeestructure.php\n"
                "For personalized help, contact accounts@ue.edu.pk")

    # 2. Semantic similarity search
    query_embedding = similarity_model.encode(lower_input, convert_to_tensor=True)
    scores = util.pytorch_cos_sim(query_embedding, dataset_embeddings)[0]
    best_idx = scores.argmax().item()
    best_score = scores[best_idx].item()

    # 3. Save unmatched queries (threshold = 0.65)
    if best_score < 0.65:
        manage_unmatched_queries(user_input)

    # 4. Generate response
    if best_score >= 0.65:
        context = dataset_answers[best_idx]
        prompt = f"""University Assistant Task:
        Question: {user_input}
        Context: {context}
        Generate a helpful, accurate response using the context. If unsure, say "Please contact info@ue.edu.pk" """
    else:
        prompt = f"""As an official University of Education assistant, answer:
        Question: {user_input}
        Guidelines:
        - Be polite and professional
        - Direct to official channels if uncertain
        - Keep responses under 3 sentences"""

    response = query_llm(prompt)
    return response or """For official assistance:
    πŸ“ž +92-42-99262231-33
    βœ‰οΈ info@ue.edu.pk"""