flaskbot / ai_chatbot.py
markobinario's picture
Upload 3 files
d1e5d7d verified
raw
history blame
8.91 kB
from sentence_transformers import SentenceTransformer
import numpy as np
from typing import List, Dict, Tuple
import re
class AIChatbot:
def __init__(self):
# Load the pre-trained model (can use a smaller model for more speed)
self.model = SentenceTransformer('all-MiniLM-L6-v2')
# Warm up the model to avoid first-request slowness
_ = self.model.encode(["Hello, world!"])
self.faq_embeddings = None
self.faqs = None
self.load_faqs()
def load_faqs(self):
"""Load static FAQs and compute their normalized embeddings"""
# Static FAQ data
self.faqs = [
{"id": 1, "question": "What are the admission requirements?", "answer": "To apply for admission, you need to submit your high school diploma, transcript of records, 2x2 ID photo, and completed application form. You also need to take the entrance examination."},
{"id": 2, "question": "When is the application deadline?", "answer": "The application deadline is usually in March for the first semester and October for the second semester. Please check our website for the exact dates."},
{"id": 3, "question": "What courses are available?", "answer": "We offer various courses including BS Computer Science, BS Information Technology, BS Business Administration, BS Education, BS Nursing, BS Architecture, and more. Check our course catalog for the complete list."},
{"id": 4, "question": "How much is the tuition fee?", "answer": "Tuition fees vary by program. For undergraduate programs, it ranges from ₱15,000 to ₱25,000 per semester. Please contact the registrar's office for specific program fees."},
{"id": 5, "question": "Do you offer scholarships?", "answer": "Yes, we offer various scholarships including academic scholarships, athletic scholarships, and need-based financial aid. Applications are available at the student affairs office."},
{"id": 6, "question": "What is the minimum GWA requirement?", "answer": "The minimum GWA requirement is 80% for most programs. Some programs may have higher requirements. Please check the specific requirements for your chosen program."},
{"id": 7, "question": "How can I contact the admissions office?", "answer": "You can contact the admissions office at (02) 123-4567 or email admissions@psau.edu.ph. Office hours are Monday to Friday, 8:00 AM to 5:00 PM."},
{"id": 8, "question": "Is there a dormitory available?", "answer": "Yes, we have dormitory facilities for both male and female students. Dormitory fees are separate from tuition. Please contact the housing office for availability and rates."},
{"id": 9, "question": "What documents do I need for enrollment?", "answer": "For enrollment, you need your admission letter, original and photocopy of birth certificate, original and photocopy of high school diploma, 2x2 ID photos, and medical certificate."},
{"id": 10, "question": "Can I transfer from another school?", "answer": "Yes, we accept transferees. You need to submit your transcript of records, honorable dismissal, and other required documents. Some credits may be credited depending on the program."}
]
if self.faqs:
# Compute and normalize embeddings for all questions
questions = [faq['question'] for faq in self.faqs]
embeddings = self.model.encode(questions, normalize_embeddings=True)
self.faq_embeddings = np.array(embeddings)
def save_unanswered_question(self, question):
"""Log unanswered questions to console (can be extended to save to file)"""
print(f"Unanswered question logged: {question}")
# In a real implementation, you could save this to a file or send to an admin
def _tokenize(self, text: str):
if not text:
return []
return [t for t in re.findall(r"[a-z0-9]+", text.lower()) if len(t) > 2]
def _overlap_ratio(self, q_tokens, faq_tokens):
if not q_tokens or not faq_tokens:
return 0.0
q_set = set(q_tokens)
f_set = set(faq_tokens)
inter = len(q_set & f_set)
denom = max(len(q_set), 1)
return inter / denom
def _wh_class(self, text: str) -> str:
if not text:
return ''
s = text.strip().lower()
# simple heuristic classification by leading wh-word
for key in ['who', 'where', 'when', 'what', 'how', 'why', 'which']:
if s.startswith(key + ' ') or s.startswith(key + "?"):
return key
# also check presence if not leading
for key in ['who', 'where', 'when', 'what', 'how', 'why', 'which']:
if f' {key} ' in f' {s} ':
return key
return ''
def find_best_match(self, question: str, threshold: float = 0.7) -> Tuple[str, float]:
print(f"find_best_match called with: {question}") # Debug print
if not self.faqs or self.faq_embeddings is None:
return "I'm sorry, I couldn't find any FAQs in the database.", 0.0
# Compute and normalize embedding for the input question
question_embedding = self.model.encode([question], normalize_embeddings=True)[0]
similarities = np.dot(self.faq_embeddings, question_embedding)
# Compute keyword overlap with each FAQ question
q_tokens = self._tokenize(question)
overlap_scores = []
for faq in self.faqs:
overlap_scores.append(self._overlap_ratio(q_tokens, self._tokenize(faq['question'])))
similarities = np.array(similarities)
overlap_scores = np.array(overlap_scores)
# Combined score to reduce false positives
combined = 0.7 * similarities + 0.3 * overlap_scores
# Apply WH-word intent consistency penalty
q_wh = self._wh_class(question)
if q_wh:
for i, faq in enumerate(self.faqs):
f_wh = self._wh_class(faq['question'])
if f_wh and f_wh != q_wh:
combined[i] *= 0.6 # penalize mismatched intent significantly
best_idx = int(np.argmax(combined))
best_semantic = float(similarities[best_idx])
best_overlap = float(overlap_scores[best_idx])
best_combined = float(combined[best_idx])
best_wh = self._wh_class(self.faqs[best_idx]['question'])
# Acceptance criteria: require good semantic OR strong combined with overlap
accept = (
best_semantic >= max(0.7, threshold)
or (best_combined >= threshold and best_overlap >= 0.3)
)
# Enforce WH intent match when present
if accept and q_wh and best_wh and q_wh != best_wh:
accept = False
if accept:
return self.faqs[best_idx]['answer'], best_combined
else:
# Log as unanswered so admins can curate (ignore errors)
try:
self.save_unanswered_question(question)
except Exception:
pass
fallback = (
"Sorry, I don’t have the knowledge to answer that yet.\n"
"I’ll notify an admin about your question and we’ll add the answer soon.\n"
"Please come back in a while."
)
return (fallback, best_combined)
def get_suggested_questions(self, question: str, num_suggestions: int = 3) -> List[str]:
"""Get suggested questions based on the input question"""
if not self.faqs or self.faq_embeddings is None:
return []
# Compute and normalize embedding for the input question
question_embedding = self.model.encode([question], normalize_embeddings=True)[0]
# Calculate cosine similarity
similarities = np.dot(self.faq_embeddings, question_embedding)
# Get top N similar questions
top_indices = np.argsort(similarities)[-num_suggestions:][::-1]
return [self.faqs[idx]['question'] for idx in top_indices if similarities[idx] > 0.3]
def add_faq(self, question: str, answer: str) -> bool:
"""Add a new FAQ to the static list (for demonstration purposes)"""
try:
new_id = max([faq['id'] for faq in self.faqs]) + 1 if self.faqs else 1
new_faq = {"id": new_id, "question": question, "answer": answer}
self.faqs.append(new_faq)
# Recompute embeddings
questions = [faq['question'] for faq in self.faqs]
embeddings = self.model.encode(questions, normalize_embeddings=True)
self.faq_embeddings = np.array(embeddings)
print(f"FAQ added: {question}")
return True
except Exception as e:
print(f"Error adding FAQ: {e}")
return False