Spaces:

markobinario
/

flaskbot

Sleeping

App Files Files Community

flaskbot / ai_chatbot.py

markobinario

Upload 3 files

d1e5d7d verified about 2 months ago

raw

history blame

8.91 kB

	from sentence_transformers import SentenceTransformer
	import numpy as np
	from typing import List, Dict, Tuple
	import re

	class AIChatbot:
	def __init__(self):
	# Load the pre-trained model (can use a smaller model for more speed)
	self.model = SentenceTransformer('all-MiniLM-L6-v2')
	# Warm up the model to avoid first-request slowness
	_ = self.model.encode(["Hello, world!"])
	self.faq_embeddings = None
	self.faqs = None
	self.load_faqs()

	def load_faqs(self):
	"""Load static FAQs and compute their normalized embeddings"""
	# Static FAQ data
	self.faqs = [
	{"id": 1, "question": "What are the admission requirements?", "answer": "To apply for admission, you need to submit your high school diploma, transcript of records, 2x2 ID photo, and completed application form. You also need to take the entrance examination."},
	{"id": 2, "question": "When is the application deadline?", "answer": "The application deadline is usually in March for the first semester and October for the second semester. Please check our website for the exact dates."},
	{"id": 3, "question": "What courses are available?", "answer": "We offer various courses including BS Computer Science, BS Information Technology, BS Business Administration, BS Education, BS Nursing, BS Architecture, and more. Check our course catalog for the complete list."},
	{"id": 4, "question": "How much is the tuition fee?", "answer": "Tuition fees vary by program. For undergraduate programs, it ranges from ₱15,000 to ₱25,000 per semester. Please contact the registrar's office for specific program fees."},
	{"id": 5, "question": "Do you offer scholarships?", "answer": "Yes, we offer various scholarships including academic scholarships, athletic scholarships, and need-based financial aid. Applications are available at the student affairs office."},
	{"id": 6, "question": "What is the minimum GWA requirement?", "answer": "The minimum GWA requirement is 80% for most programs. Some programs may have higher requirements. Please check the specific requirements for your chosen program."},
	{"id": 7, "question": "How can I contact the admissions office?", "answer": "You can contact the admissions office at (02) 123-4567 or email admissions@psau.edu.ph. Office hours are Monday to Friday, 8:00 AM to 5:00 PM."},
	{"id": 8, "question": "Is there a dormitory available?", "answer": "Yes, we have dormitory facilities for both male and female students. Dormitory fees are separate from tuition. Please contact the housing office for availability and rates."},
	{"id": 9, "question": "What documents do I need for enrollment?", "answer": "For enrollment, you need your admission letter, original and photocopy of birth certificate, original and photocopy of high school diploma, 2x2 ID photos, and medical certificate."},
	{"id": 10, "question": "Can I transfer from another school?", "answer": "Yes, we accept transferees. You need to submit your transcript of records, honorable dismissal, and other required documents. Some credits may be credited depending on the program."}
	]

	if self.faqs:
	# Compute and normalize embeddings for all questions
	questions = [faq['question'] for faq in self.faqs]
	embeddings = self.model.encode(questions, normalize_embeddings=True)
	self.faq_embeddings = np.array(embeddings)

	def save_unanswered_question(self, question):
	"""Log unanswered questions to console (can be extended to save to file)"""
	print(f"Unanswered question logged: {question}")
	# In a real implementation, you could save this to a file or send to an admin

	def _tokenize(self, text: str):
	if not text:
	return []
	return [t for t in re.findall(r"[a-z0-9]+", text.lower()) if len(t) > 2]

	def _overlap_ratio(self, q_tokens, faq_tokens):
	if not q_tokens or not faq_tokens:
	return 0.0
	q_set = set(q_tokens)
	f_set = set(faq_tokens)
	inter = len(q_set & f_set)
	denom = max(len(q_set), 1)
	return inter / denom

	def _wh_class(self, text: str) -> str:
	if not text:
	return ''
	s = text.strip().lower()
	# simple heuristic classification by leading wh-word
	for key in ['who', 'where', 'when', 'what', 'how', 'why', 'which']:
	if s.startswith(key + ' ') or s.startswith(key + "?"):
	return key
	# also check presence if not leading
	for key in ['who', 'where', 'when', 'what', 'how', 'why', 'which']:
	if f' {key} ' in f' {s} ':
	return key
	return ''

	def find_best_match(self, question: str, threshold: float = 0.7) -> Tuple[str, float]:
	print(f"find_best_match called with: {question}") # Debug print
	if not self.faqs or self.faq_embeddings is None:
	return "I'm sorry, I couldn't find any FAQs in the database.", 0.0

	# Compute and normalize embedding for the input question
	question_embedding = self.model.encode([question], normalize_embeddings=True)[0]
	similarities = np.dot(self.faq_embeddings, question_embedding)

	# Compute keyword overlap with each FAQ question
	q_tokens = self._tokenize(question)
	overlap_scores = []
	for faq in self.faqs:
	overlap_scores.append(self._overlap_ratio(q_tokens, self._tokenize(faq['question'])))

	similarities = np.array(similarities)
	overlap_scores = np.array(overlap_scores)

	# Combined score to reduce false positives
	combined = 0.7 * similarities + 0.3 * overlap_scores

	# Apply WH-word intent consistency penalty
	q_wh = self._wh_class(question)
	if q_wh:
	for i, faq in enumerate(self.faqs):
	f_wh = self._wh_class(faq['question'])
	if f_wh and f_wh != q_wh:
	combined[i] *= 0.6 # penalize mismatched intent significantly
	best_idx = int(np.argmax(combined))
	best_semantic = float(similarities[best_idx])
	best_overlap = float(overlap_scores[best_idx])
	best_combined = float(combined[best_idx])
	best_wh = self._wh_class(self.faqs[best_idx]['question'])

	# Acceptance criteria: require good semantic OR strong combined with overlap
	accept = (
	best_semantic >= max(0.7, threshold)
	or (best_combined >= threshold and best_overlap >= 0.3)
	)
	# Enforce WH intent match when present
	if accept and q_wh and best_wh and q_wh != best_wh:
	accept = False

	if accept:
	return self.faqs[best_idx]['answer'], best_combined
	else:
	# Log as unanswered so admins can curate (ignore errors)
	try:
	self.save_unanswered_question(question)
	except Exception:
	pass
	fallback = (
	"Sorry, I don’t have the knowledge to answer that yet.\n"
	"I’ll notify an admin about your question and we’ll add the answer soon.\n"
	"Please come back in a while."
	)
	return (fallback, best_combined)

	def get_suggested_questions(self, question: str, num_suggestions: int = 3) -> List[str]:
	"""Get suggested questions based on the input question"""
	if not self.faqs or self.faq_embeddings is None:
	return []

	# Compute and normalize embedding for the input question
	question_embedding = self.model.encode([question], normalize_embeddings=True)[0]

	# Calculate cosine similarity
	similarities = np.dot(self.faq_embeddings, question_embedding)

	# Get top N similar questions
	top_indices = np.argsort(similarities)[-num_suggestions:][::-1]
	return [self.faqs[idx]['question'] for idx in top_indices if similarities[idx] > 0.3]

	def add_faq(self, question: str, answer: str) -> bool:
	"""Add a new FAQ to the static list (for demonstration purposes)"""
	try:
	new_id = max([faq['id'] for faq in self.faqs]) + 1 if self.faqs else 1
	new_faq = {"id": new_id, "question": question, "answer": answer}
	self.faqs.append(new_faq)

	# Recompute embeddings
	questions = [faq['question'] for faq in self.faqs]
	embeddings = self.model.encode(questions, normalize_embeddings=True)
	self.faq_embeddings = np.array(embeddings)

	print(f"FAQ added: {question}")
	return True
	except Exception as e:
	print(f"Error adding FAQ: {e}")
	return False