Spaces:
Sleeping
Sleeping
| from sentence_transformers import SentenceTransformer | |
| import numpy as np | |
| from typing import List, Dict, Tuple | |
| import re | |
| class AIChatbot: | |
| def __init__(self): | |
| # Load the pre-trained model (can use a smaller model for more speed) | |
| self.model = SentenceTransformer('all-MiniLM-L6-v2') | |
| # Warm up the model to avoid first-request slowness | |
| _ = self.model.encode(["Hello, world!"]) | |
| self.faq_embeddings = None | |
| self.faqs = None | |
| self.load_faqs() | |
| def load_faqs(self): | |
| """Load static FAQs and compute their normalized embeddings""" | |
| # Static FAQ data | |
| self.faqs = [ | |
| {"id": 1, "question": "What are the admission requirements?", "answer": "To apply for admission, you need to submit your high school diploma, transcript of records, 2x2 ID photo, and completed application form. You also need to take the entrance examination."}, | |
| {"id": 2, "question": "When is the application deadline?", "answer": "The application deadline is usually in March for the first semester and October for the second semester. Please check our website for the exact dates."}, | |
| {"id": 3, "question": "What courses are available?", "answer": "We offer various courses including BS Computer Science, BS Information Technology, BS Business Administration, BS Education, BS Nursing, BS Architecture, and more. Check our course catalog for the complete list."}, | |
| {"id": 4, "question": "How much is the tuition fee?", "answer": "Tuition fees vary by program. For undergraduate programs, it ranges from ₱15,000 to ₱25,000 per semester. Please contact the registrar's office for specific program fees."}, | |
| {"id": 5, "question": "Do you offer scholarships?", "answer": "Yes, we offer various scholarships including academic scholarships, athletic scholarships, and need-based financial aid. Applications are available at the student affairs office."}, | |
| {"id": 6, "question": "What is the minimum GWA requirement?", "answer": "The minimum GWA requirement is 80% for most programs. Some programs may have higher requirements. Please check the specific requirements for your chosen program."}, | |
| {"id": 7, "question": "How can I contact the admissions office?", "answer": "You can contact the admissions office at (02) 123-4567 or email admissions@psau.edu.ph. Office hours are Monday to Friday, 8:00 AM to 5:00 PM."}, | |
| {"id": 8, "question": "Is there a dormitory available?", "answer": "Yes, we have dormitory facilities for both male and female students. Dormitory fees are separate from tuition. Please contact the housing office for availability and rates."}, | |
| {"id": 9, "question": "What documents do I need for enrollment?", "answer": "For enrollment, you need your admission letter, original and photocopy of birth certificate, original and photocopy of high school diploma, 2x2 ID photos, and medical certificate."}, | |
| {"id": 10, "question": "Can I transfer from another school?", "answer": "Yes, we accept transferees. You need to submit your transcript of records, honorable dismissal, and other required documents. Some credits may be credited depending on the program."} | |
| ] | |
| if self.faqs: | |
| # Compute and normalize embeddings for all questions | |
| questions = [faq['question'] for faq in self.faqs] | |
| embeddings = self.model.encode(questions, normalize_embeddings=True) | |
| self.faq_embeddings = np.array(embeddings) | |
| def save_unanswered_question(self, question): | |
| """Log unanswered questions to console (can be extended to save to file)""" | |
| print(f"Unanswered question logged: {question}") | |
| # In a real implementation, you could save this to a file or send to an admin | |
| def _tokenize(self, text: str): | |
| if not text: | |
| return [] | |
| return [t for t in re.findall(r"[a-z0-9]+", text.lower()) if len(t) > 2] | |
| def _overlap_ratio(self, q_tokens, faq_tokens): | |
| if not q_tokens or not faq_tokens: | |
| return 0.0 | |
| q_set = set(q_tokens) | |
| f_set = set(faq_tokens) | |
| inter = len(q_set & f_set) | |
| denom = max(len(q_set), 1) | |
| return inter / denom | |
| def _wh_class(self, text: str) -> str: | |
| if not text: | |
| return '' | |
| s = text.strip().lower() | |
| # simple heuristic classification by leading wh-word | |
| for key in ['who', 'where', 'when', 'what', 'how', 'why', 'which']: | |
| if s.startswith(key + ' ') or s.startswith(key + "?"): | |
| return key | |
| # also check presence if not leading | |
| for key in ['who', 'where', 'when', 'what', 'how', 'why', 'which']: | |
| if f' {key} ' in f' {s} ': | |
| return key | |
| return '' | |
| def find_best_match(self, question: str, threshold: float = 0.7) -> Tuple[str, float]: | |
| print(f"find_best_match called with: {question}") # Debug print | |
| if not self.faqs or self.faq_embeddings is None: | |
| return "I'm sorry, I couldn't find any FAQs in the database.", 0.0 | |
| # Compute and normalize embedding for the input question | |
| question_embedding = self.model.encode([question], normalize_embeddings=True)[0] | |
| similarities = np.dot(self.faq_embeddings, question_embedding) | |
| # Compute keyword overlap with each FAQ question | |
| q_tokens = self._tokenize(question) | |
| overlap_scores = [] | |
| for faq in self.faqs: | |
| overlap_scores.append(self._overlap_ratio(q_tokens, self._tokenize(faq['question']))) | |
| similarities = np.array(similarities) | |
| overlap_scores = np.array(overlap_scores) | |
| # Combined score to reduce false positives | |
| combined = 0.7 * similarities + 0.3 * overlap_scores | |
| # Apply WH-word intent consistency penalty | |
| q_wh = self._wh_class(question) | |
| if q_wh: | |
| for i, faq in enumerate(self.faqs): | |
| f_wh = self._wh_class(faq['question']) | |
| if f_wh and f_wh != q_wh: | |
| combined[i] *= 0.6 # penalize mismatched intent significantly | |
| best_idx = int(np.argmax(combined)) | |
| best_semantic = float(similarities[best_idx]) | |
| best_overlap = float(overlap_scores[best_idx]) | |
| best_combined = float(combined[best_idx]) | |
| best_wh = self._wh_class(self.faqs[best_idx]['question']) | |
| # Acceptance criteria: require good semantic OR strong combined with overlap | |
| accept = ( | |
| best_semantic >= max(0.7, threshold) | |
| or (best_combined >= threshold and best_overlap >= 0.3) | |
| ) | |
| # Enforce WH intent match when present | |
| if accept and q_wh and best_wh and q_wh != best_wh: | |
| accept = False | |
| if accept: | |
| return self.faqs[best_idx]['answer'], best_combined | |
| else: | |
| # Log as unanswered so admins can curate (ignore errors) | |
| try: | |
| self.save_unanswered_question(question) | |
| except Exception: | |
| pass | |
| fallback = ( | |
| "Sorry, I don’t have the knowledge to answer that yet.\n" | |
| "I’ll notify an admin about your question and we’ll add the answer soon.\n" | |
| "Please come back in a while." | |
| ) | |
| return (fallback, best_combined) | |
| def get_suggested_questions(self, question: str, num_suggestions: int = 3) -> List[str]: | |
| """Get suggested questions based on the input question""" | |
| if not self.faqs or self.faq_embeddings is None: | |
| return [] | |
| # Compute and normalize embedding for the input question | |
| question_embedding = self.model.encode([question], normalize_embeddings=True)[0] | |
| # Calculate cosine similarity | |
| similarities = np.dot(self.faq_embeddings, question_embedding) | |
| # Get top N similar questions | |
| top_indices = np.argsort(similarities)[-num_suggestions:][::-1] | |
| return [self.faqs[idx]['question'] for idx in top_indices if similarities[idx] > 0.3] | |
| def add_faq(self, question: str, answer: str) -> bool: | |
| """Add a new FAQ to the static list (for demonstration purposes)""" | |
| try: | |
| new_id = max([faq['id'] for faq in self.faqs]) + 1 if self.faqs else 1 | |
| new_faq = {"id": new_id, "question": question, "answer": answer} | |
| self.faqs.append(new_faq) | |
| # Recompute embeddings | |
| questions = [faq['question'] for faq in self.faqs] | |
| embeddings = self.model.encode(questions, normalize_embeddings=True) | |
| self.faq_embeddings = np.array(embeddings) | |
| print(f"FAQ added: {question}") | |
| return True | |
| except Exception as e: | |
| print(f"Error adding FAQ: {e}") | |
| return False |