Spaces:
Sleeping
Sleeping
| import openai | |
| import numpy as np | |
| import re | |
| from typing import List, Tuple | |
| from config import EMBED_MODEL | |
| def get_embedding(text: str) -> List[float]: | |
| """Generate embedding for a given text.""" | |
| text_strip = text.replace("\n", " ").strip() | |
| response = openai.embeddings.create(input=[text_strip], model=EMBED_MODEL) | |
| return response.data[0].embedding | |
| def cosine_similarity(a: List[float], b: List[float]) -> float: | |
| """Calculate cosine similarity between two vectors.""" | |
| a = np.array(a) | |
| b = np.array(b) | |
| if np.linalg.norm(a) == 0 or np.linalg.norm(b) == 0: | |
| return 0.0 | |
| return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b)) | |
| def clean_time(time_str: str) -> str: | |
| """Clean up time string.""" | |
| if not time_str: | |
| return "" | |
| time_match = re.search(r'(\d{1,2}):?(\d{0,2})\s*(AM|PM)', time_str, re.IGNORECASE) | |
| if time_match: | |
| hour = time_match.group(1) | |
| minute = time_match.group(2) or "00" | |
| ampm = time_match.group(3).upper() | |
| return f"{hour}:{minute} {ampm}" | |
| return time_str.strip() | |
| def find_top_k_matches(user_embedding, dataset, k=3): | |
| """Find top k matching entries from a dataset.""" | |
| scored = [] | |
| for entry_id, text, emb in dataset: | |
| score = cosine_similarity(user_embedding, emb) | |
| scored.append((score, entry_id, text)) | |
| scored.sort(reverse=True) | |
| return scored[:k] | |
| def classify_intent(question: str) -> str: | |
| """ | |
| Classify the user's intent into: | |
| Mode A: Recommendation Mode (Workshops, Dates, Availability, Recommendations) | |
| Mode B: Front Desk Mode (Default - Everything else) | |
| """ | |
| prompt = f"""Classify the following user question into one of two modes: | |
| 1. "Mode A - Recommendation Mode": Use this if the user is asking about workshops, specific dates, what's available this month, asking for recommendations, or career goals (like getting an agent). | |
| 2. "Mode B - Front Desk Mode": Use this for broad introductory questions, kids classes, signing up, summit, instructor roles, auditing, online vs in-studio, general policies, or specific questions about existing classes. | |
| User Question: "{question}" | |
| Response must be exactly "Mode A" or "Mode B".""" | |
| try: | |
| response = openai.chat.completions.create( | |
| model="gpt-4o-mini", | |
| messages=[{"role": "user", "content": prompt}], | |
| temperature=0, | |
| max_tokens=5 | |
| ) | |
| prediction = response.choices[0].message.content.strip() | |
| if "Mode A" in prediction: | |
| return "Mode A" | |
| return "Mode B" | |
| except Exception as e: | |
| print(f"Error in intent classification: {e}") | |
| return "Mode B" # Default to Front Desk Mode | |
| def should_include_email(question: str) -> bool: | |
| """ | |
| Determine if the contact email should be shown based on user intent. | |
| Allowed for: Payments, Refunds, Attendance issues, Account problems. | |
| """ | |
| from config import EMAIL_ONLY_KEYWORDS | |
| import re | |
| question_lower = question.lower() | |
| for word in EMAIL_ONLY_KEYWORDS: | |
| pattern = rf'\b{re.escape(word)}\b' | |
| if re.search(pattern, question_lower): | |
| return True | |
| return False | |
| def classify_user_type(question: str, history: List[dict] = None) -> str: | |
| """ | |
| Classify the user type into: | |
| - new_actor | |
| - experienced_actor | |
| - parent | |
| - current_student | |
| - unknown | |
| """ | |
| history_str = "" | |
| if history: | |
| history_str = "\nConversation context:\n" + "\n".join([f"{m['role']}: {m['content'][:100]}..." for m in history[-3:]]) | |
| prompt = f"""Classify the user into exactly one of these categories based on their question and context: | |
| 1. "new_actor": Just starting out, has no experience, or is asking how to begin. | |
| 2. "experienced_actor": Already has credits, mentions agents, looking for advanced workshops, or refers to their career progress. | |
| 3. "parent": Asking on behalf of their child, mentions "my kid", "my son", "my daughter", "teens". | |
| 4. "current_student": Refers to past/current classes at Get Scene, mentions a specific GSP membership, or asks about recurring student workshops. | |
| 5. "unknown": Not enough information yet. | |
| User Question: "{question}"{history_str} | |
| Response must be exactly one of: new_actor, experienced_actor, parent, current_student, unknown.""" | |
| try: | |
| response = openai.chat.completions.create( | |
| model="gpt-4o-mini", | |
| messages=[{"role": "user", "content": prompt}], | |
| temperature=0, | |
| max_tokens=10 | |
| ) | |
| prediction = response.choices[0].message.content.strip().lower() | |
| valid_types = ["new_actor", "experienced_actor", "parent", "current_student", "unknown"] | |
| for t in valid_types: | |
| if t in prediction: | |
| return t | |
| return "unknown" | |
| except Exception as e: | |
| print(f"Error in user type classification: {e}") | |
| return "unknown" | |
| def recalculate_all_embeddings(): | |
| """Recalculate embeddings for all entries in faq_entries and podcast_episodes that are missing embeddings.""" | |
| from database import get_db_connection | |
| import json | |
| with get_db_connection() as conn: | |
| cur = conn.cursor() | |
| # 1. Update FAQs | |
| print("Starting FAQ embedding recalculation...") | |
| cur.execute("SELECT id, question FROM faq_entries WHERE embedding IS NULL") | |
| faqs = cur.fetchall() | |
| for faq_id, question in faqs: | |
| try: | |
| emb = get_embedding(question) | |
| cur.execute("UPDATE faq_entries SET embedding = ? WHERE id = ?", (json.dumps(emb), faq_id)) | |
| print(f" ✓ Updated FAQ ID {faq_id}") | |
| except Exception as e: | |
| print(f" ✗ Error updating FAQ ID {faq_id}: {e}") | |
| # 2. Update Podcasts | |
| print("Starting Podcast embedding recalculation...") | |
| cur.execute("SELECT id, full_text FROM podcast_episodes WHERE embedding IS NULL") | |
| podcasts = cur.fetchall() | |
| for pod_id, full_text in podcasts: | |
| try: | |
| emb = get_embedding(full_text) | |
| cur.execute("UPDATE podcast_episodes SET embedding = ? WHERE id = ?", (json.dumps(emb), pod_id)) | |
| print(f" ✓ Updated Podcast ID {pod_id}") | |
| except Exception as e: | |
| print(f" ✗ Error updating Podcast ID {pod_id}: {e}") | |
| conn.commit() | |
| print("Embedding recalculation complete.") |