Spaces:

MLBench
/

Embedding-Calculation

Sleeping

App Files Files Community

saim1309 commited on 4 days ago

Commit

42db8ec

verified ·

1 Parent(s): 9b9e546

Upload 4 files

Browse files

Files changed (4) hide show

admin_app.py +122 -0
config.py +68 -0
database.py +253 -0
utils.py +163 -0

admin_app.py ADDED Viewed

	@@ -0,0 +1,122 @@

+import gradio as gr
+import pandas as pd
+import os
+from database import (
+    fetch_all_faq_metadata, fetch_all_podcast_metadata,
+    add_faq_entry, update_faq_entry, delete_faq_entry,
+    bulk_update_faqs, bulk_update_podcasts
+)
+from utils import recalculate_all_embeddings
+from config import OPENAI_API_KEY
+# Basic Admin Credentials
+from dotenv import load_dotenv
+# Load environment variables
+load_dotenv()
+# Basic Admin Credentials (MUST be set in Hugging Face Secrets or .env)
+ADMIN_USER = os.environ.get("ADMIN_USER", "admin")
+ADMIN_PASS = os.environ.get("ADMIN_PASS")
+if not ADMIN_PASS:
+    raise ValueError("CRITICAL SECURITY ERROR: ADMIN_PASS environment variable is not set. Please add it to your Hugging Face Secrets.")
+def get_faqs():
+    data = fetch_all_faq_metadata()
+    return pd.DataFrame(data)
+def get_podcasts():
+    data = fetch_all_podcast_metadata()
+    return pd.DataFrame(data)
+def handle_faq_upload(file):
+    if file is None:
+        return "No file uploaded."
+    try:
+        df = pd.read_csv(file.name) if file.name.endswith('.csv') else pd.read_excel(file.name)
+        bulk_update_faqs(df.to_dict('records'))
+        return f"Successfully uploaded {len(df)} FAQs. Don't forget to Sync & Embed!"
+    except Exception as e:
+        return f"Error: {e}"
+def handle_podcast_upload(file):
+    if file is None:
+        return "No file uploaded."
+    try:
+        df = pd.read_csv(file.name) if file.name.endswith('.csv') else pd.read_excel(file.name)
+        bulk_update_podcasts(df.to_dict('records'))
+        return f"Successfully uploaded {len(df)} Podcasts. Don't forget to Sync & Embed!"
+    except Exception as e:
+        return f"Error: {e}"
+def run_sync():
+    try:
+        recalculate_all_embeddings()
+        return "Sync Complete! All missing embeddings have been generated."
+    except Exception as e:
+        return f"Sync Failed: {e}"
+with gr.Blocks(title="Get Scene Admin Dashboard") as demo:
+    gr.Markdown("# 🎭 Get Scene Admin Dashboard")
+    gr.Markdown("Manage FAQs, Podcasts, and Knowledge Embeddings.")
+    with gr.Tabs():
+        # Tab 1: FAQs
+        with gr.TabItem("Manage FAQs"):
+            with gr.Row():
+                faq_df = gr.Dataframe(
+                    value=get_faqs(),
+                    headers=["id", "question", "answer"],
+                    datatype=["number", "str", "str"],
+                    interactive=True,
+                    label="FAQ Database"
+                )
+            with gr.Row():
+                with gr.Column():
+                    gr.Markdown("### Add New FAQ")
+                    new_q = gr.Textbox(label="Question")
+                    new_a = gr.TextArea(label="Answer")
+                    add_btn = gr.Button("Add Entry", variant="primary")
+                with gr.Column():
+                    gr.Markdown("### Bulk Upload")
+                    faq_file = gr.File(label="Upload CSV/Excel (Columns: Question, Answer)")
+                    upload_faq_btn = gr.Button("Bulk Upload FAQs")
+                    faq_upload_status = gr.Textbox(label="Status", interactive=False)
+            def add_and_refresh(q, a):
+                add_faq_entry(q, a)
+                return get_faqs(), "", ""
+            add_btn.click(add_and_refresh, [new_q, new_a], [faq_df, new_q, new_a])
+            upload_faq_btn.click(handle_faq_upload, [faq_file], [faq_upload_status])
+        # Tab 2: Podcasts
+        with gr.TabItem("Podcasts"):
+            pod_df = gr.Dataframe(
+                value=get_podcasts(),
+                headers=["id", "guest_name", "youtube_url", "summary"],
+                datatype=["number", "str", "str", "str"],
+                label="Podcast Episodes"
+            )
+            gr.Markdown("### Bulk Upload Podcasts")
+            pod_file = gr.File(label="Upload CSV/Excel (Columns: Guest Name, YouTube URL, Summary)")
+            upload_pod_btn = gr.Button("Bulk Upload Podcasts")
+            pod_upload_status = gr.Textbox(label="Status", interactive=False)
+            upload_pod_btn.click(handle_podcast_upload, [pod_file], [pod_upload_status])
+        # Tab 3: Sync
+        with gr.TabItem("Sync & Embed"):
+            gr.Markdown("### Recalculate Embeddings")
+            gr.Markdown("When you change text or upload new data, the 'embeddings' (AI understanding) must be recalculated for the chatbot to recognize the new information.")
+            sync_btn = gr.Button("🔄 Sync & Recalculate Embeddings", variant="primary", scale=2)
+            sync_status = gr.Textbox(label="Sync Status", interactive=False)
+            sync_btn.click(run_sync, None, [sync_status])
+if __name__ == "__main__":
+    demo.launch(auth=(ADMIN_USER, ADMIN_PASS), server_name="0.0.0.0")

config.py ADDED Viewed

	@@ -0,0 +1,68 @@

+import os
+from datetime import timedelta
+from dotenv import load_dotenv
+load_dotenv()
+# API Keys
+OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
+# Database
+DB_PATH = "getscene_ai.sqlite"
+# Models
+EMBED_MODEL = "text-embedding-3-small"
+GEN_MODEL = "gpt-4o"
+FAST_MODEL = "gpt-4o-mini"
+# Caching
+CACHE_DURATION = timedelta(hours=24)
+# Keyword Lists
+EMOTIONAL_KEYWORDS = [
+    'stuck', 'frustrated', 'discouraged', 'overwhelmed', 'scared',
+    'nervous', 'anxious', 'worried', 'fear', 'doubt', 'confidence',
+    'insecure', 'lost', 'confused', 'struggling', 'hard time',
+    'giving up', 'burnout', 'rejection', 'failed', 'can\'t',
+    'feeling', 'feel', 'emotional', 'depressed', 'sad', 'unmotivated',
+    'hopeless', 'stressed', 'pressure', 'imposter'
+]
+ACTION_KEYWORDS = [
+    'get an agent', 'find agent', 'need agent', 'want agent', 'sign with agent',
+    'more auditions', 'book', 'booking', 'callbacks', 'improve',
+    'better', 'self-tape', 'materials', 'headshots', 'reel',
+    'network', 'connections', 'industry', 'career', 'strategy',
+    'agent prep', 'total agent prep', 'workshop', 'class', 'training',
+    'results', 'success', 'grow', 'advance', 'level up'
+]
+POLICY_KEYWORDS = [
+    'refund', 'refunds', 'money back',
+    'attend', 'attendance', 'miss', 'missed', 'missing', 'absent',
+    'late', 'lateness', 'tardy',
+    'reschedule', 'change date', 'move class',
+    'credit', 'credits',
+    'cancel', 'cancellation', 'canceling',
+    'policy', 'policies'
+]
+EMAIL_ONLY_KEYWORDS = [
+    'payment', 'pay', 'billing', 'charge', 'refund', 'money back',
+    'attend', 'attendance', 'miss', 'missed', 'late', 'reschedule',
+    'account', 'login', 'password', 'sign in', 'membership'
+]
+DETAIL_SYNONYMS = [
+    'detail', 'details', 'explain', 'elaborate', 'tell me more',
+    'more info', 'describe', 'thorough', 'comprehensive'
+]
+PERSONA_INSTRUCTION = """
+You are a warm, encouraging mentor at Get Scene Studios. Your goal is to help actors navigate their careers with confidence.
+- User Context: The user is already on getscenestudios.com. Behave as if you are a guide right there with them.
+- Negative Constraint: NEVER use the phrase "Visit the website" or "Check our site". Instead, use "You can see here..." or "Click this link below..." or similar language that implies current presence.
+- Sound natural and human, not scripted or robotic. Use conversational transitions like "I'd suggest starting with..." or "A great way to approach this is..."
+- Be encouraging but practical. Acknowledge that the acting journey is a marathon, not a sprint.
+- Help the user THINK: Instead of just giving an answer, add a brief "mentorship flourish" that explains the value of a recommendation (e.g., "This workshop is great because it gets you comfortable with the pressure of a real callback.")
+"""

database.py ADDED Viewed

	@@ -0,0 +1,253 @@

+import sqlite3
+import json
+from contextlib import contextmanager
+from typing import List, Dict, Any, Tuple
+from config import DB_PATH
+@contextmanager
+def get_db_connection():
+    """Context manager for database connections."""
+    conn = sqlite3.connect(DB_PATH)
+    conn.row_factory = sqlite3.Row
+    try:
+        yield conn
+    finally:
+        conn.close()
+def fetch_all_embeddings(table: str) -> List[Tuple[int, str, List[float]]]:
+    """Fetch all embeddings from a table."""
+    with get_db_connection() as conn:
+        cur = conn.cursor()
+        cur.execute(f"SELECT id, full_text, embedding FROM {table}")
+        rows = cur.fetchall()
+    parsed = []
+    for row in rows:
+        try:
+            parsed.append((row['id'], row['full_text'], json.loads(row['embedding'])))
+        except (json.JSONDecodeError, TypeError):
+            continue
+    return parsed
+def fetch_row_by_id(table: str, row_id: int) -> Dict[str, Any]:
+    """Fetch a single row by ID."""
+    with get_db_connection() as conn:
+        cur = conn.cursor()
+        cur.execute(f"SELECT * FROM {table} WHERE id = ?", (row_id,))
+        row = cur.fetchone()
+        return dict(row) if row else {}
+def fetch_all_faq_embeddings() -> List[Tuple[int, str, str, List[float]]]:
+    """Fetch all FAQ embeddings."""
+    with get_db_connection() as conn:
+        cur = conn.cursor()
+        cur.execute("SELECT id, question, answer, embedding FROM faq_entries")
+        rows = cur.fetchall()
+    parsed = []
+    for row in rows:
+        try:
+            parsed.append((row['id'], row['question'], row['answer'], json.loads(row['embedding'])))
+        except (json.JSONDecodeError, TypeError):
+            continue
+    return parsed
+def log_question(
+    question: str,
+    session_id: str = None,
+    category: str = None,
+    answer: str = None,
+    detected_mode: str = None,
+    routing_question: str = None,
+    rule_triggered: str = None,
+    link_provided: bool = False
+):
+    """Log a user question to the database with comprehensive observability metadata.
+    Args:
+        question: The user's question
+        session_id: Session identifier
+        category: Question category (e.g., 'faq_match', 'llm_generated', 'policy_violation')
+        answer: The bot's response
+        detected_mode: Operating mode ('Mode A' or 'Mode B')
+        routing_question: The routing question asked (if any)
+        rule_triggered: Business rule that was triggered (e.g., 'audit_rule', 'free_class_first')
+        link_provided: Whether a direct link was included in the response
+    """
+    with get_db_connection() as conn:
+        cur = conn.cursor()
+        try:
+            cur.execute("""
+                INSERT INTO question_logs (
+                    session_id, question, category, answer,
+                    detected_mode, routing_question, rule_triggered, link_provided
+                )
+                VALUES (?, ?, ?, ?, ?, ?, ?, ?)
+            """, (
+                session_id, question, category, answer,
+                detected_mode, routing_question, rule_triggered,
+                1 if link_provided else 0
+            ))
+        except sqlite3.OperationalError as e:
+            # Fallback for older schema versions (shouldn't happen after migration)
+            print(f"⚠️  Logging error: {e}. Falling back to basic logging.")
+            cur.execute("INSERT INTO question_logs (question) VALUES (?)", (question,))
+        conn.commit()
+def get_session_state(session_id: str) -> Dict[str, Any]:
+    """Get session state from DB"""
+    with get_db_connection() as conn:
+        cur = conn.cursor()
+        cur.execute("SELECT * FROM user_sessions WHERE session_id = ?", (session_id,))
+        row = cur.fetchone()
+        if row:
+            return dict(row)
+    return {"preference": None, "msg_count": 0, "clarification_count": 0, "knowledge_context": "{}"}
+def update_session_state(session_id: str, preference: str = None, increment_count: bool = True, increment_clarification: bool = False, reset_clarification: bool = False, knowledge_update: Dict = None):
+    """Update session state with Knowledge Dictionary support"""
+    with get_db_connection() as conn:
+        cur = conn.cursor()
+        # Check if exists
+        cur.execute("SELECT preference, msg_count, clarification_count, knowledge_context FROM user_sessions WHERE session_id = ?", (session_id,))
+        row = cur.fetchone()
+        current_knowledge = {}
+        if row:
+            curr_pref, curr_count, curr_clarification, curr_knowledge_json = row
+            try:
+                current_knowledge = json.loads(curr_knowledge_json)
+            except:
+                current_knowledge = {}
+            new_pref = preference if preference else curr_pref
+            new_count = curr_count + 1 if increment_count else curr_count
+            # 10-Message Memory Rule: Reset if we hit the limit
+            if new_count > 10:
+                print(f"🔄 Session {session_id} reached 10 messages. Resetting memory context.")
+                new_count = 1
+                new_pref = None
+                current_knowledge = {}
+                new_clarification = 0
+            else:
+                new_clarification = curr_clarification
+                if reset_clarification:
+                    new_clarification = 0
+                elif increment_clarification:
+                    new_clarification = curr_clarification + 1
+            # Merge knowledge updates
+            if knowledge_update:
+                current_knowledge.update(knowledge_update)
+            new_knowledge_json = json.dumps(current_knowledge)
+            cur.execute("""
+                UPDATE user_sessions
+                SET preference = ?, msg_count = ?, clarification_count = ?, knowledge_context = ?, last_updated = CURRENT_TIMESTAMP
+                WHERE session_id = ?
+            """, (new_pref, new_count, new_clarification, new_knowledge_json, session_id))
+        else:
+            new_pref = preference
+            new_count = 1 if increment_count else 0
+            new_clarification = 1 if increment_clarification else 0
+            if knowledge_update:
+                current_knowledge.update(knowledge_update)
+            new_knowledge_json = json.dumps(current_knowledge)
+            cur.execute("""
+                INSERT INTO user_sessions (session_id, preference, msg_count, clarification_count, knowledge_context)
+                VALUES (?, ?, ?, ?, ?)
+            """, (session_id, new_pref, new_count, new_clarification, new_knowledge_json))
+        conn.commit()
+def update_faq_entry(faq_id: int, question: str, answer: str):
+    """Update an existing FAQ entry."""
+    with get_db_connection() as conn:
+        cur = conn.cursor()
+        cur.execute(
+            "UPDATE faq_entries SET question = ?, answer = ?, embedding = NULL WHERE id = ?",
+            (question, answer, faq_id)
+        )
+        conn.commit()
+def delete_faq_entry(faq_id: int):
+    """Delete an FAQ entry."""
+    with get_db_connection() as conn:
+        cur = conn.cursor()
+        cur.execute("DELETE FROM faq_entries WHERE id = ?", (faq_id,))
+        conn.commit()
+def add_faq_entry(question: str, answer: str):
+    """Add a new FAQ entry."""
+    with get_db_connection() as conn:
+        cur = conn.cursor()
+        cur.execute(
+            "INSERT INTO faq_entries (question, answer) VALUES (?, ?)",
+            (question, answer)
+        )
+        conn.commit()
+def bulk_update_faqs(entries: List[Dict[str, str]]):
+    """Bulk update FAQs from a list of dictionaries."""
+    with get_db_connection() as conn:
+        cur = conn.cursor()
+        for entry in entries:
+            question = entry.get('Question') or entry.get('question')
+            answer = entry.get('Answer') or entry.get('answer')
+            if question and answer:
+                cur.execute(
+                    "INSERT INTO faq_entries (question, answer) VALUES (?, ?)",
+                    (question, answer)
+                )
+        conn.commit()
+def bulk_update_podcasts(entries: List[Dict[str, str]]):
+    """Bulk update Podcasts from a list of dictionaries."""
+    with get_db_connection() as conn:
+        cur = conn.cursor()
+        for entry in entries:
+            guest = entry.get('Guest Name') or entry.get('guest_name')
+            url = entry.get('YouTube URL') or entry.get('youtube_url')
+            summary = entry.get('Summary') or entry.get('summary')
+            if guest and url and summary:
+                # Format full_text as required by the existing logic
+                full_text = f"Guest: {guest}. Summary: {summary}"
+                # Store summary in highlight_json as a simple list for compatibility
+                h_json = json.dumps([{"summary": summary}])
+                cur.execute(
+                    "INSERT INTO podcast_episodes (guest_name, youtube_url, highlight_json, full_text) VALUES (?, ?, ?, ?)",
+                    (guest, url, h_json, full_text)
+                )
+        conn.commit()
+def fetch_all_podcast_metadata() -> List[Dict[str, Any]]:
+    """Fetch all podcast metadata for the admin table."""
+    with get_db_connection() as conn:
+        cur = conn.cursor()
+        cur.execute("SELECT id, guest_name, youtube_url, highlight_json FROM podcast_episodes")
+        rows = cur.fetchall()
+        results = []
+        for row in rows:
+            d = dict(row)
+            # Try to extract plain summary from JSON for the table
+            try:
+                h = json.loads(d['highlight_json'])
+                d['summary'] = h[0]['summary'] if h and isinstance(h, list) else d['highlight_json']
+            except:
+                d['summary'] = d['highlight_json']
+            results.append(d)
+        return results
+def fetch_all_faq_metadata() -> List[Dict[str, Any]]:
+    """Fetch all FAQ metadata for the admin table."""
+    with get_db_connection() as conn:
+        cur = conn.cursor()
+        cur.execute("SELECT id, question, answer FROM faq_entries")
+        return [dict(row) for row in cur.fetchall()]

utils.py ADDED Viewed

	@@ -0,0 +1,163 @@

+import openai
+import numpy as np
+import re
+from typing import List, Tuple
+from config import EMBED_MODEL
+def get_embedding(text: str) -> List[float]:
+    """Generate embedding for a given text."""
+    text_strip = text.replace("\n", " ").strip()
+    response = openai.embeddings.create(input=[text_strip], model=EMBED_MODEL)
+    return response.data[0].embedding
+def cosine_similarity(a: List[float], b: List[float]) -> float:
+    """Calculate cosine similarity between two vectors."""
+    a = np.array(a)
+    b = np.array(b)
+    if np.linalg.norm(a) == 0 or np.linalg.norm(b) == 0:
+        return 0.0
+    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))
+def clean_time(time_str: str) -> str:
+    """Clean up time string."""
+    if not time_str:
+        return ""
+    time_match = re.search(r'(\d{1,2}):?(\d{0,2})\s*(AM|PM)', time_str, re.IGNORECASE)
+    if time_match:
+        hour = time_match.group(1)
+        minute = time_match.group(2) or "00"
+        ampm = time_match.group(3).upper()
+        return f"{hour}:{minute} {ampm}"
+    return time_str.strip()
+def find_top_k_matches(user_embedding, dataset, k=3):
+    """Find top k matching entries from a dataset."""
+    scored = []
+    for entry_id, text, emb in dataset:
+        score = cosine_similarity(user_embedding, emb)
+        scored.append((score, entry_id, text))
+    scored.sort(reverse=True)
+    return scored[:k]
+def classify_intent(question: str) -> str:
+    """
+    Classify the user's intent into:
+    Mode A: Recommendation Mode (Workshops, Dates, Availability, Recommendations)
+    Mode B: Front Desk Mode (Default - Everything else)
+    """
+    prompt = f"""Classify the following user question into one of two modes:
+1. "Mode A - Recommendation Mode": Use this if the user is asking about workshops, specific dates, what's available this month, asking for recommendations, or career goals (like getting an agent).
+2. "Mode B - Front Desk Mode": Use this for broad introductory questions, kids classes, signing up, summit, instructor roles, auditing, online vs in-studio, general policies, or specific questions about existing classes.
+User Question: "{question}"
+Response must be exactly "Mode A" or "Mode B"."""
+    try:
+        response = openai.chat.completions.create(
+            model="gpt-4o-mini",
+            messages=[{"role": "user", "content": prompt}],
+            temperature=0,
+            max_tokens=5
+        )
+        prediction = response.choices[0].message.content.strip()
+        if "Mode A" in prediction:
+            return "Mode A"
+        return "Mode B"
+    except Exception as e:
+        print(f"Error in intent classification: {e}")
+        return "Mode B"  # Default to Front Desk Mode
+def should_include_email(question: str) -> bool:
+    """
+    Determine if the contact email should be shown based on user intent.
+    Allowed for: Payments, Refunds, Attendance issues, Account problems.
+    """
+    from config import EMAIL_ONLY_KEYWORDS
+    import re
+    question_lower = question.lower()
+    for word in EMAIL_ONLY_KEYWORDS:
+        pattern = rf'\b{re.escape(word)}\b'
+        if re.search(pattern, question_lower):
+            return True
+    return False
+def classify_user_type(question: str, history: List[dict] = None) -> str:
+    """
+    Classify the user type into:
+    - new_actor
+    - experienced_actor
+    - parent
+    - current_student
+    - unknown
+    """
+    history_str = ""
+    if history:
+        history_str = "\nConversation context:\n" + "\n".join([f"{m['role']}: {m['content'][:100]}..." for m in history[-3:]])
+    prompt = f"""Classify the user into exactly one of these categories based on their question and context:
+1. "new_actor": Just starting out, has no experience, or is asking how to begin.
+2. "experienced_actor": Already has credits, mentions agents, looking for advanced workshops, or refers to their career progress.
+3. "parent": Asking on behalf of their child, mentions "my kid", "my son", "my daughter", "teens".
+4. "current_student": Refers to past/current classes at Get Scene, mentions a specific GSP membership, or asks about recurring student workshops.
+5. "unknown": Not enough information yet.
+User Question: "{question}"{history_str}
+Response must be exactly one of: new_actor, experienced_actor, parent, current_student, unknown."""
+    try:
+        response = openai.chat.completions.create(
+            model="gpt-4o-mini",
+            messages=[{"role": "user", "content": prompt}],
+            temperature=0,
+            max_tokens=10
+        )
+        prediction = response.choices[0].message.content.strip().lower()
+        valid_types = ["new_actor", "experienced_actor", "parent", "current_student", "unknown"]
+        for t in valid_types:
+            if t in prediction:
+                return t
+        return "unknown"
+    except Exception as e:
+        print(f"Error in user type classification: {e}")
+        return "unknown"
+def recalculate_all_embeddings():
+    """Recalculate embeddings for all entries in faq_entries and podcast_episodes that are missing embeddings."""
+    from database import get_db_connection
+    import json
+    with get_db_connection() as conn:
+        cur = conn.cursor()
+        # 1. Update FAQs
+        print("Starting FAQ embedding recalculation...")
+        cur.execute("SELECT id, question FROM faq_entries WHERE embedding IS NULL")
+        faqs = cur.fetchall()
+        for faq_id, question in faqs:
+            try:
+                emb = get_embedding(question)
+                cur.execute("UPDATE faq_entries SET embedding = ? WHERE id = ?", (json.dumps(emb), faq_id))
+                print(f"  ✓ Updated FAQ ID {faq_id}")
+            except Exception as e:
+                print(f"  ✗ Error updating FAQ ID {faq_id}: {e}")
+        # 2. Update Podcasts
+        print("Starting Podcast embedding recalculation...")
+        cur.execute("SELECT id, full_text FROM podcast_episodes WHERE embedding IS NULL")
+        podcasts = cur.fetchall()
+        for pod_id, full_text in podcasts:
+            try:
+                emb = get_embedding(full_text)
+                cur.execute("UPDATE podcast_episodes SET embedding = ? WHERE id = ?", (json.dumps(emb), pod_id))
+                print(f"  ✓ Updated Podcast ID {pod_id}")
+            except Exception as e:
+                print(f"  ✗ Error updating Podcast ID {pod_id}: {e}")
+        conn.commit()
+    print("Embedding recalculation complete.")