Spaces:

agnixcode
/

YoutubeTranscribevideochatbot

Sleeping

App Files Files Community

agnixcode commited on 21 days ago

Commit

f93afb8

verified ·

1 Parent(s): 6782e6b

Create app.py

Browse files

Files changed (1) hide show

app.py +552 -0

app.py ADDED Viewed

	@@ -0,0 +1,552 @@

+# ============================================================
+# YouTube RAG Q&A System — Production-Quality Colab Notebook
+# Author  : Your Name
+# Model   : Groq LLaMA-3.3-70B-Versatile (128K context)
+# Embedder: all-MiniLM-L6-v2  (Sentence-Transformers, free)
+# Vector DB: FAISS (Facebook AI, free, CPU)
+# UI      : Gradio 4.x
+# ============================================================
+# ─────────────────────────────────────────────────────────────
+# MODULE 0 ❯  INSTALLATION
+# Run this cell once.  Restart runtime after it finishes.
+# ─────────────────────────────────────────────────────────────
+# !pip install -q \
+#   gradio \
+#   youtube-transcript-api \
+#   sentence-transformers \
+#   faiss-cpu \
+#   groq \
+#   langchain-text-splitters \
+#   python-dotenv
+# ─────────────────────────────────────────────────────────────
+# MODULE 1 ❯  IMPORTS & CONFIGURATION
+# All third-party imports live here.
+# API key is read from Colab Secrets (preferred) or env var.
+# ─────────────────────────────────────────────────────────────
+import os
+import re
+import logging
+from typing import Optional
+# ── UI framework ─────────────────────────────────────────────
+import gradio as gr
+# ── YouTube transcript (free, no API key required) ───────────
+from youtube_transcript_api import YouTubeTranscriptApi
+from youtube_transcript_api._errors import (
+    TranscriptsDisabled,
+    NoTranscriptFound,
+    VideoUnavailable,
+)
+# ── Embedding model (local, runs on CPU) ─────────────────────
+from sentence_transformers import SentenceTransformer
+# ── Text splitting ────────────────────────────────────────────
+from langchain_text_splitters import RecursiveCharacterTextSplitter
+# ── Numerical / vector DB ─────────────────────────────────────
+import numpy as np
+import faiss
+# ── Groq LLM client ───────────────────────────────────────────
+from groq import Groq
+# ── Logging — shows clean status in Colab output ──────────────
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(asctime)s | %(levelname)s | %(message)s",
+    datefmt="%H:%M:%S",
+)
+log = logging.getLogger("rag")
+# ── API key ────────────────────────────────────────────────────
+# Option A (recommended in Colab): use Secrets panel (🔑 left sidebar)
+#   key name → GROQ_API_KEY
+try:
+    from google.colab import userdata  # type: ignore
+    GROQ_API_KEY = userdata.get("GROQ_API_KEY")
+except Exception:
+    GROQ_API_KEY = os.getenv("GROQ_API_KEY", "")
+if not GROQ_API_KEY:
+    raise EnvironmentError(
+        "⚠️  GROQ_API_KEY not found. "
+        "Add it via Colab Secrets (🔑) or set os.environ['GROQ_API_KEY']."
+    )
+# ── Model identifiers ──────────────────────────────────────────
+GROQ_MODEL      = "llama-3.3-70b-versatile"   # 128K context, best OSS on Groq 2025
+EMBED_MODEL     = "all-MiniLM-L6-v2"          # 384-dim, fast, free, CPU-friendly
+CHUNK_SIZE      = 500                          # tokens per chunk
+CHUNK_OVERLAP   = 50                           # overlap to preserve context across chunks
+TOP_K           = 4                            # how many chunks to retrieve per query
+MAX_NEW_TOKENS  = 1024                         # LLM answer budget
+# ─────────────────────────────────────────────────────────────
+# MODULE 2 ❯  MODEL INITIALISATION
+# Load embedding model once at startup so every call is fast.
+# Groq client is stateless — one instance is enough.
+# ─────────────────────────────────────────────────────────────
+log.info("Loading embedding model …")
+embedding_model = SentenceTransformer(EMBED_MODEL)
+log.info("Embedding model ready ✓")
+groq_client = Groq(api_key=GROQ_API_KEY)
+# ── Global vector store ────────────────────────────────────────
+# These are module-level globals so every Gradio callback
+# can read/write them without passing objects around.
+vector_store: Optional[faiss.IndexFlatL2] = None   # FAISS index
+chunks_store: list[str] = []                        # parallel list of text chunks
+current_video_title: str = ""                       # shown in the UI
+# ─────────────────────────────────────────────────────────────
+# MODULE 3 ❯  YOUTUBE TRANSCRIPT FETCHER
+# ─────────────────────────────────────────────────────────────
+def extract_video_id(url: str) -> str:
+    """
+    Extract the YouTube video ID from any common URL format.
+    Handles:
+      https://www.youtube.com/watch?v=VIDEO_ID
+      https://youtu.be/VIDEO_ID
+      https://youtube.com/shorts/VIDEO_ID
+      https://www.youtube.com/embed/VIDEO_ID
+    """
+    patterns = [
+        r"(?:v=)([A-Za-z0-9_-]{11})",
+        r"youtu\.be/([A-Za-z0-9_-]{11})",
+        r"shorts/([A-Za-z0-9_-]{11})",
+        r"embed/([A-Za-z0-9_-]{11})",
+    ]
+    for pattern in patterns:
+        match = re.search(pattern, url)
+        if match:
+            return match.group(1)
+    raise ValueError(f"Could not extract video ID from URL: {url}")
+def get_transcript(url: str) -> tuple[str, str]:
+    """
+    Fetch the transcript for a YouTube video.
+    Returns
+    -------
+    (transcript_text, status_message)
+    On error: (empty string, error description)
+    """
+    try:
+        video_id = extract_video_id(url)
+        log.info(f"Fetching transcript for video ID: {video_id}")
+        api = YouTubeTranscriptApi()
+        # .fetch() returns a FetchedTranscript object (updated API)
+        transcript_data = api.fetch(video_id)
+        # Join all text segments into one continuous string
+        full_text = " ".join(
+            segment.text.strip()
+            for segment in transcript_data
+            if segment.text.strip()
+        )
+        word_count = len(full_text.split())
+        log.info(f"Transcript fetched — {word_count:,} words")
+        return full_text, f"✅ Transcript fetched ({word_count:,} words)"
+    except VideoUnavailable:
+        return "", "❌ Video is unavailable or private."
+    except TranscriptsDisabled:
+        return "", "❌ Transcripts are disabled for this video."
+    except NoTranscriptFound:
+        return "", "❌ No transcript found. Try a video with auto-generated captions."
+    except ValueError as e:
+        return "", f"❌ Invalid URL — {e}"
+    except Exception as e:
+        log.exception("Unexpected error fetching transcript")
+        return "", f"❌ Unexpected error: {e}"
+# ─────────────────────────────────────────────────────────────
+# MODULE 4 ❯  VECTOR DATABASE BUILDER
+# Splits transcript → chunks → embeddings → FAISS index
+# ─────────────────────────────────────────────────────────────
+def build_vector_store(transcript: str) -> str:
+    """
+    Convert a raw transcript into a FAISS vector index.
+    Steps
+    -----
+    1. Split text into overlapping chunks via RecursiveCharacterTextSplitter
+    2. Encode each chunk with the embedding model
+    3. Build a FAISS IndexFlatL2 and add the vectors
+    4. Store everything in module-level globals
+    Returns
+    -------
+    Status message string.
+    """
+    global vector_store, chunks_store
+    # ── Step 1: Chunk ──────────────────────────────────────────
+    splitter = RecursiveCharacterTextSplitter(
+        chunk_size=CHUNK_SIZE,
+        chunk_overlap=CHUNK_OVERLAP,
+        length_function=len,           # character-based length
+        separators=["\n\n", "\n", ". ", " ", ""],
+    )
+    chunks = splitter.split_text(transcript)
+    log.info(f"Created {len(chunks)} chunks")
+    if not chunks:
+        return "❌ No chunks created — transcript may be too short."
+    # ── Step 2: Embed ──────────────────────────────────────────
+    log.info("Encoding chunks …")
+    embeddings = embedding_model.encode(
+        chunks,
+        show_progress_bar=False,
+        batch_size=64,
+        normalize_embeddings=True,     # cosine similarity via inner product
+    )
+    # ── Step 3: Index ─────────────────────────────────���────────
+    dimension = embeddings.shape[1]
+    index = faiss.IndexFlatIP(dimension)   # Inner Product → cosine on normalised vecs
+    index.add(np.array(embeddings, dtype=np.float32))
+    # ── Step 4: Persist to globals ─────────────────────────────
+    vector_store = index
+    chunks_store = chunks
+    log.info(f"FAISS index built — {index.ntotal} vectors, dim={dimension}")
+    return f"✅ Indexed {len(chunks)} chunks into FAISS (dim={dimension})"
+# ─────────────────────────────────────────────────────────────
+# MODULE 5 ❯  RETRIEVER
+# Similarity search: query → top-k relevant chunks
+# ─────────────────────────────────────────────────────────────
+def retrieve_context(query: str, top_k: int = TOP_K) -> str:
+    """
+    Retrieve the most semantically relevant chunks for a given query.
+    Parameters
+    ----------
+    query  : user's natural-language question
+    top_k  : number of chunks to return
+    Returns
+    -------
+    String of concatenated retrieved chunks, separated by blank lines.
+    """
+    if vector_store is None or not chunks_store:
+        return ""
+    # Embed and normalise the query (same preprocessing as the chunks)
+    query_vec = embedding_model.encode(
+        [query],
+        normalize_embeddings=True,
+    )
+    # FAISS inner-product search (cosine on normalised vectors)
+    scores, indices = vector_store.search(
+        np.array(query_vec, dtype=np.float32), top_k
+    )
+    retrieved = []
+    for score, idx in zip(scores[0], indices[0]):
+        if idx == -1:   # FAISS returns -1 for empty slots
+            continue
+        retrieved.append(f"[Relevance: {score:.3f}]\n{chunks_store[idx]}")
+    log.info(f"Retrieved {len(retrieved)} chunks for query: '{query[:60]}…'")
+    return "\n\n---\n\n".join(retrieved)
+# ─────────────────────────────────────────────────────────────
+# MODULE 6 ❯  LLM — GROQ LLAMA 3.3-70B
+# Augment + Generate step of RAG
+# ─────────────────────────────────────────────────────────────
+SYSTEM_PROMPT = """\
+You are a precise, helpful AI assistant that answers questions about YouTube videos \
+based strictly on the provided transcript context.
+Rules:
+- Answer ONLY from the context provided.
+- If the context does not contain enough information, say so clearly.
+- Be concise but complete.
+- Use bullet points for lists or steps.
+- Never fabricate information not present in the context.
+"""
+def generate_answer(query: str) -> str:
+    """
+    Full RAG generate step:
+      1. Retrieve relevant context from FAISS
+      2. Build an augmented prompt
+      3. Send to Groq LLaMA-3.3-70B
+      4. Return the model's response
+    Parameters
+    ----------
+    query : user's question
+    Returns
+    -------
+    The model's answer as a string.
+    """
+    context = retrieve_context(query)
+    if not context:
+        return "⚠️ No relevant context found in the transcript for your question."
+    user_message = f"""\
+Context from the video transcript:
+{context}
+---
+Question: {query}
+Answer:"""
+    try:
+        response = groq_client.chat.completions.create(
+            model=GROQ_MODEL,
+            messages=[
+                {"role": "system", "content": SYSTEM_PROMPT},
+                {"role": "user",   "content": user_message},
+            ],
+            max_tokens=MAX_NEW_TOKENS,
+            temperature=0.2,       # low temp → factual, grounded answers
+            top_p=0.9,
+        )
+        answer = response.choices[0].message.content.strip()
+        log.info("LLM response received")
+        return answer
+    except Exception as e:
+        log.exception("Groq API error")
+        return f"❌ LLM error: {e}"
+# ─────────────────────────────────────────────────────────────
+# MODULE 7 ❯  ORCHESTRATION PIPELINE
+# Ties transcript fetch + vector store build together.
+# Called by the Gradio "Process Video" button.
+# ─────────────────────────────────────────────────────────────
+def process_video(url: str) -> tuple[str, str, str]:
+    """
+    Full ingestion pipeline triggered by the UI.
+    Returns
+    -------
+    (transcript_preview, index_status, combined_status)
+    suitable for Gradio outputs.
+    """
+    global current_video_title
+    if not url or not url.strip():
+        return "", "", "⚠️ Please enter a YouTube URL."
+    # ── Phase 1: Fetch transcript ──────────────────────────────
+    transcript, fetch_status = get_transcript(url.strip())
+    if not transcript:
+        return "", "", fetch_status
+    # ── Phase 2: Build vector store ───────────────────────────
+    index_status = build_vector_store(transcript)
+    # ── Phase 3: Summary line for UI ──────────────────────────
+    combined = f"{fetch_status}\n{index_status}\n\n💬 Video is ready — switch to the Chat tab!"
+    # Show first 2000 chars in the transcript preview box
+    preview = transcript[:2000] + (" …[truncated]" if len(transcript) > 2000 else "")
+    return preview, index_status, combined
+# ─────────────────────────────────────────────────────────────
+# MODULE 8 ❯  CHAT HANDLER
+# Called on every user message in the Chat tab.
+# ─────────────────────────────────────────────────────────────
+def chat_with_video(
+    user_query: str,
+    history: list[tuple[str, str]],
+) -> tuple[list[tuple[str, str]], str]:
+    """
+    Handle a single chat turn.
+    Parameters
+    ----------
+    user_query : the question typed by the user
+    history    : Gradio chat history (list of (user, assistant) pairs)
+    Returns
+    -------
+    Updated history, empty string (clears the input box)
+    """
+    if not user_query.strip():
+        return history, ""
+    if vector_store is None:
+        history.append((user_query, "⚠️ Please process a video first on the **Process Video** tab."))
+        return history, ""
+    answer = generate_answer(user_query)
+    history.append((user_query, answer))
+    return history, ""
+# ─────────────────────────────────────────────────────────────
+# MODULE 9 ❯  GRADIO USER INTERFACE
+# Professional two-tab layout:
+#   Tab 1 — Process Video (URL input, status, transcript preview)
+#   Tab 2 — Chat         (conversation window + input)
+# ─────────────────────────────────────────────────────────────
+CSS = """
+/* ── Global ── */
+#app-header { text-align: center; margin-bottom: 0.5rem; }
+#status-box textarea {
+    font-size: 0.85rem;
+    color: var(--body-text-color);
+    background: var(--input-background-fill);
+}
+#transcript-box textarea { font-size: 0.8rem; }
+#chat-window { height: 480px; }
+/* ── Send on Enter ── */
+#chat-input textarea { resize: none; }
+"""
+with gr.Blocks(
+    title="YouTube RAG Q&A",
+    theme=gr.themes.Soft(
+        primary_hue="indigo",
+        neutral_hue="slate",
+        font=gr.themes.GoogleFont("Inter"),
+    ),
+    css=CSS,
+) as app:
+    # ── Header ─────────────────────────────────────────────────
+    gr.Markdown(
+        """
+        # 🎥 YouTube RAG Q&A
+        **Paste any YouTube URL → transcribe → chat with the video using AI**
+        *Powered by [Groq](https://groq.com) · LLaMA 3.3-70B · FAISS · Sentence-Transformers*
+        """,
+        elem_id="app-header",
+    )
+    # ── Tab 1: Process Video ────────────────────────────────────
+    with gr.Tab("📥  Process Video", id="tab-process"):
+        with gr.Row():
+            url_input = gr.Textbox(
+                label="YouTube URL",
+                placeholder="https://www.youtube.com/watch?v=...",
+                scale=4,
+            )
+            process_btn = gr.Button(
+                "▶  Transcribe & Index",
+                variant="primary",
+                scale=1,
+                min_width=180,
+            )
+        status_output = gr.Textbox(
+            label="Pipeline Status",
+            interactive=False,
+            lines=4,
+            elem_id="status-box",
+        )
+        with gr.Accordion("📄  Transcript Preview (first 2000 chars)", open=False):
+            transcript_output = gr.Textbox(
+                label="Raw transcript",
+                interactive=False,
+                lines=12,
+                elem_id="transcript-box",
+            )
+        # ── Wiring ────────────────────────────────────────────
+        process_btn.click(
+            fn=process_video,
+            inputs=url_input,
+            outputs=[transcript_output, gr.Textbox(visible=False), status_output],
+        )
+    # ── Tab 2: Chat ─────────────────────────────────────────────
+    with gr.Tab("💬  Chat with Video", id="tab-chat"):
+        chatbot = gr.Chatbot(
+            label="Conversation",
+            bubble_full_width=False,
+            height=480,
+            elem_id="chat-window",
+        )
+        with gr.Row():
+            chat_input = gr.Textbox(
+                placeholder="Ask anything about the video…",
+                label="",
+                scale=5,
+                elem_id="chat-input",
+                autofocus=True,
+            )
+            send_btn = gr.Button("Send  ➤", variant="primary", scale=1, min_width=100)
+        clear_btn = gr.Button("🗑  Clear conversation", variant="secondary", size="sm")
+        # ── Wiring ────────────────────────────────────────────
+        # Submit on button click or Enter key
+        send_btn.click(
+            fn=chat_with_video,
+            inputs=[chat_input, chatbot],
+            outputs=[chatbot, chat_input],
+        )
+        chat_input.submit(
+            fn=chat_with_video,
+            inputs=[chat_input, chatbot],
+            outputs=[chatbot, chat_input],
+        )
+        clear_btn.click(fn=lambda: [], outputs=chatbot)
+    # ── Footer ──────────────────────────────────────────────────
+    gr.Markdown(
+        "<center style='font-size:0.75rem; color:#888;'>"
+        "Open-source · No data stored · Transcript processed locally"
+        "</center>"
+    )
+# ─────────────────────────────────────────────────────────────
+# MODULE 10 ❯  LAUNCH
+# ─────────────────────────────────────────────────────────────
+if __name__ == "__main__":
+    app.launch(
+        debug=True,          # shows tracebacks in output
+        share=True,          # creates a public gradio.live link (great for demos)
+        show_error=True,
+    )