agnixcode commited on
Commit
f93afb8
Β·
verified Β·
1 Parent(s): 6782e6b

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +552 -0
app.py ADDED
@@ -0,0 +1,552 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ============================================================
2
+ # YouTube RAG Q&A System β€” Production-Quality Colab Notebook
3
+ # Author : Your Name
4
+ # Model : Groq LLaMA-3.3-70B-Versatile (128K context)
5
+ # Embedder: all-MiniLM-L6-v2 (Sentence-Transformers, free)
6
+ # Vector DB: FAISS (Facebook AI, free, CPU)
7
+ # UI : Gradio 4.x
8
+ # ============================================================
9
+
10
+
11
+ # ─────────────────────────────────────────────────────────────
12
+ # MODULE 0 ❯ INSTALLATION
13
+ # Run this cell once. Restart runtime after it finishes.
14
+ # ─────────────────────────────────────────────────────────────
15
+
16
+ # !pip install -q \
17
+ # gradio \
18
+ # youtube-transcript-api \
19
+ # sentence-transformers \
20
+ # faiss-cpu \
21
+ # groq \
22
+ # langchain-text-splitters \
23
+ # python-dotenv
24
+
25
+
26
+ # ─────────────────────────────────────────────────────────────
27
+ # MODULE 1 ❯ IMPORTS & CONFIGURATION
28
+ # All third-party imports live here.
29
+ # API key is read from Colab Secrets (preferred) or env var.
30
+ # ─────────────────────────────────────────────────────────────
31
+
32
+ import os
33
+ import re
34
+ import logging
35
+ from typing import Optional
36
+
37
+ # ── UI framework ─────────────────────────────────────────────
38
+ import gradio as gr
39
+
40
+ # ── YouTube transcript (free, no API key required) ───────────
41
+ from youtube_transcript_api import YouTubeTranscriptApi
42
+ from youtube_transcript_api._errors import (
43
+ TranscriptsDisabled,
44
+ NoTranscriptFound,
45
+ VideoUnavailable,
46
+ )
47
+
48
+ # ── Embedding model (local, runs on CPU) ─────────────────────
49
+ from sentence_transformers import SentenceTransformer
50
+
51
+ # ── Text splitting ────────────────────────────────────────────
52
+ from langchain_text_splitters import RecursiveCharacterTextSplitter
53
+
54
+ # ── Numerical / vector DB ─────────────────────────────────────
55
+ import numpy as np
56
+ import faiss
57
+
58
+ # ── Groq LLM client ───────────────────────────────────────────
59
+ from groq import Groq
60
+
61
+ # ── Logging β€” shows clean status in Colab output ──────────────
62
+ logging.basicConfig(
63
+ level=logging.INFO,
64
+ format="%(asctime)s | %(levelname)s | %(message)s",
65
+ datefmt="%H:%M:%S",
66
+ )
67
+ log = logging.getLogger("rag")
68
+
69
+ # ── API key ────────────────────────────────────────────────────
70
+ # Option A (recommended in Colab): use Secrets panel (πŸ”‘ left sidebar)
71
+ # key name β†’ GROQ_API_KEY
72
+ try:
73
+ from google.colab import userdata # type: ignore
74
+ GROQ_API_KEY = userdata.get("GROQ_API_KEY")
75
+ except Exception:
76
+ GROQ_API_KEY = os.getenv("GROQ_API_KEY", "")
77
+
78
+ if not GROQ_API_KEY:
79
+ raise EnvironmentError(
80
+ "⚠️ GROQ_API_KEY not found. "
81
+ "Add it via Colab Secrets (πŸ”‘) or set os.environ['GROQ_API_KEY']."
82
+ )
83
+
84
+ # ── Model identifiers ──────────────────────────────────────────
85
+ GROQ_MODEL = "llama-3.3-70b-versatile" # 128K context, best OSS on Groq 2025
86
+ EMBED_MODEL = "all-MiniLM-L6-v2" # 384-dim, fast, free, CPU-friendly
87
+ CHUNK_SIZE = 500 # tokens per chunk
88
+ CHUNK_OVERLAP = 50 # overlap to preserve context across chunks
89
+ TOP_K = 4 # how many chunks to retrieve per query
90
+ MAX_NEW_TOKENS = 1024 # LLM answer budget
91
+
92
+
93
+ # ─────────────────────────────────────────────────────────────
94
+ # MODULE 2 ❯ MODEL INITIALISATION
95
+ # Load embedding model once at startup so every call is fast.
96
+ # Groq client is stateless β€” one instance is enough.
97
+ # ─────────────────────────────────────────────────────────────
98
+
99
+ log.info("Loading embedding model …")
100
+ embedding_model = SentenceTransformer(EMBED_MODEL)
101
+ log.info("Embedding model ready βœ“")
102
+
103
+ groq_client = Groq(api_key=GROQ_API_KEY)
104
+
105
+ # ── Global vector store ────────────────────────────────────────
106
+ # These are module-level globals so every Gradio callback
107
+ # can read/write them without passing objects around.
108
+ vector_store: Optional[faiss.IndexFlatL2] = None # FAISS index
109
+ chunks_store: list[str] = [] # parallel list of text chunks
110
+ current_video_title: str = "" # shown in the UI
111
+
112
+
113
+ # ─────────────────────────────────────────────────────────────
114
+ # MODULE 3 ❯ YOUTUBE TRANSCRIPT FETCHER
115
+ # ─────────────────────────────────────────────────────────────
116
+
117
+ def extract_video_id(url: str) -> str:
118
+ """
119
+ Extract the YouTube video ID from any common URL format.
120
+
121
+ Handles:
122
+ https://www.youtube.com/watch?v=VIDEO_ID
123
+ https://youtu.be/VIDEO_ID
124
+ https://youtube.com/shorts/VIDEO_ID
125
+ https://www.youtube.com/embed/VIDEO_ID
126
+ """
127
+ patterns = [
128
+ r"(?:v=)([A-Za-z0-9_-]{11})",
129
+ r"youtu\.be/([A-Za-z0-9_-]{11})",
130
+ r"shorts/([A-Za-z0-9_-]{11})",
131
+ r"embed/([A-Za-z0-9_-]{11})",
132
+ ]
133
+ for pattern in patterns:
134
+ match = re.search(pattern, url)
135
+ if match:
136
+ return match.group(1)
137
+ raise ValueError(f"Could not extract video ID from URL: {url}")
138
+
139
+
140
+ def get_transcript(url: str) -> tuple[str, str]:
141
+ """
142
+ Fetch the transcript for a YouTube video.
143
+
144
+ Returns
145
+ -------
146
+ (transcript_text, status_message)
147
+ On error: (empty string, error description)
148
+ """
149
+ try:
150
+ video_id = extract_video_id(url)
151
+ log.info(f"Fetching transcript for video ID: {video_id}")
152
+
153
+ api = YouTubeTranscriptApi()
154
+ # .fetch() returns a FetchedTranscript object (updated API)
155
+ transcript_data = api.fetch(video_id)
156
+
157
+ # Join all text segments into one continuous string
158
+ full_text = " ".join(
159
+ segment.text.strip()
160
+ for segment in transcript_data
161
+ if segment.text.strip()
162
+ )
163
+
164
+ word_count = len(full_text.split())
165
+ log.info(f"Transcript fetched β€” {word_count:,} words")
166
+ return full_text, f"βœ… Transcript fetched ({word_count:,} words)"
167
+
168
+ except VideoUnavailable:
169
+ return "", "❌ Video is unavailable or private."
170
+ except TranscriptsDisabled:
171
+ return "", "❌ Transcripts are disabled for this video."
172
+ except NoTranscriptFound:
173
+ return "", "❌ No transcript found. Try a video with auto-generated captions."
174
+ except ValueError as e:
175
+ return "", f"❌ Invalid URL β€” {e}"
176
+ except Exception as e:
177
+ log.exception("Unexpected error fetching transcript")
178
+ return "", f"❌ Unexpected error: {e}"
179
+
180
+
181
+ # ─────────────────────────────────────────────────────────────
182
+ # MODULE 4 ❯ VECTOR DATABASE BUILDER
183
+ # Splits transcript β†’ chunks β†’ embeddings β†’ FAISS index
184
+ # ─────────────────────────────────────────────────────────────
185
+
186
+ def build_vector_store(transcript: str) -> str:
187
+ """
188
+ Convert a raw transcript into a FAISS vector index.
189
+
190
+ Steps
191
+ -----
192
+ 1. Split text into overlapping chunks via RecursiveCharacterTextSplitter
193
+ 2. Encode each chunk with the embedding model
194
+ 3. Build a FAISS IndexFlatL2 and add the vectors
195
+ 4. Store everything in module-level globals
196
+
197
+ Returns
198
+ -------
199
+ Status message string.
200
+ """
201
+ global vector_store, chunks_store
202
+
203
+ # ── Step 1: Chunk ──────────────────────────────────────────
204
+ splitter = RecursiveCharacterTextSplitter(
205
+ chunk_size=CHUNK_SIZE,
206
+ chunk_overlap=CHUNK_OVERLAP,
207
+ length_function=len, # character-based length
208
+ separators=["\n\n", "\n", ". ", " ", ""],
209
+ )
210
+ chunks = splitter.split_text(transcript)
211
+ log.info(f"Created {len(chunks)} chunks")
212
+
213
+ if not chunks:
214
+ return "❌ No chunks created β€” transcript may be too short."
215
+
216
+ # ── Step 2: Embed ──────────────────────────────────────────
217
+ log.info("Encoding chunks …")
218
+ embeddings = embedding_model.encode(
219
+ chunks,
220
+ show_progress_bar=False,
221
+ batch_size=64,
222
+ normalize_embeddings=True, # cosine similarity via inner product
223
+ )
224
+
225
+ # ── Step 3: Index ─────────────────────────────────���────────
226
+ dimension = embeddings.shape[1]
227
+ index = faiss.IndexFlatIP(dimension) # Inner Product β†’ cosine on normalised vecs
228
+ index.add(np.array(embeddings, dtype=np.float32))
229
+
230
+ # ── Step 4: Persist to globals ─────────────────────────────
231
+ vector_store = index
232
+ chunks_store = chunks
233
+
234
+ log.info(f"FAISS index built β€” {index.ntotal} vectors, dim={dimension}")
235
+ return f"βœ… Indexed {len(chunks)} chunks into FAISS (dim={dimension})"
236
+
237
+
238
+ # ─────────────────────────────────────────────────────────────
239
+ # MODULE 5 ❯ RETRIEVER
240
+ # Similarity search: query β†’ top-k relevant chunks
241
+ # ─────────────────────────────────────────────────────────────
242
+
243
+ def retrieve_context(query: str, top_k: int = TOP_K) -> str:
244
+ """
245
+ Retrieve the most semantically relevant chunks for a given query.
246
+
247
+ Parameters
248
+ ----------
249
+ query : user's natural-language question
250
+ top_k : number of chunks to return
251
+
252
+ Returns
253
+ -------
254
+ String of concatenated retrieved chunks, separated by blank lines.
255
+ """
256
+ if vector_store is None or not chunks_store:
257
+ return ""
258
+
259
+ # Embed and normalise the query (same preprocessing as the chunks)
260
+ query_vec = embedding_model.encode(
261
+ [query],
262
+ normalize_embeddings=True,
263
+ )
264
+
265
+ # FAISS inner-product search (cosine on normalised vectors)
266
+ scores, indices = vector_store.search(
267
+ np.array(query_vec, dtype=np.float32), top_k
268
+ )
269
+
270
+ retrieved = []
271
+ for score, idx in zip(scores[0], indices[0]):
272
+ if idx == -1: # FAISS returns -1 for empty slots
273
+ continue
274
+ retrieved.append(f"[Relevance: {score:.3f}]\n{chunks_store[idx]}")
275
+
276
+ log.info(f"Retrieved {len(retrieved)} chunks for query: '{query[:60]}…'")
277
+ return "\n\n---\n\n".join(retrieved)
278
+
279
+
280
+ # ─────────────────────────────────────────────────────────────
281
+ # MODULE 6 ❯ LLM β€” GROQ LLAMA 3.3-70B
282
+ # Augment + Generate step of RAG
283
+ # ─────────────────────────────────────────────────────────────
284
+
285
+ SYSTEM_PROMPT = """\
286
+ You are a precise, helpful AI assistant that answers questions about YouTube videos \
287
+ based strictly on the provided transcript context.
288
+
289
+ Rules:
290
+ - Answer ONLY from the context provided.
291
+ - If the context does not contain enough information, say so clearly.
292
+ - Be concise but complete.
293
+ - Use bullet points for lists or steps.
294
+ - Never fabricate information not present in the context.
295
+ """
296
+
297
+ def generate_answer(query: str) -> str:
298
+ """
299
+ Full RAG generate step:
300
+ 1. Retrieve relevant context from FAISS
301
+ 2. Build an augmented prompt
302
+ 3. Send to Groq LLaMA-3.3-70B
303
+ 4. Return the model's response
304
+
305
+ Parameters
306
+ ----------
307
+ query : user's question
308
+
309
+ Returns
310
+ -------
311
+ The model's answer as a string.
312
+ """
313
+ context = retrieve_context(query)
314
+
315
+ if not context:
316
+ return "⚠️ No relevant context found in the transcript for your question."
317
+
318
+ user_message = f"""\
319
+ Context from the video transcript:
320
+
321
+ {context}
322
+
323
+ ---
324
+
325
+ Question: {query}
326
+
327
+ Answer:"""
328
+
329
+ try:
330
+ response = groq_client.chat.completions.create(
331
+ model=GROQ_MODEL,
332
+ messages=[
333
+ {"role": "system", "content": SYSTEM_PROMPT},
334
+ {"role": "user", "content": user_message},
335
+ ],
336
+ max_tokens=MAX_NEW_TOKENS,
337
+ temperature=0.2, # low temp β†’ factual, grounded answers
338
+ top_p=0.9,
339
+ )
340
+ answer = response.choices[0].message.content.strip()
341
+ log.info("LLM response received")
342
+ return answer
343
+
344
+ except Exception as e:
345
+ log.exception("Groq API error")
346
+ return f"❌ LLM error: {e}"
347
+
348
+
349
+ # ─────────────────────────────────────────────────────────────
350
+ # MODULE 7 ❯ ORCHESTRATION PIPELINE
351
+ # Ties transcript fetch + vector store build together.
352
+ # Called by the Gradio "Process Video" button.
353
+ # ─────────────────────────────────────────────────────────────
354
+
355
+ def process_video(url: str) -> tuple[str, str, str]:
356
+ """
357
+ Full ingestion pipeline triggered by the UI.
358
+
359
+ Returns
360
+ -------
361
+ (transcript_preview, index_status, combined_status)
362
+ suitable for Gradio outputs.
363
+ """
364
+ global current_video_title
365
+
366
+ if not url or not url.strip():
367
+ return "", "", "⚠️ Please enter a YouTube URL."
368
+
369
+ # ── Phase 1: Fetch transcript ──────────────────────────────
370
+ transcript, fetch_status = get_transcript(url.strip())
371
+ if not transcript:
372
+ return "", "", fetch_status
373
+
374
+ # ── Phase 2: Build vector store ───────────────────────────
375
+ index_status = build_vector_store(transcript)
376
+
377
+ # ── Phase 3: Summary line for UI ──────────────────────────
378
+ combined = f"{fetch_status}\n{index_status}\n\nπŸ’¬ Video is ready β€” switch to the Chat tab!"
379
+
380
+ # Show first 2000 chars in the transcript preview box
381
+ preview = transcript[:2000] + (" …[truncated]" if len(transcript) > 2000 else "")
382
+
383
+ return preview, index_status, combined
384
+
385
+
386
+ # ─────────────────────────────────────────────────────────────
387
+ # MODULE 8 ❯ CHAT HANDLER
388
+ # Called on every user message in the Chat tab.
389
+ # ─────────────────────────────────────────────────────────────
390
+
391
+ def chat_with_video(
392
+ user_query: str,
393
+ history: list[tuple[str, str]],
394
+ ) -> tuple[list[tuple[str, str]], str]:
395
+ """
396
+ Handle a single chat turn.
397
+
398
+ Parameters
399
+ ----------
400
+ user_query : the question typed by the user
401
+ history : Gradio chat history (list of (user, assistant) pairs)
402
+
403
+ Returns
404
+ -------
405
+ Updated history, empty string (clears the input box)
406
+ """
407
+ if not user_query.strip():
408
+ return history, ""
409
+
410
+ if vector_store is None:
411
+ history.append((user_query, "⚠️ Please process a video first on the **Process Video** tab."))
412
+ return history, ""
413
+
414
+ answer = generate_answer(user_query)
415
+ history.append((user_query, answer))
416
+ return history, ""
417
+
418
+
419
+ # ─────────────────────────────────────────────────────────────
420
+ # MODULE 9 ❯ GRADIO USER INTERFACE
421
+ # Professional two-tab layout:
422
+ # Tab 1 β€” Process Video (URL input, status, transcript preview)
423
+ # Tab 2 β€” Chat (conversation window + input)
424
+ # ─────────────────────────────────────────────────────────────
425
+
426
+ CSS = """
427
+ /* ── Global ── */
428
+ #app-header { text-align: center; margin-bottom: 0.5rem; }
429
+ #status-box textarea {
430
+ font-size: 0.85rem;
431
+ color: var(--body-text-color);
432
+ background: var(--input-background-fill);
433
+ }
434
+ #transcript-box textarea { font-size: 0.8rem; }
435
+ #chat-window { height: 480px; }
436
+ /* ── Send on Enter ── */
437
+ #chat-input textarea { resize: none; }
438
+ """
439
+
440
+ with gr.Blocks(
441
+ title="YouTube RAG Q&A",
442
+ theme=gr.themes.Soft(
443
+ primary_hue="indigo",
444
+ neutral_hue="slate",
445
+ font=gr.themes.GoogleFont("Inter"),
446
+ ),
447
+ css=CSS,
448
+ ) as app:
449
+
450
+ # ── Header ─────────────────────────────────────────────────
451
+ gr.Markdown(
452
+ """
453
+ # πŸŽ₯ YouTube RAG Q&A
454
+ **Paste any YouTube URL β†’ transcribe β†’ chat with the video using AI**
455
+
456
+ *Powered by [Groq](https://groq.com) Β· LLaMA 3.3-70B Β· FAISS Β· Sentence-Transformers*
457
+ """,
458
+ elem_id="app-header",
459
+ )
460
+
461
+ # ── Tab 1: Process Video ────────────────────────────────────
462
+ with gr.Tab("πŸ“₯ Process Video", id="tab-process"):
463
+
464
+ with gr.Row():
465
+ url_input = gr.Textbox(
466
+ label="YouTube URL",
467
+ placeholder="https://www.youtube.com/watch?v=...",
468
+ scale=4,
469
+ )
470
+ process_btn = gr.Button(
471
+ "β–Ά Transcribe & Index",
472
+ variant="primary",
473
+ scale=1,
474
+ min_width=180,
475
+ )
476
+
477
+ status_output = gr.Textbox(
478
+ label="Pipeline Status",
479
+ interactive=False,
480
+ lines=4,
481
+ elem_id="status-box",
482
+ )
483
+
484
+ with gr.Accordion("πŸ“„ Transcript Preview (first 2000 chars)", open=False):
485
+ transcript_output = gr.Textbox(
486
+ label="Raw transcript",
487
+ interactive=False,
488
+ lines=12,
489
+ elem_id="transcript-box",
490
+ )
491
+
492
+ # ── Wiring ────────────────────────────────────────────
493
+ process_btn.click(
494
+ fn=process_video,
495
+ inputs=url_input,
496
+ outputs=[transcript_output, gr.Textbox(visible=False), status_output],
497
+ )
498
+
499
+ # ── Tab 2: Chat ─────────────────────────────────────────────
500
+ with gr.Tab("πŸ’¬ Chat with Video", id="tab-chat"):
501
+
502
+ chatbot = gr.Chatbot(
503
+ label="Conversation",
504
+ bubble_full_width=False,
505
+ height=480,
506
+ elem_id="chat-window",
507
+ )
508
+
509
+ with gr.Row():
510
+ chat_input = gr.Textbox(
511
+ placeholder="Ask anything about the video…",
512
+ label="",
513
+ scale=5,
514
+ elem_id="chat-input",
515
+ autofocus=True,
516
+ )
517
+ send_btn = gr.Button("Send ➀", variant="primary", scale=1, min_width=100)
518
+
519
+ clear_btn = gr.Button("πŸ—‘ Clear conversation", variant="secondary", size="sm")
520
+
521
+ # ── Wiring ────────────────────────────────────────────
522
+ # Submit on button click or Enter key
523
+ send_btn.click(
524
+ fn=chat_with_video,
525
+ inputs=[chat_input, chatbot],
526
+ outputs=[chatbot, chat_input],
527
+ )
528
+ chat_input.submit(
529
+ fn=chat_with_video,
530
+ inputs=[chat_input, chatbot],
531
+ outputs=[chatbot, chat_input],
532
+ )
533
+ clear_btn.click(fn=lambda: [], outputs=chatbot)
534
+
535
+ # ── Footer ──────────────────────────────────────────────────
536
+ gr.Markdown(
537
+ "<center style='font-size:0.75rem; color:#888;'>"
538
+ "Open-source Β· No data stored Β· Transcript processed locally"
539
+ "</center>"
540
+ )
541
+
542
+
543
+ # ─────────────────────────────────────────────────────────────
544
+ # MODULE 10 ❯ LAUNCH
545
+ # ─────────────────────────────────────────────────────────────
546
+
547
+ if __name__ == "__main__":
548
+ app.launch(
549
+ debug=True, # shows tracebacks in output
550
+ share=True, # creates a public gradio.live link (great for demos)
551
+ show_error=True,
552
+ )