DEVJHAWAR11 commited on
Commit
54bef2f
Β·
1 Parent(s): c079c93

Deploy Klypse backend

Browse files
.env.example ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ LLM_PROVIDER=groq
3
+
4
+ # Groq Settings (Get key from console.groq.com)
5
+ GROQ_API_KEY='your_api_key'
6
+ GROQ_MODEL=llama-3.3-70b-versatile
7
+
8
+ # Storage Paths
9
+ CHROMA_DB_PATH=./data/faiss
10
+ CACHE_PATH=./data/cache
11
+
12
+ # Server Configuration
13
+ APP_HOST=0.0.0.0
14
+ APP_PORT=8000
15
+ LOG_LEVEL=INFO
.gitignore ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ venv/
2
+ __pycache__/
3
+ .env
4
+
5
+ # Logs
6
+ app.log
7
+ *.log
8
+
9
+ # Caches and outputs
10
+ /data/cache/*
11
+ /data/audio/*
12
+ /data/faiss/*
13
+ *.db
14
+
15
+ # Whisper temp files and downloads
16
+ ~/.cache/whisper/*
README.md CHANGED
@@ -1,11 +1,26 @@
1
  ---
2
- title: KLypse
3
- emoji: πŸ”₯
4
- colorFrom: pink
5
- colorTo: red
 
 
 
 
 
6
  sdk: docker
7
- pinned: false
8
- short_description: A YouTube Video ChatBot
 
9
  ---
10
 
11
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
1
  ---
2
+
3
+ title: Klypse AI
4
+
5
+ emoji: πŸŽ₯
6
+
7
+ colorFrom: indigo
8
+
9
+ colorTo: purple
10
+
11
  sdk: docker
12
+
13
+ app\_port: 7860
14
+
15
  ---
16
 
17
+
18
+
19
+ \# Klypse - AI Video Assistant
20
+
21
+
22
+
23
+ AI-powered Chrome extension for YouTube video Q\&A.
24
+
25
+
26
+
app/__init__.py ADDED
File without changes
app/api/auth.py ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import Security
2
+ from fastapi.security import APIKeyHeader
3
+
4
+ api_key_header = APIKeyHeader(name="X-API-Key", auto_error=False)
5
+ VALID_API_KEYS = ["dev-key-123", "prod-key-456"]
6
+
7
+ def verify_api_key(api_key: str = Security(api_key_header)):
8
+ # If no key is provided, just allow access (optional auth)
9
+ if api_key is None:
10
+ return None
11
+ # If key is provided, check if it's valid
12
+ if api_key not in VALID_API_KEYS:
13
+ # Optionally, you can log or track invalid attempts here
14
+ return None
15
+ return api_key
16
+
app/api/deps.py ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from langchain_groq import ChatGroq
2
+ from app.config import config
3
+ from app.storage.vector_store import get_vectorstore
4
+ from app.services.qa_chain import create_qa_chain
5
+
6
+ def get_llm():
7
+ """Return LLM based on provider setting."""
8
+ if config.LLM_PROVIDER == "groq":
9
+ return ChatGroq(
10
+ groq_api_key=config.GROQ_API_KEY,
11
+ model_name=config.GROQ_MODEL,
12
+ temperature=0.3, # Lower temperature for more focused responses
13
+ max_tokens=1024,
14
+ )
15
+
16
+ # Initialize once when app starts
17
+ llm = get_llm()
18
+ vectorstore = get_vectorstore()
19
+ qa_chain = create_qa_chain(llm, vectorstore)
app/api/endpoints.py ADDED
@@ -0,0 +1,190 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # app/api/endpoints.py
2
+
3
+ import asyncio
4
+ import os
5
+ import re
6
+ from fastapi import APIRouter
7
+ from fastapi.responses import StreamingResponse
8
+ from app.models.schemas import AskRequest
9
+ from app.storage.vector_store import load_vectorstore_for_video, create_vectorstore_for_video
10
+ from app.services.qa_chain import create_qa_chain
11
+ from app.api.deps import llm
12
+ from app.storage.cache import load_transcript
13
+ from app.services.transcripts import get_transcript
14
+
15
+ router = APIRouter()
16
+
17
+ @router.get('/check/{video_id}')
18
+ def check_transcript_status(video_id: str):
19
+ transcript = load_transcript(video_id)
20
+ if transcript:
21
+ return {"status": "available"}
22
+
23
+ vectorstore_path = f"./data/faiss/{video_id}/"
24
+ if os.path.exists(vectorstore_path):
25
+ return {"status": "available"}
26
+
27
+ try:
28
+ transcript = get_transcript(video_id)
29
+ if transcript:
30
+ return {"status": "available"}
31
+ except:
32
+ pass
33
+
34
+ return {"status": "unavailable"}
35
+
36
+ import uuid
37
+ import logging
38
+
39
+ logger = logging.getLogger(__name__)
40
+
41
+ def remove_consecutive_duplicates(text: str) -> str:
42
+ """
43
+ Remove consecutive duplicate words from text.
44
+ Example: "AWS AWS caused" -> "AWS caused"
45
+ Example: "economy, economy," -> "economy,"
46
+ """
47
+ # Pattern 1: Remove word-level duplicates (with punctuation handling)
48
+ # Matches: word followed by space(s) and the same word
49
+ text = re.sub(r'\b(\w+)\s+\1\b', r'\1', text, flags=re.IGNORECASE)
50
+
51
+ # Pattern 2: Remove duplicates with punctuation
52
+ # Matches: word with punctuation followed by space and same word with punctuation
53
+ text = re.sub(r'\b(\w+)([.,;:!?]?)\s+\1\2\b', r'\1\2', text, flags=re.IGNORECASE)
54
+
55
+ # Pattern 3: Clean up any remaining multiple consecutive duplicates
56
+ words = text.split()
57
+ cleaned = []
58
+ prev_word = None
59
+
60
+ for word in words:
61
+ # Normalize for comparison (remove punctuation)
62
+ word_normalized = re.sub(r'[^\w]', '', word).lower()
63
+ if word_normalized != prev_word or word_normalized == '':
64
+ cleaned.append(word)
65
+ prev_word = word_normalized
66
+
67
+ return ' '.join(cleaned)
68
+
69
+ @router.post('/ask/stream')
70
+ async def ask_question_stream(body: AskRequest):
71
+ video_id = body.video_id
72
+ question = body.question
73
+
74
+ logger.info(f"REQ {uuid.uuid4()}: incoming QA request: video_id={video_id}, question_len={len(question)}")
75
+
76
+ # CRITICAL: Validate inputs
77
+ if not video_id or not question:
78
+ async def error_stream():
79
+ yield "data: ❌ Missing video ID or question\n\n"
80
+ yield "data: [END]\n\n"
81
+ return StreamingResponse(error_stream(), media_type="text/event-stream")
82
+
83
+ # CRITICAL: Ensure question is a clean string
84
+ question = str(question).strip()
85
+ if not question:
86
+ async def error_stream():
87
+ yield "data: ❌ Question cannot be empty\n\n"
88
+ yield "data: [END]\n\n"
89
+ return StreamingResponse(error_stream(), media_type="text/event-stream")
90
+
91
+ try:
92
+ vectorstore = load_vectorstore_for_video(video_id)
93
+ except FileNotFoundError:
94
+ async def processing_stream():
95
+ yield "data: πŸ”„ Processing video...\n\n"
96
+ await asyncio.sleep(0.2)
97
+
98
+ transcript = load_transcript(video_id)
99
+ if not transcript:
100
+ try:
101
+ transcript = get_transcript(video_id)
102
+ except Exception as e:
103
+ yield f"data: ❌ Could not fetch transcript: {str(e)}\n\n"
104
+ yield "data: [END]\n\n"
105
+ return
106
+
107
+ yield "data: 🧠 Creating embeddings...\n\n"
108
+ await asyncio.sleep(0.2)
109
+
110
+ try:
111
+ create_vectorstore_for_video(video_id, transcript)
112
+ vectorstore = load_vectorstore_for_video(video_id)
113
+ except Exception as e:
114
+ yield f"data: ❌ Error creating embeddings: {str(e)}\n\n"
115
+ yield "data: [END]\n\n"
116
+ return
117
+
118
+ yield "data: βœ… Ready!\n\n\n"
119
+ await asyncio.sleep(0.2)
120
+
121
+ try:
122
+ qa_chain = create_qa_chain(llm, vectorstore)
123
+ result = qa_chain.invoke({"query": question})
124
+ answer = result.get('result', result.get('answer', str(result)))
125
+
126
+ # Ensure answer is string and clean
127
+ answer = str(answer).strip()
128
+
129
+ # CRITICAL: Apply aggressive deduplication before streaming
130
+ answer = remove_consecutive_duplicates(answer)
131
+
132
+ # Log cleaned answer
133
+ logger.info(f"Cleaned answer (first 200 chars): {answer[:200]}")
134
+
135
+ # Stream word by word with deduplication check
136
+ words = answer.split()
137
+ prev_word = None
138
+ for word in words:
139
+ word_clean = word.strip()
140
+ # Additional check: don't send if same as previous
141
+ word_normalized = re.sub(r'[^\w]', '', word_clean).lower()
142
+ if word_normalized != prev_word or word_normalized == '':
143
+ yield f"data: {word_clean}\n\n"
144
+ await asyncio.sleep(0.04)
145
+ prev_word = word_normalized
146
+
147
+ except Exception as e:
148
+ logger.error(f"Error generating answer: {str(e)}")
149
+ yield f"data: ❌ Error generating answer: {str(e)}\n\n"
150
+
151
+ yield "data: [END]\n\n"
152
+
153
+ return StreamingResponse(processing_stream(), media_type="text/event-stream")
154
+
155
+ # Vectorstore exists
156
+ qa_chain = create_qa_chain(llm, vectorstore)
157
+
158
+ async def event_stream():
159
+ try:
160
+ result = qa_chain.invoke({"query": question})
161
+ answer = result.get('result', result.get('answer', str(result)))
162
+
163
+ # Ensure answer is string and clean
164
+ answer = str(answer).strip()
165
+
166
+ # CRITICAL: Apply aggressive deduplication before streaming
167
+ answer = remove_consecutive_duplicates(answer)
168
+
169
+ # Log cleaned answer
170
+ logger.info(f"Cleaned answer (first 200 chars): {answer[:200]}")
171
+
172
+ # Stream word by word with deduplication check
173
+ words = answer.split()
174
+ prev_word = None
175
+ for word in words:
176
+ word_clean = word.strip()
177
+ # Additional check: don't send if same as previous
178
+ word_normalized = re.sub(r'[^\w]', '', word_clean).lower()
179
+ if word_normalized != prev_word or word_normalized == '':
180
+ yield f"data: {word_clean}\n\n"
181
+ await asyncio.sleep(0.04)
182
+ prev_word = word_normalized
183
+
184
+ except Exception as e:
185
+ logger.error(f"Error: {str(e)}")
186
+ yield f"data: ❌ Error: {str(e)}\n\n"
187
+
188
+ yield "data: [END]\n\n"
189
+
190
+ return StreamingResponse(event_stream(), media_type="text/event-stream")
app/config.py ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pydantic_settings import BaseSettings
2
+
3
+ class Settings(BaseSettings):
4
+ # LLM Configuration
5
+ LLM_PROVIDER: str = "groq" # Default to Groq
6
+
7
+ # Groq Settings (Best free option)
8
+ GROQ_API_KEY: str
9
+ GROQ_MODEL: str = "llama-3.3-70b-versatile" # GPT-4 level quality
10
+
11
+ # OpenAI (Backup - if you add credits later)
12
+ OPENAI_API_KEY: str = ""
13
+ OPENAI_MODEL: str = "gpt-4o-mini"
14
+ OPENAI_EMBEDDING_MODEL: str = "text-embedding-3-small"
15
+
16
+ # Storage Paths
17
+ CHROMA_DB_PATH: str
18
+ CACHE_PATH: str
19
+
20
+ # Server Configuration
21
+ APP_HOST: str = "0.0.0.0"
22
+ APP_PORT: int = 8000
23
+ LOG_LEVEL: str = "INFO"
24
+
25
+ class Config:
26
+ env_file = '.env'
27
+
28
+ config = Settings()
29
+
30
+ # Validation
31
+ if config.LLM_PROVIDER == 'groq' and not config.GROQ_API_KEY:
32
+ raise ValueError("GROQ_API_KEY is required when using Groq")
app/database/db.py ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sqlite3
2
+ from contextlib import contextmanager
3
+
4
+ DATABASE_PATH = "./data/conversations.db"
5
+
6
+ #to keep memory of past conversations
7
+ @contextmanager
8
+ def get_db():
9
+ conn = sqlite3.connect(DATABASE_PATH)
10
+ conn.row_factory = sqlite3.Row
11
+ try:
12
+ yield conn
13
+ finally:
14
+ conn.close()
15
+
16
+ def init_db():
17
+ with get_db() as conn:
18
+ conn.execute("""
19
+ CREATE TABLE IF NOT EXISTS conversations (
20
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
21
+ session_id TEXT NOT NULL,
22
+ video_id TEXT NOT NULL,
23
+ question TEXT NOT NULL,
24
+ answer TEXT NOT NULL,
25
+ created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
26
+ )
27
+ """)
28
+ conn.commit()
29
+
30
+ def save_conversation(session_id, video_id, question, answer):
31
+ with get_db() as conn:
32
+ conn.execute(
33
+ "INSERT INTO conversations (session_id, video_id, question, answer) VALUES (?, ?, ?, ?)",
34
+ (session_id, video_id, question, answer)
35
+ )
36
+ conn.commit()
37
+
38
+ def get_conversation_history(session_id, limit=10):
39
+ with get_db() as conn:
40
+ cursor = conn.execute(
41
+ "SELECT question, answer, created_at FROM conversations WHERE session_id = ? ORDER BY created_at DESC LIMIT ?",
42
+ (session_id, limit)
43
+ )
44
+ return [dict(row) for row in cursor.fetchall()]
45
+
46
+ def clear_session(session_id):
47
+ with get_db() as conn:
48
+ conn.execute("DELETE FROM conversations WHERE session_id = ?", (session_id,))
49
+ conn.commit()
50
+
51
+ # Initialize the database when this file is imported
52
+ init_db()
app/main.py ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # app/main.py
2
+ from fastapi import FastAPI
3
+ from fastapi.middleware.cors import CORSMiddleware
4
+ from app.api import endpoints
5
+ from app.config import config
6
+
7
+ app = FastAPI(
8
+ title="KLYPSE API",
9
+ description="YouTube Video Q&A with AI",
10
+ version="1.0.0"
11
+ )
12
+
13
+ # Add CORS middleware
14
+ app.add_middleware(
15
+ CORSMiddleware,
16
+ allow_origins=[
17
+ "chrome-extension://*",
18
+ "http://localhost:*",
19
+ "https://www.youtube.com",
20
+ ],
21
+ allow_credentials=True,
22
+ allow_methods=["*"],
23
+ allow_headers=["*"],
24
+ )
25
+
26
+ # Include API routes
27
+ app.include_router(endpoints.router, prefix="/api/v1", tags=["videos"])
28
+
29
+ @app.get("/")
30
+ def root():
31
+ return {"message": "VidIQAI API", "version": "1.0.0"}
32
+
33
+ @app.get("/health")
34
+ def health():
35
+ return {"status": "healthy"}
app/models/schemas.py ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pydantic import BaseModel, Field, field_validator
2
+ from typing import Optional
3
+ import re
4
+ class AskRequest(BaseModel):
5
+ video_id: str
6
+ question: str
7
+
8
+ class ProcessVideoRequest(BaseModel):
9
+ """Request model for processing a video"""
10
+ video_url: str = Field(..., description="YouTube video URL or video ID")
11
+
12
+ @field_validator('video_url')
13
+ def validate_video_url(cls, v):
14
+ """Ensure it's a valid YouTube URL or video ID"""
15
+ if not v:
16
+ raise ValueError("video_url cannot be empty")
17
+ return v
18
+
19
+ class ProcessVideoResponse(BaseModel):
20
+ """Response after processing a video"""
21
+ status: str
22
+ video_id: str
23
+ video_url: str
24
+ message: str
25
+ chunks_created: int
26
+ transcript_length: int
27
+
28
+ class AskQuestionRequest(BaseModel):
29
+ """Request model for asking a question"""
30
+ video_id: str = Field(..., description="YouTube video ID")
31
+ question: str = Field(..., min_length=3, description="User's question")
32
+
33
+ class AskQuestionResponse(BaseModel):
34
+ """Response with answer to user's question"""
35
+ answer: str
36
+ video_id: str
37
+ question: str
38
+ sources_used: int
39
+
40
+ class SummaryRequest(BaseModel):
41
+ """Request model for video summary"""
42
+ video_id: str = Field(..., description="YouTube video ID")
43
+
44
+ class SummaryResponse(BaseModel):
45
+ """Response with video summary"""
46
+ summary: str
47
+ video_id: str
48
+ transcript_length: int
49
+
50
+ class ErrorResponse(BaseModel):
51
+ """Standard error response"""
52
+ error: str
53
+ detail: Optional[str] = None
app/services/audio_utils.py ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import yt_dlp
2
+ import os
3
+
4
+ def download_audio(video_url, output_dir="./data/audio"):
5
+ os.makedirs(output_dir, exist_ok=True)
6
+ ydl_opts = {
7
+ 'format': 'bestaudio/best',
8
+ 'outtmpl': f'{output_dir}/%(id)s.%(ext)s',
9
+ 'postprocessors': [{
10
+ 'key': 'FFmpegExtractAudio',
11
+ 'preferredcodec': 'mp3',
12
+ 'preferredquality': '192',
13
+ }],
14
+ 'quiet': True,
15
+ }
16
+ with yt_dlp.YoutubeDL(ydl_opts) as ydl:
17
+ info = ydl.extract_info(video_url, download=True)
18
+ audio_path = os.path.join(output_dir, f"{info['id']}.mp3")
19
+ return audio_path
app/services/embeddings.py ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from app.config import config
2
+ from langchain_huggingface import HuggingFaceEmbeddings
3
+
4
+ def get_embeddings():
5
+ """Return embeddings model based on provider."""
6
+ if config.LLM_PROVIDER == "groq":
7
+ # Use free local embeddings (no API key needed)
8
+ return HuggingFaceEmbeddings(
9
+ model_name="sentence-transformers/all-MiniLM-L6-v2",
10
+ model_kwargs={'device': 'cpu'},
11
+ encode_kwargs={'normalize_embeddings': True}
12
+ )
app/services/processing.py ADDED
@@ -0,0 +1,75 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # app/services/processing.py
2
+ import re
3
+
4
+ def clean_text(text: str) -> str:
5
+ """
6
+ Clean transcript text by removing:
7
+ - Timestamp markers like {ts:123}
8
+ - Extra whitespace, line breaks
9
+ - Special characters and formatting artifacts
10
+ - Music/sound effect markers like [ΰ€Έΰ€‚ΰ€—ΰ₯€ΰ€€], [Music]
11
+ """
12
+ if not text:
13
+ return ""
14
+
15
+ # Remove timestamp markers: {ts:123}, {ts:0}, etc.
16
+ text = re.sub(r'\{ts:\d+\}', '', text)
17
+
18
+ # Remove sound effect markers: [ΰ€Έΰ€‚ΰ€—ΰ₯€ΰ€€], [Music], [Applause], etc.
19
+ text = re.sub(r'\[.*?\]', '', text)
20
+
21
+ # Remove parentheses with metadata: (music), (laughing), etc.
22
+ text = re.sub(r'\(.*?\)', '', text)
23
+
24
+ # Remove URLs
25
+ text = re.sub(r'http[s]?://\S+', '', text)
26
+
27
+ # Replace multiple line breaks with space
28
+ text = text.replace('\n', ' ')
29
+
30
+ # Remove extra whitespace (multiple spaces to single space)
31
+ text = re.sub(r'\s+', ' ', text)
32
+
33
+ # Remove leading/trailing whitespace
34
+ text = text.strip()
35
+
36
+ return text
37
+
38
+ def chunk_text(text: str, chunk_size: int = 500, overlap: int = 50) -> list[str]:
39
+ """
40
+ Split text into chunks with overlap for better context preservation.
41
+
42
+ Args:
43
+ text: Cleaned text to chunk
44
+ chunk_size: Number of words per chunk (default: 500)
45
+ overlap: Number of overlapping words between chunks (default: 50)
46
+
47
+ Returns:
48
+ List of text chunks with overlap
49
+ """
50
+ if not text:
51
+ return []
52
+
53
+ words = text.split()
54
+
55
+ # If text is smaller than chunk_size, return as single chunk
56
+ if len(words) <= chunk_size:
57
+ return [text]
58
+
59
+ chunks = []
60
+ start = 0
61
+
62
+ while start < len(words):
63
+ # Get chunk of words
64
+ end = start + chunk_size
65
+ chunk = " ".join(words[start:end])
66
+ chunks.append(chunk)
67
+
68
+ # Move start position with overlap
69
+ start = end - overlap
70
+
71
+ # Prevent infinite loop if we're at the end
72
+ if end >= len(words):
73
+ break
74
+
75
+ return chunks
app/services/qa_chain.py ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # app/services/qa_chain.py
2
+
3
+ from langchain.chains import RetrievalQA
4
+ from langchain.prompts import PromptTemplate
5
+ import logging
6
+
7
+ logger = logging.getLogger(__name__)
8
+
9
+ def create_qa_chain(llm, vectorstore):
10
+ # ENHANCED: Better prompt to prevent repetition
11
+ prompt_template = """You are an AI assistant analyzing a YouTube video transcript. Use the context below to answer the question accurately and concisely.
12
+
13
+ Context from video transcript:
14
+ {context}
15
+
16
+ User Question: {question}
17
+
18
+ IMPORTANT INSTRUCTIONS:
19
+ 1. Provide a clear, well-structured answer based ONLY on the transcript context
20
+ 2. Write naturally without repeating words or phrases
21
+ 3. Use proper formatting (bullet points, numbers) when appropriate
22
+ 4. Be concise - avoid unnecessary elaboration
23
+ 5. If the information is not in the transcript, say "This information is not covered in the video"
24
+ 6. Do NOT duplicate or repeat sentences
25
+
26
+ Your Answer:"""
27
+
28
+ PROMPT = PromptTemplate(
29
+ template=prompt_template,
30
+ input_variables=["context", "question"]
31
+ )
32
+
33
+ return RetrievalQA.from_chain_type(
34
+ llm=llm,
35
+ chain_type="stuff",
36
+ retriever=vectorstore.as_retriever(
37
+ search_kwargs={
38
+ "k": 3, # Retrieve top 3 most relevant chunks
39
+ "fetch_k": 10 # Fetch more candidates for better filtering
40
+ }
41
+ ),
42
+ return_source_documents=False,
43
+ chain_type_kwargs={"prompt": PROMPT}
44
+ )
app/services/transcript_audio.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ import whisper
2
+
3
+ def transcribe_audio(audio_path, model_size="base"):
4
+ model = whisper.load_model(model_size)
5
+ result = model.transcribe(audio_path)
6
+ return result["text"]
app/services/transcripts.py ADDED
@@ -0,0 +1,141 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from youtube_transcript_api import YouTubeTranscriptApi, _errors
3
+ from app.storage.cache import save_transcript, load_transcript
4
+ from app.storage.vector_store import add_to_vectorstore
5
+ from app.services.processing import chunk_text, clean_text
6
+ from app.utils.logger import get_logger
7
+ import yt_dlp
8
+ from groq import Groq
9
+ from app.config import config
10
+ import whisper
11
+
12
+ logger = get_logger(__name__)
13
+
14
+ class TranscriptError(Exception):
15
+ """Custom exception for transcript errors"""
16
+ pass
17
+
18
+ def download_audio(video_url: str, output_dir: str = "./data/audio") -> str:
19
+ os.makedirs(output_dir, exist_ok=True)
20
+ ydl_opts = {
21
+ 'format': 'bestaudio/best',
22
+ 'outtmpl': f'{output_dir}/%(id)s.%(ext)s',
23
+ 'postprocessors': [{
24
+ 'key': 'FFmpegExtractAudio',
25
+ 'preferredcodec': 'mp3',
26
+ 'preferredquality': '128',
27
+ }],
28
+ 'quiet': True,
29
+ 'no_warnings': True,
30
+ }
31
+
32
+ with yt_dlp.YoutubeDL(ydl_opts) as ydl:
33
+ info = ydl.extract_info(video_url, download=True)
34
+ audio_path = os.path.join(output_dir, f"{info['id']}.mp3")
35
+ logger.info(f"βœ“ Downloaded audio: {audio_path}")
36
+ return audio_path
37
+
38
+ def transcribe_with_groq(audio_path: str) -> str:
39
+ client = Groq(api_key=config.GROQ_API_KEY)
40
+ with open(audio_path, "rb") as file:
41
+ transcription = client.audio.transcriptions.create(
42
+ file=(os.path.basename(audio_path), file.read()),
43
+ model="whisper-large-v3",
44
+ response_format="text",
45
+ temperature=0.0,
46
+ )
47
+ logger.info("βœ“ Groq transcription complete")
48
+ return transcription
49
+
50
+ def transcribe_with_local_whisper(audio_path, model_size="base"):
51
+ model = whisper.load_model(model_size)
52
+ # Force English translation for non-English audio
53
+ result = model.transcribe(audio_path, task="translate")
54
+ print("[DEBUG] Whisper transcript after translation:", result["text"][:200])
55
+ return result["text"]
56
+
57
+ def get_transcript(video_id: str, video_url: str = None):
58
+ # Step 1: Try transcript cache
59
+ cached = load_transcript(video_id)
60
+ if cached:
61
+ logger.info(f"βœ“ Using cached transcript for: {video_id}")
62
+ return cached
63
+
64
+ # Step 2: Try all likely transcript languages
65
+ languages = [
66
+ 'en', 'hi', 'es', 'fr', 'de', 'ru', 'ar', 'bn', 'id', 'auto'
67
+ ]
68
+
69
+ for lang in languages:
70
+ try:
71
+ logger.info(f"Trying transcript for language: {lang}")
72
+ transcript_data = YouTubeTranscriptApi().fetch(video_id, languages=[lang])
73
+ transcript_data = transcript_data.to_raw_data()
74
+ transcript_text = " ".join([entry['text'] for entry in transcript_data])
75
+
76
+ # FIXED: Clean transcript immediately after fetching
77
+ transcript_text = clean_text(transcript_text)
78
+
79
+ save_transcript(video_id, transcript_text)
80
+ logger.info(f"βœ“ Got transcript ({lang}, {len(transcript_text)} chars)")
81
+ return transcript_text
82
+
83
+ except _errors.NoTranscriptFound as e:
84
+ logger.info(f"βœ— No transcript in {lang}: {str(e)}")
85
+ except Exception as e:
86
+ logger.info(f"βœ— Other error for lang {lang}: {str(e)}")
87
+ continue
88
+
89
+ # Step 3: Groq fallback for short videos only (<25MB audio)
90
+ logger.info("No transcript found for any language. Trying Groq Whisper API...")
91
+ try:
92
+ if not video_url:
93
+ video_url = f"https://www.youtube.com/watch?v={video_id}"
94
+
95
+ audio_path = download_audio(video_url)
96
+ file_size_mb = os.path.getsize(audio_path) / (1024 * 1024)
97
+ logger.info(f"Audio file size: {file_size_mb:.2f} MB")
98
+
99
+ if file_size_mb <= 24:
100
+ try:
101
+ grq_txt = transcribe_with_groq(audio_path)
102
+ # FIXED: Clean after Groq transcription
103
+ grq_txt = clean_text(grq_txt)
104
+ save_transcript(video_id, grq_txt)
105
+ os.remove(audio_path)
106
+ return grq_txt
107
+ except Exception as groq_error:
108
+ logger.warning(f"Groq failed: {str(groq_error)}")
109
+ else:
110
+ logger.warning("Audio file too large for Groq fallback; trying local Whisper")
111
+
112
+ # Step 4: Local Whisper fallback (any file size)
113
+ w_txt = transcribe_with_local_whisper(audio_path)
114
+ # FIXED: Clean after Whisper transcription
115
+ w_txt = clean_text(w_txt)
116
+ save_transcript(video_id, w_txt)
117
+ os.remove(audio_path)
118
+ return w_txt
119
+
120
+ except Exception as whisper_error:
121
+ logger.error(f"All approaches failed: {str(whisper_error)}")
122
+ raise TranscriptError(
123
+ "No transcript could be retrieved for this video (even with local Whisper fallback). "
124
+ "This may be a platform restriction or severe audio download error. Contact admin if this is unexpected."
125
+ )
126
+
127
+ def process_video(video_id: str, video_url: str = None) -> dict:
128
+ logger.info(f"Starting video processing for: {video_id}")
129
+ transcript = get_transcript(video_id, video_url)
130
+ cleaned = clean_text(transcript)
131
+ chunks = chunk_text(cleaned, chunk_size=500)
132
+ add_to_vectorstore(chunks, video_id=video_id)
133
+ logger.info(f"βœ“ Processed {len(chunks)} chunks into video-specific vector store")
134
+
135
+ return {
136
+ "video_id": video_id,
137
+ "video_url": video_url or f"https://www.youtube.com/watch?v={video_id}",
138
+ "transcript_length": len(transcript),
139
+ "chunks_created": len(chunks),
140
+ "status": "success"
141
+ }
app/services/video_utils.py ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ from typing import Optional
3
+
4
+ def extract_video_id(video_input: str) -> Optional[str]:
5
+ """
6
+ Extract YouTube video ID from a URL or accept a direct video ID.
7
+ """
8
+ cleaned = video_input.strip()
9
+ # 1. Already just a valid video ID?
10
+ if re.fullmatch(r'[A-Za-z0-9_-]{11}', cleaned):
11
+ return cleaned
12
+
13
+ # 2. Try to pull canonical ID from any supported format (robust)
14
+ # Order matters: check for v= or /ID in any URL form
15
+ patterns = [
16
+ r"(?:v=|/)([A-Za-z0-9_-]{11})(?=\b|[&?/])",
17
+ ]
18
+ for pattern in patterns:
19
+ match = re.search(pattern, cleaned)
20
+ if match:
21
+ return match.group(1)
22
+
23
+ return None
24
+
25
+ def is_valid_video_id(video_id: str) -> bool:
26
+ return bool(re.fullmatch(r'[A-Za-z0-9_-]{11}', video_id))
app/storage/cache.py ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from app.config import config
3
+
4
+ CACHE_DIR = config.CACHE_PATH
5
+ os.makedirs(CACHE_DIR, exist_ok=True)
6
+
7
+ def save_transcript(video_id: str, transcript: str):
8
+ """Save transcript locally."""
9
+ file_path = os.path.join(CACHE_DIR, f"{video_id}.txt")
10
+ with open(file_path, "w", encoding="utf-8") as f:
11
+ f.write(transcript)
12
+
13
+ def load_transcript(video_id: str) -> str | None:
14
+ """Load transcript if it exists."""
15
+ file_path = os.path.join(CACHE_DIR, f"{video_id}.txt")
16
+ if os.path.exists(file_path):
17
+ with open(file_path, "r", encoding="utf-8") as f:
18
+ return f.read()
19
+ return None
app/storage/vector_store.py ADDED
@@ -0,0 +1,330 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # app/storage/vector_store.py
2
+
3
+ from langchain_community.vectorstores import FAISS
4
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
5
+ from app.services.embeddings import get_embeddings
6
+ from app.config import config
7
+ import os
8
+ import re
9
+
10
+ # ---- CLEAN TRANSCRIPT UTILS ----
11
+
12
+ # ...existing code...
13
+ import logging
14
+ from typing import Any, Dict, List, Optional, Sequence
15
+
16
+ logger = logging.getLogger(__name__)
17
+
18
+
19
+ class VectorStore:
20
+ """
21
+ Generic wrapper around an underlying vector DB client.
22
+ - Call `add_embeddings` to persist vectors.
23
+ - Call `search` to retrieve nearest neighbors.
24
+ This wrapper ensures results are deduplicated (preserve order).
25
+ Adapt client initialization to your project's real client in __init__.
26
+ """
27
+
28
+ def __init__(self, client: Optional[Any] = None, namespace: Optional[str] = None):
29
+ """
30
+ If `client` is provided, this wrapper will delegate to it.
31
+ Otherwise you must set `self._client` later to an object exposing compatible methods.
32
+ """
33
+ self._client = client
34
+ self.namespace = namespace
35
+
36
+ # -- Helper: dedupe results preserving order --------------------------------
37
+ @staticmethod
38
+ def _dedupe_results(results: Sequence[Dict], key_fields: Optional[Sequence[str]] = None, top_k: Optional[int] = None) -> List[Dict]:
39
+ """
40
+ Deduplicate a sequence of result dicts preserving order.
41
+ Default dedupe key: result['id'] if present, else result.get('meta', {}).get('chunk_id'), else result.get('text')
42
+ Returns at most top_k items if top_k provided.
43
+ """
44
+ seen = set()
45
+ out = []
46
+ for r in results:
47
+ # Compose primary key candidates
48
+ key = None
49
+ if isinstance(r, dict):
50
+ key = r.get("id")
51
+ if not key:
52
+ meta = r.get("meta") or {}
53
+ key = meta.get("chunk_id")
54
+ if not key:
55
+ key = r.get("text")
56
+ else:
57
+ key = str(r)
58
+
59
+ if key in seen:
60
+ continue
61
+ seen.add(key)
62
+ out.append(r)
63
+ if top_k and len(out) >= top_k:
64
+ break
65
+ return out
66
+
67
+ # -- Add embeddings ----------------------------------------------------------
68
+ def add_embeddings(self, ids: Sequence[str], vectors: Sequence[Sequence[float]], metadatas: Optional[Sequence[Dict]] = None):
69
+ """
70
+ Persist embeddings into the underlying client.
71
+ Expects:
72
+ ids: list of string ids (eg. chunk ids)
73
+ vectors: list of numeric vectors aligned with ids
74
+ metadatas: optional list of metadata dicts aligned with ids
75
+ Adapt to your client's API: this generic implementation will attempt common method names.
76
+ """
77
+ if self._client is None:
78
+ raise RuntimeError("VectorStore client not configured")
79
+
80
+ try:
81
+ # Common client API: add / upsert
82
+ if hasattr(self._client, "upsert"):
83
+ # chroma-like / vectordb clients
84
+ self._client.upsert(ids=ids, embeddings=vectors, metadatas=metadatas, namespace=self.namespace)
85
+ return
86
+ if hasattr(self._client, "add"):
87
+ # faiss/persisted-store wrappers
88
+ self._client.add(ids, vectors, metadatas)
89
+ return
90
+ # Fallback: try generic attribute names
91
+ if hasattr(self._client, "persist"):
92
+ self._client.persist(ids=ids, vectors=vectors, metadatas=metadatas)
93
+ return
94
+ except Exception:
95
+ logger.exception("Failed to add embeddings to vector store")
96
+ raise
97
+
98
+ raise RuntimeError("Underlying client does not expose a supported add/upsert API")
99
+
100
+ # -- Search / similarity retrieval -------------------------------------------
101
+ def search(self, query_vector: Sequence[float], top_k: int = 10, filter: Optional[Dict] = None) -> List[Dict]:
102
+ """
103
+ Search the vector DB. Returns a list of result dicts:
104
+ [{"id": <id>, "score": <score>, "text": <text>, "meta": {...}}, ...]
105
+ This wrapper will deduplicate the returned results (by id/text) preserving order.
106
+ """
107
+ if self._client is None:
108
+ raise RuntimeError("VectorStore client not configured")
109
+
110
+ raw_results = None
111
+ try:
112
+ # Try a few common client search signatures:
113
+ if hasattr(self._client, "query") and callable(getattr(self._client, "query")):
114
+ # chroma-like: client.query(query_embeddings=[query_vector], n_results=top_k, where=filter)
115
+ try:
116
+ resp = self._client.query(query_embeddings=[query_vector], n_results=top_k, where=filter, namespace=self.namespace)
117
+ # Normalize response into list of dicts
118
+ raw_results = self._normalize_query_response(resp)
119
+ except TypeError:
120
+ # fallback signature
121
+ resp = self._client.query(query_vector, top_k)
122
+ raw_results = self._normalize_query_response(resp)
123
+ elif hasattr(self._client, "search") and callable(getattr(self._client, "search")):
124
+ # FAISS/other wrappers: client.search(query_vector, top_k, filter=...)
125
+ resp = self._client.search(query_vector, top_k, filter=filter, namespace=self.namespace)
126
+ raw_results = self._normalize_query_response(resp)
127
+ else:
128
+ # Try generic method names
129
+ if hasattr(self._client, "get_nearest_neighbors"):
130
+ resp = self._client.get_nearest_neighbors(query_vector, top_k)
131
+ raw_results = self._normalize_query_response(resp)
132
+ else:
133
+ raise RuntimeError("Underlying client does not expose a supported search/query API")
134
+ except Exception:
135
+ logger.exception("Vector store search failed")
136
+ raise
137
+
138
+ # Ensure raw_results is a list of dict-like results
139
+ if not isinstance(raw_results, list):
140
+ logger.debug("Normalizing single search response to list")
141
+ raw_results = list(raw_results) if raw_results is not None else []
142
+
143
+ # Deduplicate results preserving order and cap to top_k
144
+ deduped = self._dedupe_results(raw_results, top_k=top_k)
145
+ return deduped
146
+
147
+ # -- Response normalization -------------------------------------------------
148
+ @staticmethod
149
+ def _normalize_query_response(resp: Any) -> List[Dict]:
150
+ """
151
+ Convert common response formats into a list of dicts with keys:
152
+ 'id', 'score', 'text', 'meta'
153
+ The exact structure depends on the client; this helper attempts reasonable mappings.
154
+ """
155
+ out = []
156
+
157
+ if resp is None:
158
+ return out
159
+
160
+ # choma-like: resp['ids'], resp['distances'], resp['metadatas'], resp['documents']
161
+ try:
162
+ if isinstance(resp, dict):
163
+ # chroma-python query format
164
+ if "ids" in resp and isinstance(resp["ids"], list):
165
+ # chroma returns lists of lists when multiple queries provided
166
+ ids_list = resp["ids"]
167
+ docs_list = resp.get("documents") or resp.get("documents", [])
168
+ metas_list = resp.get("metadatas") or resp.get("metadatas", [])
169
+ dists_list = resp.get("distances") or resp.get("distances", [])
170
+ # take first query's results if nested
171
+ ids = ids_list[0] if ids_list and isinstance(ids_list[0], list) else ids_list
172
+ docs = docs_list[0] if docs_list and isinstance(docs_list[0], list) else docs_list
173
+ metas = metas_list[0] if metas_list and isinstance(metas_list[0], list) else metas_list
174
+ dists = dists_list[0] if dists_list and isinstance(dists_list[0], list) else dists_list
175
+
176
+ for i, idv in enumerate(ids):
177
+ out.append({"id": idv, "score": None if not dists else dists[i], "text": (docs[i] if docs and i < len(docs) else None), "meta": (metas[i] if metas and i < len(metas) else {})})
178
+ return out
179
+
180
+ # If resp contains 'results' key that is a list
181
+ if "results" in resp and isinstance(resp["results"], list):
182
+ for r in resp["results"]:
183
+ # try to extract known fields
184
+ out.append(
185
+ {
186
+ "id": r.get("id"),
187
+ "score": r.get("score") or r.get("distance") or r.get("score"),
188
+ "text": r.get("document") or r.get("text") or r.get("content"),
189
+ "meta": r.get("metadata") or r.get("meta") or {},
190
+ }
191
+ )
192
+ return out
193
+ except Exception:
194
+ logger.debug("Chroma-like normalization failed, trying other formats", exc_info=True)
195
+
196
+ # If resp is an iterable of tuples (id, score, text, meta)
197
+ try:
198
+ if isinstance(resp, (list, tuple)):
199
+ for item in resp:
200
+ if isinstance(item, dict):
201
+ out.append({"id": item.get("id"), "score": item.get("score") or item.get("distance"), "text": item.get("text") or item.get("document") or item.get("content"), "meta": item.get("meta") or item.get("metadata") or {}})
202
+ elif isinstance(item, (list, tuple)) and len(item) >= 2:
203
+ # (id, score) or (id, score, text)
204
+ idv = item[0]
205
+ score = item[1]
206
+ text = item[2] if len(item) > 2 else None
207
+ meta = item[3] if len(item) > 3 else {}
208
+ out.append({"id": idv, "score": score, "text": text, "meta": meta})
209
+ else:
210
+ out.append({"id": None, "score": None, "text": str(item), "meta": {}})
211
+ return out
212
+ except Exception:
213
+ logger.debug("Iterable normalization failed", exc_info=True)
214
+
215
+ # Last resort: wrap the resp as single result with text representation
216
+ try:
217
+ out.append({"id": None, "score": None, "text": str(resp), "meta": {}})
218
+ except Exception:
219
+ out = []
220
+
221
+ return out
222
+ # ...existing code...
223
+
224
+ def remove_double_words(text):
225
+ # FIXED: Correct regex to remove consecutive repeated words
226
+ return re.sub(r'\b(\w+)\s+\1\b', r'\1', text, flags=re.IGNORECASE)
227
+
228
+ def clean_transcript(text):
229
+ # Remove duplicate lines, strip, and double words
230
+ lines = text.split('\n')
231
+ unique_lines = []
232
+ prev_line = None
233
+
234
+ for line in lines:
235
+ line = line.strip()
236
+ if not line or line == prev_line:
237
+ continue
238
+
239
+ cleaned = remove_double_words(line)
240
+ if cleaned != prev_line:
241
+ unique_lines.append(cleaned)
242
+ prev_line = cleaned
243
+
244
+ return ' '.join(unique_lines)
245
+
246
+ # ---- VECTORSTORE FUNCTIONS ----
247
+
248
+ _embeddings = get_embeddings()
249
+ FAISS_INDEX_PATH = config.CHROMA_DB_PATH.replace("chroma", "faiss")
250
+ os.makedirs(FAISS_INDEX_PATH, exist_ok=True)
251
+
252
+ _vectorstore = None
253
+
254
+ def get_vectorstore():
255
+ global _vectorstore
256
+ if _vectorstore is None:
257
+ index_file = os.path.join(FAISS_INDEX_PATH, "index.faiss")
258
+ if os.path.exists(index_file):
259
+ try:
260
+ _vectorstore = FAISS.load_local(
261
+ FAISS_INDEX_PATH,
262
+ _embeddings,
263
+ allow_dangerous_deserialization=True
264
+ )
265
+ print(f"βœ“ Loaded existing FAISS index from {FAISS_INDEX_PATH}")
266
+ except Exception as e:
267
+ print(f"⚠ Could not load existing index: {e}")
268
+ _vectorstore = FAISS.from_texts(["initialization"], _embeddings)
269
+ else:
270
+ _vectorstore = FAISS.from_texts(["initialization"], _embeddings)
271
+ print(f"βœ“ Created new FAISS index at {FAISS_INDEX_PATH}")
272
+
273
+ return _vectorstore
274
+
275
+ def add_to_vectorstore(texts):
276
+ vectorstore = get_vectorstore()
277
+ vectorstore.add_texts(texts)
278
+ vectorstore.save_local(FAISS_INDEX_PATH)
279
+ print(f"βœ“ Added {len(texts)} texts to FAISS and saved to disk")
280
+
281
+ def clear_vectorstore():
282
+ global _vectorstore
283
+ index_file = os.path.join(FAISS_INDEX_PATH, "index.faiss")
284
+ pkl_file = os.path.join(FAISS_INDEX_PATH, "index.pkl")
285
+
286
+ if os.path.exists(index_file):
287
+ os.remove(index_file)
288
+ if os.path.exists(pkl_file):
289
+ os.remove(pkl_file)
290
+
291
+ _vectorstore = None
292
+ print("βœ“ Cleared FAISS vectorstore")
293
+
294
+ def load_vectorstore_for_video(video_id: str):
295
+ path = f"./data/faiss/{video_id}/"
296
+ if not os.path.exists(path):
297
+ raise FileNotFoundError(f"No vectorstore found for video ID: {video_id}")
298
+
299
+ return FAISS.load_local(
300
+ path,
301
+ _embeddings,
302
+ allow_dangerous_deserialization=True
303
+ )
304
+
305
+ def create_vectorstore_for_video(video_id: str, transcript: str):
306
+ # FIXED: Clean the transcript before processing
307
+ transcript = clean_transcript(transcript)
308
+
309
+ # Split transcript into chunks
310
+ text_splitter = RecursiveCharacterTextSplitter(
311
+ chunk_size=1000,
312
+ chunk_overlap=200,
313
+ length_function=len
314
+ )
315
+
316
+ chunks = text_splitter.split_text(transcript)
317
+
318
+ # Create vectorstore from chunks
319
+ vectorstore = FAISS.from_texts(
320
+ texts=chunks,
321
+ embedding=_embeddings
322
+ )
323
+
324
+ # Save to disk
325
+ path = f"./data/faiss/{video_id}/"
326
+ os.makedirs(path, exist_ok=True)
327
+ vectorstore.save_local(path)
328
+
329
+ print(f"βœ“ Created and saved vectorstore for video {video_id} with {len(chunks)} chunks (cleaned)")
330
+ return vectorstore
app/utils/logger.py ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ import sys
3
+
4
+ logging.basicConfig(
5
+ level=logging.INFO,
6
+ format='%(asctime)s | %(levelname)s | %(name)s | %(message)s',
7
+ handlers=[
8
+ logging.StreamHandler(sys.stdout),
9
+ logging.FileHandler('./data/app.log')
10
+ ]
11
+ )
12
+
13
+ def get_logger(name):
14
+ return logging.getLogger(name)
docker-compose.yml ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ version: '3.8'
2
+
3
+ services:
4
+ klypse-backend:
5
+ build:
6
+ context: .
7
+ dockerfile: docker/Dockerfile
8
+ container_name: klypse-backend
9
+ ports:
10
+ - "8000:8000"
11
+ environment:
12
+ - GROQ_API_KEY=${GROQ_API_KEY}
13
+ volumes:
14
+ - ./data:/app/data
15
+ restart: unless-stopped
16
+ networks:
17
+ - klypse-network
18
+
19
+ networks:
20
+ klypse-network:
21
+ driver: bridge
docker/.dockerignore ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ __pycache__/
2
+ *.pyc
3
+ *.pyo
4
+ *.pyd
5
+ .Python
6
+ *.so
7
+ *.egg
8
+ *.egg-info
9
+ dist/
10
+ build/
11
+ *.log
12
+ .git/
13
+ .gitignore
14
+ .env
15
+ .venv
16
+ env/
17
+ venv/
18
+ data/
19
+ .idea/
20
+ .vscode/
21
+ *.db
22
+ *.sqlite
23
+ .DS_Store
24
+ tests/
25
+ docker/
docker/Dockerfile ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Python base image
2
+ FROM python:3.10-slim
3
+
4
+ # Set a working directory
5
+ WORKDIR /app
6
+
7
+ # Copy requirements first for better caching
8
+ COPY requirements.txt .
9
+
10
+ # Install system dependencies (ffmpeg for audio, etc)
11
+ RUN apt-get update && \
12
+ apt-get install -y ffmpeg git && \
13
+ pip install --upgrade pip && \
14
+ pip install -r requirements.txt && \
15
+ apt-get clean && \
16
+ rm -rf /var/lib/apt/lists/*
17
+
18
+ # Copy rest of your app
19
+ COPY . .
20
+
21
+ # Environment variables (set these securely in production)
22
+ ENV PYTHONUNBUFFERED 1
23
+
24
+ # Expose FastAPI default port
25
+ EXPOSE 8000
26
+
27
+ # Command to run your backend (edit as needed)
28
+ CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8000", "--workers", "2"]
requirements.txt ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # FastAPI
2
+ fastapi==0.109.0
3
+ uvicorn[standard]==0.27.0
4
+ python-multipart==0.0.6
5
+
6
+ # YouTube
7
+ youtube-transcript-api==0.6.2
8
+ yt-dlp==2024.3.10
9
+
10
+ # AI/LLM - Compatible versions
11
+ groq==0.4.2
12
+ langchain==0.1.16
13
+ langchain-groq==0.0.1
14
+ langchain-community==0.0.36
15
+ langchain-huggingface==0.0.1
16
+ openai==1.12.0
17
+
18
+ # Embeddings & Vector Store - Updated version
19
+ faiss-cpu==1.7.4
20
+ sentence-transformers>=2.6.0
21
+ chromadb==0.4.22
22
+
23
+ # Audio Processing
24
+ openai-whisper==20231117
25
+
26
+ # Utils
27
+ pydantic==2.6.0
28
+ pydantic-settings==2.1.0
29
+ python-dotenv==1.0.0
30
+ requests==2.31.0
temp.py ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ...existing code...
2
+ import re
3
+ from pathlib import Path
4
+
5
+ p = Path("data/cache")
6
+ for f in p.glob("*.txt"):
7
+ text = f.read_text(encoding="utf-8")
8
+ # find repeated adjacent words like "word word" sequences
9
+ matches = re.findall(r"\b(\w+)(?:\s+\1\b)+", text, flags=re.IGNORECASE)
10
+ if matches:
11
+ print(f"{f.name} has repeated words sample: {matches[:10]}")
12
+ else:
13
+ print(f"{f.name} looks ok")
test_config.py ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Test Groq API connection and model quality
3
+ """
4
+ from langchain_groq import ChatGroq
5
+ from langchain.schema import HumanMessage
6
+ import os
7
+ from dotenv import load_dotenv
8
+
9
+ load_dotenv()
10
+
11
+ def test_groq():
12
+ print("=" * 60)
13
+ print("Testing Groq API")
14
+ print("=" * 60)
15
+
16
+ api_key = os.getenv('GROQ_API_KEY')
17
+
18
+ if not api_key or api_key == 'gsk_your_actual_groq_api_key_here':
19
+ print("❌ Please add your Groq API key to .env file")
20
+ return
21
+
22
+ try:
23
+ # Initialize Groq
24
+ llm = ChatGroq(
25
+ groq_api_key=api_key,
26
+ model_name="llama-3.3-70b-versatile",
27
+ temperature=0
28
+ )
29
+
30
+ print(f"βœ“ Groq API Key: {api_key[:15]}...")
31
+ print("βœ“ Testing model quality...")
32
+
33
+ # Test with a complex question
34
+ messages = [
35
+ HumanMessage(content="Explain quantum computing in simple terms, then write a Python function to calculate fibonacci numbers.")
36
+ ]
37
+
38
+ response = llm.invoke(messages)
39
+
40
+ print("\n" + "=" * 60)
41
+ print("GROQ RESPONSE (GPT-4 Level Quality):")
42
+ print("=" * 60)
43
+ print(response.content[:500] + "...")
44
+ print("\n" + "=" * 60)
45
+ print("βœ… Groq is working perfectly!")
46
+ print("Quality: GPT-4 level (Llama 3.3 70B)")
47
+ print("Speed: 10x faster than OpenAI")
48
+ print("Cost: 100% FREE forever")
49
+ print("=" * 60)
50
+
51
+ except Exception as e:
52
+ print(f"❌ Error: {e}")
53
+ print("\nMake sure you:")
54
+ print("1. Created account at console.groq.com")
55
+ print("2. Got your API key")
56
+ print("3. Added it to .env file as GROQ_API_KEY=gsk_...")
57
+
58
+ if __name__ == "__main__":
59
+ test_groq()
60
+
test_db.py ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from app.database.db import save_conversation, get_conversation_history, clear_session
2
+
3
+ # Save a conversation
4
+ save_conversation("session1", "video123", "What is this video?", "It's a music video.")
5
+
6
+ # Retrieve history
7
+ history = get_conversation_history("session1")
8
+ print(history)
9
+
10
+ # Clear session
11
+ clear_session("session1")
test_stream.py ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+
3
+ url = "http://localhost:8000/api/v1/ask/stream"
4
+ payload = {"video_id": "U8PZejZ0F-c", "question": "What is the main story in this video?"}
5
+
6
+ with requests.post(url, json=payload, stream=True) as r:
7
+ for line in r.iter_lines():
8
+ print(line)
tests/test_install.py ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Quick test to verify all packages are installed correctly
3
+ """
4
+
5
+ def test_imports():
6
+ """Test if all critical packages can be imported"""
7
+ try:
8
+ import faiss
9
+ print(f"βœ“ FAISS {faiss.__version__}")
10
+ except ImportError as e:
11
+ print(f"βœ— FAISS import failed: {e}")
12
+ return False
13
+
14
+ try:
15
+ import langchain
16
+ print(f"βœ“ LangChain {langchain.__version__}")
17
+ except ImportError as e:
18
+ print(f"βœ— LangChain import failed: {e}")
19
+ return False
20
+
21
+ try:
22
+ from langchain_openai import ChatOpenAI, OpenAIEmbeddings
23
+ print("βœ“ LangChain OpenAI integration")
24
+ except ImportError as e:
25
+ print(f"βœ— LangChain OpenAI import failed: {e}")
26
+ return False
27
+
28
+ try:
29
+ from langchain_community.vectorstores import FAISS
30
+ print("βœ“ LangChain FAISS integration")
31
+ except ImportError as e:
32
+ print(f"βœ— FAISS integration failed: {e}")
33
+ return False
34
+
35
+ try:
36
+ import fastapi
37
+ print("βœ“ FastAPI")
38
+ except ImportError as e:
39
+ print(f"βœ— FastAPI import failed: {e}")
40
+ return False
41
+
42
+ try:
43
+ from youtube_transcript_api import YouTubeTranscriptApi
44
+ print("βœ“ YouTube Transcript API")
45
+ except ImportError as e:
46
+ print(f"βœ— YouTube Transcript API failed: {e}")
47
+ return False
48
+
49
+ print("\nπŸŽ‰ All packages installed successfully!")
50
+ print("Ready to run VidIQAI backend!")
51
+ return True
52
+
53
+ if __name__ == "__main__":
54
+ test_imports()
tests/tests_processing.py ADDED
File without changes
tests/tests_transcript.py ADDED
File without changes
tests_api.py ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ # Create test_api.py in your project root
2
+ from app.config import config
3
+
4
+ print(f"Provider: {config.LLM_PROVIDER}")
5
+ print(f"Model: {config.OPENAI_MODEL}")
6
+ print(f"API Key (first 10 chars): {config.OPENAI_API_KEY[:10]}...")
7
+ print("βœ“ Configuration loaded successfully!")