Spaces:

devjhawar
/

KLypse

Sleeping

App Files Files Community

DEVJHAWAR11 commited on Oct 22, 2025

Commit

54bef2f

1 Parent(s): c079c93

Deploy Klypse backend

Browse files

Files changed (33) hide show

.env.example +15 -0
.gitignore +16 -0
README.md +22 -7
app/__init__.py +0 -0
app/api/auth.py +16 -0
app/api/deps.py +19 -0
app/api/endpoints.py +190 -0
app/config.py +32 -0
app/database/db.py +52 -0
app/main.py +35 -0
app/models/schemas.py +53 -0
app/services/audio_utils.py +19 -0
app/services/embeddings.py +12 -0
app/services/processing.py +75 -0
app/services/qa_chain.py +44 -0
app/services/transcript_audio.py +6 -0
app/services/transcripts.py +141 -0
app/services/video_utils.py +26 -0
app/storage/cache.py +19 -0
app/storage/vector_store.py +330 -0
app/utils/logger.py +14 -0
docker-compose.yml +21 -0
docker/.dockerignore +25 -0
docker/Dockerfile +28 -0
requirements.txt +30 -0
temp.py +13 -0
test_config.py +60 -0
test_db.py +11 -0
test_stream.py +8 -0
tests/test_install.py +54 -0
tests/tests_processing.py +0 -0
tests/tests_transcript.py +0 -0
tests_api.py +7 -0

.env.example ADDED Viewed

	@@ -0,0 +1,15 @@

+LLM_PROVIDER=groq
+# Groq Settings (Get key from console.groq.com)
+GROQ_API_KEY='your_api_key'
+GROQ_MODEL=llama-3.3-70b-versatile
+# Storage Paths
+CHROMA_DB_PATH=./data/faiss
+CACHE_PATH=./data/cache
+# Server Configuration
+APP_HOST=0.0.0.0
+APP_PORT=8000
+LOG_LEVEL=INFO

.gitignore ADDED Viewed

	@@ -0,0 +1,16 @@

+venv/
+__pycache__/
+.env
+# Logs
+app.log
+*.log
+# Caches and outputs
+/data/cache/*
+/data/audio/*
+/data/faiss/*
+*.db
+# Whisper temp files and downloads
+~/.cache/whisper/*

README.md CHANGED Viewed

@@ -1,11 +1,26 @@
 ---
-title: KLypse
-emoji: 🔥
-colorFrom: pink
-colorTo: red
 sdk: docker
-pinned: false
-short_description: A YouTube Video ChatBot
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: Klypse AI
+emoji: 🎥
+colorFrom: indigo
+colorTo: purple
 sdk: docker
+app\_port: 7860
 ---
+\# Klypse - AI Video Assistant
+AI-powered Chrome extension for YouTube video Q\&A.

app/__init__.py ADDED Viewed

File without changes

app/api/auth.py ADDED Viewed

	@@ -0,0 +1,16 @@

+from fastapi import Security
+from fastapi.security import APIKeyHeader
+api_key_header = APIKeyHeader(name="X-API-Key", auto_error=False)
+VALID_API_KEYS = ["dev-key-123", "prod-key-456"]
+def verify_api_key(api_key: str = Security(api_key_header)):
+    # If no key is provided, just allow access (optional auth)
+    if api_key is None:
+        return None
+    # If key is provided, check if it's valid
+    if api_key not in VALID_API_KEYS:
+        # Optionally, you can log or track invalid attempts here
+        return None
+    return api_key

app/api/deps.py ADDED Viewed

	@@ -0,0 +1,19 @@

+from langchain_groq import ChatGroq
+from app.config import config
+from app.storage.vector_store import get_vectorstore
+from app.services.qa_chain import create_qa_chain
+def get_llm():
+    """Return LLM based on provider setting."""
+    if config.LLM_PROVIDER == "groq":
+        return ChatGroq(
+            groq_api_key=config.GROQ_API_KEY,
+            model_name=config.GROQ_MODEL,
+            temperature=0.3,  # Lower temperature for more focused responses
+            max_tokens=1024,
+        )
+# Initialize once when app starts
+llm = get_llm()
+vectorstore = get_vectorstore()
+qa_chain = create_qa_chain(llm, vectorstore)

app/api/endpoints.py ADDED Viewed

	@@ -0,0 +1,190 @@

+# app/api/endpoints.py
+import asyncio
+import os
+import re
+from fastapi import APIRouter
+from fastapi.responses import StreamingResponse
+from app.models.schemas import AskRequest
+from app.storage.vector_store import load_vectorstore_for_video, create_vectorstore_for_video
+from app.services.qa_chain import create_qa_chain
+from app.api.deps import llm
+from app.storage.cache import load_transcript
+from app.services.transcripts import get_transcript
+router = APIRouter()
+@router.get('/check/{video_id}')
+def check_transcript_status(video_id: str):
+    transcript = load_transcript(video_id)
+    if transcript:
+        return {"status": "available"}
+    vectorstore_path = f"./data/faiss/{video_id}/"
+    if os.path.exists(vectorstore_path):
+        return {"status": "available"}
+    try:
+        transcript = get_transcript(video_id)
+        if transcript:
+            return {"status": "available"}
+    except:
+        pass
+    return {"status": "unavailable"}
+import uuid
+import logging
+logger = logging.getLogger(__name__)
+def remove_consecutive_duplicates(text: str) -> str:
+    """
+    Remove consecutive duplicate words from text.
+    Example: "AWS AWS caused" -> "AWS caused"
+    Example: "economy, economy," -> "economy,"
+    """
+    # Pattern 1: Remove word-level duplicates (with punctuation handling)
+    # Matches: word followed by space(s) and the same word
+    text = re.sub(r'\b(\w+)\s+\1\b', r'\1', text, flags=re.IGNORECASE)
+    # Pattern 2: Remove duplicates with punctuation
+    # Matches: word with punctuation followed by space and same word with punctuation
+    text = re.sub(r'\b(\w+)([.,;:!?]?)\s+\1\2\b', r'\1\2', text, flags=re.IGNORECASE)
+    # Pattern 3: Clean up any remaining multiple consecutive duplicates
+    words = text.split()
+    cleaned = []
+    prev_word = None
+    for word in words:
+        # Normalize for comparison (remove punctuation)
+        word_normalized = re.sub(r'[^\w]', '', word).lower()
+        if word_normalized != prev_word or word_normalized == '':
+            cleaned.append(word)
+            prev_word = word_normalized
+    return ' '.join(cleaned)
+@router.post('/ask/stream')
+async def ask_question_stream(body: AskRequest):
+    video_id = body.video_id
+    question = body.question
+    logger.info(f"REQ {uuid.uuid4()}: incoming QA request: video_id={video_id}, question_len={len(question)}")
+    # CRITICAL: Validate inputs
+    if not video_id or not question:
+        async def error_stream():
+            yield "data: ❌ Missing video ID or question\n\n"
+            yield "data: [END]\n\n"
+        return StreamingResponse(error_stream(), media_type="text/event-stream")
+    # CRITICAL: Ensure question is a clean string
+    question = str(question).strip()
+    if not question:
+        async def error_stream():
+            yield "data: ❌ Question cannot be empty\n\n"
+            yield "data: [END]\n\n"
+        return StreamingResponse(error_stream(), media_type="text/event-stream")
+    try:
+        vectorstore = load_vectorstore_for_video(video_id)
+    except FileNotFoundError:
+        async def processing_stream():
+            yield "data: 🔄 Processing video...\n\n"
+            await asyncio.sleep(0.2)
+            transcript = load_transcript(video_id)
+            if not transcript:
+                try:
+                    transcript = get_transcript(video_id)
+                except Exception as e:
+                    yield f"data: ❌ Could not fetch transcript: {str(e)}\n\n"
+                    yield "data: [END]\n\n"
+                    return
+            yield "data: 🧠 Creating embeddings...\n\n"
+            await asyncio.sleep(0.2)
+            try:
+                create_vectorstore_for_video(video_id, transcript)
+                vectorstore = load_vectorstore_for_video(video_id)
+            except Exception as e:
+                yield f"data: ❌ Error creating embeddings: {str(e)}\n\n"
+                yield "data: [END]\n\n"
+                return
+            yield "data: ✅ Ready!\n\n\n"
+            await asyncio.sleep(0.2)
+            try:
+                qa_chain = create_qa_chain(llm, vectorstore)
+                result = qa_chain.invoke({"query": question})
+                answer = result.get('result', result.get('answer', str(result)))
+                # Ensure answer is string and clean
+                answer = str(answer).strip()
+                # CRITICAL: Apply aggressive deduplication before streaming
+                answer = remove_consecutive_duplicates(answer)
+                # Log cleaned answer
+                logger.info(f"Cleaned answer (first 200 chars): {answer[:200]}")
+                # Stream word by word with deduplication check
+                words = answer.split()
+                prev_word = None
+                for word in words:
+                    word_clean = word.strip()
+                    # Additional check: don't send if same as previous
+                    word_normalized = re.sub(r'[^\w]', '', word_clean).lower()
+                    if word_normalized != prev_word or word_normalized == '':
+                        yield f"data: {word_clean}\n\n"
+                        await asyncio.sleep(0.04)
+                        prev_word = word_normalized
+            except Exception as e:
+                logger.error(f"Error generating answer: {str(e)}")
+                yield f"data: ❌ Error generating answer: {str(e)}\n\n"
+            yield "data: [END]\n\n"
+        return StreamingResponse(processing_stream(), media_type="text/event-stream")
+    # Vectorstore exists
+    qa_chain = create_qa_chain(llm, vectorstore)
+    async def event_stream():
+        try:
+            result = qa_chain.invoke({"query": question})
+            answer = result.get('result', result.get('answer', str(result)))
+            # Ensure answer is string and clean
+            answer = str(answer).strip()
+            # CRITICAL: Apply aggressive deduplication before streaming
+            answer = remove_consecutive_duplicates(answer)
+            # Log cleaned answer
+            logger.info(f"Cleaned answer (first 200 chars): {answer[:200]}")
+            # Stream word by word with deduplication check
+            words = answer.split()
+            prev_word = None
+            for word in words:
+                word_clean = word.strip()
+                # Additional check: don't send if same as previous
+                word_normalized = re.sub(r'[^\w]', '', word_clean).lower()
+                if word_normalized != prev_word or word_normalized == '':
+                    yield f"data: {word_clean}\n\n"
+                    await asyncio.sleep(0.04)
+                    prev_word = word_normalized
+        except Exception as e:
+            logger.error(f"Error: {str(e)}")
+            yield f"data: ❌ Error: {str(e)}\n\n"
+        yield "data: [END]\n\n"
+    return StreamingResponse(event_stream(), media_type="text/event-stream")

app/config.py ADDED Viewed

	@@ -0,0 +1,32 @@

+from pydantic_settings import BaseSettings
+class Settings(BaseSettings):
+    # LLM Configuration
+    LLM_PROVIDER: str = "groq"  # Default to Groq
+    # Groq Settings (Best free option)
+    GROQ_API_KEY: str
+    GROQ_MODEL: str = "llama-3.3-70b-versatile"  # GPT-4 level quality
+    # OpenAI (Backup - if you add credits later)
+    OPENAI_API_KEY: str = ""
+    OPENAI_MODEL: str = "gpt-4o-mini"
+    OPENAI_EMBEDDING_MODEL: str = "text-embedding-3-small"
+    # Storage Paths
+    CHROMA_DB_PATH: str
+    CACHE_PATH: str
+    # Server Configuration
+    APP_HOST: str = "0.0.0.0"
+    APP_PORT: int = 8000
+    LOG_LEVEL: str = "INFO"
+    class Config:
+        env_file = '.env'
+config = Settings()
+# Validation
+if config.LLM_PROVIDER == 'groq' and not config.GROQ_API_KEY:
+    raise ValueError("GROQ_API_KEY is required when using Groq")

app/database/db.py ADDED Viewed

	@@ -0,0 +1,52 @@

+import sqlite3
+from contextlib import contextmanager
+DATABASE_PATH = "./data/conversations.db"
+#to keep memory of past conversations
+@contextmanager
+def get_db():
+    conn = sqlite3.connect(DATABASE_PATH)
+    conn.row_factory = sqlite3.Row
+    try:
+        yield conn
+    finally:
+        conn.close()
+def init_db():
+    with get_db() as conn:
+        conn.execute("""
+            CREATE TABLE IF NOT EXISTS conversations (
+                id INTEGER PRIMARY KEY AUTOINCREMENT,
+                session_id TEXT NOT NULL,
+                video_id TEXT NOT NULL,
+                question TEXT NOT NULL,
+                answer TEXT NOT NULL,
+                created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
+            )
+        """)
+        conn.commit()
+def save_conversation(session_id, video_id, question, answer):
+    with get_db() as conn:
+        conn.execute(
+            "INSERT INTO conversations (session_id, video_id, question, answer) VALUES (?, ?, ?, ?)",
+            (session_id, video_id, question, answer)
+        )
+        conn.commit()
+def get_conversation_history(session_id, limit=10):
+    with get_db() as conn:
+        cursor = conn.execute(
+            "SELECT question, answer, created_at FROM conversations WHERE session_id = ? ORDER BY created_at DESC LIMIT ?",
+            (session_id, limit)
+        )
+        return [dict(row) for row in cursor.fetchall()]
+def clear_session(session_id):
+    with get_db() as conn:
+        conn.execute("DELETE FROM conversations WHERE session_id = ?", (session_id,))
+        conn.commit()
+# Initialize the database when this file is imported
+init_db()

app/main.py ADDED Viewed

	@@ -0,0 +1,35 @@

+# app/main.py
+from fastapi import FastAPI
+from fastapi.middleware.cors import CORSMiddleware
+from app.api import endpoints
+from app.config import config
+app = FastAPI(
+    title="KLYPSE API",
+    description="YouTube Video Q&A with AI",
+    version="1.0.0"
+)
+# Add CORS middleware
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=[
+        "chrome-extension://*",
+        "http://localhost:*",
+        "https://www.youtube.com",
+    ],
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+# Include API routes
+app.include_router(endpoints.router, prefix="/api/v1", tags=["videos"])
+@app.get("/")
+def root():
+    return {"message": "VidIQAI API", "version": "1.0.0"}
+@app.get("/health")
+def health():
+    return {"status": "healthy"}

app/models/schemas.py ADDED Viewed

	@@ -0,0 +1,53 @@

+from pydantic import BaseModel, Field, field_validator
+from typing import Optional
+import re
+class AskRequest(BaseModel):
+    video_id: str
+    question: str
+class ProcessVideoRequest(BaseModel):
+    """Request model for processing a video"""
+    video_url: str = Field(..., description="YouTube video URL or video ID")
+    @field_validator('video_url')
+    def validate_video_url(cls, v):
+        """Ensure it's a valid YouTube URL or video ID"""
+        if not v:
+            raise ValueError("video_url cannot be empty")
+        return v
+class ProcessVideoResponse(BaseModel):
+    """Response after processing a video"""
+    status: str
+    video_id: str
+    video_url: str
+    message: str
+    chunks_created: int
+    transcript_length: int
+class AskQuestionRequest(BaseModel):
+    """Request model for asking a question"""
+    video_id: str = Field(..., description="YouTube video ID")
+    question: str = Field(..., min_length=3, description="User's question")
+class AskQuestionResponse(BaseModel):
+    """Response with answer to user's question"""
+    answer: str
+    video_id: str
+    question: str
+    sources_used: int
+class SummaryRequest(BaseModel):
+    """Request model for video summary"""
+    video_id: str = Field(..., description="YouTube video ID")
+class SummaryResponse(BaseModel):
+    """Response with video summary"""
+    summary: str
+    video_id: str
+    transcript_length: int
+class ErrorResponse(BaseModel):
+    """Standard error response"""
+    error: str
+    detail: Optional[str] = None

app/services/audio_utils.py ADDED Viewed

	@@ -0,0 +1,19 @@

+import yt_dlp
+import os
+def download_audio(video_url, output_dir="./data/audio"):
+    os.makedirs(output_dir, exist_ok=True)
+    ydl_opts = {
+        'format': 'bestaudio/best',
+        'outtmpl': f'{output_dir}/%(id)s.%(ext)s',
+        'postprocessors': [{
+            'key': 'FFmpegExtractAudio',
+            'preferredcodec': 'mp3',
+            'preferredquality': '192',
+        }],
+        'quiet': True,
+    }
+    with yt_dlp.YoutubeDL(ydl_opts) as ydl:
+        info = ydl.extract_info(video_url, download=True)
+        audio_path = os.path.join(output_dir, f"{info['id']}.mp3")
+        return audio_path

app/services/embeddings.py ADDED Viewed

	@@ -0,0 +1,12 @@

+from app.config import config
+from langchain_huggingface import HuggingFaceEmbeddings
+def get_embeddings():
+    """Return embeddings model based on provider."""
+    if config.LLM_PROVIDER == "groq":
+        # Use free local embeddings (no API key needed)
+        return HuggingFaceEmbeddings(
+            model_name="sentence-transformers/all-MiniLM-L6-v2",
+            model_kwargs={'device': 'cpu'},
+            encode_kwargs={'normalize_embeddings': True}
+        )

app/services/processing.py ADDED Viewed

	@@ -0,0 +1,75 @@

+# app/services/processing.py
+import re
+def clean_text(text: str) -> str:
+    """
+    Clean transcript text by removing:
+    - Timestamp markers like {ts:123}
+    - Extra whitespace, line breaks
+    - Special characters and formatting artifacts
+    - Music/sound effect markers like [संगीत], [Music]
+    """
+    if not text:
+        return ""
+    # Remove timestamp markers: {ts:123}, {ts:0}, etc.
+    text = re.sub(r'\{ts:\d+\}', '', text)
+    # Remove sound effect markers: [संगीत], [Music], [Applause], etc.
+    text = re.sub(r'\[.*?\]', '', text)
+    # Remove parentheses with metadata: (music), (laughing), etc.
+    text = re.sub(r'\(.*?\)', '', text)
+    # Remove URLs
+    text = re.sub(r'http[s]?://\S+', '', text)
+    # Replace multiple line breaks with space
+    text = text.replace('\n', ' ')
+    # Remove extra whitespace (multiple spaces to single space)
+    text = re.sub(r'\s+', ' ', text)
+    # Remove leading/trailing whitespace
+    text = text.strip()
+    return text
+def chunk_text(text: str, chunk_size: int = 500, overlap: int = 50) -> list[str]:
+    """
+    Split text into chunks with overlap for better context preservation.
+    Args:
+        text: Cleaned text to chunk
+        chunk_size: Number of words per chunk (default: 500)
+        overlap: Number of overlapping words between chunks (default: 50)
+    Returns:
+        List of text chunks with overlap
+    """
+    if not text:
+        return []
+    words = text.split()
+    # If text is smaller than chunk_size, return as single chunk
+    if len(words) <= chunk_size:
+        return [text]
+    chunks = []
+    start = 0
+    while start < len(words):
+        # Get chunk of words
+        end = start + chunk_size
+        chunk = " ".join(words[start:end])
+        chunks.append(chunk)
+        # Move start position with overlap
+        start = end - overlap
+        # Prevent infinite loop if we're at the end
+        if end >= len(words):
+            break
+    return chunks

app/services/qa_chain.py ADDED Viewed

	@@ -0,0 +1,44 @@

+# app/services/qa_chain.py
+from langchain.chains import RetrievalQA
+from langchain.prompts import PromptTemplate
+import logging
+logger = logging.getLogger(__name__)
+def create_qa_chain(llm, vectorstore):
+    # ENHANCED: Better prompt to prevent repetition
+    prompt_template = """You are an AI assistant analyzing a YouTube video transcript. Use the context below to answer the question accurately and concisely.
+Context from video transcript:
+{context}
+User Question: {question}
+IMPORTANT INSTRUCTIONS:
+1. Provide a clear, well-structured answer based ONLY on the transcript context
+2. Write naturally without repeating words or phrases
+3. Use proper formatting (bullet points, numbers) when appropriate
+4. Be concise - avoid unnecessary elaboration
+5. If the information is not in the transcript, say "This information is not covered in the video"
+6. Do NOT duplicate or repeat sentences
+Your Answer:"""
+    PROMPT = PromptTemplate(
+        template=prompt_template,
+        input_variables=["context", "question"]
+    )
+    return RetrievalQA.from_chain_type(
+        llm=llm,
+        chain_type="stuff",
+        retriever=vectorstore.as_retriever(
+            search_kwargs={
+                "k": 3,  # Retrieve top 3 most relevant chunks
+                "fetch_k": 10  # Fetch more candidates for better filtering
+            }
+        ),
+        return_source_documents=False,
+        chain_type_kwargs={"prompt": PROMPT}
+    )

app/services/transcript_audio.py ADDED Viewed

	@@ -0,0 +1,6 @@

+import whisper
+def transcribe_audio(audio_path, model_size="base"):
+    model = whisper.load_model(model_size)
+    result = model.transcribe(audio_path)
+    return result["text"]

app/services/transcripts.py ADDED Viewed

	@@ -0,0 +1,141 @@

+import os
+from youtube_transcript_api import YouTubeTranscriptApi, _errors
+from app.storage.cache import save_transcript, load_transcript
+from app.storage.vector_store import add_to_vectorstore
+from app.services.processing import chunk_text, clean_text
+from app.utils.logger import get_logger
+import yt_dlp
+from groq import Groq
+from app.config import config
+import whisper
+logger = get_logger(__name__)
+class TranscriptError(Exception):
+    """Custom exception for transcript errors"""
+    pass
+def download_audio(video_url: str, output_dir: str = "./data/audio") -> str:
+    os.makedirs(output_dir, exist_ok=True)
+    ydl_opts = {
+        'format': 'bestaudio/best',
+        'outtmpl': f'{output_dir}/%(id)s.%(ext)s',
+        'postprocessors': [{
+            'key': 'FFmpegExtractAudio',
+            'preferredcodec': 'mp3',
+            'preferredquality': '128',
+        }],
+        'quiet': True,
+        'no_warnings': True,
+    }
+    with yt_dlp.YoutubeDL(ydl_opts) as ydl:
+        info = ydl.extract_info(video_url, download=True)
+        audio_path = os.path.join(output_dir, f"{info['id']}.mp3")
+        logger.info(f"✓ Downloaded audio: {audio_path}")
+        return audio_path
+def transcribe_with_groq(audio_path: str) -> str:
+    client = Groq(api_key=config.GROQ_API_KEY)
+    with open(audio_path, "rb") as file:
+        transcription = client.audio.transcriptions.create(
+            file=(os.path.basename(audio_path), file.read()),
+            model="whisper-large-v3",
+            response_format="text",
+            temperature=0.0,
+        )
+    logger.info("✓ Groq transcription complete")
+    return transcription
+def transcribe_with_local_whisper(audio_path, model_size="base"):
+    model = whisper.load_model(model_size)
+    # Force English translation for non-English audio
+    result = model.transcribe(audio_path, task="translate")
+    print("[DEBUG] Whisper transcript after translation:", result["text"][:200])
+    return result["text"]
+def get_transcript(video_id: str, video_url: str = None):
+    # Step 1: Try transcript cache
+    cached = load_transcript(video_id)
+    if cached:
+        logger.info(f"✓ Using cached transcript for: {video_id}")
+        return cached
+    # Step 2: Try all likely transcript languages
+    languages = [
+        'en', 'hi', 'es', 'fr', 'de', 'ru', 'ar', 'bn', 'id', 'auto'
+    ]
+    for lang in languages:
+        try:
+            logger.info(f"Trying transcript for language: {lang}")
+            transcript_data = YouTubeTranscriptApi().fetch(video_id, languages=[lang])
+            transcript_data = transcript_data.to_raw_data()
+            transcript_text = " ".join([entry['text'] for entry in transcript_data])
+            # FIXED: Clean transcript immediately after fetching
+            transcript_text = clean_text(transcript_text)
+            save_transcript(video_id, transcript_text)
+            logger.info(f"✓ Got transcript ({lang}, {len(transcript_text)} chars)")
+            return transcript_text
+        except _errors.NoTranscriptFound as e:
+            logger.info(f"✗ No transcript in {lang}: {str(e)}")
+        except Exception as e:
+            logger.info(f"✗ Other error for lang {lang}: {str(e)}")
+            continue
+    # Step 3: Groq fallback for short videos only (<25MB audio)
+    logger.info("No transcript found for any language. Trying Groq Whisper API...")
+    try:
+        if not video_url:
+            video_url = f"https://www.youtube.com/watch?v={video_id}"
+        audio_path = download_audio(video_url)
+        file_size_mb = os.path.getsize(audio_path) / (1024 * 1024)
+        logger.info(f"Audio file size: {file_size_mb:.2f} MB")
+        if file_size_mb <= 24:
+            try:
+                grq_txt = transcribe_with_groq(audio_path)
+                # FIXED: Clean after Groq transcription
+                grq_txt = clean_text(grq_txt)
+                save_transcript(video_id, grq_txt)
+                os.remove(audio_path)
+                return grq_txt
+            except Exception as groq_error:
+                logger.warning(f"Groq failed: {str(groq_error)}")
+        else:
+            logger.warning("Audio file too large for Groq fallback; trying local Whisper")
+        # Step 4: Local Whisper fallback (any file size)
+        w_txt = transcribe_with_local_whisper(audio_path)
+        # FIXED: Clean after Whisper transcription
+        w_txt = clean_text(w_txt)
+        save_transcript(video_id, w_txt)
+        os.remove(audio_path)
+        return w_txt
+    except Exception as whisper_error:
+        logger.error(f"All approaches failed: {str(whisper_error)}")
+        raise TranscriptError(
+            "No transcript could be retrieved for this video (even with local Whisper fallback). "
+            "This may be a platform restriction or severe audio download error. Contact admin if this is unexpected."
+        )
+def process_video(video_id: str, video_url: str = None) -> dict:
+    logger.info(f"Starting video processing for: {video_id}")
+    transcript = get_transcript(video_id, video_url)
+    cleaned = clean_text(transcript)
+    chunks = chunk_text(cleaned, chunk_size=500)
+    add_to_vectorstore(chunks, video_id=video_id)
+    logger.info(f"✓ Processed {len(chunks)} chunks into video-specific vector store")
+    return {
+        "video_id": video_id,
+        "video_url": video_url or f"https://www.youtube.com/watch?v={video_id}",
+        "transcript_length": len(transcript),
+        "chunks_created": len(chunks),
+        "status": "success"
+    }

app/services/video_utils.py ADDED Viewed

	@@ -0,0 +1,26 @@

+import re
+from typing import Optional
+def extract_video_id(video_input: str) -> Optional[str]:
+    """
+    Extract YouTube video ID from a URL or accept a direct video ID.
+    """
+    cleaned = video_input.strip()
+    # 1. Already just a valid video ID?
+    if re.fullmatch(r'[A-Za-z0-9_-]{11}', cleaned):
+        return cleaned
+    # 2. Try to pull canonical ID from any supported format (robust)
+    # Order matters: check for v= or /ID in any URL form
+    patterns = [
+        r"(?:v=|/)([A-Za-z0-9_-]{11})(?=\b|[&?/])",
+    ]
+    for pattern in patterns:
+        match = re.search(pattern, cleaned)
+        if match:
+            return match.group(1)
+    return None
+def is_valid_video_id(video_id: str) -> bool:
+    return bool(re.fullmatch(r'[A-Za-z0-9_-]{11}', video_id))

app/storage/cache.py ADDED Viewed

	@@ -0,0 +1,19 @@

+import os
+from app.config import config
+CACHE_DIR = config.CACHE_PATH
+os.makedirs(CACHE_DIR, exist_ok=True)
+def save_transcript(video_id: str, transcript: str):
+    """Save transcript locally."""
+    file_path = os.path.join(CACHE_DIR, f"{video_id}.txt")
+    with open(file_path, "w", encoding="utf-8") as f:
+        f.write(transcript)
+def load_transcript(video_id: str) -> str | None:
+    """Load transcript if it exists."""
+    file_path = os.path.join(CACHE_DIR, f"{video_id}.txt")
+    if os.path.exists(file_path):
+        with open(file_path, "r", encoding="utf-8") as f:
+            return f.read()
+    return None

app/storage/vector_store.py ADDED Viewed

	@@ -0,0 +1,330 @@

+# app/storage/vector_store.py
+from langchain_community.vectorstores import FAISS
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from app.services.embeddings import get_embeddings
+from app.config import config
+import os
+import re
+# ---- CLEAN TRANSCRIPT UTILS ----
+# ...existing code...
+import logging
+from typing import Any, Dict, List, Optional, Sequence
+logger = logging.getLogger(__name__)
+class VectorStore:
+    """
+    Generic wrapper around an underlying vector DB client.
+    - Call `add_embeddings` to persist vectors.
+    - Call `search` to retrieve nearest neighbors.
+    This wrapper ensures results are deduplicated (preserve order).
+    Adapt client initialization to your project's real client in __init__.
+    """
+    def __init__(self, client: Optional[Any] = None, namespace: Optional[str] = None):
+        """
+        If `client` is provided, this wrapper will delegate to it.
+        Otherwise you must set `self._client` later to an object exposing compatible methods.
+        """
+        self._client = client
+        self.namespace = namespace
+    # -- Helper: dedupe results preserving order --------------------------------
+    @staticmethod
+    def _dedupe_results(results: Sequence[Dict], key_fields: Optional[Sequence[str]] = None, top_k: Optional[int] = None) -> List[Dict]:
+        """
+        Deduplicate a sequence of result dicts preserving order.
+        Default dedupe key: result['id'] if present, else result.get('meta', {}).get('chunk_id'), else result.get('text')
+        Returns at most top_k items if top_k provided.
+        """
+        seen = set()
+        out = []
+        for r in results:
+            # Compose primary key candidates
+            key = None
+            if isinstance(r, dict):
+                key = r.get("id")
+                if not key:
+                    meta = r.get("meta") or {}
+                    key = meta.get("chunk_id")
+                if not key:
+                    key = r.get("text")
+            else:
+                key = str(r)
+            if key in seen:
+                continue
+            seen.add(key)
+            out.append(r)
+            if top_k and len(out) >= top_k:
+                break
+        return out
+    # -- Add embeddings ----------------------------------------------------------
+    def add_embeddings(self, ids: Sequence[str], vectors: Sequence[Sequence[float]], metadatas: Optional[Sequence[Dict]] = None):
+        """
+        Persist embeddings into the underlying client.
+        Expects:
+            ids: list of string ids (eg. chunk ids)
+            vectors: list of numeric vectors aligned with ids
+            metadatas: optional list of metadata dicts aligned with ids
+        Adapt to your client's API: this generic implementation will attempt common method names.
+        """
+        if self._client is None:
+            raise RuntimeError("VectorStore client not configured")
+        try:
+            # Common client API: add / upsert
+            if hasattr(self._client, "upsert"):
+                # chroma-like / vectordb clients
+                self._client.upsert(ids=ids, embeddings=vectors, metadatas=metadatas, namespace=self.namespace)
+                return
+            if hasattr(self._client, "add"):
+                # faiss/persisted-store wrappers
+                self._client.add(ids, vectors, metadatas)
+                return
+            # Fallback: try generic attribute names
+            if hasattr(self._client, "persist"):
+                self._client.persist(ids=ids, vectors=vectors, metadatas=metadatas)
+                return
+        except Exception:
+            logger.exception("Failed to add embeddings to vector store")
+            raise
+        raise RuntimeError("Underlying client does not expose a supported add/upsert API")
+    # -- Search / similarity retrieval -------------------------------------------
+    def search(self, query_vector: Sequence[float], top_k: int = 10, filter: Optional[Dict] = None) -> List[Dict]:
+        """
+        Search the vector DB. Returns a list of result dicts:
+          [{"id": <id>, "score": <score>, "text": <text>, "meta": {...}}, ...]
+        This wrapper will deduplicate the returned results (by id/text) preserving order.
+        """
+        if self._client is None:
+            raise RuntimeError("VectorStore client not configured")
+        raw_results = None
+        try:
+            # Try a few common client search signatures:
+            if hasattr(self._client, "query") and callable(getattr(self._client, "query")):
+                # chroma-like: client.query(query_embeddings=[query_vector], n_results=top_k, where=filter)
+                try:
+                    resp = self._client.query(query_embeddings=[query_vector], n_results=top_k, where=filter, namespace=self.namespace)
+                    # Normalize response into list of dicts
+                    raw_results = self._normalize_query_response(resp)
+                except TypeError:
+                    # fallback signature
+                    resp = self._client.query(query_vector, top_k)
+                    raw_results = self._normalize_query_response(resp)
+            elif hasattr(self._client, "search") and callable(getattr(self._client, "search")):
+                # FAISS/other wrappers: client.search(query_vector, top_k, filter=...)
+                resp = self._client.search(query_vector, top_k, filter=filter, namespace=self.namespace)
+                raw_results = self._normalize_query_response(resp)
+            else:
+                # Try generic method names
+                if hasattr(self._client, "get_nearest_neighbors"):
+                    resp = self._client.get_nearest_neighbors(query_vector, top_k)
+                    raw_results = self._normalize_query_response(resp)
+                else:
+                    raise RuntimeError("Underlying client does not expose a supported search/query API")
+        except Exception:
+            logger.exception("Vector store search failed")
+            raise
+        # Ensure raw_results is a list of dict-like results
+        if not isinstance(raw_results, list):
+            logger.debug("Normalizing single search response to list")
+            raw_results = list(raw_results) if raw_results is not None else []
+        # Deduplicate results preserving order and cap to top_k
+        deduped = self._dedupe_results(raw_results, top_k=top_k)
+        return deduped
+    # -- Response normalization -------------------------------------------------
+    @staticmethod
+    def _normalize_query_response(resp: Any) -> List[Dict]:
+        """
+        Convert common response formats into a list of dicts with keys:
+          'id', 'score', 'text', 'meta'
+        The exact structure depends on the client; this helper attempts reasonable mappings.
+        """
+        out = []
+        if resp is None:
+            return out
+        # choma-like: resp['ids'], resp['distances'], resp['metadatas'], resp['documents']
+        try:
+            if isinstance(resp, dict):
+                # chroma-python query format
+                if "ids" in resp and isinstance(resp["ids"], list):
+                    # chroma returns lists of lists when multiple queries provided
+                    ids_list = resp["ids"]
+                    docs_list = resp.get("documents") or resp.get("documents", [])
+                    metas_list = resp.get("metadatas") or resp.get("metadatas", [])
+                    dists_list = resp.get("distances") or resp.get("distances", [])
+                    # take first query's results if nested
+                    ids = ids_list[0] if ids_list and isinstance(ids_list[0], list) else ids_list
+                    docs = docs_list[0] if docs_list and isinstance(docs_list[0], list) else docs_list
+                    metas = metas_list[0] if metas_list and isinstance(metas_list[0], list) else metas_list
+                    dists = dists_list[0] if dists_list and isinstance(dists_list[0], list) else dists_list
+                    for i, idv in enumerate(ids):
+                        out.append({"id": idv, "score": None if not dists else dists[i], "text": (docs[i] if docs and i < len(docs) else None), "meta": (metas[i] if metas and i < len(metas) else {})})
+                    return out
+                # If resp contains 'results' key that is a list
+                if "results" in resp and isinstance(resp["results"], list):
+                    for r in resp["results"]:
+                        # try to extract known fields
+                        out.append(
+                            {
+                                "id": r.get("id"),
+                                "score": r.get("score") or r.get("distance") or r.get("score"),
+                                "text": r.get("document") or r.get("text") or r.get("content"),
+                                "meta": r.get("metadata") or r.get("meta") or {},
+                            }
+                        )
+                    return out
+        except Exception:
+            logger.debug("Chroma-like normalization failed, trying other formats", exc_info=True)
+        # If resp is an iterable of tuples (id, score, text, meta)
+        try:
+            if isinstance(resp, (list, tuple)):
+                for item in resp:
+                    if isinstance(item, dict):
+                        out.append({"id": item.get("id"), "score": item.get("score") or item.get("distance"), "text": item.get("text") or item.get("document") or item.get("content"), "meta": item.get("meta") or item.get("metadata") or {}})
+                    elif isinstance(item, (list, tuple)) and len(item) >= 2:
+                        # (id, score) or (id, score, text)
+                        idv = item[0]
+                        score = item[1]
+                        text = item[2] if len(item) > 2 else None
+                        meta = item[3] if len(item) > 3 else {}
+                        out.append({"id": idv, "score": score, "text": text, "meta": meta})
+                    else:
+                        out.append({"id": None, "score": None, "text": str(item), "meta": {}})
+                return out
+        except Exception:
+            logger.debug("Iterable normalization failed", exc_info=True)
+        # Last resort: wrap the resp as single result with text representation
+        try:
+            out.append({"id": None, "score": None, "text": str(resp), "meta": {}})
+        except Exception:
+            out = []
+        return out
+# ...existing code...
+def remove_double_words(text):
+    # FIXED: Correct regex to remove consecutive repeated words
+    return re.sub(r'\b(\w+)\s+\1\b', r'\1', text, flags=re.IGNORECASE)
+def clean_transcript(text):
+    # Remove duplicate lines, strip, and double words
+    lines = text.split('\n')
+    unique_lines = []
+    prev_line = None
+    for line in lines:
+        line = line.strip()
+        if not line or line == prev_line:
+            continue
+        cleaned = remove_double_words(line)
+        if cleaned != prev_line:
+            unique_lines.append(cleaned)
+            prev_line = cleaned
+    return ' '.join(unique_lines)
+# ---- VECTORSTORE FUNCTIONS ----
+_embeddings = get_embeddings()
+FAISS_INDEX_PATH = config.CHROMA_DB_PATH.replace("chroma", "faiss")
+os.makedirs(FAISS_INDEX_PATH, exist_ok=True)
+_vectorstore = None
+def get_vectorstore():
+    global _vectorstore
+    if _vectorstore is None:
+        index_file = os.path.join(FAISS_INDEX_PATH, "index.faiss")
+        if os.path.exists(index_file):
+            try:
+                _vectorstore = FAISS.load_local(
+                    FAISS_INDEX_PATH,
+                    _embeddings,
+                    allow_dangerous_deserialization=True
+                )
+                print(f"✓ Loaded existing FAISS index from {FAISS_INDEX_PATH}")
+            except Exception as e:
+                print(f"⚠ Could not load existing index: {e}")
+                _vectorstore = FAISS.from_texts(["initialization"], _embeddings)
+        else:
+            _vectorstore = FAISS.from_texts(["initialization"], _embeddings)
+            print(f"✓ Created new FAISS index at {FAISS_INDEX_PATH}")
+    return _vectorstore
+def add_to_vectorstore(texts):
+    vectorstore = get_vectorstore()
+    vectorstore.add_texts(texts)
+    vectorstore.save_local(FAISS_INDEX_PATH)
+    print(f"✓ Added {len(texts)} texts to FAISS and saved to disk")
+def clear_vectorstore():
+    global _vectorstore
+    index_file = os.path.join(FAISS_INDEX_PATH, "index.faiss")
+    pkl_file = os.path.join(FAISS_INDEX_PATH, "index.pkl")
+    if os.path.exists(index_file):
+        os.remove(index_file)
+    if os.path.exists(pkl_file):
+        os.remove(pkl_file)
+    _vectorstore = None
+    print("✓ Cleared FAISS vectorstore")
+def load_vectorstore_for_video(video_id: str):
+    path = f"./data/faiss/{video_id}/"
+    if not os.path.exists(path):
+        raise FileNotFoundError(f"No vectorstore found for video ID: {video_id}")
+    return FAISS.load_local(
+        path,
+        _embeddings,
+        allow_dangerous_deserialization=True
+    )
+def create_vectorstore_for_video(video_id: str, transcript: str):
+    # FIXED: Clean the transcript before processing
+    transcript = clean_transcript(transcript)
+    # Split transcript into chunks
+    text_splitter = RecursiveCharacterTextSplitter(
+        chunk_size=1000,
+        chunk_overlap=200,
+        length_function=len
+    )
+    chunks = text_splitter.split_text(transcript)
+    # Create vectorstore from chunks
+    vectorstore = FAISS.from_texts(
+        texts=chunks,
+        embedding=_embeddings
+    )
+    # Save to disk
+    path = f"./data/faiss/{video_id}/"
+    os.makedirs(path, exist_ok=True)
+    vectorstore.save_local(path)
+    print(f"✓ Created and saved vectorstore for video {video_id} with {len(chunks)} chunks (cleaned)")
+    return vectorstore

app/utils/logger.py ADDED Viewed

	@@ -0,0 +1,14 @@

+import logging
+import sys
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s | %(levelname)s | %(name)s | %(message)s',
+    handlers=[
+        logging.StreamHandler(sys.stdout),
+        logging.FileHandler('./data/app.log')
+    ]
+)
+def get_logger(name):
+    return logging.getLogger(name)

docker-compose.yml ADDED Viewed

	@@ -0,0 +1,21 @@

+version: '3.8'
+services:
+  klypse-backend:
+    build:
+      context: .
+      dockerfile: docker/Dockerfile
+    container_name: klypse-backend
+    ports:
+      - "8000:8000"
+    environment:
+      - GROQ_API_KEY=${GROQ_API_KEY}
+    volumes:
+      - ./data:/app/data
+    restart: unless-stopped
+    networks:
+      - klypse-network
+networks:
+  klypse-network:
+    driver: bridge

docker/.dockerignore ADDED Viewed

	@@ -0,0 +1,25 @@

+__pycache__/
+*.pyc
+*.pyo
+*.pyd
+.Python
+*.so
+*.egg
+*.egg-info
+dist/
+build/
+*.log
+.git/
+.gitignore
+.env
+.venv
+env/
+venv/
+data/
+.idea/
+.vscode/
+*.db
+*.sqlite
+.DS_Store
+tests/
+docker/

docker/Dockerfile ADDED Viewed

	@@ -0,0 +1,28 @@

+# Python base image
+FROM python:3.10-slim
+# Set a working directory
+WORKDIR /app
+# Copy requirements first for better caching
+COPY requirements.txt .
+# Install system dependencies (ffmpeg for audio, etc)
+RUN apt-get update && \
+    apt-get install -y ffmpeg git && \
+    pip install --upgrade pip && \
+    pip install -r requirements.txt && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/*
+# Copy rest of your app
+COPY . .
+# Environment variables (set these securely in production)
+ENV PYTHONUNBUFFERED 1
+# Expose FastAPI default port
+EXPOSE 8000
+# Command to run your backend (edit as needed)
+CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "8000", "--workers", "2"]

requirements.txt ADDED Viewed

	@@ -0,0 +1,30 @@

+# FastAPI
+fastapi==0.109.0
+uvicorn[standard]==0.27.0
+python-multipart==0.0.6
+# YouTube
+youtube-transcript-api==0.6.2
+yt-dlp==2024.3.10
+# AI/LLM - Compatible versions
+groq==0.4.2
+langchain==0.1.16
+langchain-groq==0.0.1
+langchain-community==0.0.36
+langchain-huggingface==0.0.1
+openai==1.12.0
+# Embeddings & Vector Store - Updated version
+faiss-cpu==1.7.4
+sentence-transformers>=2.6.0
+chromadb==0.4.22
+# Audio Processing
+openai-whisper==20231117
+# Utils
+pydantic==2.6.0
+pydantic-settings==2.1.0
+python-dotenv==1.0.0
+requests==2.31.0

temp.py ADDED Viewed

	@@ -0,0 +1,13 @@

+# ...existing code...
+import re
+from pathlib import Path
+p = Path("data/cache")
+for f in p.glob("*.txt"):
+    text = f.read_text(encoding="utf-8")
+    # find repeated adjacent words like "word word" sequences
+    matches = re.findall(r"\b(\w+)(?:\s+\1\b)+", text, flags=re.IGNORECASE)
+    if matches:
+        print(f"{f.name} has repeated words sample: {matches[:10]}")
+    else:
+        print(f"{f.name} looks ok")

test_config.py ADDED Viewed

	@@ -0,0 +1,60 @@

+"""
+Test Groq API connection and model quality
+"""
+from langchain_groq import ChatGroq
+from langchain.schema import HumanMessage
+import os
+from dotenv import load_dotenv
+load_dotenv()
+def test_groq():
+    print("=" * 60)
+    print("Testing Groq API")
+    print("=" * 60)
+    api_key = os.getenv('GROQ_API_KEY')
+    if not api_key or api_key == 'gsk_your_actual_groq_api_key_here':
+        print("❌ Please add your Groq API key to .env file")
+        return
+    try:
+        # Initialize Groq
+        llm = ChatGroq(
+            groq_api_key=api_key,
+            model_name="llama-3.3-70b-versatile",
+            temperature=0
+        )
+        print(f"✓ Groq API Key: {api_key[:15]}...")
+        print("✓ Testing model quality...")
+        # Test with a complex question
+        messages = [
+            HumanMessage(content="Explain quantum computing in simple terms, then write a Python function to calculate fibonacci numbers.")
+        ]
+        response = llm.invoke(messages)
+        print("\n" + "=" * 60)
+        print("GROQ RESPONSE (GPT-4 Level Quality):")
+        print("=" * 60)
+        print(response.content[:500] + "...")
+        print("\n" + "=" * 60)
+        print("✅ Groq is working perfectly!")
+        print("Quality: GPT-4 level (Llama 3.3 70B)")
+        print("Speed: 10x faster than OpenAI")
+        print("Cost: 100% FREE forever")
+        print("=" * 60)
+    except Exception as e:
+        print(f"❌ Error: {e}")
+        print("\nMake sure you:")
+        print("1. Created account at console.groq.com")
+        print("2. Got your API key")
+        print("3. Added it to .env file as GROQ_API_KEY=gsk_...")
+if __name__ == "__main__":
+    test_groq()

test_db.py ADDED Viewed

	@@ -0,0 +1,11 @@

+from app.database.db import save_conversation, get_conversation_history, clear_session
+# Save a conversation
+save_conversation("session1", "video123", "What is this video?", "It's a music video.")
+# Retrieve history
+history = get_conversation_history("session1")
+print(history)
+# Clear session
+clear_session("session1")

test_stream.py ADDED Viewed

	@@ -0,0 +1,8 @@

+import requests
+url = "http://localhost:8000/api/v1/ask/stream"
+payload = {"video_id": "U8PZejZ0F-c", "question": "What is the main story in this video?"}
+with requests.post(url, json=payload, stream=True) as r:
+    for line in r.iter_lines():
+        print(line)

tests/test_install.py ADDED Viewed

	@@ -0,0 +1,54 @@

+"""
+Quick test to verify all packages are installed correctly
+"""
+def test_imports():
+    """Test if all critical packages can be imported"""
+    try:
+        import faiss
+        print(f"✓ FAISS {faiss.__version__}")
+    except ImportError as e:
+        print(f"✗ FAISS import failed: {e}")
+        return False
+    try:
+        import langchain
+        print(f"✓ LangChain {langchain.__version__}")
+    except ImportError as e:
+        print(f"✗ LangChain import failed: {e}")
+        return False
+    try:
+        from langchain_openai import ChatOpenAI, OpenAIEmbeddings
+        print("✓ LangChain OpenAI integration")
+    except ImportError as e:
+        print(f"✗ LangChain OpenAI import failed: {e}")
+        return False
+    try:
+        from langchain_community.vectorstores import FAISS
+        print("✓ LangChain FAISS integration")
+    except ImportError as e:
+        print(f"✗ FAISS integration failed: {e}")
+        return False
+    try:
+        import fastapi
+        print("✓ FastAPI")
+    except ImportError as e:
+        print(f"✗ FastAPI import failed: {e}")
+        return False
+    try:
+        from youtube_transcript_api import YouTubeTranscriptApi
+        print("✓ YouTube Transcript API")
+    except ImportError as e:
+        print(f"✗ YouTube Transcript API failed: {e}")
+        return False
+    print("\n🎉 All packages installed successfully!")
+    print("Ready to run VidIQAI backend!")
+    return True
+if __name__ == "__main__":
+    test_imports()

tests/tests_processing.py ADDED Viewed

File without changes

tests/tests_transcript.py ADDED Viewed

File without changes

tests_api.py ADDED Viewed

	@@ -0,0 +1,7 @@

+# Create test_api.py in your project root
+from app.config import config
+print(f"Provider: {config.LLM_PROVIDER}")
+print(f"Model: {config.OPENAI_MODEL}")
+print(f"API Key (first 10 chars): {config.OPENAI_API_KEY[:10]}...")
+print("✓ Configuration loaded successfully!")