Video-Rag / app /services /transcription.py
Hammad712's picture
first commit
e84d389
# app/services/transcription.py
import os
import uuid
from datetime import datetime
from fastapi import BackgroundTasks, HTTPException
from langchain.text_splitter import RecursiveCharacterTextSplitter
from ..services.llm import get_embeddings
from ..config import settings
from ..db.mongodb import mongodb
from ..db.chat_manager import chat_manager
from langchain_community.vectorstores import FAISS
# ensure video dir exists
os.makedirs(settings.VIDEOS_DIR, exist_ok=True)
# Store text splits in MongoDB under "chunks" collection
chunks_collection = mongodb.db.get_collection("chunks")
def process_transcription(transcription: str, user_id: str, title: str, source_type: str,
source_url: str = None, file_size: int = None) -> str:
"""
Split transcription into chunks, store in MongoDB, initialize chat history, and return session ID.
"""
# Split text
splitter = RecursiveCharacterTextSplitter(chunk_size=1024, chunk_overlap=20)
splits = splitter.split_text(transcription)
# Persist session metadata
session_id = str(uuid.uuid4())
mongodb.videos.insert_one({
"video_id": session_id,
"user_id": user_id,
"title": title,
"source_type": source_type,
"source_url": source_url,
"created_at": datetime.utcnow(),
"transcription": transcription,
"size": file_size
})
# Store chunks for retrieval
chunk_docs = [{"session_id": session_id, "text": chunk} for chunk in splits]
chunks_collection.insert_many(chunk_docs)
# Initialize chat history in Mongo
chat_manager.initialize_chat_history(session_id)
return session_id
def get_retriever(session_id: str):
"""
Build a Retriever by loading chunks from MongoDB and creating a FAISS vectorstore.
"""
# Fetch stored text splits
docs = [doc["text"] for doc in chunks_collection.find({"session_id": session_id})]
if not docs:
raise HTTPException(status_code=404, detail="Session data not found. Please transcribe first.")
# Create embeddings and vectorstore
embeddings = get_embeddings()
vectorstore = FAISS.from_texts(docs, embeddings)
return vectorstore.as_retriever(search_kwargs={"k": 3})
def save_video_file(video_id: str, file_path: str, contents: bytes) -> None:
"""
Persist the uploaded video file to disk.
"""
os.makedirs(os.path.dirname(file_path), exist_ok=True)
with open(file_path, "wb") as f:
f.write(contents)