Daksh0505 commited on
Commit
1ab0e96
Β·
verified Β·
1 Parent(s): ea1c842

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +104 -0
app.py ADDED
@@ -0,0 +1,104 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from langchain_huggingface import ChatHuggingFace, HuggingFaceEndpoint, HuggingFaceEmbeddings
3
+ from youtube_transcript_api import YouTubeTranscriptApi, TranscriptsDisabled, NoTranscriptFound
4
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
5
+ from langchain.vectorstores import FAISS
6
+ from langchain.prompts import PromptTemplate
7
+ import os
8
+
9
+ api_key = os.getenv("HF_API_KEY")
10
+
11
+ # πŸ“Ό Transcript Language Options
12
+ @st.cache_data
13
+ def get_available_languages(video_id):
14
+ transcriber = YouTubeTranscriptApi()
15
+ try:
16
+ transcript_info = transcriber.list(video_id)
17
+ return [(t.language_code, t.language) for t in transcript_info]
18
+ except Exception:
19
+ return []
20
+
21
+ # πŸ“Ό Transcript Fetcher
22
+ @st.cache_data
23
+ def get_transcript(video_id, language_code):
24
+ transcriber = YouTubeTranscriptApi()
25
+ try:
26
+ transcript_list = transcriber.fetch(video_id, languages=[language_code])
27
+ return ' '.join([d.text for d in transcript_list])
28
+ except (NoTranscriptFound, TranscriptsDisabled):
29
+ return None
30
+ except Exception:
31
+ return None
32
+
33
+ # 🧠 Embedding Loader
34
+ @st.cache_resource
35
+ def load_embeddings():
36
+ return HuggingFaceEmbeddings(
37
+ model_name="intfloat/multilingual-e5-base",
38
+ model_kwargs={"device": "cpu"}
39
+ )
40
+
41
+ # 🧱 Vector Store Builder
42
+ @st.cache_data
43
+ def create_vector_store(transcript):
44
+ splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
45
+ docs = splitter.create_documents([transcript])
46
+ return FAISS.from_documents(docs, load_embeddings())
47
+
48
+ # πŸ€– Model Builder
49
+ def build_model(model_choice, temperature):
50
+ repo_id = "deepseek-ai/DeepSeek-V3.2-Exp" if model_choice == "DeepSeek" else "openai/gpt-oss-20b"
51
+ llm = HuggingFaceEndpoint(
52
+ repo_id=repo_id,
53
+ huggingfacehub_api_token=api_key,
54
+ task="text-generation"
55
+ )
56
+ return ChatHuggingFace(llm=llm, temperature=temperature)
57
+
58
+ # 🧾 Prompt Template
59
+ prompt_template = PromptTemplate(
60
+ template=(
61
+ "You are a helpful assistant.\n\n"
62
+ "Answer the question using the context provided below.\n"
63
+ "If the context does not mention the topic, say clearly: 'There is no mention of the topic in the video you provided.'\n"
64
+ "Then, based on your own knowledge, try to answer the question.\n"
65
+ "If both the context and your knowledge are insufficient, say: 'I don't know.'\n\n"
66
+ "Keep the answer format neat, clean, and human-readable.\n\n"
67
+ "Context:\n{context}\n\n"
68
+ "Question:\n{question}"
69
+ ),
70
+ input_variables=["context", "question"]
71
+ )
72
+
73
+ # πŸš€ App UI
74
+ st.title("πŸŽ₯ YouTube Transcript Chatbot")
75
+
76
+ video_id = st.text_input("YouTube Video ID", value="lv1_-RER4_I")
77
+ if video_id:
78
+ langs = get_available_languages(video_id)
79
+ lang_options = [f"{name} ({code})" for code, name in langs] if langs else ["No transcript available"]
80
+ selected_lang = st.selectbox("Transcript Language", lang_options)
81
+ language_code = selected_lang.split("(")[-1].strip(")") if langs else None
82
+ else:
83
+ language_code = None
84
+
85
+ query = st.text_area("Your Query", value="What is RAG?")
86
+ model_choice = st.radio("Model to Use", ["DeepSeek", "OpenAI"])
87
+ temperature = st.slider("Temperature", 0, 100, value=50)
88
+
89
+ if st.button("πŸš€ Run Chatbot"):
90
+ if not video_id or not query or not language_code:
91
+ st.warning("Please fill in all fields.")
92
+ else:
93
+ with st.spinner("Fetching transcript and generating response..."):
94
+ transcript = get_transcript(video_id, language_code)
95
+ if not transcript:
96
+ st.error("Transcript not available or disabled.")
97
+ else:
98
+ retriever = create_vector_store(transcript).as_retriever(search_type="mmr", search_kwargs={"k": 5})
99
+ relevant_docs = retriever.invoke(query)
100
+ context_text = "\n\n".join(doc.page_content for doc in relevant_docs)
101
+ prompt = prompt_template.invoke({"context": context_text, "question": query})
102
+ model = build_model(model_choice, temperature / 100.0)
103
+ response = model.invoke(prompt)
104
+ st.text_area("Model Response", value=response.content, height=400)