vilson commited on
Commit
9db894e
1 Parent(s): b09053c
Files changed (9) hide show
  1. README.md +1 -13
  2. app.py +65 -0
  3. qa/chains.py +6 -0
  4. qa/loader.py +7 -0
  5. qa/manager.py +24 -0
  6. qa/model.py +6 -0
  7. qa/split.py +7 -0
  8. qa/vector_store.py +25 -0
  9. requirements.txt +5 -0
README.md CHANGED
@@ -1,13 +1 @@
1
- ---
2
- title: Youtube Retrieval Qa
3
- emoji: 📚
4
- colorFrom: indigo
5
- colorTo: red
6
- sdk: gradio
7
- sdk_version: 3.32.0
8
- app_file: app.py
9
- pinned: false
10
- license: mit
11
- ---
12
-
13
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
+ # youtube-retrieval-qa
 
 
 
 
 
 
 
 
 
 
 
 
app.py ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from typing import List
3
+ import gradio as gr
4
+ from qa.manager import YoutubeQA
5
+
6
+ DESCRIPTION = """
7
+
8
+ <h1> <center> 🤗 Hello. This App will help you do questions on youtube videos.</center> </h1>
9
+
10
+ <h4>
11
+ Follow this steps to use 😉:
12
+ </h4>
13
+
14
+ <ol>
15
+ <li>Set your OpenAI Key</li>
16
+ <li>Set your Youtube URL</li>
17
+ <li>Ask!</li>
18
+ </ol>
19
+ """
20
+
21
+ qa = YoutubeQA()
22
+
23
+ def set_openai_key(key: str):
24
+ os.environ["OPENAI_API_KEY"] = key
25
+ # Set status field to Not Ready
26
+ return gr.update(lines=1, value="Not Ready 🥴")
27
+
28
+ def instanciate_retriver(url: str):
29
+ qa.load_model()
30
+ qa.load_vector_store(url)
31
+ qa.load_retriever()
32
+ # Set status field to Ready
33
+ return gr.update(lines=1, value="Ready 😎")
34
+
35
+ def respond(message: str, chat_history: List[str]):
36
+ bot_message = qa.run(message)
37
+ chat_history.append((message, bot_message))
38
+ return "", chat_history
39
+
40
+ with gr.Blocks() as app:
41
+
42
+ gr.Markdown(DESCRIPTION)
43
+ with gr.Tab("QA"):
44
+ status = gr.Textbox(label="🤔 Vector DB Status:", interactive=False)
45
+ chatbot = gr.Chatbot(label="🤖 Bot Answer:")
46
+ question = gr.Textbox(label="🕵️‍♀️ Question:", placeholder="Write your question here and press enter")
47
+ clear = gr.Button("Clear")
48
+ question.submit(respond, [question, chatbot], [question, chatbot])
49
+ clear.click(lambda: None, None, chatbot, queue=False)
50
+
51
+ with gr.Tab("Youtube URL"):
52
+ url = gr.Textbox(label="🎞️ URL:", lines=1, placeholder="Set your Youtube URL here...")
53
+ url_button = gr.Button("Set URL")
54
+
55
+ with gr.Tab("OpenAI Key"):
56
+ key = gr.Textbox(label="🔑 Key:", type="password", placeholder="Set your OpenAI Key here...")
57
+ key_button = gr.Button("Set Key")
58
+
59
+ #with gr.Accordion("Click me. About this App"):
60
+ # gr.Markdown("Look at me...")
61
+
62
+ url_button.click(instanciate_retriver, inputs=url, outputs=status)
63
+ key_button.click(set_openai_key, inputs=key, outputs=status)
64
+
65
+ app.launch()
qa/chains.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ from typing import Callable
2
+
3
+ def retrieval_qa(llm: Callable, retriever: Callable) -> Callable:
4
+ from langchain.chains import RetrievalQA
5
+ qa = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=retriever)
6
+ return qa
qa/loader.py ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ from typing import List
2
+
3
+ def youtube_doc_loader(url: str) -> List:
4
+ from langchain.document_loaders import YoutubeLoader
5
+ loader = YoutubeLoader.from_youtube_url(url, add_video_info=False)
6
+ data = loader.load()
7
+ return data
qa/manager.py ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from qa.chains import retrieval_qa
2
+ from qa.loader import youtube_doc_loader
3
+ from qa.model import load_llm
4
+ from qa.split import split_document
5
+ from qa.vector_store import create_vector_store
6
+
7
+ class YoutubeQA:
8
+
9
+ def __init__(self):
10
+ pass
11
+
12
+ def load_model(self) -> None:
13
+ self.llm = load_llm()
14
+
15
+ def load_vector_store(self, url: str) -> None:
16
+ data = youtube_doc_loader(url=url)
17
+ docs = split_document(data=data)
18
+ self.retriver = create_vector_store(docs=docs)
19
+
20
+ def load_retriever(self) -> None:
21
+ self.retrieval_qa = retrieval_qa(llm=self.llm, retriever=self.retriver)
22
+
23
+ def run(self, question: str) -> str:
24
+ return self.retrieval_qa.run(question)
qa/model.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ from typing import Callable
2
+
3
+ def load_llm(temperature: float = 0.0, model: str = 'gpt-3.5-turbo') -> Callable:
4
+ from langchain.chat_models import ChatOpenAI
5
+ llm = ChatOpenAI(temperature=temperature, model=model)
6
+ return llm
qa/split.py ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ from typing import List
2
+
3
+ def split_document(data: List, chunk_size: int = 3000) -> List:
4
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
5
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=400)
6
+ docs = text_splitter.split_documents(data)
7
+ return docs
qa/vector_store.py ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Callable, List
2
+
3
+ def create_vector_store(
4
+ docs: List,
5
+ metric: str = 'cos',
6
+ top_k: int = 4
7
+ ) -> Callable:
8
+
9
+ from langchain.vectorstores import FAISS
10
+ from langchain.embeddings.openai import OpenAIEmbeddings
11
+
12
+ embeddings = OpenAIEmbeddings()
13
+
14
+ # Embed your documents and combine with the raw text in a pseudo db.
15
+ # Note: This will make an API call to OpenAI
16
+ docsearch = FAISS.from_documents(docs, embeddings)
17
+
18
+ # Retriver object
19
+ retriever = docsearch.as_retriever()
20
+
21
+ # Retriver configs
22
+ retriever.search_kwargs['distance_metric'] = metric
23
+ retriever.search_kwargs['k'] = top_k
24
+
25
+ return retriever
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ langchain
2
+ openai
3
+ youtube-transcript-api
4
+ faiss-cpu
5
+ tiktoken