ubermenchh commited on
Commit
5af81e5
1 Parent(s): b3c1ba9

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +82 -0
app.py ADDED
@@ -0,0 +1,82 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch, os, argparse, shutil, textwrap, time, streamlit as st
2
+ from langchain.document_loaders import YoutubeLoader
3
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
4
+ from langchain.vectorstores import Chroma
5
+ from langchain.embeddings import OpenAIEmbeddings, HuggingFaceInstructEmbeddings, HuggingFaceBgeEmebddings
6
+ from langchain.chains import RetrievalQA
7
+ from langchain.llms import OpenAI
8
+ from langchain.chat_models import ChatOpenAI
9
+ from langchain import HuggingFaceHub
10
+ from transformers import pipeline
11
+ from deep_translator import GoogleTranslator
12
+ from langdetect import detect
13
+
14
+ def typewriter(text, speed):
15
+ container = st.empty()
16
+ displayed_text = ''
17
+
18
+ for char in text:
19
+ displayed_text += char
20
+ container.markdown(displayed_text)
21
+ time.sleep(1 / speed)
22
+
23
+ def wrap_text_preserve_newlines(text, width=110):
24
+ lines = text.split('\n')
25
+ wrapped_lines = [textwrap.fill(line, width=width) for line in lines]
26
+ wrapped_text = '\n'.join(wrapped_lines)
27
+ return wrapped_text
28
+
29
+ def process_llm_response(llm_originalresponse2):
30
+ typewriter(llm_originalresponse2['result'], speed=40)
31
+
32
+ def extract_video_id(youtube_url):
33
+ try:
34
+ parsed_url = urlparse(youtube_url)
35
+ query_params = parse_qs(parsed_url.query)
36
+ video_id = query_params.get('v', [None])[0]
37
+
38
+ return video_id
39
+ except Exception as e:
40
+ print(f"Error extracting video ID: {e}")
41
+ return None
42
+
43
+ def chat():
44
+ HF_TOKEN = os.environ.get('HF_TOKEN', False)
45
+ model_name = "BAAI/bge-base-en"
46
+ encode_kwargs = {'normalize_embeddings': True}
47
+
48
+ st.title('YouTube ChatBot')
49
+
50
+ video_url = st.text_input('Insert video URL', placeholder='Format should be like: https://www.youtube.com/watch?v=pSLeYvld8Mk')
51
+ query = st.text_input("Ask any question about the video")
52
+
53
+ if st.button('Submit', type='primary'):
54
+ with st.spinner('Processing the video...'):
55
+ video_id = extract_video_id(video_url)
56
+ loader = YoutubeLoader(video_id)
57
+ documents = loader.load()
58
+
59
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_ovelap=100)
60
+ documents = text_splitter.split_documents(documents)
61
+
62
+ vector_db = Chroma.from_documents(
63
+ documents,
64
+ embeddings = HuggingFaceBgeEmebddings(model_name=model_name, model_kwargs={'device'L 'cuda' if torch.cuda.is_available() else 'cpu'}, encode_kwargs=encode_kwargs)
65
+ )
66
+ repo_id = "tiiuae/falcon-7b-instruct"
67
+ qa_chain = RetrievalQA.from_chain_type(
68
+ llm=HuggingFaceHub(
69
+ huggingfacehub_api_token=HF_TOKEN,
70
+ repo_id=repo_id,
71
+ model_kwargs={'temperature': 0.1, 'max_new_tokens': 1000},
72
+ )
73
+ retriever=vector_db.as_retriever(),
74
+ return_source_documents=False,
75
+ verbose=False
76
+ )
77
+
78
+ with st.spinner('Generating Answer...'):
79
+ llm_response = qa_chain(query)
80
+ process_llm_response(llm_response)
81
+
82
+ chat()