import torch, os, argparse, shutil, textwrap, time, streamlit as st from langchain.document_loaders import YoutubeLoader from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain.vectorstores import Chroma from langchain.embeddings import OpenAIEmbeddings, HuggingFaceInstructEmbeddings, HuggingFaceBgeEmbeddings from langchain.chains import RetrievalQA from langchain.llms import OpenAI from langchain.chat_models import ChatOpenAI from langchain import HuggingFaceHub from transformers import pipeline from deep_translator import GoogleTranslator from langdetect import detect from urllib.parse import urlparse, parse_qs def typewriter(text, speed): container = st.empty() displayed_text = '' for char in text: displayed_text += char container.markdown(displayed_text) time.sleep(1 / speed) def wrap_text_preserve_newlines(text, width=110): lines = text.split('\n') wrapped_lines = [textwrap.fill(line, width=width) for line in lines] wrapped_text = '\n'.join(wrapped_lines) return wrapped_text def process_llm_response(llm_originalresponse2): typewriter(llm_originalresponse2['result'], speed=40) def extract_video_id(youtube_url): try: parsed_url = urlparse(youtube_url) query_params = parse_qs(parsed_url.query) video_id = query_params.get('v', [None])[0] return video_id except Exception as e: print(f"Error extracting video ID: {e}") return None def chat(): HF_TOKEN = os.environ.get('HF_TOKEN', False) model_name = "BAAI/bge-base-en" encode_kwargs = {'normalize_embeddings': True} st.title('YouTube ChatBot') video_url = st.text_input('Insert video URL', placeholder='Format should be like: https://www.youtube.com/watch?v=pSLeYvld8Mk') query = st.text_input("Ask any question about the video") if st.button('Submit', type='primary'): with st.spinner('Processing the video...'): video_id = extract_video_id(video_url) loader = YoutubeLoader(video_id) documents = loader.load() text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100) documents = text_splitter.split_documents(documents) vector_db = Chroma.from_documents( documents, embedding = HuggingFaceBgeEmbeddings(model_name=model_name, model_kwargs={'device': 'cuda' if torch.cuda.is_available() else 'cpu'}, encode_kwargs=encode_kwargs) ) repo_id = "tiiuae/falcon-7b-instruct" qa_chain = RetrievalQA.from_chain_type( llm=HuggingFaceHub( huggingfacehub_api_token=HF_TOKEN, repo_id=repo_id, model_kwargs={'temperature': 0.1, 'max_new_tokens': 1000}, ), retriever=vector_db.as_retriever(), return_source_documents=False, verbose=False ) with st.spinner('Generating Answer...'): llm_response = qa_chain(query) process_llm_response(llm_response) chat()