import streamlit as st from langchain import PromptTemplate from langchain.llms import Replicate import os from langchain.document_loaders import YoutubeLoader import requests import re from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain.vectorstores import FAISS from langchain.embeddings import HuggingFaceEmbeddings from langchain.chains import RetrievalQA from langchain.vectorstores.base import VectorStoreRetriever from langchain.prompts import PromptTemplate from lxml import etree from langchain.document_loaders import WebBaseLoader from bs4 import BeautifulSoup st.set_page_config(page_title="🦜🔗 Ask an LLM to know more about me") st.title('🦜🔗 Ask an LLM to know more about me') def has_numbers(inputString): return any(char.isdigit() for char in inputString) @st.cache_resource def get_query_chain(): model_name = "sentence-transformers/all-mpnet-base-v2" model_kwargs = {'device': 'cpu'} encode_kwargs = {'normalize_embeddings': False} hf = HuggingFaceEmbeddings( model_name=model_name, model_kwargs=model_kwargs, encode_kwargs=encode_kwargs ) loader = YoutubeLoader.from_youtube_url( "https://www.youtube.com/watch?v=pAYrk3f9xRk", add_video_info=True ) my_url = "https://www.youtube.com/@rrwithdeku8677/videos" r = requests.get(my_url) page = (r.text) pattern = r'watch\?v=([^"]+)' matches = re.findall(pattern, page, re.IGNORECASE) ids = [x.split('=')[-1] for x in matches] base_url = "https://www.youtube.com/watch?v=" video_data= [] #TODO - Cache this and only do this if there is a new video for id in ids: loader = YoutubeLoader.from_youtube_url( base_url + id, add_video_info=True ) print("got loader") data = loader.load() video_data.extend(data) profile_url = "https://ayushtues.medium.com" response = requests.get(profile_url) soup = BeautifulSoup(response.content, 'html.parser') links = [] for link in soup.findAll('a'): x = link.get('href') if x.startswith('/') and has_numbers(x) : links.append(link.get('href')) links = list(set(links)) links = [profile_url+ x.split('?source')[0] for x in links] # print(links) loader = WebBaseLoader(links) data = loader.load() video_data.extend(data) # print(data) text_splitter = RecursiveCharacterTextSplitter(chunk_size = 100, chunk_overlap = 0) all_splits = text_splitter.split_documents(video_data) vectorstore = FAISS.from_documents(documents=all_splits, embedding=hf) retriever = VectorStoreRetriever(vectorstore=vectorstore) print("got retriever") template = """Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer. Use three sentences maximum and keep the answer as concise as possible. Always say "thanks for asking!" at the end of the answer. {context} Question: {question} Helpful Answer:""" QA_CHAIN_PROMPT = PromptTemplate.from_template(template) llm = Replicate( model="a16z-infra/llama13b-v2-chat:df7690f1994d94e96ad9d568eac121aecf50684a0b0963b25a41cc40061269e5", input={"temperature": 0.75, "max_length": 500, "top_p": 1}, ) qa_chain = RetrievalQA.from_chain_type( llm, retriever=retriever, chain_type_kwargs={"prompt": QA_CHAIN_PROMPT} ) return qa_chain def generate_response(topic, query_chain): result = query_chain({"query": topic}) print(result) return st.info(result) with st.form('myform'): topic_text = st.text_input('Enter keyword:', '') submitted = st.form_submit_button('Submit') if submitted : os.environ["REPLICATE_API_TOKEN"] = st.secrets["REPLICATE_API_TOKEN"] query_chain = get_query_chain() generate_response(topic_text, query_chain)