#DocArrayInMemorySearch is a document index provided by Docarray that stores documents in memory. #It is a great starting point for small datasets, where you may not want to launch a database server. # import libraries import streamlit as st import requests from bs4 import BeautifulSoup #from langchain.indexes import VectorstoreIndexCreator #Logic for creating indexes. #from langchain.vectorstores import DocArrayInMemorySearch #document index provided by Docarray that stores documents in memory. from sentence_transformers import SentenceTransformer from langchain_community.llms import HuggingFaceEndpoint from langchain_chroma import Chroma from langchain_community.document_loaders import TextLoader from langchain_community.embeddings.sentence_transformer import (SentenceTransformerEmbeddings,) from langchain_text_splitters import CharacterTextSplitter from langchain.chains import RetrievalQA #import vertexai #from langchain.llms import VertexAI #from langchain.embeddings import VertexAIEmbeddings #vertexai.init(project=PROJECT, location=LOCATION) #GCP PROJECT ID, LOCATION as region. #The PaLM 2 for Text (text-bison, text-unicorn) foundation models are optimized for a variety of natural language #tasks such as sentiment analysis, entity extraction, and content creation. The types of content that the PaLM 2 for #Text models can create include document summaries, answers to questions, and labels that classify content. from transformers import AutoModel import os api_key = os.getenv("hugggingface_api_key") llm = HuggingFaceEndpoint(repo_id="mistralai/Mistral-7B-Instruct-v0.2", Temperature=0.3, huggingfacehub_api_token=api_key) #llm = AutoModel.from_pretrained("mistralai/Mistral-7B-Instruct-v0.2", use_auth_token=api_key) #model = SentenceTransformer("all-MiniLM-L6-v2") #llm = VertexAI(model_name="text-bison@001",max_output_tokens=256,temperature=0.1,top_p=0.8,top_k=40,verbose=True,) #embeddings = VertexAIEmbeddings() #embeddings = model.encode(sentences) #The below code scrapes all the text data from the webpage link provided by the user and saves it in a text file. def get_text(url): # Send a GET request to the URL response = requests.get(url) # Create a BeautifulSoup object with the HTML content soup = BeautifulSoup(response.content, "html.parser") # Find the specific element or elements containing the text you want to scrape # Here, we'll find all

tags and extract their text paragraphs = soup.find_all("p") # Loop through the paragraphs and print their text with open("text\\temp.txt", "w", encoding='utf-8') as file: # Loop through the paragraphs and write their text to the file for paragraph in paragraphs: file.write(paragraph.get_text() + "\n") @st.cache_resource def create_langchain_index(input_text): print("--indexing---") get_text(input_text) loader = TextLoader("text\\temp.txt", encoding='utf-8') documents = loader.load() # split it into chunks text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0) docs = text_splitter.split_documents(documents) # create the open-source embedding function embeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2") # load it into Chroma db = Chroma.from_documents(docs, embeddings) persist_directory = "chroma_db" vectordb = Chroma.from_documents(documents=docs, embedding=embeddings, persist_directory=persist_directory) db = Chroma(persist_directory=persist_directory, embedding_function=embeddings) return db # @st.cache_resource # def get_basic_page_details(input_text,summary_query,tweet_query,ln_query): # index = create_langchain_index(input_text) # summary_response = index.query(summary_query) # tweet_response = index.query(tweet_query) # ln_response = index.query(ln_query) # return summary_response,tweet_response,ln_response @st.cache_data def get_response(input_text,query,_db): print(f"--querying---{query}") retrieval_chain = RetrievalQA.from_chain_type(llm, chain_type="stuff", retriever=db.as_retriever()) response = retrieval_chain.run(query) #response = index.query(query,llm=llm) return response #The below code is a simple flow to accept the webpage link and process the queries #using the get_response function created above. Using the cache, the same. st.title('Webpage Question and Answering ') input_text=st.text_input("Provide the link to the webpage...") summary_response = "" tweet_response = "" ln_response = "" # if st.button("Load"): if input_text: db = create_langchain_index(input_text) summary_query ="Write a 100 words summary of the document" summary_response = get_response(input_text,summary_query,db) tweet_query ="Write a twitter tweet" tweet_response = get_response(input_text,tweet_query,db) ln_query ="Write a linkedin post for the document" ln_response = get_response(input_text,ln_query,db) with st.expander('Page Summary'): st.info(summary_response) with st.expander('Tweet'): st.info(tweet_response) with st.expander('LinkedIn Post'): st.info(ln_response) st.session_state.input_text = '' question=st.text_input("Ask a question from the link you shared...") if st.button("Ask"): if question: db = create_langchain_index(input_text) response = get_response(input_text,question,db) st.write(response) else: st.warning("Please enter a question.")