Spaces:
				
			
			
	
			
			
		Sleeping
		
	
	
	
			
			
	
	
	
	
		
		
		Sleeping
		
	| #DocArrayInMemorySearch is a document index provided by Docarray that stores documents in memory. | |
| #It is a great starting point for small datasets, where you may not want to launch a database server. | |
| # import libraries | |
| import streamlit as st | |
| import requests | |
| from bs4 import BeautifulSoup | |
| #from langchain.indexes import VectorstoreIndexCreator #Logic for creating indexes. | |
| #from langchain.vectorstores import DocArrayInMemorySearch #document index provided by Docarray that stores documents in memory. | |
| from sentence_transformers import SentenceTransformer | |
| from langchain_community.llms import HuggingFaceEndpoint | |
| from langchain_chroma import Chroma | |
| from langchain_community.document_loaders import TextLoader | |
| from langchain_community.embeddings.sentence_transformer import (SentenceTransformerEmbeddings,) | |
| from langchain_text_splitters import CharacterTextSplitter | |
| from langchain.chains import RetrievalQA | |
| #import vertexai | |
| #from langchain.llms import VertexAI | |
| #from langchain.embeddings import VertexAIEmbeddings | |
| #vertexai.init(project=PROJECT, location=LOCATION) #GCP PROJECT ID, LOCATION as region. | |
| #The PaLM 2 for Text (text-bison, text-unicorn) foundation models are optimized for a variety of natural language | |
| #tasks such as sentiment analysis, entity extraction, and content creation. The types of content that the PaLM 2 for | |
| #Text models can create include document summaries, answers to questions, and labels that classify content. | |
| llm = HuggingFaceEndpoint(repo_id="mistralai/Mistral-7B-Instruct-v0.2", Temperature=0.3) | |
| #model = SentenceTransformer("all-MiniLM-L6-v2") | |
| #llm = VertexAI(model_name="text-bison@001",max_output_tokens=256,temperature=0.1,top_p=0.8,top_k=40,verbose=True,) | |
| #embeddings = VertexAIEmbeddings() | |
| #embeddings = model.encode(sentences) | |
| #The below code scrapes all the text data from the webpage link provided by the user and saves it in a text file. | |
| def get_text(url): | |
| # Send a GET request to the URL | |
| response = requests.get(url) | |
| # Create a BeautifulSoup object with the HTML content | |
| soup = BeautifulSoup(response.content, "html.parser") | |
| # Find the specific element or elements containing the text you want to scrape | |
| # Here, we'll find all <p> tags and extract their text | |
| paragraphs = soup.find_all("p") | |
| # Loop through the paragraphs and print their text | |
| with open("text\\temp.txt", "w", encoding='utf-8') as file: | |
| # Loop through the paragraphs and write their text to the file | |
| for paragraph in paragraphs: | |
| file.write(paragraph.get_text() + "\n") | |
| def create_langchain_index(input_text): | |
| print("--indexing---") | |
| get_text(input_text) | |
| loader = TextLoader("text\\temp.txt", encoding='utf-8') | |
| documents = loader.load() | |
| # split it into chunks | |
| text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0) | |
| docs = text_splitter.split_documents(documents) | |
| # create the open-source embedding function | |
| embeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2") | |
| # load it into Chroma | |
| db = Chroma.from_documents(docs, embeddings) | |
| persist_directory = "chroma_db" | |
| vectordb = Chroma.from_documents(documents=docs, embedding=embeddings, persist_directory=persist_directory) | |
| db = Chroma(persist_directory=persist_directory, embedding_function=embeddings) | |
| return db | |
| # @st.cache_resource | |
| # def get_basic_page_details(input_text,summary_query,tweet_query,ln_query): | |
| # index = create_langchain_index(input_text) | |
| # summary_response = index.query(summary_query) | |
| # tweet_response = index.query(tweet_query) | |
| # ln_response = index.query(ln_query) | |
| # return summary_response,tweet_response,ln_response | |
| def get_response(input_text,query,_db): | |
| print(f"--querying---{query}") | |
| retrieval_chain = RetrievalQA.from_chain_type(llm, chain_type="stuff", retriever=db.as_retriever()) | |
| response = retrieval_chain.run(query) | |
| #response = index.query(query,llm=llm) | |
| return response | |
| #The below code is a simple flow to accept the webpage link and process the queries | |
| #using the get_response function created above. Using the cache, the same. | |
| st.title('Webpage Question and Answering ') | |
| input_text=st.text_input("Provide the link to the webpage...") | |
| summary_response = "" | |
| tweet_response = "" | |
| ln_response = "" | |
| # if st.button("Load"): | |
| if input_text: | |
| db = create_langchain_index(input_text) | |
| summary_query ="Write a 100 words summary of the document" | |
| summary_response = get_response(input_text,summary_query,db) | |
| tweet_query ="Write a twitter tweet" | |
| tweet_response = get_response(input_text,tweet_query,db) | |
| ln_query ="Write a linkedin post for the document" | |
| ln_response = get_response(input_text,ln_query,db) | |
| with st.expander('Page Summary'): | |
| st.info(summary_response) | |
| with st.expander('Tweet'): | |
| st.info(tweet_response) | |
| with st.expander('LinkedIn Post'): | |
| st.info(ln_response) | |
| st.session_state.input_text = '' | |
| question=st.text_input("Ask a question from the link you shared...") | |
| if st.button("Ask"): | |
| if question: | |
| db = create_langchain_index(input_text) | |
| response = get_response(input_text,question,db) | |
| st.write(response) | |
| else: | |
| st.warning("Please enter a question.") | |
