Spaces:

HarryGGD
/

WebQA

Sleeping

App Files Files Community

WebQA / app.py

HarryGGD

Create app.py

b160e5c verified 5 months ago

raw

history blame

No virus

5.35 kB

	#DocArrayInMemorySearch is a document index provided by Docarray that stores documents in memory.
	#It is a great starting point for small datasets, where you may not want to launch a database server.

	# import libraries
	import streamlit as st
	import requests
	from bs4 import BeautifulSoup
	#from langchain.indexes import VectorstoreIndexCreator #Logic for creating indexes.
	#from langchain.vectorstores import DocArrayInMemorySearch #document index provided by Docarray that stores documents in memory.
	from sentence_transformers import SentenceTransformer
	from langchain_community.llms import HuggingFaceEndpoint
	from langchain_chroma import Chroma
	from langchain_community.document_loaders import TextLoader
	from langchain_community.embeddings.sentence_transformer import (SentenceTransformerEmbeddings,)
	from langchain_text_splitters import CharacterTextSplitter
	from langchain.chains import RetrievalQA

	#import vertexai
	#from langchain.llms import VertexAI
	#from langchain.embeddings import VertexAIEmbeddings

	#vertexai.init(project=PROJECT, location=LOCATION) #GCP PROJECT ID, LOCATION as region.

	#The PaLM 2 for Text (text-bison, text-unicorn) foundation models are optimized for a variety of natural language
	#tasks such as sentiment analysis, entity extraction, and content creation. The types of content that the PaLM 2 for
	#Text models can create include document summaries, answers to questions, and labels that classify content.

	llm = HuggingFaceEndpoint(repo_id="mistralai/Mistral-7B-Instruct-v0.2", Temperature=0.3)
	#model = SentenceTransformer("all-MiniLM-L6-v2")

	#llm = VertexAI(model_name="text-bison@001",max_output_tokens=256,temperature=0.1,top_p=0.8,top_k=40,verbose=True,)

	#embeddings = VertexAIEmbeddings()
	#embeddings = model.encode(sentences)

	#The below code scrapes all the text data from the webpage link provided by the user and saves it in a text file.
	def get_text(url):
	# Send a GET request to the URL
	response = requests.get(url)

	# Create a BeautifulSoup object with the HTML content
	soup = BeautifulSoup(response.content, "html.parser")

	# Find the specific element or elements containing the text you want to scrape
	# Here, we'll find all <p> tags and extract their text
	paragraphs = soup.find_all("p")

	# Loop through the paragraphs and print their text
	with open("text\\temp.txt", "w", encoding='utf-8') as file:
	# Loop through the paragraphs and write their text to the file
	for paragraph in paragraphs:
	file.write(paragraph.get_text() + "\n")

	@st.cache_resource
	def create_langchain_index(input_text):
	print("--indexing---")
	get_text(input_text)
	loader = TextLoader("text\\temp.txt", encoding='utf-8')
	documents = loader.load()
	# split it into chunks
	text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
	docs = text_splitter.split_documents(documents)
	# create the open-source embedding function
	embeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
	# load it into Chroma
	db = Chroma.from_documents(docs, embeddings)
	persist_directory = "chroma_db"
	vectordb = Chroma.from_documents(documents=docs, embedding=embeddings, persist_directory=persist_directory)
	db = Chroma(persist_directory=persist_directory, embedding_function=embeddings)
	return db

	# @st.cache_resource
	# def get_basic_page_details(input_text,summary_query,tweet_query,ln_query):
	# index = create_langchain_index(input_text)
	# summary_response = index.query(summary_query)
	# tweet_response = index.query(tweet_query)
	# ln_response = index.query(ln_query)

	# return summary_response,tweet_response,ln_response


	@st.cache_data
	def get_response(input_text,query,_db):
	print(f"--querying---{query}")
	retrieval_chain = RetrievalQA.from_chain_type(llm, chain_type="stuff", retriever=db.as_retriever())
	response = retrieval_chain.run(query)
	#response = index.query(query,llm=llm)
	return response

	#The below code is a simple flow to accept the webpage link and process the queries
	#using the get_response function created above. Using the cache, the same.

	st.title('Webpage Question and Answering ')


	input_text=st.text_input("Provide the link to the webpage...")

	summary_response = ""
	tweet_response = ""
	ln_response = ""
	# if st.button("Load"):
	if input_text:
	db = create_langchain_index(input_text)
	summary_query ="Write a 100 words summary of the document"
	summary_response = get_response(input_text,summary_query,db)

	tweet_query ="Write a twitter tweet"
	tweet_response = get_response(input_text,tweet_query,db)

	ln_query ="Write a linkedin post for the document"
	ln_response = get_response(input_text,ln_query,db)


	with st.expander('Page Summary'):
	st.info(summary_response)

	with st.expander('Tweet'):
	st.info(tweet_response)

	with st.expander('LinkedIn Post'):
	st.info(ln_response)


	st.session_state.input_text = ''
	question=st.text_input("Ask a question from the link you shared...")
	if st.button("Ask"):
	if question:
	db = create_langchain_index(input_text)
	response = get_response(input_text,question,db)
	st.write(response)
	else:
	st.warning("Please enter a question.")