Spaces:

HarryGGD
/

WebQA

Running

App Files Files Community

HarryGGD commited on May 13

Commit

b160e5c

•

1 Parent(s): 33bb8a0

Create app.py

Browse files

Files changed (1) hide show

app.py +132 -0

app.py ADDED Viewed

	@@ -0,0 +1,132 @@

+#DocArrayInMemorySearch is a document index provided by Docarray that stores documents in memory.
+#It is a great starting point for small datasets, where you may not want to launch a database server.
+# import libraries
+import streamlit as st
+import requests
+from bs4 import BeautifulSoup
+#from langchain.indexes import VectorstoreIndexCreator      #Logic for creating indexes.
+#from langchain.vectorstores import DocArrayInMemorySearch  #document index provided by Docarray that stores documents in memory.
+from sentence_transformers import SentenceTransformer
+from langchain_community.llms import HuggingFaceEndpoint
+from langchain_chroma import Chroma
+from langchain_community.document_loaders import TextLoader
+from langchain_community.embeddings.sentence_transformer import (SentenceTransformerEmbeddings,)
+from langchain_text_splitters import CharacterTextSplitter
+from langchain.chains import RetrievalQA
+#import vertexai
+#from langchain.llms import VertexAI
+#from langchain.embeddings import VertexAIEmbeddings
+#vertexai.init(project=PROJECT, location=LOCATION)        #GCP PROJECT ID, LOCATION as region.
+#The PaLM 2 for Text (text-bison, text-unicorn) foundation models are optimized for a variety of natural language
+#tasks such as sentiment analysis, entity extraction, and content creation. The types of content that the PaLM 2 for
+#Text models can create include document summaries, answers to questions, and labels that classify content.
+llm = HuggingFaceEndpoint(repo_id="mistralai/Mistral-7B-Instruct-v0.2", Temperature=0.3)
+#model = SentenceTransformer("all-MiniLM-L6-v2")
+#llm = VertexAI(model_name="text-bison@001",max_output_tokens=256,temperature=0.1,top_p=0.8,top_k=40,verbose=True,)
+#embeddings = VertexAIEmbeddings()
+#embeddings = model.encode(sentences)
+#The below code scrapes all the text data from the webpage link provided by the user and saves it in a text file.
+def get_text(url):
+    # Send a GET request to the URL
+    response = requests.get(url)
+    # Create a BeautifulSoup object with the HTML content
+    soup = BeautifulSoup(response.content, "html.parser")
+    # Find the specific element or elements containing the text you want to scrape
+    # Here, we'll find all <p> tags and extract their text
+    paragraphs = soup.find_all("p")
+    # Loop through the paragraphs and print their text
+    with open("text\\temp.txt", "w", encoding='utf-8') as file:
+        # Loop through the paragraphs and write their text to the file
+        for paragraph in paragraphs:
+            file.write(paragraph.get_text() + "\n")
+@st.cache_resource
+def create_langchain_index(input_text):
+    print("--indexing---")
+    get_text(input_text)
+    loader = TextLoader("text\\temp.txt", encoding='utf-8')
+    documents = loader.load()
+    # split it into chunks
+    text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
+    docs = text_splitter.split_documents(documents)
+    # create the open-source embedding function
+    embeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
+    # load it into Chroma
+    db = Chroma.from_documents(docs, embeddings)
+    persist_directory = "chroma_db"
+    vectordb = Chroma.from_documents(documents=docs, embedding=embeddings, persist_directory=persist_directory)
+    db = Chroma(persist_directory=persist_directory, embedding_function=embeddings)
+    return db
+# @st.cache_resource
+# def get_basic_page_details(input_text,summary_query,tweet_query,ln_query):
+#     index = create_langchain_index(input_text)
+#     summary_response = index.query(summary_query)
+#     tweet_response = index.query(tweet_query)
+#     ln_response = index.query(ln_query)
+#     return summary_response,tweet_response,ln_response
+@st.cache_data
+def get_response(input_text,query,_db):
+    print(f"--querying---{query}")
+    retrieval_chain = RetrievalQA.from_chain_type(llm, chain_type="stuff", retriever=db.as_retriever())
+    response = retrieval_chain.run(query)
+    #response = index.query(query,llm=llm)
+    return response
+#The below code is a simple flow to accept the webpage link and process the queries
+#using the get_response function created above. Using the cache, the same.
+st.title('Webpage Question and Answering ')
+input_text=st.text_input("Provide the link to the webpage...")
+summary_response = ""
+tweet_response = ""
+ln_response = ""
+# if st.button("Load"):
+if input_text:
+    db = create_langchain_index(input_text)
+    summary_query ="Write a 100 words summary of the document"
+    summary_response = get_response(input_text,summary_query,db)
+    tweet_query ="Write a twitter tweet"
+    tweet_response =  get_response(input_text,tweet_query,db)
+    ln_query ="Write a linkedin post for the document"
+    ln_response = get_response(input_text,ln_query,db)
+    with st.expander('Page Summary'):
+        st.info(summary_response)
+    with st.expander('Tweet'):
+        st.info(tweet_response)
+    with st.expander('LinkedIn Post'):
+        st.info(ln_response)
+st.session_state.input_text = ''
+question=st.text_input("Ask a question from the link you shared...")
+if st.button("Ask"):
+        if question:
+            db = create_langchain_index(input_text)
+            response = get_response(input_text,question,db)
+            st.write(response)
+        else:
+            st.warning("Please enter a question.")