HarryGGD commited on
Commit
b160e5c
1 Parent(s): 33bb8a0

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +132 -0
app.py ADDED
@@ -0,0 +1,132 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #DocArrayInMemorySearch is a document index provided by Docarray that stores documents in memory.
2
+ #It is a great starting point for small datasets, where you may not want to launch a database server.
3
+
4
+ # import libraries
5
+ import streamlit as st
6
+ import requests
7
+ from bs4 import BeautifulSoup
8
+ #from langchain.indexes import VectorstoreIndexCreator #Logic for creating indexes.
9
+ #from langchain.vectorstores import DocArrayInMemorySearch #document index provided by Docarray that stores documents in memory.
10
+ from sentence_transformers import SentenceTransformer
11
+ from langchain_community.llms import HuggingFaceEndpoint
12
+ from langchain_chroma import Chroma
13
+ from langchain_community.document_loaders import TextLoader
14
+ from langchain_community.embeddings.sentence_transformer import (SentenceTransformerEmbeddings,)
15
+ from langchain_text_splitters import CharacterTextSplitter
16
+ from langchain.chains import RetrievalQA
17
+
18
+ #import vertexai
19
+ #from langchain.llms import VertexAI
20
+ #from langchain.embeddings import VertexAIEmbeddings
21
+
22
+ #vertexai.init(project=PROJECT, location=LOCATION) #GCP PROJECT ID, LOCATION as region.
23
+
24
+ #The PaLM 2 for Text (text-bison, text-unicorn) foundation models are optimized for a variety of natural language
25
+ #tasks such as sentiment analysis, entity extraction, and content creation. The types of content that the PaLM 2 for
26
+ #Text models can create include document summaries, answers to questions, and labels that classify content.
27
+
28
+ llm = HuggingFaceEndpoint(repo_id="mistralai/Mistral-7B-Instruct-v0.2", Temperature=0.3)
29
+ #model = SentenceTransformer("all-MiniLM-L6-v2")
30
+
31
+ #llm = VertexAI(model_name="text-bison@001",max_output_tokens=256,temperature=0.1,top_p=0.8,top_k=40,verbose=True,)
32
+
33
+ #embeddings = VertexAIEmbeddings()
34
+ #embeddings = model.encode(sentences)
35
+
36
+ #The below code scrapes all the text data from the webpage link provided by the user and saves it in a text file.
37
+ def get_text(url):
38
+ # Send a GET request to the URL
39
+ response = requests.get(url)
40
+
41
+ # Create a BeautifulSoup object with the HTML content
42
+ soup = BeautifulSoup(response.content, "html.parser")
43
+
44
+ # Find the specific element or elements containing the text you want to scrape
45
+ # Here, we'll find all <p> tags and extract their text
46
+ paragraphs = soup.find_all("p")
47
+
48
+ # Loop through the paragraphs and print their text
49
+ with open("text\\temp.txt", "w", encoding='utf-8') as file:
50
+ # Loop through the paragraphs and write their text to the file
51
+ for paragraph in paragraphs:
52
+ file.write(paragraph.get_text() + "\n")
53
+
54
+ @st.cache_resource
55
+ def create_langchain_index(input_text):
56
+ print("--indexing---")
57
+ get_text(input_text)
58
+ loader = TextLoader("text\\temp.txt", encoding='utf-8')
59
+ documents = loader.load()
60
+ # split it into chunks
61
+ text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
62
+ docs = text_splitter.split_documents(documents)
63
+ # create the open-source embedding function
64
+ embeddings = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
65
+ # load it into Chroma
66
+ db = Chroma.from_documents(docs, embeddings)
67
+ persist_directory = "chroma_db"
68
+ vectordb = Chroma.from_documents(documents=docs, embedding=embeddings, persist_directory=persist_directory)
69
+ db = Chroma(persist_directory=persist_directory, embedding_function=embeddings)
70
+ return db
71
+
72
+ # @st.cache_resource
73
+ # def get_basic_page_details(input_text,summary_query,tweet_query,ln_query):
74
+ # index = create_langchain_index(input_text)
75
+ # summary_response = index.query(summary_query)
76
+ # tweet_response = index.query(tweet_query)
77
+ # ln_response = index.query(ln_query)
78
+
79
+ # return summary_response,tweet_response,ln_response
80
+
81
+
82
+ @st.cache_data
83
+ def get_response(input_text,query,_db):
84
+ print(f"--querying---{query}")
85
+ retrieval_chain = RetrievalQA.from_chain_type(llm, chain_type="stuff", retriever=db.as_retriever())
86
+ response = retrieval_chain.run(query)
87
+ #response = index.query(query,llm=llm)
88
+ return response
89
+
90
+ #The below code is a simple flow to accept the webpage link and process the queries
91
+ #using the get_response function created above. Using the cache, the same.
92
+
93
+ st.title('Webpage Question and Answering ')
94
+
95
+
96
+ input_text=st.text_input("Provide the link to the webpage...")
97
+
98
+ summary_response = ""
99
+ tweet_response = ""
100
+ ln_response = ""
101
+ # if st.button("Load"):
102
+ if input_text:
103
+ db = create_langchain_index(input_text)
104
+ summary_query ="Write a 100 words summary of the document"
105
+ summary_response = get_response(input_text,summary_query,db)
106
+
107
+ tweet_query ="Write a twitter tweet"
108
+ tweet_response = get_response(input_text,tweet_query,db)
109
+
110
+ ln_query ="Write a linkedin post for the document"
111
+ ln_response = get_response(input_text,ln_query,db)
112
+
113
+
114
+ with st.expander('Page Summary'):
115
+ st.info(summary_response)
116
+
117
+ with st.expander('Tweet'):
118
+ st.info(tweet_response)
119
+
120
+ with st.expander('LinkedIn Post'):
121
+ st.info(ln_response)
122
+
123
+
124
+ st.session_state.input_text = ''
125
+ question=st.text_input("Ask a question from the link you shared...")
126
+ if st.button("Ask"):
127
+ if question:
128
+ db = create_langchain_index(input_text)
129
+ response = get_response(input_text,question,db)
130
+ st.write(response)
131
+ else:
132
+ st.warning("Please enter a question.")