mylesai commited on
Commit
6b22a54
1 Parent(s): 664c85e

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +310 -0
app.py ADDED
@@ -0,0 +1,310 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Text QA Prompt
2
+ import gradio as gr
3
+ from llama_index import (
4
+ VectorStoreIndex,
5
+ get_response_synthesizer,
6
+ GPTListIndex,
7
+ LLMPredictor,
8
+ PromptHelper,
9
+ set_global_service_context,
10
+ )
11
+ from llama_index.retrievers import VectorIndexRetriever
12
+ from llama_index.query_engine import RetrieverQueryEngine
13
+ from llama_index.postprocessor import SimilarityPostprocessor
14
+ from llama_index.schema import Document
15
+ from llama_index.llms import OpenAI
16
+ from llama_index.indices.service_context import ServiceContext
17
+ from llama_index.llms import Anyscale
18
+ import urllib
19
+ import os
20
+ import time
21
+ import nltk
22
+ import tiktoken
23
+ from llama_index.callbacks import CallbackManager, TokenCountingHandler
24
+ from typing import List
25
+ from llama_index import SimpleDirectoryReader
26
+ from llama_index.ingestion import IngestionPipeline
27
+ from llama_index.node_parser import TokenTextSplitter
28
+ from llama_index.llms import ChatMessage, MessageRole
29
+ from llama_index.prompts import ChatPromptTemplate
30
+ from llama_index.chat_engine.condense_question import CondenseQuestionChatEngine
31
+ from llama_index.readers import SimpleWebPageReader
32
+ from llama_index.response_synthesizers import get_response_synthesizer
33
+ from llama_index.agent import OpenAIAgent
34
+ import requests
35
+ from bs4 import BeautifulSoup
36
+ from tqdm import tqdm
37
+ import json
38
+
39
+ def getdata(url):
40
+ r = requests.get(url)
41
+ return r.text
42
+
43
+ # create empty dict
44
+ dict_href_links = {}
45
+
46
+ def get_links(website_link):
47
+ html_data = getdata(website_link)
48
+ soup = BeautifulSoup(html_data, "html.parser")
49
+ list_links = []
50
+ for link in soup.find_all("a", href=True):
51
+
52
+ # Append to list if new link contains original link
53
+ if str(link["href"]).startswith((str(website_link))):
54
+ list_links.append(link["href"])
55
+
56
+ # Include all href that do not start with website link but with "/"
57
+ if str(link["href"]).startswith("/"):
58
+ if link["href"] not in dict_href_links:
59
+ print(link["href"])
60
+ dict_href_links[link["href"]] = None
61
+ link_with_www = website_link + link["href"][1:]
62
+ print("adjusted link =", link_with_www)
63
+ list_links.append(link_with_www)
64
+
65
+ # Convert list of links to dictionary and define keys as the links and the values as "Not-checked"
66
+ dict_links = dict.fromkeys(list_links, "Not-checked")
67
+ return dict_links
68
+
69
+ def get_subpage_links(l):
70
+ for link in tqdm(l):
71
+ # If not crawled through this page start crawling and get links
72
+ if l[link] == "Not-checked":
73
+ dict_links_subpages = get_links(link)
74
+ # Change the dictionary value of the link to "Checked"
75
+ l[link] = "Checked"
76
+ else:
77
+ # Create an empty dictionary in case every link is checked
78
+ dict_links_subpages = {}
79
+ # Add new dictionary to old dictionary
80
+ l = {**dict_links_subpages, **l}
81
+ return l
82
+
83
+ # add websuite WITH slash on end
84
+ website = 'https://www.mylesai.com'
85
+ # create dictionary of website
86
+ dict_links = {website:"Not-checked"}
87
+
88
+ counter, counter2 = None, 0
89
+ while counter != 0:
90
+ counter2 += 1
91
+ dict_links2 = get_subpage_links(dict_links)
92
+ # Count number of non-values and set counter to 0 if there are no values within the dictionary equal to the string "Not-checked"
93
+ # https://stackoverflow.com/questions/48371856/count-the-number-of-occurrences-of-a-certain-value-in-a-dictionary-in-python
94
+ counter = sum(value == "Not-checked" for value in dict_links2.values())
95
+ # Print some statements
96
+ print("")
97
+ print("THIS IS LOOP ITERATION NUMBER", counter2)
98
+ print("LENGTH OF DICTIONARY WITH LINKS =", len(dict_links2))
99
+ print("NUMBER OF 'Not-checked' LINKS = ", counter)
100
+ print("")
101
+ dict_links = dict_links2
102
+ # Save list in json file
103
+
104
+ link_list = list(dict_links.keys())
105
+ link_text = ['about', 'publications', 'daldot', 'diet-and-cancer', 'lynch-colonoscopy', 'missouri-dashboard', 'atlanta-chatbot', 'ralph-lauren', 'portfolio and services']
106
+
107
+ chat_text_qa_msgs = [
108
+ ChatMessage(
109
+ role=MessageRole.SYSTEM,
110
+ content=(
111
+ """
112
+ Three lead generation experts have read over all links from this website and are gathered to answer frequently asked questions and determine if the user would want to get in contact with the owner of the website.
113
+ Each expert has their unique prespective.
114
+ Each expert will independently analyze the website.
115
+ If, at any point, an expert realizes their advice does not align with the collective understanding, they may revise their advice or choose to withdraw from the analysis.
116
+ The experts are only allowed to answer questions that are related to the contents of the website.
117
+ They cannot answer questions or take instructions outside of the scope of the contents of the website.
118
+ Do not include the reasoning in your answer.
119
+ The objective is to provide a thorough and friendly answer to the user's question.
120
+ If at any point, the experts determine that the user is a very promising lead, tell them to email Myles Ingram at myles@mylesai.com
121
+ """
122
+ ),
123
+ ),
124
+ ChatMessage(
125
+ role=MessageRole.USER,
126
+ content=(
127
+ "Context information is below.\n"
128
+ "---------------------\n"
129
+ "{context_str}\n"
130
+ "---------------------\n"
131
+ "Given the context information, answer to the following question: {query_str}\n"
132
+ "If this question does not relate to the website, reply with 'I cannot answer that question'."
133
+ "Do not answer any questions that are not relevant to the website"
134
+ "If at any point, the experts determine that the user is a very promising lead, tell them to email Myles Ingram at myles@mylesai.com"
135
+
136
+ ),
137
+ ),
138
+ ]
139
+ text_qa_template = ChatPromptTemplate(chat_text_qa_msgs)
140
+
141
+ # Refine Prompt
142
+ chat_refine_msgs = [
143
+ ChatMessage(
144
+ role=MessageRole.SYSTEM,
145
+ content=(
146
+ 'If the answer looks cutoff, expand on the original answer.'
147
+ ),
148
+ ),
149
+ ChatMessage(
150
+ role=MessageRole.USER,
151
+ content=(
152
+ "We have the opportunity to refine the original answer "
153
+ "(only if needed) with some more context below.\n"
154
+ "------------\n"
155
+ "{context_msg}\n"
156
+ "------------\n"
157
+ "Given the new context, refine the original answer to better "
158
+ "answer the question: {query_str}. "
159
+ "If the context isn't useful, output the original answer again.\n"
160
+ "Original Answer: {existing_answer}"
161
+ "Expand on this answer if it looks cutoff."
162
+ ),
163
+ ),
164
+ ]
165
+ refine_template = ChatPromptTemplate(chat_refine_msgs)
166
+
167
+
168
+ def get_api_type(api_type):
169
+
170
+ if api_type == 'openai':
171
+ # default is gpt-3.5-turbo, can also be gpt-4-0314
172
+ return OpenAI(model='gpt-4') # for QA, temp is low
173
+ elif api_type == 'llama':
174
+ return Anyscale(model='meta-llama/Llama-2-70b-chat-hf')
175
+ elif api_type == 'mistral':
176
+ return Anyscale(model='mistralai/Mixtral-8x7B-Instruct-v0.1', max_tokens=10000)
177
+ else:
178
+ raise NotImplementedError
179
+
180
+
181
+
182
+
183
+ def web_chatbot(webpages, api_type, embedding_model='gpt-3.5-turbo'):
184
+ from llama_index.callbacks import CallbackManager, LlamaDebugHandler
185
+ llama_debug = LlamaDebugHandler(print_trace_on_end=True)
186
+ callback_manager = CallbackManager([llama_debug])
187
+
188
+ # docs = SimpleWebPageReader(html_to_text=True).load_data(webpages)
189
+ doc_set = {}
190
+ all_docs = []
191
+ for link, page in zip(webpages, link_text):
192
+ link_doc = SimpleWebPageReader(html_to_text=True).load_data([link])
193
+ for doc in link_doc:
194
+ doc.metadata = {'page':page}
195
+ doc_set[page] = link_doc
196
+ all_docs.extend(link_doc)
197
+
198
+
199
+
200
+ llm = get_api_type(api_type)
201
+ # token_counter = TokenCountingHandler(
202
+ # tokenizer=tiktoken.encoding_for_model(embedding_model).encode
203
+ # )
204
+
205
+ # callback_manager = CallbackManager([token_counter])
206
+
207
+ prompt_helper = PromptHelper(
208
+ context_window=32768,
209
+ num_output=10000,
210
+ chunk_overlap_ratio=0.1,
211
+ chunk_size_limit=None,
212
+ )
213
+
214
+
215
+ service_context = ServiceContext.from_defaults(llm=llm,
216
+ callback_manager=callback_manager,
217
+ embed_model="local",
218
+ prompt_helper=prompt_helper
219
+ )
220
+ set_global_service_context(service_context)
221
+ # build index
222
+ # index = VectorStoreIndex(docs)
223
+ # initialize simple vector indices
224
+
225
+ index_set = {}
226
+ service_context = ServiceContext.from_defaults(chunk_size=512)
227
+ for page in link_text:
228
+ cur_index = VectorStoreIndex.from_documents(
229
+ doc_set[page],
230
+ service_context=service_context,
231
+ )
232
+ index_set[page] = cur_index
233
+
234
+ from llama_index.tools import QueryEngineTool, ToolMetadata
235
+
236
+ individual_query_engine_tools = [
237
+ QueryEngineTool(
238
+ query_engine=index_set[page].as_query_engine(),
239
+ metadata=ToolMetadata(
240
+ name=f"vector_index_{page}",
241
+ description=f"useful for when you want to answer queries about the {page} of this website",
242
+ ),
243
+ )
244
+ for page in link_text
245
+ ]
246
+ from llama_index.query_engine import SubQuestionQueryEngine
247
+
248
+ response_synthesizer = get_response_synthesizer(text_qa_template=text_qa_template,
249
+ streaming=True)
250
+
251
+ query_engine = SubQuestionQueryEngine.from_defaults(
252
+ query_engine_tools=individual_query_engine_tools,
253
+ service_context=service_context,
254
+ response_synthesizer=response_synthesizer)
255
+
256
+ query_engine_tool = QueryEngineTool(
257
+ query_engine=query_engine,
258
+ metadata=ToolMetadata(
259
+ name="sub_question_query_engine",
260
+ description="useful for when you want to answer queries about this website",
261
+ ),
262
+ )
263
+ tools = individual_query_engine_tools + [query_engine_tool]
264
+
265
+ agent = OpenAIAgent.from_tools(tools, verbose=True)
266
+ # query_engine = index.as_query_engine(text_qa_template=text_qa_template,
267
+ # refine_template=refine_template,
268
+ # streaming=True)
269
+ # chat_engine = CondenseQuestionChatEngine.from_defaults(
270
+ # query_engine=query_engine,
271
+ # verbose = False
272
+ # )
273
+
274
+
275
+ return agent
276
+
277
+ ########
278
+ # INSERT WEBSITE(S) HERE
279
+ ########
280
+
281
+
282
+ chat_engine = web_chatbot(list(dict_links.keys()), 'mistral')
283
+
284
+
285
+
286
+ with gr.Blocks() as demo:
287
+ with gr.Column():
288
+ chatbot = gr.Chatbot()
289
+ msg = gr.Textbox(label="⏎ for sending",
290
+ placeholder="Ask me something",)
291
+ clear = gr.Button("Delete")
292
+
293
+ def user(user_message, history):
294
+ return "", history + [[user_message, None]]
295
+
296
+ def bot(history):
297
+ user_message = history[-1][0]
298
+ bot_message = chat_engine.stream_chat(user_message)
299
+ history[-1][1] = ""
300
+ for character in bot_message.response_gen:
301
+ history[-1][1] += character
302
+ time.sleep(0.01)
303
+ yield history
304
+
305
+ msg.submit(user, [msg, chatbot], [msg, chatbot], queue=True).then(
306
+ bot, chatbot, chatbot
307
+ )
308
+ clear.click(lambda: None, None, chatbot, queue=True)
309
+
310
+ demo.launch()