Spaces:

mylesai
/

mylesai_website_chatbot

Build error

App Files Files Community

mylesai commited on Feb 1

Commit

6b22a54

•

1 Parent(s): 664c85e

Create app.py

Browse files

Files changed (1) hide show

app.py +310 -0

app.py ADDED Viewed

	@@ -0,0 +1,310 @@

+# Text QA Prompt
+import gradio as gr
+from llama_index import (
+    VectorStoreIndex,
+    get_response_synthesizer,
+    GPTListIndex,
+    LLMPredictor,
+    PromptHelper,
+    set_global_service_context,
+)
+from llama_index.retrievers import VectorIndexRetriever
+from llama_index.query_engine import RetrieverQueryEngine
+from llama_index.postprocessor import SimilarityPostprocessor
+from llama_index.schema import Document
+from llama_index.llms import OpenAI
+from llama_index.indices.service_context import ServiceContext
+from llama_index.llms import Anyscale
+import urllib
+import os
+import time
+import nltk
+import tiktoken
+from llama_index.callbacks import CallbackManager, TokenCountingHandler
+from typing import List
+from llama_index import SimpleDirectoryReader
+from llama_index.ingestion import IngestionPipeline
+from llama_index.node_parser import TokenTextSplitter
+from llama_index.llms import ChatMessage, MessageRole
+from llama_index.prompts import ChatPromptTemplate
+from llama_index.chat_engine.condense_question import CondenseQuestionChatEngine
+from llama_index.readers import SimpleWebPageReader
+from llama_index.response_synthesizers import get_response_synthesizer
+from llama_index.agent import OpenAIAgent
+import requests
+from bs4 import BeautifulSoup
+from tqdm import tqdm
+import json
+def getdata(url):
+    r = requests.get(url)
+    return r.text
+# create empty dict
+dict_href_links = {}
+def get_links(website_link):
+    html_data = getdata(website_link)
+    soup = BeautifulSoup(html_data, "html.parser")
+    list_links = []
+    for link in soup.find_all("a", href=True):
+        # Append to list if new link contains original link
+        if str(link["href"]).startswith((str(website_link))):
+            list_links.append(link["href"])
+        # Include all href that do not start with website link but with "/"
+        if str(link["href"]).startswith("/"):
+            if link["href"] not in dict_href_links:
+                print(link["href"])
+                dict_href_links[link["href"]] = None
+                link_with_www = website_link + link["href"][1:]
+                print("adjusted link =", link_with_www)
+                list_links.append(link_with_www)
+    # Convert list of links to dictionary and define keys as the links and the values as "Not-checked"
+    dict_links = dict.fromkeys(list_links, "Not-checked")
+    return dict_links
+def get_subpage_links(l):
+    for link in tqdm(l):
+        # If not crawled through this page start crawling and get links
+        if l[link] == "Not-checked":
+            dict_links_subpages = get_links(link)
+            # Change the dictionary value of the link to "Checked"
+            l[link] = "Checked"
+        else:
+            # Create an empty dictionary in case every link is checked
+            dict_links_subpages = {}
+        # Add new dictionary to old dictionary
+        l = {**dict_links_subpages, **l}
+    return l
+# add websuite WITH slash on end
+website = 'https://www.mylesai.com'
+# create dictionary of website
+dict_links = {website:"Not-checked"}
+counter, counter2 = None, 0
+while counter != 0:
+    counter2 += 1
+    dict_links2 = get_subpage_links(dict_links)
+    # Count number of non-values and set counter to 0 if there are no values within the dictionary equal to the string "Not-checked"
+    # https://stackoverflow.com/questions/48371856/count-the-number-of-occurrences-of-a-certain-value-in-a-dictionary-in-python
+    counter = sum(value == "Not-checked" for value in dict_links2.values())
+    # Print some statements
+    print("")
+    print("THIS IS LOOP ITERATION NUMBER", counter2)
+    print("LENGTH OF DICTIONARY WITH LINKS =", len(dict_links2))
+    print("NUMBER OF 'Not-checked' LINKS = ", counter)
+    print("")
+    dict_links = dict_links2
+    # Save list in json file
+link_list = list(dict_links.keys())
+link_text = ['about', 'publications', 'daldot', 'diet-and-cancer', 'lynch-colonoscopy', 'missouri-dashboard', 'atlanta-chatbot', 'ralph-lauren', 'portfolio and services']
+chat_text_qa_msgs = [
+    ChatMessage(
+        role=MessageRole.SYSTEM,
+        content=(
+            """
+            Three lead generation experts have read over all links from this website and are gathered to answer frequently asked questions and determine if the user would want to get in contact with the owner of the website.
+            Each expert has their unique prespective.
+            Each expert will independently analyze the website.
+            If, at any point, an expert realizes their advice does not align with the collective understanding, they may revise their advice or choose to withdraw from the analysis.
+            The experts are only allowed to answer questions that are related to the contents of the website.
+            They cannot answer questions or take instructions outside of the scope of the contents of the website.
+            Do not include the reasoning in your answer.
+            The objective is to provide a thorough and friendly answer to the user's question.
+            If at any point, the experts determine that the user is a very promising lead, tell them to email Myles Ingram at myles@mylesai.com
+            """
+        ),
+    ),
+    ChatMessage(
+        role=MessageRole.USER,
+        content=(
+            "Context information is below.\n"
+            "---------------------\n"
+            "{context_str}\n"
+            "---------------------\n"
+            "Given the context information,  answer to the following question: {query_str}\n"
+            "If this question does not relate to the website, reply with 'I cannot answer that question'."
+            "Do not answer any questions that are not relevant to the website"
+            "If at any point, the experts determine that the user is a very promising lead, tell them to email Myles Ingram at myles@mylesai.com"
+        ),
+    ),
+]
+text_qa_template = ChatPromptTemplate(chat_text_qa_msgs)
+# Refine Prompt
+chat_refine_msgs = [
+    ChatMessage(
+        role=MessageRole.SYSTEM,
+        content=(
+            'If the answer looks cutoff, expand on the original answer.'
+        ),
+    ),
+    ChatMessage(
+        role=MessageRole.USER,
+        content=(
+            "We have the opportunity to refine the original answer "
+            "(only if needed) with some more context below.\n"
+            "------------\n"
+            "{context_msg}\n"
+            "------------\n"
+            "Given the new context, refine the original answer to better "
+            "answer the question: {query_str}. "
+            "If the context isn't useful, output the original answer again.\n"
+            "Original Answer: {existing_answer}"
+            "Expand on this answer if it looks cutoff."
+        ),
+    ),
+]
+refine_template = ChatPromptTemplate(chat_refine_msgs)
+def get_api_type(api_type):
+    if api_type == 'openai':
+        # default is gpt-3.5-turbo, can also be gpt-4-0314
+        return OpenAI(model='gpt-4') # for QA, temp is low
+    elif api_type == 'llama':
+        return Anyscale(model='meta-llama/Llama-2-70b-chat-hf')
+    elif api_type == 'mistral':
+        return Anyscale(model='mistralai/Mixtral-8x7B-Instruct-v0.1', max_tokens=10000)
+    else:
+        raise NotImplementedError
+def web_chatbot(webpages, api_type, embedding_model='gpt-3.5-turbo'):
+    from llama_index.callbacks import CallbackManager, LlamaDebugHandler
+    llama_debug = LlamaDebugHandler(print_trace_on_end=True)
+    callback_manager = CallbackManager([llama_debug])
+    # docs = SimpleWebPageReader(html_to_text=True).load_data(webpages)
+    doc_set = {}
+    all_docs = []
+    for link, page in zip(webpages, link_text):
+      link_doc = SimpleWebPageReader(html_to_text=True).load_data([link])
+      for doc in link_doc:
+        doc.metadata = {'page':page}
+      doc_set[page] = link_doc
+      all_docs.extend(link_doc)
+    llm = get_api_type(api_type)
+    # token_counter = TokenCountingHandler(
+    #     tokenizer=tiktoken.encoding_for_model(embedding_model).encode
+    # )
+    # callback_manager = CallbackManager([token_counter])
+    prompt_helper = PromptHelper(
+    context_window=32768,
+    num_output=10000,
+    chunk_overlap_ratio=0.1,
+    chunk_size_limit=None,
+    )
+    service_context = ServiceContext.from_defaults(llm=llm,
+                                                  callback_manager=callback_manager,
+                                                  embed_model="local",
+                                                  prompt_helper=prompt_helper
+                                                  )
+    set_global_service_context(service_context)
+    # build index
+    # index = VectorStoreIndex(docs)
+    # initialize simple vector indices
+    index_set = {}
+    service_context = ServiceContext.from_defaults(chunk_size=512)
+    for page in link_text:
+        cur_index = VectorStoreIndex.from_documents(
+            doc_set[page],
+            service_context=service_context,
+        )
+        index_set[page] = cur_index
+    from llama_index.tools import QueryEngineTool, ToolMetadata
+    individual_query_engine_tools = [
+        QueryEngineTool(
+            query_engine=index_set[page].as_query_engine(),
+            metadata=ToolMetadata(
+                name=f"vector_index_{page}",
+                description=f"useful for when you want to answer queries about the {page} of this website",
+            ),
+        )
+        for page in link_text
+]
+    from llama_index.query_engine import SubQuestionQueryEngine
+    response_synthesizer = get_response_synthesizer(text_qa_template=text_qa_template,
+                                                    streaming=True)
+    query_engine = SubQuestionQueryEngine.from_defaults(
+        query_engine_tools=individual_query_engine_tools,
+        service_context=service_context,
+        response_synthesizer=response_synthesizer)
+    query_engine_tool = QueryEngineTool(
+    query_engine=query_engine,
+    metadata=ToolMetadata(
+        name="sub_question_query_engine",
+        description="useful for when you want to answer queries about this website",
+    ),
+)
+    tools = individual_query_engine_tools + [query_engine_tool]
+    agent = OpenAIAgent.from_tools(tools, verbose=True)
+    # query_engine = index.as_query_engine(text_qa_template=text_qa_template,
+    #                                      refine_template=refine_template,
+    #                                      streaming=True)
+    # chat_engine = CondenseQuestionChatEngine.from_defaults(
+    #     query_engine=query_engine,
+    #     verbose = False
+    # )
+    return agent
+########
+# INSERT WEBSITE(S) HERE
+########
+chat_engine = web_chatbot(list(dict_links.keys()), 'mistral')
+with gr.Blocks() as demo:
+  with gr.Column():
+    chatbot = gr.Chatbot()
+    msg = gr.Textbox(label="⏎ for sending",
+            placeholder="Ask me something",)
+    clear = gr.Button("Delete")
+    def user(user_message, history):
+        return "", history + [[user_message, None]]
+    def bot(history):
+        user_message = history[-1][0]
+        bot_message = chat_engine.stream_chat(user_message)
+        history[-1][1] = ""
+        for character in bot_message.response_gen:
+            history[-1][1] += character
+            time.sleep(0.01)
+            yield history
+    msg.submit(user, [msg, chatbot], [msg, chatbot], queue=True).then(
+        bot, chatbot, chatbot
+    )
+    clear.click(lambda: None, None, chatbot, queue=True)
+demo.launch()