Spaces:

seanpedrickcase
/

Light-PDF-Web-QA-Chatbot

Running

App Files Files Community

Sean-Case commited on Oct 13, 2023

Commit

275393f

•

1 Parent(s): d213c15

Changed embedding model, added reference to chat model on front page

Browse files

Files changed (5) hide show

Generation speed GPU test.txt +0 -51
app.py +14 -11
chatfuncs/chatfuncs.py +24 -15
chatfuncs/ingest_borough_plan.py +3 -5
faiss_embedding/faiss_embedding.zip +0 -0

Generation speed GPU test.txt DELETED Viewed

@@ -1,51 +0,0 @@
-With 5 gpu layers, batch size 8
-Num of generated tokens: 113
-Time for complete generation: 115.42684650421143s
-Tokens per secound: 0.9789750255013432
-Time per token: 1021.4765177363843ms
-With 5 gpu layers, batch size 512
-Num of generated tokens: 102
-Time for complete generation: 40.369266986846924s
-Tokens per secound: 2.5266745624396285
-Time per token: 395.77712732202866ms
-With 6 gpu layers -
-Num of generated tokens: 113
-Time for complete generation: 46.37785983085632s
-Tokens per secound: 2.4365074285902764
-Time per token: 410.42353832616215ms
-With 6 gpu layers, batch size 1024 -
-Five pillars Q:
-Num of generated tokens: 102
-Time for complete generation: 41.85241961479187s
-Tokens per secound: 2.4371350793766346
-Time per token: 410.31783936070457ms
-With 8 threads
-Num of generated tokens: 102
-Time for complete generation: 40.64410996437073s
-Tokens per secound: 2.5095887224351774
-Time per token: 398.4716663173601ms
-Vision statement Q:
-Num of generated tokens: 84
-Time for complete generation: 35.57932233810425s
-Tokens per secound: 2.360921863597128
-Time per token: 423.5633611679077ms
-Commitments Q:
-Num of generated tokens: 50
-Time for complete generation: 23.73319172859192s
-Tokens per secound: 2.106754142965266
-Time per token: 474.6638345718384ms
-Outcomes Q
-Num of generated tokens: 167
-Time for complete generation: 52.302518367767334s
-Tokens per secound: 3.1929628861412094
-Time per token: 313.1887327411217ms

app.py CHANGED Viewed

@@ -28,7 +28,7 @@ import chatfuncs.ingest as ing
 ##  Load preset embeddings, vectorstore, and model
-embeddings_name = "thenlper/gte-base"
 def load_embeddings(embeddings_name = "thenlper/gte-base"):
@@ -79,7 +79,7 @@ def load_model(model_type, gpu_layers, gpu_config=None, cpu_config=None, torch_d
     if torch_device is None:
         torch_device = chatf.torch_device
-    if model_type == "Orca Mini":
         gpu_config.update_gpu(gpu_layers)
         cpu_config.update_gpu(gpu_layers)
@@ -103,7 +103,7 @@ def load_model(model_type, gpu_layers, gpu_config=None, cpu_config=None, torch_d
         tokenizer = []
-    if model_type == "Flan Alpaca":
         # Huggingface chat model
         hf_checkpoint = 'declare-lab/flan-alpaca-large'
@@ -135,14 +135,14 @@ def load_model(model_type, gpu_layers, gpu_config=None, cpu_config=None, torch_d
     load_confirmation = "Finished loading model: " + model_type
     print(load_confirmation)
-    return model_type, load_confirmation
 # Both models are loaded on app initialisation so that users don't have to wait for the models to be downloaded
-model_type = "Orca Mini"
 load_model(model_type, chatf.gpu_layers, chatf.gpu_config, chatf.cpu_config, chatf.torch_device)
-model_type = "Flan Alpaca"
 load_model(model_type, 0, chatf.gpu_config, chatf.cpu_config, chatf.torch_device)
 def docs_to_faiss_save(docs_out:PandasDataFrame, embeddings=embeddings):
@@ -181,16 +181,19 @@ with block:
     gr.Markdown("<h1><center>Lightweight PDF / web page QA bot</center></h1>")
-    gr.Markdown("Chat with PDF or web page documents. The default is a small model (Flan Alpaca), that can only answer specific questions that are answered in the text. It cannot give overall impressions of, or summarise the document. The alternative (Orca Mini), can reason a little better, but is much slower (See Advanced tab).\n\nBy default the Lambeth Borough Plan '[Lambeth 2030 : Our Future, Our Lambeth](https://www.lambeth.gov.uk/better-fairer-lambeth/projects/lambeth-2030-our-future-our-lambeth)' is loaded. If you want to talk about another document or web page, please select from the second tab. If switching topic, please click the 'Clear chat' button.\n\nCaution: This is a public app. Please ensure that the document you upload is not sensitive is any way as other users may see it! Also, please note that LLM chatbots may give incomplete or incorrect information, so please use with care.")
-    current_source = gr.Textbox(label="Current data source that is loaded into the app", value="Lambeth_2030-Our_Future_Our_Lambeth.pdf")
     with gr.Tab("Chatbot"):
         with gr.Row():
             chat_height = 500
             chatbot = gr.Chatbot(height=chat_height, avatar_images=('user.jfif', 'bot.jpg'),bubble_full_width = False, scale = 1)
-            sources = gr.HTML(value = "Source paragraphs where I looked for answers will appear here", height=chat_height, scale = 2)
         with gr.Row():
             message = gr.Textbox(
@@ -228,7 +231,7 @@ with block:
         ingest_embed_out = gr.Textbox(label="File/webpage preparation progress")
     with gr.Tab("Advanced features"):
-        model_choice = gr.Radio(label="Choose a chat model", value="Flan Alpaca", choices = ["Flan Alpaca", "Orca Mini"])
         with gr.Row():
             gpu_layer_choice = gr.Slider(label="Choose number of model layers to send to GPU (WARNING: please don't modify unless you have a GPU).", value=0, minimum=0, maximum=6, step = 1, visible=False)
             change_model_button = gr.Button(value="Load model", scale=0)
@@ -241,7 +244,7 @@ with block:
     examples_set.change(fn=chatf.update_message, inputs=[examples_set], outputs=[message])
     change_model_button.click(fn=chatf.turn_off_interactivity, inputs=[message, chatbot], outputs=[message, chatbot], queue=False).\
-    then(fn=load_model, inputs=[model_choice, gpu_layer_choice], outputs = [model_type_state, load_text]).\
     then(lambda: chatf.restore_interactivity(), None, [message], queue=False).\
     then(chatf.clear_chat, inputs=[chat_history_state, sources, message, current_topic], outputs=[chat_history_state, sources, message, current_topic]).\
     then(lambda: None, None, chatbot, queue=False)

 ##  Load preset embeddings, vectorstore, and model
+embeddings_name = "BAAI/bge-base-en-v1.5"
 def load_embeddings(embeddings_name = "thenlper/gte-base"):
     if torch_device is None:
         torch_device = chatf.torch_device
+    if model_type == "Orca Mini (larger, slow)":
         gpu_config.update_gpu(gpu_layers)
         cpu_config.update_gpu(gpu_layers)
         tokenizer = []
+    if model_type == "Flan Alpaca (small, fast)":
         # Huggingface chat model
         hf_checkpoint = 'declare-lab/flan-alpaca-large'
     load_confirmation = "Finished loading model: " + model_type
     print(load_confirmation)
+    return model_type, load_confirmation, model_type
 # Both models are loaded on app initialisation so that users don't have to wait for the models to be downloaded
+model_type = "Orca Mini (larger, slow)"
 load_model(model_type, chatf.gpu_layers, chatf.gpu_config, chatf.cpu_config, chatf.torch_device)
+model_type = "Flan Alpaca (small, fast)"
 load_model(model_type, 0, chatf.gpu_config, chatf.cpu_config, chatf.torch_device)
 def docs_to_faiss_save(docs_out:PandasDataFrame, embeddings=embeddings):
     gr.Markdown("<h1><center>Lightweight PDF / web page QA bot</center></h1>")
+    gr.Markdown("Chat with PDF or web page documents. The default is a small model (Flan Alpaca), that can only answer specific questions that are answered in the text. It cannot give overall impressions of, or summarise the document. The alternative (Orca Mini (larger, slow)), can reason a little better, but is much slower (See Advanced tab).\n\nBy default the Lambeth Borough Plan '[Lambeth 2030 : Our Future, Our Lambeth](https://www.lambeth.gov.uk/better-fairer-lambeth/projects/lambeth-2030-our-future-our-lambeth)' is loaded. If you want to talk about another document or web page, please select from the second tab. If switching topic, please click the 'Clear chat' button.\n\nCaution: This is a public app. Please ensure that the document you upload is not sensitive is any way as other users may see it! Also, please note that LLM chatbots may give incomplete or incorrect information, so please use with care.")
+    with gr.Row():
+        current_source = gr.Textbox(label="Current data source(s)", value="Lambeth_2030-Our_Future_Our_Lambeth.pdf", scale = 10)
+        current_model = gr.Textbox(label="Current model", value=model_type, scale = 3)
     with gr.Tab("Chatbot"):
         with gr.Row():
             chat_height = 500
             chatbot = gr.Chatbot(height=chat_height, avatar_images=('user.jfif', 'bot.jpg'),bubble_full_width = False, scale = 1)
+            #sources = gr.HTML(value = "Source paragraphs with the most relevant text will appear here", height=chat_height, scale = 2)
+            sources = gr.Markdown(value = "Source paragraphs with the most relevant text will appear here", height=chat_height, scale = 2)
         with gr.Row():
             message = gr.Textbox(
         ingest_embed_out = gr.Textbox(label="File/webpage preparation progress")
     with gr.Tab("Advanced features"):
+        model_choice = gr.Radio(label="Choose a chat model", value="Flan Alpaca (small, fast)", choices = ["Flan Alpaca (small, fast)", "Orca Mini (larger, slow)"])
         with gr.Row():
             gpu_layer_choice = gr.Slider(label="Choose number of model layers to send to GPU (WARNING: please don't modify unless you have a GPU).", value=0, minimum=0, maximum=6, step = 1, visible=False)
             change_model_button = gr.Button(value="Load model", scale=0)
     examples_set.change(fn=chatf.update_message, inputs=[examples_set], outputs=[message])
     change_model_button.click(fn=chatf.turn_off_interactivity, inputs=[message, chatbot], outputs=[message, chatbot], queue=False).\
+    then(fn=load_model, inputs=[model_choice, gpu_layer_choice], outputs = [model_type_state, load_text, current_model]).\
     then(lambda: chatf.restore_interactivity(), None, [message], queue=False).\
     then(chatf.clear_chat, inputs=[chat_history_state, sources, message, current_topic], outputs=[chat_history_state, sources, message, current_topic]).\
     then(lambda: None, None, chatbot, queue=False)

chatfuncs/chatfuncs.py CHANGED Viewed

@@ -7,7 +7,7 @@ import pandas as pd
 import numpy as np
 # Model packages
-import torch
 from threading import Thread
 from transformers import pipeline, TextIteratorStreamer
@@ -21,16 +21,16 @@ from langchain.retrievers import SVMRetriever
 from langchain.text_splitter import RecursiveCharacterTextSplitter
 from langchain.docstore.document import Document
-# For keyword extraction
-import nltk
-nltk.download('wordnet')
 from nltk.corpus import stopwords
 from nltk.tokenize import RegexpTokenizer
 from nltk.stem import WordNetLemmatizer
-import keybert
 # For Name Entity Recognition model
-from span_marker import SpanMarkerModel
 # For BM25 retrieval
 from gensim.corpora import Dictionary
@@ -60,7 +60,7 @@ hlt_strat = [" ", ". ", "! ", "? ", ": ", "\n\n", "\n", ", "]
 hlt_overlap = 4
 ## Initialise NER model ##
-ner_model = SpanMarkerModel.from_pretrained("tomaarsen/span-marker-mbert-base-multinerd")
 ## Initialise keyword model ##
 # Used to pull out keywords from chat history to add to user queries behind the scenes
@@ -78,7 +78,7 @@ print("Running on device:", torch_device)
 threads = 8 #torch.get_num_threads()
 print("CPU threads:", threads)
-# Flan Alpaca Model parameters
 temperature: float = 0.1
 top_k: int = 3
 top_p: float = 1
@@ -202,7 +202,7 @@ def docs_to_faiss_save(docs_out:PandasDataFrame, embeddings=embeddings):
 # Prompt functions
-def base_prompt_templates(model_type = "Flan Alpaca"):
     #EXAMPLE_PROMPT = PromptTemplate(
     #    template="\nCONTENT:\n\n{page_content}\n\nSOURCE: {source}\n\n",
@@ -313,9 +313,9 @@ QUESTION: {question}
 ### RESPONSE:
 """
-    if model_type == "Flan Alpaca":
         INSTRUCTION_PROMPT=PromptTemplate(template=instruction_prompt_template_alpaca, input_variables=['question', 'summaries'])
-    elif model_type == "Orca Mini":
         INSTRUCTION_PROMPT=PromptTemplate(template=instruction_prompt_template_wizard_orca, input_variables=['question', 'summaries'])
     return INSTRUCTION_PROMPT, CONTENT_PROMPT
@@ -359,6 +359,9 @@ def generate_expanded_prompt(inputs: Dict[str, str], instruction_prompt, content
 def create_full_prompt(user_input, history, extracted_memory, vectorstore, embeddings, model_type):
     #if chain_agent is None:
     #    history.append((user_input, "Please click the button to submit the Huggingface API key before using the chatbot (top right)"))
     #    return history, history, "", ""
@@ -385,7 +388,13 @@ def create_full_prompt(user_input, history, extracted_memory, vectorstore, embed
 def produce_streaming_answer_chatbot(history, full_prompt, model_type):
     #print("Model type is: ", model_type)
-    if model_type == "Flan Alpaca":
         # Get the model and tokenizer, and tokenize the user text.
         model_inputs = tokenizer(text=full_prompt, return_tensors="pt", return_attention_mask=False).to(torch_device) # return_attention_mask=False was added
@@ -425,7 +434,7 @@ def produce_streaming_answer_chatbot(history, full_prompt, model_type):
         print(f'Tokens per secound: {NUM_TOKENS/time_generate}')
         print(f'Time per token: {(time_generate/NUM_TOKENS)*1000}ms')
-    elif model_type == "Orca Mini":
         tokens = model.tokenize(full_prompt)
         gen_config = CtransGenGenerationConfig()
@@ -460,7 +469,7 @@ def adapt_q_from_chat_history(question, chat_history, extracted_memory, keyword_
         if chat_history_str:
             # Keyword extraction is now done in the add_inputs_to_history function
-            extracted_memory = extracted_memory#remove_q_stopwords(str(chat_history_first_q) + " " + str(chat_history_first_ans))
             new_question_kworded = str(extracted_memory) + ". " + question #+ " " + new_question_keywords
@@ -966,7 +975,7 @@ def keybert_keywords(text, n, kw_model):
     tokens_lemma = apply_lemmatize(text)
     lemmatised_text = ' '.join(tokens_lemma)
-    keywords_text = keybert.KeyBERT(model=kw_model).extract_keywords(lemmatised_text, stop_words='english', top_n=n,
                                                    keyphrase_ngram_range=(1, 1))
     keywords_list = [item[0] for item in keywords_text]

 import numpy as np
 # Model packages
+import torch.cuda
 from threading import Thread
 from transformers import pipeline, TextIteratorStreamer
 from langchain.text_splitter import RecursiveCharacterTextSplitter
 from langchain.docstore.document import Document
+# For keyword extraction (not currently used)
+#import nltk
+#nltk.download('wordnet')
 from nltk.corpus import stopwords
 from nltk.tokenize import RegexpTokenizer
 from nltk.stem import WordNetLemmatizer
+from keybert import KeyBERT
 # For Name Entity Recognition model
+#from span_marker import SpanMarkerModel # Not currently used
 # For BM25 retrieval
 from gensim.corpora import Dictionary
 hlt_overlap = 4
 ## Initialise NER model ##
+ner_model = []#SpanMarkerModel.from_pretrained("tomaarsen/span-marker-mbert-base-multinerd") # Not currently used
 ## Initialise keyword model ##
 # Used to pull out keywords from chat history to add to user queries behind the scenes
 threads = 8 #torch.get_num_threads()
 print("CPU threads:", threads)
+# Flan Alpaca (small, fast) Model parameters
 temperature: float = 0.1
 top_k: int = 3
 top_p: float = 1
 # Prompt functions
+def base_prompt_templates(model_type = "Flan Alpaca (small, fast)"):
     #EXAMPLE_PROMPT = PromptTemplate(
     #    template="\nCONTENT:\n\n{page_content}\n\nSOURCE: {source}\n\n",
 ### RESPONSE:
 """
+    if model_type == "Flan Alpaca (small, fast)":
         INSTRUCTION_PROMPT=PromptTemplate(template=instruction_prompt_template_alpaca, input_variables=['question', 'summaries'])
+    elif model_type == "Orca Mini (larger, slow)":
         INSTRUCTION_PROMPT=PromptTemplate(template=instruction_prompt_template_wizard_orca, input_variables=['question', 'summaries'])
     return INSTRUCTION_PROMPT, CONTENT_PROMPT
 def create_full_prompt(user_input, history, extracted_memory, vectorstore, embeddings, model_type):
+    if not user_input.strip():
+        return history, "", ""
     #if chain_agent is None:
     #    history.append((user_input, "Please click the button to submit the Huggingface API key before using the chatbot (top right)"))
     #    return history, history, "", ""
 def produce_streaming_answer_chatbot(history, full_prompt, model_type):
     #print("Model type is: ", model_type)
+    #if not full_prompt.strip():
+    #    if history is None:
+    #        history = []
+    #    return history
+    if model_type == "Flan Alpaca (small, fast)":
         # Get the model and tokenizer, and tokenize the user text.
         model_inputs = tokenizer(text=full_prompt, return_tensors="pt", return_attention_mask=False).to(torch_device) # return_attention_mask=False was added
         print(f'Tokens per secound: {NUM_TOKENS/time_generate}')
         print(f'Time per token: {(time_generate/NUM_TOKENS)*1000}ms')
+    elif model_type == "Orca Mini (larger, slow)":
         tokens = model.tokenize(full_prompt)
         gen_config = CtransGenGenerationConfig()
         if chat_history_str:
             # Keyword extraction is now done in the add_inputs_to_history function
+            #remove_q_stopwords(str(chat_history_first_q) + " " + str(chat_history_first_ans))
             new_question_kworded = str(extracted_memory) + ". " + question #+ " " + new_question_keywords
     tokens_lemma = apply_lemmatize(text)
     lemmatised_text = ' '.join(tokens_lemma)
+    keywords_text = KeyBERT(model=kw_model).extract_keywords(lemmatised_text, stop_words='english', top_n=n,
                                                    keyphrase_ngram_range=(1, 1))
     keywords_list = [item[0] for item in keywords_text]

chatfuncs/ingest_borough_plan.py CHANGED Viewed

@@ -1,16 +1,14 @@
 import ingest as ing
-import pandas as pd
-borough_plan_text = ing.parse_file([open("Lambeth_2030-Our_Future_Our_Lambeth.pdf")])
 print("Borough plan text created")
-#print(borough_plan_text)
 borough_plan_docs = ing.text_to_docs(borough_plan_text)
 print("Borough plan docs created")
-embedding_model = "thenlper/gte-base"
 embeddings = ing.load_embeddings(model_name = embedding_model)
 ing.embed_faiss_save_to_zip(borough_plan_docs, save_to="faiss_embedding", model_name = embedding_model)

 import ingest as ing
+borough_plan_text, file_names = ing.parse_file([open("Lambeth_2030-Our_Future_Our_Lambeth.pdf")])
 print("Borough plan text created")
+print(borough_plan_text)
 borough_plan_docs = ing.text_to_docs(borough_plan_text)
 print("Borough plan docs created")
+embedding_model = "BAAI/bge-base-en-v1.5"
 embeddings = ing.load_embeddings(model_name = embedding_model)
 ing.embed_faiss_save_to_zip(borough_plan_docs, save_to="faiss_embedding", model_name = embedding_model)

faiss_embedding/faiss_embedding.zip CHANGED Viewed

Binary files a/faiss_embedding/faiss_embedding.zip and b/faiss_embedding/faiss_embedding.zip differ