Spaces:

seanpedrickcase
/

Light-PDF-Web-QA-Chatbot

Running

App Files Files Community

seanpedrickcase commited on Jun 1, 2024

Commit

f301d67

1 Parent(s): 8aa3ebb

Upgraded gradio and packages to latest. Switched Ctransformers with Llama.cpp Python

Browse files

Files changed (6) hide show

Dockerfile +3 -2
README.md +1 -1
app.py +105 -20
chatfuncs/chatfuncs.py +209 -45
chatfuncs/ingest.py +1 -1
requirements.txt +8 -5

Dockerfile CHANGED Viewed

@@ -13,13 +13,14 @@ USER user
 # Set home to the user's home directory
 ENV HOME=/home/user \
 	PATH=/home/user/.local/bin:$PATH \
-    PYTHONPATH=$HOME/app \
 	PYTHONUNBUFFERED=1 \
 	GRADIO_ALLOW_FLAGGING=never \
 	GRADIO_NUM_PORTS=1 \
 	GRADIO_SERVER_NAME=0.0.0.0 \
 	GRADIO_THEME=huggingface \
-	SYSTEM=spaces
 # Set the working directory to the user's home directory
 WORKDIR $HOME/app

 # Set home to the user's home directory
 ENV HOME=/home/user \
 	PATH=/home/user/.local/bin:$PATH \
+    	PYTHONPATH=$HOME/app \
 	PYTHONUNBUFFERED=1 \
 	GRADIO_ALLOW_FLAGGING=never \
 	GRADIO_NUM_PORTS=1 \
 	GRADIO_SERVER_NAME=0.0.0.0 \
 	GRADIO_THEME=huggingface \
+	SYSTEM=spaces \
+	LLAMA_CUBLAS=1
 # Set the working directory to the user's home directory
 WORKDIR $HOME/app

README.md CHANGED Viewed

@@ -4,7 +4,7 @@ emoji: 🌍
 colorFrom: yellow
 colorTo: yellow
 sdk: gradio
-sdk_version: 3.50.0
 app_file: app.py
 pinned: false
 license: apache-2.0

 colorFrom: yellow
 colorTo: yellow
 sdk: gradio
+sdk_version: 4.31.5
 app_file: app.py
 pinned: false
 license: apache-2.0

app.py CHANGED Viewed

@@ -11,6 +11,12 @@ import pandas as pd
 from transformers import AutoTokenizer
 from ctransformers import AutoModelForCausalLM
 PandasDataFrame = Type[pd.DataFrame]
 # Disable cuda devices if necessary
@@ -38,7 +44,7 @@ def get_faiss_store(faiss_vstore_folder,embeddings):
     with zipfile.ZipFile(faiss_vstore_folder + '/' + faiss_vstore_folder + '.zip', 'r') as zip_ref:
         zip_ref.extractall(faiss_vstore_folder)
-    faiss_vstore = FAISS.load_local(folder_path=faiss_vstore_folder, embeddings=embeddings)
     os.remove(faiss_vstore_folder + "/index.faiss")
     os.remove(faiss_vstore_folder + "/index.pkl")
@@ -53,6 +59,78 @@ import chatfuncs.chatfuncs as chatf
 chatf.embeddings = load_embeddings(embeddings_name)
 chatf.vectorstore = get_faiss_store(faiss_vstore_folder="faiss_embedding",embeddings=globals()["embeddings"])
 def load_model(model_type, gpu_layers, gpu_config=None, cpu_config=None, torch_device=None):
     print("Loading model")
@@ -67,26 +145,35 @@ def load_model(model_type, gpu_layers, gpu_config=None, cpu_config=None, torch_d
     if model_type == "Mistral Open Orca (larger, slow)":
         if torch_device == "cuda":
             gpu_config.update_gpu(gpu_layers)
         else:
             gpu_config.update_gpu(gpu_layers)
             cpu_config.update_gpu(gpu_layers)
-        print("Loading with", cpu_config.gpu_layers, "model layers sent to GPU.")
         print(vars(gpu_config))
         print(vars(cpu_config))
         try:
-            #model = AutoModelForCausalLM.from_pretrained('Aryanne/Orca-Mini-3B-gguf', model_type='llama', model_file='q5_0-orca-mini-3b.gguf', **vars(gpu_config)) # **asdict(CtransRunConfig_cpu())
-            #model = AutoModelForCausalLM.from_pretrained('Aryanne/Wizard-Orca-3B-gguf', model_type='llama', model_file='q4_1-wizard-orca-3b.gguf', **vars(gpu_config)) # **asdict(CtransRunConfig_cpu())
-            model = AutoModelForCausalLM.from_pretrained('TheBloke/Mistral-7B-OpenOrca-GGUF', model_type='mistral', model_file='mistral-7b-openorca.Q4_K_M.gguf', **vars(gpu_config)) # **asdict(CtransRunConfig_cpu())
-            #model = AutoModelForCausalLM.from_pretrained('TheBloke/MistralLite-7B-GGUF', model_type='mistral', model_file='mistrallite.Q4_K_M.gguf', **vars(gpu_config)) # **asdict(CtransRunConfig_cpu())
-        except:
-            #model = AutoModelForCausalLM.from_pretrained('Aryanne/Orca-Mini-3B-gguf', model_type='llama', model_file='q5_0-orca-mini-3b.gguf', **vars(cpu_config)) #**asdict(CtransRunConfig_gpu())
-            #model = AutoModelForCausalLM.from_pretrained('Aryanne/Wizard-Orca-3B-gguf', model_type='llama', model_file='q4_1-wizard-orca-3b.gguf', **vars(cpu_config)) # **asdict(CtransRunConfig_cpu())
-            model = AutoModelForCausalLM.from_pretrained('TheBloke/Mistral-7B-OpenOrca-GGUF', model_type='mistral', model_file='mistral-7b-openorca.Q4_K_M.gguf', **vars(cpu_config)) # **asdict(CtransRunConfig_cpu())
-            #model = AutoModelForCausalLM.from_pretrained('TheBloke/MistralLite-7B-GGUF', model_type='mistral', model_file='mistrallite.Q4_K_M.gguf', **vars(cpu_config)) # **asdict(CtransRunConfig_cpu())
         tokenizer = []
@@ -100,14 +187,14 @@ def load_model(model_type, gpu_layers, gpu_config=None, cpu_config=None, torch_d
             if torch_device == "cuda":
                 if "flan" in model_name:
-                    model = AutoModelForSeq2SeqLM.from_pretrained(model_name, device_map="auto")
                 else:
-                    model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto")
             else:
                 if "flan" in model_name:
-                    model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
                 else:
-                    model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True)
             tokenizer = AutoTokenizer.from_pretrained(model_name, model_max_length = chatf.context_length)
@@ -179,7 +266,7 @@ with block:
             #chat_height = 500
             chatbot = gr.Chatbot(avatar_images=('user.jfif', 'bot.jpg'),bubble_full_width = False, scale = 1) # , height=chat_height
             with gr.Accordion("Open this tab to see the source paragraphs used to generate the answer", open = False):
-                sources = gr.HTML(value = "Source paragraphs with the most relevant text will appear here", scale = 1) # , height=chat_height
         with gr.Row():
             message = gr.Textbox(
@@ -233,7 +320,7 @@ with block:
     gr.HTML(
-        "<center>This app is based on the models Flan Alpaca and Mistral Open Orca. It powered by Gradio, Transformers, Ctransformers, and Langchain.</a></center>"
     )
     examples_set.change(fn=chatf.update_message, inputs=[examples_set], outputs=[message])
@@ -289,6 +376,4 @@ with block:
     # Thumbs up or thumbs down voting function
     chatbot.like(chatf.vote, [chat_history_state, instruction_prompt_out, model_type_state], None)
-block.queue(concurrency_count=1).launch(debug=True)
-# -

 from transformers import AutoTokenizer
 from ctransformers import AutoModelForCausalLM
+import torch
+import llama_cpp
+from llama_cpp import Llama
+from huggingface_hub import hf_hub_download
 PandasDataFrame = Type[pd.DataFrame]
 # Disable cuda devices if necessary
     with zipfile.ZipFile(faiss_vstore_folder + '/' + faiss_vstore_folder + '.zip', 'r') as zip_ref:
         zip_ref.extractall(faiss_vstore_folder)
+    faiss_vstore = FAISS.load_local(folder_path=faiss_vstore_folder, embeddings=embeddings, allow_dangerous_deserialization=True)
     os.remove(faiss_vstore_folder + "/index.faiss")
     os.remove(faiss_vstore_folder + "/index.pkl")
 chatf.embeddings = load_embeddings(embeddings_name)
 chatf.vectorstore = get_faiss_store(faiss_vstore_folder="faiss_embedding",embeddings=globals()["embeddings"])
+# def load_model(model_type, gpu_layers, gpu_config=None, cpu_config=None, torch_device=None):
+#     print("Loading model")
+#     # Default values inside the function
+#     if gpu_config is None:
+#         gpu_config = chatf.gpu_config
+#     if cpu_config is None:
+#         cpu_config = chatf.cpu_config
+#     if torch_device is None:
+#         torch_device = chatf.torch_device
+#     if model_type == "Mistral Open Orca (larger, slow)":
+#         if torch_device == "cuda":
+#             gpu_config.update_gpu(gpu_layers)
+#         else:
+#             gpu_config.update_gpu(gpu_layers)
+#             cpu_config.update_gpu(gpu_layers)
+#         print("Loading with", cpu_config.gpu_layers, "model layers sent to GPU.")
+#         print(vars(gpu_config))
+#         print(vars(cpu_config))
+#         try:
+#             #model = AutoModelForCausalLM.from_pretrained('Aryanne/Orca-Mini-3B-gguf', model_type='llama', model_file='q5_0-orca-mini-3b.gguf', **vars(gpu_config)) # **asdict(CtransRunConfig_cpu())
+#             #model = AutoModelForCausalLM.from_pretrained('Aryanne/Wizard-Orca-3B-gguf', model_type='llama', model_file='q4_1-wizard-orca-3b.gguf', **vars(gpu_config)) # **asdict(CtransRunConfig_cpu())
+#             model = AutoModelForCausalLM.from_pretrained('TheBloke/Mistral-7B-OpenOrca-GGUF', model_type='mistral', model_file='mistral-7b-openorca.Q4_K_M.gguf', **vars(gpu_config)) # **asdict(CtransRunConfig_cpu())
+#             #model = AutoModelForCausalLM.from_pretrained('TheBloke/MistralLite-7B-GGUF', model_type='mistral', model_file='mistrallite.Q4_K_M.gguf', **vars(gpu_config)) # **asdict(CtransRunConfig_cpu())
+#         except:
+#             #model = AutoModelForCausalLM.from_pretrained('Aryanne/Orca-Mini-3B-gguf', model_type='llama', model_file='q5_0-orca-mini-3b.gguf', **vars(cpu_config)) #**asdict(CtransRunConfig_gpu())
+#             #model = AutoModelForCausalLM.from_pretrained('Aryanne/Wizard-Orca-3B-gguf', model_type='llama', model_file='q4_1-wizard-orca-3b.gguf', **vars(cpu_config)) # **asdict(CtransRunConfig_cpu())
+#             model = AutoModelForCausalLM.from_pretrained('TheBloke/Mistral-7B-OpenOrca-GGUF', model_type='mistral', model_file='mistral-7b-openorca.Q4_K_M.gguf', **vars(cpu_config)) # **asdict(CtransRunConfig_cpu())
+#             #model = AutoModelForCausalLM.from_pretrained('TheBloke/MistralLite-7B-GGUF', model_type='mistral', model_file='mistrallite.Q4_K_M.gguf', **vars(cpu_config)) # **asdict(CtransRunConfig_cpu())
+#         tokenizer = []
+#     if model_type == "Flan Alpaca (small, fast)":
+#         # Huggingface chat model
+#         hf_checkpoint = 'declare-lab/flan-alpaca-large'#'declare-lab/flan-alpaca-base' # # #
+#         def create_hf_model(model_name):
+#             from transformers import AutoModelForSeq2SeqLM,  AutoModelForCausalLM
+#             if torch_device == "cuda":
+#                 if "flan" in model_name:
+#                     model = AutoModelForSeq2SeqLM.from_pretrained(model_name, device_map="auto")
+#                 else:
+#                     model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto")
+#             else:
+#                 if "flan" in model_name:
+#                     model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
+#                 else:
+#                     model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True)
+#             tokenizer = AutoTokenizer.from_pretrained(model_name, model_max_length = chatf.context_length)
+#             return model, tokenizer, model_type
+#         model, tokenizer, model_type = create_hf_model(model_name = hf_checkpoint)
+#     chatf.model = model
+#     chatf.tokenizer = tokenizer
+#     chatf.model_type = model_type
+#     load_confirmation = "Finished loading model: " + model_type
+#     print(load_confirmation)
+#     return model_type, load_confirmation, model_type
 def load_model(model_type, gpu_layers, gpu_config=None, cpu_config=None, torch_device=None):
     print("Loading model")
     if model_type == "Mistral Open Orca (larger, slow)":
         if torch_device == "cuda":
             gpu_config.update_gpu(gpu_layers)
+            print("Loading with", gpu_config.n_gpu_layers, "model layers sent to GPU.")
         else:
             gpu_config.update_gpu(gpu_layers)
             cpu_config.update_gpu(gpu_layers)
+            print("Loading with", cpu_config.n_gpu_layers, "model layers sent to GPU.")
         print(vars(gpu_config))
         print(vars(cpu_config))
         try:
+            model = Llama(
+            model_path=hf_hub_download(
+            repo_id=os.environ.get("REPO_ID", "TheBloke/Mistral-7B-OpenOrca-GGUF"),
+            filename=os.environ.get("MODEL_FILE", "mistral-7b-openorca.Q4_K_M.gguf"),
+        ),
+        **vars(gpu_config) # change n_gpu_layers if you have more or less VRAM
+        )
+        except Exception as e:
+            print("GPU load failed")
+            print(e)
+            model = Llama(
+            model_path=hf_hub_download(
+            repo_id=os.environ.get("REPO_ID", "TheBloke/Mistral-7B-OpenOrca-GGUF"),
+            filename=os.environ.get("MODEL_FILE", "mistral-7b-openorca.Q4_K_M.gguf"),
+        ),
+        **vars(cpu_config)
+        )
         tokenizer = []
             if torch_device == "cuda":
                 if "flan" in model_name:
+                    model = AutoModelForSeq2SeqLM.from_pretrained(model_name, device_map="auto", torch_dtype=torch.float16)
                 else:
+                    model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto", torch_dtype=torch.float16)
             else:
                 if "flan" in model_name:
+                    model = AutoModelForSeq2SeqLM.from_pretrained(model_name, torch_dtype=torch.float16)
                 else:
+                    model = AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True, torch_dtype=torch.float16)
             tokenizer = AutoTokenizer.from_pretrained(model_name, model_max_length = chatf.context_length)
             #chat_height = 500
             chatbot = gr.Chatbot(avatar_images=('user.jfif', 'bot.jpg'),bubble_full_width = False, scale = 1) # , height=chat_height
             with gr.Accordion("Open this tab to see the source paragraphs used to generate the answer", open = False):
+                sources = gr.HTML(value = "Source paragraphs with the most relevant text will appear here") # , height=chat_height
         with gr.Row():
             message = gr.Textbox(
     gr.HTML(
+        "<center>This app is based on the models Flan Alpaca and Mistral Open Orca. It powered by Gradio, Transformers, and Llama.cpp.</a></center>"
     )
     examples_set.change(fn=chatf.update_message, inputs=[examples_set], outputs=[message])
     # Thumbs up or thumbs down voting function
     chatbot.like(chatf.vote, [chat_history_state, instruction_prompt_out, model_type_state], None)
+block.queue().launch(debug=True)

chatfuncs/chatfuncs.py CHANGED Viewed

@@ -38,6 +38,11 @@ from gensim.corpora import Dictionary
 from gensim.models import TfidfModel, OkapiBM25Model
 from gensim.similarities import SparseMatrixSimilarity
 import gradio as gr
 torch.cuda.empty_cache()
@@ -70,7 +75,7 @@ kw_model = pipeline("feature-extraction", model="sentence-transformers/all-MiniL
 # Currently set gpu_layers to 0 even with cuda due to persistent bugs in implementation with cuda
 if torch.cuda.is_available():
     torch_device = "cuda"
-    gpu_layers = 0
 else:
     torch_device =  "cpu"
     gpu_layers = 0
@@ -96,67 +101,129 @@ context_length:int = 2048
 sample = True
 class CtransInitConfig_gpu:
-    def __init__(self, temperature=temperature,
-                 top_k=top_k,
-                 top_p=top_p,
-                 repetition_penalty=repetition_penalty,
                  last_n_tokens=last_n_tokens,
-                 max_new_tokens=max_new_tokens,
                  seed=seed,
-                 reset=reset,
-                 stream=stream,
-                 threads=threads,
-                 batch_size=batch_size,
-                 context_length=context_length,
-                 gpu_layers=gpu_layers):
-        self.temperature = temperature
-        self.top_k = top_k
-        self.top_p = top_p
-        self.repetition_penalty = repetition_penalty# repetition_penalty
         self.last_n_tokens = last_n_tokens
-        self.max_new_tokens = max_new_tokens
         self.seed = seed
-        self.reset = reset
-        self.stream = stream
-        self.threads = threads
-        self.batch_size = batch_size
-        self.context_length = context_length
-        self.gpu_layers = gpu_layers
         # self.stop: list[str] = field(default_factory=lambda: [stop_string])
     def update_gpu(self, new_value):
-        self.gpu_layers = new_value
 class CtransInitConfig_cpu(CtransInitConfig_gpu):
     def __init__(self):
         super().__init__()
-        self.gpu_layers = 0
 gpu_config = CtransInitConfig_gpu()
 cpu_config = CtransInitConfig_cpu()
 class CtransGenGenerationConfig:
     def __init__(self, temperature=temperature,
                  top_k=top_k,
                  top_p=top_p,
-                 repetition_penalty=repetition_penalty,
-                 last_n_tokens=last_n_tokens,
                  seed=seed,
-                 threads=threads,
-                 batch_size=batch_size,
-                 reset=True
                  ):
         self.temperature = temperature
         self.top_k = top_k
         self.top_p = top_p
-        self.repetition_penalty = repetition_penalty# repetition_penalty
-        self.last_n_tokens = last_n_tokens
         self.seed = seed
-        self.threads = threads
-        self.batch_size = batch_size
-        self.reset = reset
     def update_temp(self, new_value):
         self.temperature = new_value
@@ -352,6 +419,94 @@ def create_full_prompt(user_input, history, extracted_memory, vectorstore, embed
     return history, docs_content_string, instruction_prompt_out
 # Chat functions
 def produce_streaming_answer_chatbot(history, full_prompt, model_type,
             temperature=temperature,
             max_new_tokens=max_new_tokens,
@@ -412,7 +567,9 @@ def produce_streaming_answer_chatbot(history, full_prompt, model_type,
         print(f'Time per token: {(time_generate/NUM_TOKENS)*1000}ms')
     elif model_type == "Mistral Open Orca (larger, slow)":
-        tokens = model.tokenize(full_prompt)
         gen_config = CtransGenGenerationConfig()
         gen_config.update_temp(temperature)
@@ -424,13 +581,19 @@ def produce_streaming_answer_chatbot(history, full_prompt, model_type,
         NUM_TOKENS=0
         print('-'*4+'Start Generation'+'-'*4)
         history[-1][1] = ""
-        for new_text in model.generate(tokens, **vars(gen_config)): #CtransGen_generate(prompt=full_prompt)#, config=CtransGenGenerationConfig()): # #top_k=top_k, temperature=temperature, repetition_penalty=repetition_penalty,
-            if new_text == None: new_text =  ""
-            history[-1][1] += model.detokenize(new_text) #new_text
-            NUM_TOKENS+=1
-            yield history
         time_generate = time.time() - start
         print('\n')
         print('-'*4+'End Generation'+'-'*4)
@@ -439,6 +602,7 @@ def produce_streaming_answer_chatbot(history, full_prompt, model_type,
         print(f'Tokens per secound: {NUM_TOKENS/time_generate}')
         print(f'Time per token: {(time_generate/NUM_TOKENS)*1000}ms')
 # Chat helper functions
 def adapt_q_from_chat_history(question, chat_history, extracted_memory, keyword_model=""):#keyword_model): # new_question_keywords,
@@ -614,7 +778,7 @@ def hybrid_retrieval(new_question_kworded, vectorstore, embeddings, k_val, out_p
             # 3rd level check on retrieved docs with SVM retriever
             svm_retriever = SVMRetriever.from_texts(content_keep, embeddings, k = k_val)
-            svm_result = svm_retriever.get_relevant_documents(new_question_kworded)
             svm_rank=[]
@@ -994,10 +1158,10 @@ def restore_interactivity():
         return gr.update(interactive=True)
 def update_message(dropdown_value):
-        return gr.Textbox.update(value=dropdown_value)
 def hide_block():
-        return gr.Radio.update(visible=False)
 # Vote function

 from gensim.models import TfidfModel, OkapiBM25Model
 from gensim.similarities import SparseMatrixSimilarity
+import copy
+import llama_cpp
+from llama_cpp import Llama
+from huggingface_hub import hf_hub_download
 import gradio as gr
 torch.cuda.empty_cache()
 # Currently set gpu_layers to 0 even with cuda due to persistent bugs in implementation with cuda
 if torch.cuda.is_available():
     torch_device = "cuda"
+    gpu_layers = 100
 else:
     torch_device =  "cpu"
     gpu_layers = 0
 sample = True
+# class CtransInitConfig_gpu:
+#     def __init__(self, temperature=temperature,
+#                  top_k=top_k,
+#                  top_p=top_p,
+#                  repetition_penalty=repetition_penalty,
+#                  last_n_tokens=last_n_tokens,
+#                  max_new_tokens=max_new_tokens,
+#                  seed=seed,
+#                  reset=reset,
+#                  stream=stream,
+#                  threads=threads,
+#                  batch_size=batch_size,
+#                  context_length=context_length,
+#                  gpu_layers=gpu_layers):
+#         self.temperature = temperature
+#         self.top_k = top_k
+#         self.top_p = top_p
+#         self.repetition_penalty = repetition_penalty# repetition_penalty
+#         self.last_n_tokens = last_n_tokens
+#         self.max_new_tokens = max_new_tokens
+#         self.seed = seed
+#         self.reset = reset
+#         self.stream = stream
+#         self.threads = threads
+#         self.batch_size = batch_size
+#         self.context_length = context_length
+#         self.gpu_layers = gpu_layers
+#         # self.stop: list[str] = field(default_factory=lambda: [stop_string])
+#     def update_gpu(self, new_value):
+#         self.gpu_layers = new_value
+# class CtransInitConfig_cpu(CtransInitConfig_gpu):
+#     def __init__(self):
+#         super().__init__()
+#         self.gpu_layers = 0
 class CtransInitConfig_gpu:
+    def __init__(self, #temperature=temperature,
+                 #top_k=top_k,
+                 #top_p=top_p,
+                 #repetition_penalty=repetition_penalty,
                  last_n_tokens=last_n_tokens,
+                 #max_new_tokens=max_new_tokens,
                  seed=seed,
+                 #reset=reset,
+                 #stream=stream,
+                 n_threads=threads,
+                 n_batch=batch_size,
+                 n_ctx=4096,
+                 n_gpu_layers=gpu_layers):
+        #self.temperature = temperature
+        #self.top_k = top_k
+        #self.top_p = top_p
+        #self.repetition_penalty = repetition_penalty# repetition_penalty
         self.last_n_tokens = last_n_tokens
+        #self.max_new_tokens = max_new_tokens
         self.seed = seed
+        #self.reset = reset
+        #self.stream = stream
+        self.n_threads = n_threads
+        self.n_batch = n_batch
+        self.n_ctx = n_ctx
+        self.n_gpu_layers = n_gpu_layers
         # self.stop: list[str] = field(default_factory=lambda: [stop_string])
     def update_gpu(self, new_value):
+        self.n_gpu_layers = new_value
 class CtransInitConfig_cpu(CtransInitConfig_gpu):
     def __init__(self):
         super().__init__()
+        self.n_gpu_layers = 0
 gpu_config = CtransInitConfig_gpu()
 cpu_config = CtransInitConfig_cpu()
+# class CtransGenGenerationConfig:
+#     def __init__(self, temperature=temperature,
+#                  top_k=top_k,
+#                  top_p=top_p,
+#                  repetition_penalty=repetition_penalty,
+#                  last_n_tokens=last_n_tokens,
+#                  seed=seed,
+#                  threads=threads,
+#                  batch_size=batch_size,
+#                  reset=True
+#                  ):
+#         self.temperature = temperature
+#         self.top_k = top_k
+#         self.top_p = top_p
+#         self.repetition_penalty = repetition_penalty# repetition_penalty
+#         self.last_n_tokens = last_n_tokens
+#         self.seed = seed
+#         self.threads = threads
+#         self.batch_size = batch_size
+#         self.reset = reset
 class CtransGenGenerationConfig:
     def __init__(self, temperature=temperature,
                  top_k=top_k,
                  top_p=top_p,
+                 repeat_penalty=repetition_penalty,
+                 #last_n_tokens=last_n_tokens,
                  seed=seed,
+                 stream=stream,
+                 max_tokens=max_new_tokens
+                 #threads=threads,
+                 #batch_size=batch_size,
+                 #reset=True
                  ):
         self.temperature = temperature
         self.top_k = top_k
         self.top_p = top_p
+        self.repeat_penalty = repeat_penalty
+        #self.last_n_tokens = last_n_tokens
         self.seed = seed
+        self.max_tokens=max_tokens
+        self.stream = stream
+        #self.threads = threads
+        #self.batch_size = batch_size
+        #self.reset = reset
     def update_temp(self, new_value):
         self.temperature = new_value
     return history, docs_content_string, instruction_prompt_out
 # Chat functions
+# def produce_streaming_answer_chatbot(history, full_prompt, model_type,
+#             temperature=temperature,
+#             max_new_tokens=max_new_tokens,
+#             sample=sample,
+#             repetition_penalty=repetition_penalty,
+#             top_p=top_p,
+#             top_k=top_k
+# ):
+#     #print("Model type is: ", model_type)
+#     #if not full_prompt.strip():
+#     #    if history is None:
+#     #        history = []
+#     #    return history
+#     if model_type == "Flan Alpaca (small, fast)":
+#         # Get the model and tokenizer, and tokenize the user text.
+#         model_inputs = tokenizer(text=full_prompt, return_tensors="pt", return_attention_mask=False).to(torch_device) # return_attention_mask=False was added
+#         # Start generation on a separate thread, so that we don't block the UI. The text is pulled from the streamer
+#         # in the main thread. Adds timeout to the streamer to handle exceptions in the generation thread.
+#         streamer = TextIteratorStreamer(tokenizer, timeout=120., skip_prompt=True, skip_special_tokens=True)
+#         generate_kwargs = dict(
+#             model_inputs,
+#             streamer=streamer,
+#             max_new_tokens=max_new_tokens,
+#             do_sample=sample,
+#             repetition_penalty=repetition_penalty,
+#             top_p=top_p,
+#             temperature=temperature,
+#             top_k=top_k
+#         )
+#         print(generate_kwargs)
+#         t = Thread(target=model.generate, kwargs=generate_kwargs)
+#         t.start()
+#         # Pull the generated text from the streamer, and update the model output.
+#         start = time.time()
+#         NUM_TOKENS=0
+#         print('-'*4+'Start Generation'+'-'*4)
+#         history[-1][1] = ""
+#         for new_text in streamer:
+#             if new_text == None: new_text = ""
+#             history[-1][1] += new_text
+#             NUM_TOKENS+=1
+#             yield history
+#         time_generate = time.time() - start
+#         print('\n')
+#         print('-'*4+'End Generation'+'-'*4)
+#         print(f'Num of generated tokens: {NUM_TOKENS}')
+#         print(f'Time for complete generation: {time_generate}s')
+#         print(f'Tokens per secound: {NUM_TOKENS/time_generate}')
+#         print(f'Time per token: {(time_generate/NUM_TOKENS)*1000}ms')
+#     elif model_type == "Mistral Open Orca (larger, slow)":
+#         tokens = model.tokenize(full_prompt)
+#         gen_config = CtransGenGenerationConfig()
+#         gen_config.update_temp(temperature)
+#         print(vars(gen_config))
+#         # Pull the generated text from the streamer, and update the model output.
+#         start = time.time()
+#         NUM_TOKENS=0
+#         print('-'*4+'Start Generation'+'-'*4)
+#         history[-1][1] = ""
+#         for new_text in model.generate(tokens, **vars(gen_config)): #CtransGen_generate(prompt=full_prompt)#, config=CtransGenGenerationConfig()): # #top_k=top_k, temperature=temperature, repetition_penalty=repetition_penalty,
+#             if new_text == None: new_text =  ""
+#             history[-1][1] += model.detokenize(new_text) #new_text
+#             NUM_TOKENS+=1
+#             yield history
+#         time_generate = time.time() - start
+#         print('\n')
+#         print('-'*4+'End Generation'+'-'*4)
+#         print(f'Num of generated tokens: {NUM_TOKENS}')
+#         print(f'Time for complete generation: {time_generate}s')
+#         print(f'Tokens per secound: {NUM_TOKENS/time_generate}')
+#         print(f'Time per token: {(time_generate/NUM_TOKENS)*1000}ms')
 def produce_streaming_answer_chatbot(history, full_prompt, model_type,
             temperature=temperature,
             max_new_tokens=max_new_tokens,
         print(f'Time per token: {(time_generate/NUM_TOKENS)*1000}ms')
     elif model_type == "Mistral Open Orca (larger, slow)":
+        #tokens = model.tokenize(full_prompt)
+        temp = ""
         gen_config = CtransGenGenerationConfig()
         gen_config.update_temp(temperature)
         NUM_TOKENS=0
         print('-'*4+'Start Generation'+'-'*4)
+        output = model(
+        full_prompt, **vars(gen_config))
         history[-1][1] = ""
+        for out in output:
+            if "choices" in out and len(out["choices"]) > 0 and "text" in out["choices"][0]:
+                history[-1][1] += out["choices"][0]["text"]
+                NUM_TOKENS+=1
+                yield history
+            else:
+                print(f"Unexpected output structure: {out}")
         time_generate = time.time() - start
         print('\n')
         print('-'*4+'End Generation'+'-'*4)
         print(f'Tokens per secound: {NUM_TOKENS/time_generate}')
         print(f'Time per token: {(time_generate/NUM_TOKENS)*1000}ms')
 # Chat helper functions
 def adapt_q_from_chat_history(question, chat_history, extracted_memory, keyword_model=""):#keyword_model): # new_question_keywords,
             # 3rd level check on retrieved docs with SVM retriever
             svm_retriever = SVMRetriever.from_texts(content_keep, embeddings, k = k_val)
+            svm_result = svm_retriever.invoke(new_question_kworded)
             svm_rank=[]
         return gr.update(interactive=True)
 def update_message(dropdown_value):
+        return gr.Textbox(value=dropdown_value)
 def hide_block():
+        return gr.Radio(visible=False)
 # Vote function

chatfuncs/ingest.py CHANGED Viewed

@@ -21,7 +21,7 @@ from pypdf import PdfReader
 PandasDataFrame = Type[pd.DataFrame]
 split_strat = ["\n\n", "\n", ". ", "! ", "? "]
-chunk_size = 500
 chunk_overlap = 0
 start_index = True

 PandasDataFrame = Type[pd.DataFrame]
 split_strat = ["\n\n", "\n", ". ", "! ", "? "]
+chunk_size = 300
 chunk_overlap = 0
 start_index = True

requirements.txt CHANGED Viewed

@@ -3,15 +3,18 @@ langchain-community
 beautifulsoup4
 pandas
 transformers
---extra-index-url https://download.pytorch.org/whl/cu118
-torch
 sentence_transformers
 faiss-cpu
 pypdf
 python-docx
-ctransformers[cuda]
 keybert
 span_marker
 gensim
-gradio==3.50.0
-gradio_client

 beautifulsoup4
 pandas
 transformers
+llama-cpp-python --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cu121
+#torch \
+#--extra-index-url https://download.pytorch.org/whl/cu121
 sentence_transformers
 faiss-cpu
 pypdf
 python-docx
+#ctransformers[cuda]
 keybert
 span_marker
 gensim
+gradio==4.31.5
+gradio_client
+nltk
+scipy<1.13