Sean-Case commited on
Commit
d2ddc62
1 Parent(s): f6036ad

Attempt to switch to Orca Mini GGUF

Browse files
Files changed (3) hide show
  1. app.py +3 -4
  2. chatfuncs/chatfuncs.py +49 -31
  3. requirements.txt +1 -1
app.py CHANGED
@@ -11,7 +11,6 @@ from langchain.vectorstores import FAISS
11
  PandasDataFrame = TypeVar('pd.core.frame.DataFrame')
12
 
13
  # Disable cuda devices if necessary
14
-
15
  #os.environ['CUDA_VISIBLE_DEVICES'] = '-1'
16
 
17
  #from chatfuncs.chatfuncs import *
@@ -155,7 +154,7 @@ with block:
155
  ingest_embed_out = gr.Textbox(label="File/webpage preparation progress")
156
 
157
  gr.HTML(
158
- "<center>Powered by Flan Alpaca and Langchain</a></center>"
159
  )
160
 
161
  examples_set.change(fn=chatf.update_message, inputs=[examples_set], outputs=[message])
@@ -177,14 +176,14 @@ with block:
177
  # Click/enter to send message action
178
  response_click = submit.click(chatf.get_history_sources_final_input_prompt, inputs=[message, chat_history_state, current_topic, vectorstore_state, embeddings_state], outputs=[chat_history_state, sources, instruction_prompt_out], queue=False, api_name="retrieval").\
179
  then(chatf.turn_off_interactivity, inputs=[message, chatbot], outputs=[message, chatbot], queue=False).\
180
- then(chatf.produce_streaming_answer_chatbot_hf, inputs=[chatbot, instruction_prompt_out], outputs=chatbot)
181
  response_click.then(chatf.highlight_found_text, [chatbot, sources], [sources]).\
182
  then(chatf.add_inputs_answer_to_history,[message, chatbot, current_topic], [chat_history_state, current_topic]).\
183
  then(lambda: gr.update(interactive=True), None, [message], queue=False)
184
 
185
  response_enter = message.submit(chatf.get_history_sources_final_input_prompt, inputs=[message, chat_history_state, current_topic, vectorstore_state, embeddings_state], outputs=[chat_history_state, sources, instruction_prompt_out], queue=False).\
186
  then(chatf.turn_off_interactivity, inputs=[message, chatbot], outputs=[message, chatbot], queue=False).\
187
- then(chatf.produce_streaming_answer_chatbot_hf, [chatbot, instruction_prompt_out], chatbot)
188
  response_enter.then(chatf.highlight_found_text, [chatbot, sources], [sources]).\
189
  then(chatf.add_inputs_answer_to_history,[message, chatbot, current_topic], [chat_history_state, current_topic]).\
190
  then(lambda: gr.update(interactive=True), None, [message], queue=False)
 
11
  PandasDataFrame = TypeVar('pd.core.frame.DataFrame')
12
 
13
  # Disable cuda devices if necessary
 
14
  #os.environ['CUDA_VISIBLE_DEVICES'] = '-1'
15
 
16
  #from chatfuncs.chatfuncs import *
 
154
  ingest_embed_out = gr.Textbox(label="File/webpage preparation progress")
155
 
156
  gr.HTML(
157
+ "<center>Powered by Orca Mini and Langchain</a></center>"
158
  )
159
 
160
  examples_set.change(fn=chatf.update_message, inputs=[examples_set], outputs=[message])
 
176
  # Click/enter to send message action
177
  response_click = submit.click(chatf.get_history_sources_final_input_prompt, inputs=[message, chat_history_state, current_topic, vectorstore_state, embeddings_state], outputs=[chat_history_state, sources, instruction_prompt_out], queue=False, api_name="retrieval").\
178
  then(chatf.turn_off_interactivity, inputs=[message, chatbot], outputs=[message, chatbot], queue=False).\
179
+ then(chatf.produce_streaming_answer_chatbot_ctrans, inputs=[chatbot, instruction_prompt_out], outputs=chatbot)
180
  response_click.then(chatf.highlight_found_text, [chatbot, sources], [sources]).\
181
  then(chatf.add_inputs_answer_to_history,[message, chatbot, current_topic], [chat_history_state, current_topic]).\
182
  then(lambda: gr.update(interactive=True), None, [message], queue=False)
183
 
184
  response_enter = message.submit(chatf.get_history_sources_final_input_prompt, inputs=[message, chat_history_state, current_topic, vectorstore_state, embeddings_state], outputs=[chat_history_state, sources, instruction_prompt_out], queue=False).\
185
  then(chatf.turn_off_interactivity, inputs=[message, chatbot], outputs=[message, chatbot], queue=False).\
186
+ then(chatf.produce_streaming_answer_chatbot_ctrans, [chatbot, instruction_prompt_out], chatbot)
187
  response_enter.then(chatf.highlight_found_text, [chatbot, sources], [sources]).\
188
  then(chatf.add_inputs_answer_to_history,[message, chatbot, current_topic], [chat_history_state, current_topic]).\
189
  then(lambda: gr.update(interactive=True), None, [message], queue=False)
chatfuncs/chatfuncs.py CHANGED
@@ -7,12 +7,13 @@ import numpy as np
7
 
8
  # Model packages
9
  import torch
 
10
  from threading import Thread
11
  from transformers import AutoTokenizer, pipeline, TextIteratorStreamer
12
 
13
  # Alternative model sources
14
  from gpt4all import GPT4All
15
- from ctransformers import AutoModelForCausalLM
16
 
17
  from dataclasses import asdict, dataclass
18
 
@@ -44,7 +45,11 @@ from gensim.similarities import SparseMatrixSimilarity
44
 
45
  import gradio as gr
46
 
47
- torch_device = "cuda" if torch.cuda.is_available() else "cpu"
 
 
 
 
48
  print("Running on device:", torch_device)
49
  threads = 8#torch.get_num_threads()
50
  print("CPU threads:", threads)
@@ -72,9 +77,27 @@ stream: bool = True
72
  threads: int = threads
73
  batch_size:int = 512
74
  context_length:int = 2048
75
- gpu_layers:int = 0
76
  sample = True
77
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
78
  ## Highlight text constants
79
  hlt_chunk_size = 20
80
  hlt_strat = [" ", ".", "!", "?", ":", "\n\n", "\n", ","]
@@ -87,17 +110,20 @@ ner_model = SpanMarkerModel.from_pretrained("tomaarsen/span-marker-mbert-base-mu
87
  # Used to pull out keywords from chat history to add to user queries behind the scenes
88
  kw_model = pipeline("feature-extraction", model="sentence-transformers/all-MiniLM-L6-v2")
89
 
 
 
90
  ## Chat models ##
91
  ctrans_llm = [] # Not leaded by default
92
  #ctrans_llm = AutoModelForCausalLM.from_pretrained('TheBloke/orca_mini_3B-GGML', model_type='llama', model_file='orca-mini-3b.ggmlv3.q4_0.bin')
93
- #ctrans_llm = AutoModelForCausalLM.from_pretrained('TheBloke/orca_mini_3B-GGML', model_type='llama', model_file='orca-mini-3b.ggmlv3.q8_0.bin')
94
  #ctrans_llm = AutoModelForCausalLM.from_pretrained('TheBloke/vicuna-13B-v1.5-16K-GGUF', model_type='llama', model_file='vicuna-13b-v1.5-16k.Q4_K_M.gguf')
95
  #ctrans_llm = AutoModelForCausalLM.from_pretrained('TheBloke/CodeUp-Llama-2-13B-Chat-HF-GGUF', model_type='llama', model_file='codeup-llama-2-13b-chat-hf.Q4_K_M.gguf')
96
  #ctrans_llm = AutoModelForCausalLM.from_pretrained('TheBloke/CodeLlama-13B-Instruct-GGUF', model_type='llama', model_file='codellama-13b-instruct.Q4_K_M.gguf')
97
  #ctrans_llm = AutoModelForCausalLM.from_pretrained('TheBloke/Mistral-7B-Instruct-v0.1-GGUF', model_type='mistral', model_file='mistral-7b-instruct-v0.1.Q4_K_M.gguf')
98
  #ctrans_llm = AutoModelForCausalLM.from_pretrained('TheBloke/Mistral-7B-OpenOrca-GGUF', model_type='mistral', model_file='mistral-7b-openorca.Q4_K_M.gguf')
99
 
100
- #gpt4all_model = GPT4All(model_name= "orca-mini-3b.ggmlv3.q4_0.bin", model_path="models/") # "ggml-mpt-7b-chat.bin"
 
101
 
102
  # Huggingface chat model
103
  #hf_checkpoint = 'jphme/phi-1_5_Wizard_Vicuna_uncensored'
@@ -128,7 +154,7 @@ def create_hf_model(model_name):
128
 
129
  return model, tokenizer, torch_device
130
 
131
- model, tokenizer, torch_device = create_hf_model(model_name = hf_checkpoint)
132
 
133
  # Vectorstore funcs
134
 
@@ -196,6 +222,17 @@ def create_prompt_templates():
196
 
197
  ### Response:"""
198
 
 
 
 
 
 
 
 
 
 
 
 
199
 
200
 
201
 
@@ -581,9 +618,6 @@ def create_final_prompt(inputs: Dict[str, str], instruction_prompt, content_prom
581
  #print("The question passed to the vector search is:")
582
  #print(new_question_kworded)
583
 
584
- #docs_keep_as_doc, docs_content, docs_url = find_relevant_passages(new_question_kworded, k_val = 5, out_passages = 3,
585
- # vec_score_cut_off = 1.3, vec_weight = 1, tfidf_weight = 0.5, svm_weight = 1)
586
-
587
  docs_keep_as_doc, doc_df, docs_keep_out = hybrid_retrieval(new_question_kworded, vectorstore, embeddings, k_val = 5, out_passages = 2,
588
  vec_score_cut_off = 1, vec_weight = 1, bm25_weight = 1, svm_weight = 1)#,
589
  #vectorstore=globals()["vectorstore"], embeddings=globals()["embeddings"])
@@ -868,8 +902,8 @@ def produce_streaming_answer_chatbot_ctrans(history, full_prompt):
868
  print("The question is: ")
869
  print(full_prompt)
870
 
871
- #tokens = ctrans_llm.tokenize(full_prompt)
872
-
873
  #import psutil
874
  #from loguru import logger
875
 
@@ -884,29 +918,13 @@ def produce_streaming_answer_chatbot_ctrans(history, full_prompt):
884
  #logger.debug(f"{cpu_count=}")
885
 
886
  # Pull the generated text from the streamer, and update the model output.
887
- config = GenerationConfig(reset=True)
888
  history[-1][1] = ""
889
- for new_text in ctrans_generate(prompt=full_prompt, config=config):
890
- if new_text == None: new_text = ""
891
- history[-1][1] += new_text
892
  yield history
893
 
894
- @dataclass
895
- class GenerationConfig:
896
- temperature: float = temperature
897
- top_k: int = top_k
898
- top_p: float = top_p
899
- repetition_penalty: float = repetition_penalty
900
- last_n_tokens: int = last_n_tokens
901
- max_new_tokens: int = max_new_tokens
902
- #seed: int = 42
903
- reset: bool = reset
904
- stream: bool = stream
905
- threads: int = threads
906
- batch_size:int = batch_size
907
- #context_length:int = context_length
908
- #gpu_layers:int = gpu_layers
909
- #stop: list[str] = field(default_factory=lambda: [stop_string])
910
 
911
  def ctrans_generate(
912
  prompt: str,
 
7
 
8
  # Model packages
9
  import torch
10
+ torch.cuda.empty_cache()
11
  from threading import Thread
12
  from transformers import AutoTokenizer, pipeline, TextIteratorStreamer
13
 
14
  # Alternative model sources
15
  from gpt4all import GPT4All
16
+ from ctransformers import AutoModelForCausalLM#, AutoTokenizer
17
 
18
  from dataclasses import asdict, dataclass
19
 
 
45
 
46
  import gradio as gr
47
 
48
+ if torch.cuda.is_available():
49
+ torch_device = "cuda"
50
+ gpu_layers = 1
51
+ else: torch_device = "cpu"
52
+
53
  print("Running on device:", torch_device)
54
  threads = 8#torch.get_num_threads()
55
  print("CPU threads:", threads)
 
77
  threads: int = threads
78
  batch_size:int = 512
79
  context_length:int = 2048
80
+ gpu_layers:int = 0#10#gpu_layers
81
  sample = True
82
 
83
+ @dataclass
84
+ class GenerationConfig:
85
+ temperature: float = temperature
86
+ top_k: int = top_k
87
+ top_p: float = top_p
88
+ repetition_penalty: float = repetition_penalty
89
+ last_n_tokens: int = last_n_tokens
90
+ max_new_tokens: int = max_new_tokens
91
+ #seed: int = 42
92
+ reset: bool = reset
93
+ stream: bool = stream
94
+ threads: int = threads
95
+ batch_size:int = batch_size
96
+ context_length:int = context_length
97
+ gpu_layers:int = gpu_layers
98
+ #stop: list[str] = field(default_factory=lambda: [stop_string])
99
+
100
+
101
  ## Highlight text constants
102
  hlt_chunk_size = 20
103
  hlt_strat = [" ", ".", "!", "?", ":", "\n\n", "\n", ","]
 
110
  # Used to pull out keywords from chat history to add to user queries behind the scenes
111
  kw_model = pipeline("feature-extraction", model="sentence-transformers/all-MiniLM-L6-v2")
112
 
113
+
114
+
115
  ## Chat models ##
116
  ctrans_llm = [] # Not leaded by default
117
  #ctrans_llm = AutoModelForCausalLM.from_pretrained('TheBloke/orca_mini_3B-GGML', model_type='llama', model_file='orca-mini-3b.ggmlv3.q4_0.bin')
118
+ ctrans_llm = AutoModelForCausalLM.from_pretrained('juanjgit/orca_mini_3B-GGUF', model_type='llama', model_file='orca-mini-3b.q4_0.gguf', **asdict(GenerationConfig()))
119
  #ctrans_llm = AutoModelForCausalLM.from_pretrained('TheBloke/vicuna-13B-v1.5-16K-GGUF', model_type='llama', model_file='vicuna-13b-v1.5-16k.Q4_K_M.gguf')
120
  #ctrans_llm = AutoModelForCausalLM.from_pretrained('TheBloke/CodeUp-Llama-2-13B-Chat-HF-GGUF', model_type='llama', model_file='codeup-llama-2-13b-chat-hf.Q4_K_M.gguf')
121
  #ctrans_llm = AutoModelForCausalLM.from_pretrained('TheBloke/CodeLlama-13B-Instruct-GGUF', model_type='llama', model_file='codellama-13b-instruct.Q4_K_M.gguf')
122
  #ctrans_llm = AutoModelForCausalLM.from_pretrained('TheBloke/Mistral-7B-Instruct-v0.1-GGUF', model_type='mistral', model_file='mistral-7b-instruct-v0.1.Q4_K_M.gguf')
123
  #ctrans_llm = AutoModelForCausalLM.from_pretrained('TheBloke/Mistral-7B-OpenOrca-GGUF', model_type='mistral', model_file='mistral-7b-openorca.Q4_K_M.gguf')
124
 
125
+
126
+ #ctokenizer = AutoTokenizer.from_pretrained(ctrans_llm)
127
 
128
  # Huggingface chat model
129
  #hf_checkpoint = 'jphme/phi-1_5_Wizard_Vicuna_uncensored'
 
154
 
155
  return model, tokenizer, torch_device
156
 
157
+ #model, tokenizer, torch_device = create_hf_model(model_name = hf_checkpoint)
158
 
159
  # Vectorstore funcs
160
 
 
222
 
223
  ### Response:"""
224
 
225
+ instruction_prompt_template_orca_input = """
226
+ ### System:
227
+ You are an AI assistant that follows instruction extremely well. Help as much as you can.
228
+ ### User:
229
+ Answer the QUESTION using information from the following input.
230
+ ### Input:
231
+ {summaries}
232
+ QUESTION: {question}
233
+
234
+ ### Response:"""
235
+
236
 
237
 
238
 
 
618
  #print("The question passed to the vector search is:")
619
  #print(new_question_kworded)
620
 
 
 
 
621
  docs_keep_as_doc, doc_df, docs_keep_out = hybrid_retrieval(new_question_kworded, vectorstore, embeddings, k_val = 5, out_passages = 2,
622
  vec_score_cut_off = 1, vec_weight = 1, bm25_weight = 1, svm_weight = 1)#,
623
  #vectorstore=globals()["vectorstore"], embeddings=globals()["embeddings"])
 
902
  print("The question is: ")
903
  print(full_prompt)
904
 
905
+ tokens = ctrans_llm.tokenize(full_prompt)
906
+
907
  #import psutil
908
  #from loguru import logger
909
 
 
918
  #logger.debug(f"{cpu_count=}")
919
 
920
  # Pull the generated text from the streamer, and update the model output.
921
+ #config = GenerationConfig(reset=True)
922
  history[-1][1] = ""
923
+ for new_text in ctrans_llm.generate(tokens, top_k=top_k, temperature=temperature, repetition_penalty=repetition_penalty): #ctrans_generate(prompt=tokens, config=config):
924
+ if new_text == None: new_text = ""
925
+ history[-1][1] += ctrans_llm.detokenize(new_text) #new_text
926
  yield history
927
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
928
 
929
  def ctrans_generate(
930
  prompt: str,
requirements.txt CHANGED
@@ -17,7 +17,7 @@ gradio
17
  gradio_client==0.2.7
18
  python-docx
19
  gpt4all
20
- ctransformers
21
  keybert
22
  span_marker
23
  gensim
 
17
  gradio_client==0.2.7
18
  python-docx
19
  gpt4all
20
+ ctransformers[cuda]
21
  keybert
22
  span_marker
23
  gensim