Sean-Case commited on
Commit
1365c48
1 Parent(s): 67af224

Hopefully fixed Orca load cuda error. Loads in Orca on app load. Later gradio version installed.

Browse files
Files changed (2) hide show
  1. app.py +23 -15
  2. chatfuncs/chatfuncs.py +3 -6
app.py CHANGED
@@ -2,21 +2,22 @@
2
 
3
  # +
4
  import os
5
- os.system("pip uninstall -y gradio")
6
- os.system("pip install gradio==3.42.0")
 
 
7
 
8
  from typing import TypeVar
9
  from langchain.embeddings import HuggingFaceEmbeddings, HuggingFaceInstructEmbeddings
10
  from langchain.vectorstores import FAISS
11
  import gradio as gr
12
 
13
- from transformers import AutoTokenizer#, pipeline, TextIteratorStreamer
14
  from dataclasses import asdict, dataclass
15
 
16
  # Alternative model sources
17
- from ctransformers import AutoModelForCausalLM#, AutoTokenizer
18
 
19
- #PandasDataFrame: type[pd.core.frame.DataFrame]
20
  PandasDataFrame = TypeVar('pd.core.frame.DataFrame')
21
 
22
  # Disable cuda devices if necessary
@@ -68,12 +69,14 @@ import chatfuncs.chatfuncs as chatf
68
  chatf.embeddings = load_embeddings(embeddings_name)
69
  chatf.vectorstore = get_faiss_store(faiss_vstore_folder="faiss_embedding",embeddings=globals()["embeddings"])
70
 
71
- model_type = "Flan Alpaca"
72
 
73
 
74
- def load_model(model_type, CtransInitConfig_gpu=chatf.CtransInitConfig_gpu, CtransInitConfig_cpu=chatf.CtransInitConfig_cpu, torch_device=chatf.torch_device):
75
  print("Loading model")
76
  if model_type == "Orca Mini":
 
 
 
77
  try:
78
  model = AutoModelForCausalLM.from_pretrained('juanjgit/orca_mini_3B-GGUF', model_type='llama', model_file='orca-mini-3b.q4_0.gguf', **asdict(CtransInitConfig_gpu()))
79
  except:
@@ -88,14 +91,12 @@ def load_model(model_type, CtransInitConfig_gpu=chatf.CtransInitConfig_gpu, Ctra
88
  def create_hf_model(model_name):
89
 
90
  from transformers import AutoModelForSeq2SeqLM, AutoModelForCausalLM
91
-
92
- # model_id = model_name
93
 
94
  if torch_device == "cuda":
95
  if "flan" in model_name:
96
- model = AutoModelForSeq2SeqLM.from_pretrained(model_name, load_in_8bit=True, device_map="auto")
97
  else:
98
- model = AutoModelForCausalLM.from_pretrained(model_name, load_in_8bit=True, device_map="auto")
99
  else:
100
  if "flan" in model_name:
101
  model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
@@ -115,7 +116,13 @@ def load_model(model_type, CtransInitConfig_gpu=chatf.CtransInitConfig_gpu, Ctra
115
  print("Finished loading model: ", model_type)
116
  return model_type
117
 
118
- load_model(model_type, chatf.CtransInitConfig_gpu, chatf.CtransInitConfig_cpu, chatf.torch_device)
 
 
 
 
 
 
119
 
120
  def docs_to_faiss_save(docs_out:PandasDataFrame, embeddings=embeddings):
121
 
@@ -153,7 +160,7 @@ with block:
153
 
154
  gr.Markdown("<h1><center>Lightweight PDF / web page QA bot</center></h1>")
155
 
156
- gr.Markdown("Chat with PDF or web page documents. The default is a small model (Flan Alpaca), that can only answer specific questions that are answered in the text. It cannot give overall impressions of, or summarise the document. The alternative (Orca Mini), can reason a little better, but is much slower (See advanced tab, temporarily disabled).\n\nBy default the Lambeth Borough Plan '[Lambeth 2030 : Our Future, Our Lambeth](https://www.lambeth.gov.uk/better-fairer-lambeth/projects/lambeth-2030-our-future-our-lambeth)' is loaded. If you want to talk about another document or web page, please select from the second tab. If switching topic, please click the 'Clear chat' button.\n\nCaution: This is a public app. Likes and dislike responses will be saved to disk to improve the model. Please ensure that the document you upload is not sensitive is any way as other users may see it! Also, please note that LLM chatbots may give incomplete or incorrect information, so please use with care.")
157
 
158
  current_source = gr.Textbox(label="Current data source that is loaded into the app", value="Lambeth_2030-Our_Future_Our_Lambeth.pdf")
159
 
@@ -198,8 +205,9 @@ with block:
198
 
199
  ingest_embed_out = gr.Textbox(label="File/webpage preparation progress")
200
 
201
- with gr.Tab("Advanced features (Currently disabled)"):
202
  model_choice = gr.Radio(label="Choose a chat model", value="Flan Alpaca", choices = ["Flan Alpaca", "Orca Mini"])
 
203
 
204
  gr.HTML(
205
  "<center>This app is based on the models Flan Alpaca and Orca Mini. It powered by Gradio, Transformers, Ctransformers, and Langchain.</a></center>"
@@ -207,7 +215,7 @@ with block:
207
 
208
  examples_set.change(fn=chatf.update_message, inputs=[examples_set], outputs=[message])
209
 
210
- #model_choice.change(fn=load_model, inputs=[model_choice], outputs = [model_type_state])
211
 
212
  # Load in a pdf
213
  load_pdf_click = load_pdf.click(ing.parse_file, inputs=[in_pdf], outputs=[ingest_text, current_source]).\
 
2
 
3
  # +
4
  import os
5
+
6
+ # Need to overwrite version of gradio present in Huggingface spaces as it doesn't have like buttons/avatars (Oct 2023)
7
+ #os.system("pip uninstall -y gradio")
8
+ os.system("pip install gradio==3.47.1")
9
 
10
  from typing import TypeVar
11
  from langchain.embeddings import HuggingFaceEmbeddings, HuggingFaceInstructEmbeddings
12
  from langchain.vectorstores import FAISS
13
  import gradio as gr
14
 
15
+ from transformers import AutoTokenizer
16
  from dataclasses import asdict, dataclass
17
 
18
  # Alternative model sources
19
+ from ctransformers import AutoModelForCausalLM
20
 
 
21
  PandasDataFrame = TypeVar('pd.core.frame.DataFrame')
22
 
23
  # Disable cuda devices if necessary
 
69
  chatf.embeddings = load_embeddings(embeddings_name)
70
  chatf.vectorstore = get_faiss_store(faiss_vstore_folder="faiss_embedding",embeddings=globals()["embeddings"])
71
 
 
72
 
73
 
74
+ def load_model(model_type, gpu_layers, CtransInitConfig_gpu=chatf.CtransInitConfig_gpu, CtransInitConfig_cpu=chatf.CtransInitConfig_cpu, torch_device=chatf.torch_device):
75
  print("Loading model")
76
  if model_type == "Orca Mini":
77
+ CtransInitConfig_gpu.gpu_layers = gpu_layers
78
+ CtransInitConfig_cpu.gpu_layers = gpu_layers
79
+
80
  try:
81
  model = AutoModelForCausalLM.from_pretrained('juanjgit/orca_mini_3B-GGUF', model_type='llama', model_file='orca-mini-3b.q4_0.gguf', **asdict(CtransInitConfig_gpu()))
82
  except:
 
91
  def create_hf_model(model_name):
92
 
93
  from transformers import AutoModelForSeq2SeqLM, AutoModelForCausalLM
 
 
94
 
95
  if torch_device == "cuda":
96
  if "flan" in model_name:
97
+ model = AutoModelForSeq2SeqLM.from_pretrained(model_name, device_map="auto")
98
  else:
99
+ model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto")
100
  else:
101
  if "flan" in model_name:
102
  model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
 
116
  print("Finished loading model: ", model_type)
117
  return model_type
118
 
119
+ # Both models are loaded on app initialisation so that users don't have to wait for the models to be downloaded
120
+ model_type = "Orca Mini"
121
+
122
+ load_model(model_type, chatf.gpu_layers, chatf.CtransInitConfig_gpu, chatf.CtransInitConfig_cpu, chatf.torch_device)
123
+
124
+ model_type = "Flan Alpaca"
125
+ load_model(model_type, 0, chatf.CtransInitConfig_gpu, chatf.CtransInitConfig_cpu, chatf.torch_device)
126
 
127
  def docs_to_faiss_save(docs_out:PandasDataFrame, embeddings=embeddings):
128
 
 
160
 
161
  gr.Markdown("<h1><center>Lightweight PDF / web page QA bot</center></h1>")
162
 
163
+ gr.Markdown("Chat with PDF or web page documents. The default is a small model (Flan Alpaca), that can only answer specific questions that are answered in the text. It cannot give overall impressions of, or summarise the document. The alternative (Orca Mini), can reason a little better, but is much slower (See Advanced tab).\n\nBy default the Lambeth Borough Plan '[Lambeth 2030 : Our Future, Our Lambeth](https://www.lambeth.gov.uk/better-fairer-lambeth/projects/lambeth-2030-our-future-our-lambeth)' is loaded. If you want to talk about another document or web page, please select from the second tab. If switching topic, please click the 'Clear chat' button.\n\nCaution: This is a public app. Likes and dislike responses will be saved to disk to improve the model. Please ensure that the document you upload is not sensitive is any way as other users may see it! Also, please note that LLM chatbots may give incomplete or incorrect information, so please use with care.")
164
 
165
  current_source = gr.Textbox(label="Current data source that is loaded into the app", value="Lambeth_2030-Our_Future_Our_Lambeth.pdf")
166
 
 
205
 
206
  ingest_embed_out = gr.Textbox(label="File/webpage preparation progress")
207
 
208
+ with gr.Tab("Advanced features"):
209
  model_choice = gr.Radio(label="Choose a chat model", value="Flan Alpaca", choices = ["Flan Alpaca", "Orca Mini"])
210
+ gpu_layer_choice = gr.Slider(label="Choose number of model layers to send to GPU (please don't change if you don't know what you're doing).", value=0, minimum=0, maximum=6, step = 1)
211
 
212
  gr.HTML(
213
  "<center>This app is based on the models Flan Alpaca and Orca Mini. It powered by Gradio, Transformers, Ctransformers, and Langchain.</a></center>"
 
215
 
216
  examples_set.change(fn=chatf.update_message, inputs=[examples_set], outputs=[message])
217
 
218
+ model_choice.change(fn=load_model, inputs=[model_choice, gpu_layer_choice], outputs = [model_type_state])
219
 
220
  # Load in a pdf
221
  load_pdf_click = load_pdf.click(ing.parse_file, inputs=[in_pdf], outputs=[ingest_text, current_source]).\
chatfuncs/chatfuncs.py CHANGED
@@ -9,10 +9,9 @@ import numpy as np
9
  # Model packages
10
  import torch
11
  from threading import Thread
12
- from transformers import AutoTokenizer, pipeline, TextIteratorStreamer
13
 
14
  # Alternative model sources
15
- from ctransformers import AutoModelForCausalLM#, AutoTokenizer
16
  from dataclasses import asdict, dataclass
17
 
18
  # Langchain functions
@@ -69,8 +68,8 @@ kw_model = pipeline("feature-extraction", model="sentence-transformers/all-MiniL
69
 
70
 
71
  if torch.cuda.is_available():
72
- torch_device = "cuda"
73
- gpu_layers = 5
74
  else:
75
  torch_device = "cpu"
76
  gpu_layers = 0
@@ -95,8 +94,6 @@ batch_size:int = 1024
95
  context_length:int = 4096
96
  sample = True
97
 
98
- # CtransGen model parameters
99
- gpu_layers:int = 6 #gpu_layers For serving on Huggingface set to 0 as using free CPU instance
100
 
101
  @dataclass
102
  class CtransInitConfig_gpu:
 
9
  # Model packages
10
  import torch
11
  from threading import Thread
12
+ from transformers import pipeline, TextIteratorStreamer
13
 
14
  # Alternative model sources
 
15
  from dataclasses import asdict, dataclass
16
 
17
  # Langchain functions
 
68
 
69
 
70
  if torch.cuda.is_available():
71
+ torch_device = "cuda"
72
+ gpu_layers = 6
73
  else:
74
  torch_device = "cpu"
75
  gpu_layers = 0
 
94
  context_length:int = 4096
95
  sample = True
96
 
 
 
97
 
98
  @dataclass
99
  class CtransInitConfig_gpu: