Lihuchen commited on
Commit
f2ff742
1 Parent(s): d94406a

Upload 3 files

Browse files
Files changed (3) hide show
  1. app.py +1 -1
  2. cpu_llama_generate.py +13 -0
  3. llama_generate.py +0 -5
app.py CHANGED
@@ -1,5 +1,5 @@
1
  import gradio as gr
2
- from llama_generate import run
3
 
4
 
5
  def greet(query):
 
1
  import gradio as gr
2
+ from cpu_llama_generate import run
3
 
4
 
5
  def greet(query):
cpu_llama_generate.py ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from ctransformers import AutoModelForCausalLM
2
+
3
+ #device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') # the device to load the model onto
4
+ device = 'cpu'
5
+
6
+
7
+ # Set gpu_layers to the number of layers to offload to GPU. Set to 0 if no GPU acceleration is available on your system.
8
+ llm = AutoModelForCausalLM.from_pretrained("TheBloke/Llama-2-7b-Chat-GGUF", model_file="llama-2-7b-chat.Q4_K_M.gguf", model_type="llama", gpu_layers=0)
9
+
10
+
11
+ def run(query):
12
+ return llm(query)
13
+
llama_generate.py CHANGED
@@ -11,12 +11,7 @@ model = AutoModelForCausalLM.from_pretrained(model_name_or_path,
11
  trust_remote_code=False,
12
  revision="main")
13
 
14
- from ctransformers import AutoModelForCausalLM
15
 
16
- # Set gpu_layers to the number of layers to offload to GPU. Set to 0 if no GPU acceleration is available on your system.
17
- llm = AutoModelForCausalLM.from_pretrained("TheBloke/Llama-2-7b-Chat-GGUF", model_file="llama-2-7b-chat.q4_K_M.gguf", model_type="llama", gpu_layers=50)
18
-
19
- print(llm("AI is going to"))
20
 
21
 
22
  tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=True)
 
11
  trust_remote_code=False,
12
  revision="main")
13
 
 
14
 
 
 
 
 
15
 
16
 
17
  tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=True)