Lihuchen commited on
Commit
d94406a
1 Parent(s): a10a2a4

Upload 2 files

Browse files
Files changed (2) hide show
  1. app.py +2 -2
  2. llama_generate.py +9 -3
app.py CHANGED
@@ -3,7 +3,7 @@ from llama_generate import run
3
 
4
 
5
  def greet(query):
6
- results = run(query)
7
  return results
8
 
9
 
@@ -12,5 +12,5 @@ sample_list = [
12
  "Who is Gaël Varoquaux?"
13
  ]
14
 
15
- iface = gr.Interface(fn=greet, inputs="text", outputs="text", examples=sample_list, cache_examples=False)
16
  iface.launch()
 
3
 
4
 
5
  def greet(query):
6
+ results = run(query, 5)
7
  return results
8
 
9
 
 
12
  "Who is Gaël Varoquaux?"
13
  ]
14
 
15
+ iface = gr.Interface(fn=greet, inputs="text", outputs="text", examples=sample_list, cache_examples=True)
16
  iface.launch()
llama_generate.py CHANGED
@@ -2,7 +2,7 @@ from transformers import AutoModelForCausalLM, AutoTokenizer
2
  import torch
3
  from nltk.tokenize import sent_tokenize
4
 
5
- torch.device('cuda' if torch.cuda.is_available() else 'cpu') # the device to load the model onto
6
  model_name_or_path = "TheBloke/Llama-2-7b-Chat-GPTQ"
7
 
8
  model = AutoModelForCausalLM.from_pretrained(model_name_or_path,
@@ -11,6 +11,12 @@ model = AutoModelForCausalLM.from_pretrained(model_name_or_path,
11
  trust_remote_code=False,
12
  revision="main")
13
 
 
 
 
 
 
 
14
 
15
 
16
  tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=True)
@@ -41,7 +47,7 @@ def single_generate(query):
41
  model_inputs = encodeds.to(device)
42
  model.to(device)
43
 
44
- generated_ids = model.generate(model_inputs, max_new_tokens=100, do_sample=True, temperature=1.0)
45
  decoded = tokenizer.batch_decode(generated_ids)
46
  results = list()
47
  for index, result in enumerate(decoded):
@@ -158,5 +164,5 @@ if __name__ == '__main__':
158
  # print(result)
159
  # result = """
160
 
161
- answer = run(query='WHo is Lihu Chen?', sample_size=10)
162
  print(answer)
 
2
  import torch
3
  from nltk.tokenize import sent_tokenize
4
 
5
+ device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') # the device to load the model onto
6
  model_name_or_path = "TheBloke/Llama-2-7b-Chat-GPTQ"
7
 
8
  model = AutoModelForCausalLM.from_pretrained(model_name_or_path,
 
11
  trust_remote_code=False,
12
  revision="main")
13
 
14
+ from ctransformers import AutoModelForCausalLM
15
+
16
+ # Set gpu_layers to the number of layers to offload to GPU. Set to 0 if no GPU acceleration is available on your system.
17
+ llm = AutoModelForCausalLM.from_pretrained("TheBloke/Llama-2-7b-Chat-GGUF", model_file="llama-2-7b-chat.q4_K_M.gguf", model_type="llama", gpu_layers=50)
18
+
19
+ print(llm("AI is going to"))
20
 
21
 
22
  tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=True)
 
47
  model_inputs = encodeds.to(device)
48
  model.to(device)
49
 
50
+ generated_ids = model.generate(model_inputs, max_new_tokens=150, do_sample=True, temperature=1.0)
51
  decoded = tokenizer.batch_decode(generated_ids)
52
  results = list()
53
  for index, result in enumerate(decoded):
 
164
  # print(result)
165
  # result = """
166
 
167
+ answer = run(query='Tell me something about Gaël Varoquaux, e.g., birth date and place and short bio ', sample_size=10)
168
  print(answer)