Spaces:
Runtime error
Runtime error
Upload 2 files
Browse files- app.py +2 -2
- llama_generate.py +9 -3
app.py
CHANGED
@@ -3,7 +3,7 @@ from llama_generate import run
|
|
3 |
|
4 |
|
5 |
def greet(query):
|
6 |
-
results = run(query)
|
7 |
return results
|
8 |
|
9 |
|
@@ -12,5 +12,5 @@ sample_list = [
|
|
12 |
"Who is Gaël Varoquaux?"
|
13 |
]
|
14 |
|
15 |
-
iface = gr.Interface(fn=greet, inputs="text", outputs="text", examples=sample_list, cache_examples=
|
16 |
iface.launch()
|
|
|
3 |
|
4 |
|
5 |
def greet(query):
|
6 |
+
results = run(query, 5)
|
7 |
return results
|
8 |
|
9 |
|
|
|
12 |
"Who is Gaël Varoquaux?"
|
13 |
]
|
14 |
|
15 |
+
iface = gr.Interface(fn=greet, inputs="text", outputs="text", examples=sample_list, cache_examples=True)
|
16 |
iface.launch()
|
llama_generate.py
CHANGED
@@ -2,7 +2,7 @@ from transformers import AutoModelForCausalLM, AutoTokenizer
|
|
2 |
import torch
|
3 |
from nltk.tokenize import sent_tokenize
|
4 |
|
5 |
-
torch.device('cuda' if torch.cuda.is_available() else 'cpu') # the device to load the model onto
|
6 |
model_name_or_path = "TheBloke/Llama-2-7b-Chat-GPTQ"
|
7 |
|
8 |
model = AutoModelForCausalLM.from_pretrained(model_name_or_path,
|
@@ -11,6 +11,12 @@ model = AutoModelForCausalLM.from_pretrained(model_name_or_path,
|
|
11 |
trust_remote_code=False,
|
12 |
revision="main")
|
13 |
|
|
|
|
|
|
|
|
|
|
|
|
|
14 |
|
15 |
|
16 |
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=True)
|
@@ -41,7 +47,7 @@ def single_generate(query):
|
|
41 |
model_inputs = encodeds.to(device)
|
42 |
model.to(device)
|
43 |
|
44 |
-
generated_ids = model.generate(model_inputs, max_new_tokens=
|
45 |
decoded = tokenizer.batch_decode(generated_ids)
|
46 |
results = list()
|
47 |
for index, result in enumerate(decoded):
|
@@ -158,5 +164,5 @@ if __name__ == '__main__':
|
|
158 |
# print(result)
|
159 |
# result = """
|
160 |
|
161 |
-
answer = run(query='
|
162 |
print(answer)
|
|
|
2 |
import torch
|
3 |
from nltk.tokenize import sent_tokenize
|
4 |
|
5 |
+
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') # the device to load the model onto
|
6 |
model_name_or_path = "TheBloke/Llama-2-7b-Chat-GPTQ"
|
7 |
|
8 |
model = AutoModelForCausalLM.from_pretrained(model_name_or_path,
|
|
|
11 |
trust_remote_code=False,
|
12 |
revision="main")
|
13 |
|
14 |
+
from ctransformers import AutoModelForCausalLM
|
15 |
+
|
16 |
+
# Set gpu_layers to the number of layers to offload to GPU. Set to 0 if no GPU acceleration is available on your system.
|
17 |
+
llm = AutoModelForCausalLM.from_pretrained("TheBloke/Llama-2-7b-Chat-GGUF", model_file="llama-2-7b-chat.q4_K_M.gguf", model_type="llama", gpu_layers=50)
|
18 |
+
|
19 |
+
print(llm("AI is going to"))
|
20 |
|
21 |
|
22 |
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, use_fast=True)
|
|
|
47 |
model_inputs = encodeds.to(device)
|
48 |
model.to(device)
|
49 |
|
50 |
+
generated_ids = model.generate(model_inputs, max_new_tokens=150, do_sample=True, temperature=1.0)
|
51 |
decoded = tokenizer.batch_decode(generated_ids)
|
52 |
results = list()
|
53 |
for index, result in enumerate(decoded):
|
|
|
164 |
# print(result)
|
165 |
# result = """
|
166 |
|
167 |
+
answer = run(query='Tell me something about Gaël Varoquaux, e.g., birth date and place and short bio ', sample_size=10)
|
168 |
print(answer)
|