Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -3,7 +3,6 @@ os.system("pip3 install transformers")
|
|
3 |
os.system("pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu")
|
4 |
os.system("pip3 install tensorflow")
|
5 |
os.system("pip3 install accelerate")
|
6 |
-
os.system("pip3 install -U bitsandbytes")
|
7 |
|
8 |
import gradio as gr
|
9 |
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
|
@@ -12,20 +11,19 @@ import torch
|
|
12 |
def load_model():
|
13 |
model = AutoModelForCausalLM.from_pretrained(
|
14 |
"nvidia/Llama-3.1-Nemotron-Nano-8B-v1",
|
15 |
-
load_in_8bit=True,
|
16 |
device_map="auto",
|
17 |
-
torch_dtype=torch.float16,
|
|
|
18 |
trust_remote_code=True
|
19 |
)
|
|
|
20 |
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
return model, tokenizer
|
27 |
|
28 |
-
model, tokenizer = load_model()
|
29 |
pipe = pipeline(
|
30 |
"text-generation",
|
31 |
model=model,
|
@@ -51,24 +49,15 @@ def generate_response(request):
|
|
51 |
return outputs[0]["generated_text"][-1]['content']
|
52 |
|
53 |
except Exception as e:
|
54 |
-
return f"
|
55 |
|
56 |
demo = gr.Interface(
|
57 |
fn=generate_response,
|
58 |
-
inputs=gr.Textbox(
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
),
|
63 |
-
outputs=gr.Textbox(
|
64 |
-
label="Ответ модели",
|
65 |
-
lines=5
|
66 |
-
),
|
67 |
-
title="Chat with 8-bit Llama-3.1-Nemotron-Nano",
|
68 |
-
description="8-битная квантованная версия модели NVIDIA Llama-3.1-Nemotron-Nano-8B",
|
69 |
-
allow_flagging="never"
|
70 |
)
|
71 |
|
72 |
-
# Запускаем интерфейс
|
73 |
if __name__ == "__main__":
|
74 |
demo.launch(share=True)
|
|
|
3 |
os.system("pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu")
|
4 |
os.system("pip3 install tensorflow")
|
5 |
os.system("pip3 install accelerate")
|
|
|
6 |
|
7 |
import gradio as gr
|
8 |
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
|
|
|
11 |
def load_model():
|
12 |
model = AutoModelForCausalLM.from_pretrained(
|
13 |
"nvidia/Llama-3.1-Nemotron-Nano-8B-v1",
|
|
|
14 |
device_map="auto",
|
15 |
+
torch_dtype=torch.float16,
|
16 |
+
low_cpu_mem_usage=True,
|
17 |
trust_remote_code=True
|
18 |
)
|
19 |
+
return model
|
20 |
|
21 |
+
model = load_model()
|
22 |
+
tokenizer = AutoTokenizer.from_pretrained(
|
23 |
+
"nvidia/Llama-3.1-Nemotron-Nano-8B-v1",
|
24 |
+
trust_remote_code=True
|
25 |
+
)
|
|
|
26 |
|
|
|
27 |
pipe = pipeline(
|
28 |
"text-generation",
|
29 |
model=model,
|
|
|
49 |
return outputs[0]["generated_text"][-1]['content']
|
50 |
|
51 |
except Exception as e:
|
52 |
+
return f"Ошибка генерации: {str(e)}"
|
53 |
|
54 |
demo = gr.Interface(
|
55 |
fn=generate_response,
|
56 |
+
inputs=gr.Textbox(label="Ваш запрос", placeholder="Введите вопрос..."),
|
57 |
+
outputs=gr.Textbox(label="Ответ модели"),
|
58 |
+
title="Chat with Llama-3.1-Nemotron-Nano (FP16)",
|
59 |
+
description="Модель работает в режиме float16 (без 8-битного квантования)"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
60 |
)
|
61 |
|
|
|
62 |
if __name__ == "__main__":
|
63 |
demo.launch(share=True)
|