Spaces:
Sleeping
Sleeping
import os | |
os.system("pip3 install transformers") | |
os.system("pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu") | |
os.system("pip3 install tensorflow") | |
os.system("pip3 install bitsandbytes accelerate") | |
import gradio as gr | |
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, pipeline | |
import torch | |
bnb_config = BitsAndBytesConfig( | |
load_in_4bit=True, | |
bnb_4bit_quant_type="nf4", | |
bnb_4bit_compute_dtype=torch.float16, | |
bnb_4bit_use_double_quant=True | |
) | |
model = AutoModelForCausalLM.from_pretrained( | |
"nvidia/Llama-3.1-Nemotron-Nano-8B-v1", | |
quantization_config=bnb_config, | |
device_map="auto", | |
trust_remote_code=True | |
) | |
tokenizer = AutoTokenizer.from_pretrained("nvidia/Llama-3.1-Nemotron-Nano-8B-v1") | |
pipe = pipeline( | |
"text-generation", | |
model=model, | |
tokenizer=tokenizer, | |
device_map="auto" | |
) | |
def textgen(request): | |
messages = [ | |
{"role": "user", "content": str(request)}, | |
] | |
outputs = pipe( | |
messages, | |
max_new_tokens=512, | |
do_sample=True, | |
temperature=0.7, | |
top_p=0.9 | |
) | |
return outputs[0]["generated_text"][-1]['content'] | |
demo = gr.Interface( | |
fn=textgen, | |
inputs=gr.Textbox(label="Ваш запрос", placeholder="Введите ваш вопрос здесь..."), | |
outputs=gr.Textbox(label="Ответ модели"), | |
title="Chat with Llama-3.1-Nemotron-Nano (4-bit quantized)", | |
description="Квантованная 4-bit версия модели NVIDIA Llama-3.1-Nemotron-Nano-8B" | |
) | |
demo.launch(share=True) |