Spaces:
Running
Running
import os | |
os.system("pip3 install transformers") | |
os.system("pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu") | |
os.system("pip3 install tensorflow") | |
os.system("pip3 install accelerate") | |
os.system("pip3 install -U bitsandbytes") | |
import gradio as gr | |
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline | |
import torch | |
def load_model(): | |
model = AutoModelForCausalLM.from_pretrained( | |
"nvidia/Llama-3.1-Nemotron-Nano-8B-v1", | |
load_in_8bit=True, | |
device_map="auto", | |
torch_dtype=torch.float16, | |
trust_remote_code=True | |
) | |
tokenizer = AutoTokenizer.from_pretrained( | |
"nvidia/Llama-3.1-Nemotron-Nano-8B-v1", | |
trust_remote_code=True | |
) | |
return model, tokenizer | |
model, tokenizer = load_model() | |
pipe = pipeline( | |
"text-generation", | |
model=model, | |
tokenizer=tokenizer, | |
device_map="auto" | |
) | |
def generate_response(request): | |
try: | |
messages = [ | |
{"role": "user", "content": str(request)}, | |
] | |
outputs = pipe( | |
messages, | |
max_new_tokens=512, | |
do_sample=True, | |
temperature=0.7, | |
top_p=0.9, | |
repetition_penalty=1.1 | |
) | |
return outputs[0]["generated_text"][-1]['content'] | |
except Exception as e: | |
return f"Произошла ошибка: {str(e)}" | |
demo = gr.Interface( | |
fn=generate_response, | |
inputs=gr.Textbox( | |
label="Ваш запрос", | |
placeholder="Введите ваш вопрос здесь...", | |
lines=3 | |
), | |
outputs=gr.Textbox( | |
label="Ответ модели", | |
lines=5 | |
), | |
title="Chat with 8-bit Llama-3.1-Nemotron-Nano", | |
description="8-битная квантованная версия модели NVIDIA Llama-3.1-Nemotron-Nano-8B", | |
allow_flagging="never" | |
) | |
# Запускаем интерфейс | |
if __name__ == "__main__": | |
demo.launch(share=True) |