Spaces:
Running
Running
import os | |
os.system("pip3 install transformers") | |
os.system("pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu") | |
os.system("pip3 install tensorflow") | |
os.system("pip3 install accelerate") | |
import gradio as gr | |
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline | |
import torch | |
def load_model(): | |
model = AutoModelForCausalLM.from_pretrained( | |
"nvidia/Llama-3.1-Nemotron-Nano-8B-v1", | |
device_map="auto", | |
torch_dtype=torch.float16, | |
low_cpu_mem_usage=True, | |
trust_remote_code=True | |
) | |
return model | |
model = load_model() | |
tokenizer = AutoTokenizer.from_pretrained( | |
"nvidia/Llama-3.1-Nemotron-Nano-8B-v1", | |
trust_remote_code=True | |
) | |
pipe = pipeline( | |
"text-generation", | |
model=model, | |
tokenizer=tokenizer, | |
device_map="auto" | |
) | |
def generate_response(request): | |
try: | |
messages = [ | |
{"role": "user", "content": str(request)}, | |
] | |
outputs = pipe( | |
messages, | |
max_new_tokens=512, | |
do_sample=True, | |
temperature=0.7, | |
top_p=0.9, | |
repetition_penalty=1.1 | |
) | |
return outputs[0]["generated_text"][-1]['content'] | |
except Exception as e: | |
return f"Ошибка генерации: {str(e)}" | |
demo = gr.Interface( | |
fn=generate_response, | |
inputs=gr.Textbox(label="Ваш запрос", placeholder="Введите вопрос..."), | |
outputs=gr.Textbox(label="Ответ модели"), | |
title="Chat with Llama-3.1-Nemotron-Nano (FP16)", | |
description="Модель работает в режиме float16 (без 8-битного квантования)" | |
) | |
if __name__ == "__main__": | |
demo.launch(share=True) |