Spaces:
Running
Running
File size: 2,010 Bytes
f5d9b66 55537e2 9cf3054 1c12c7c 439d32c 9087d4e 7455e7d 439d32c 7455e7d 9087d4e 7455e7d 439d32c 7455e7d 439d32c 9087d4e 7455e7d 439d32c 7455e7d 9087d4e 439d32c 7455e7d 439d32c 7455e7d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 |
import os
os.system("pip3 install transformers")
os.system("pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu")
os.system("pip3 install tensorflow")
os.system("pip3 install accelerate")
os.system("pip3 install -U bitsandbytes")
import gradio as gr
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
import torch
def load_model():
model = AutoModelForCausalLM.from_pretrained(
"nvidia/Llama-3.1-Nemotron-Nano-8B-v1",
load_in_8bit=True,
device_map="auto",
torch_dtype=torch.float16,
trust_remote_code=True
)
tokenizer = AutoTokenizer.from_pretrained(
"nvidia/Llama-3.1-Nemotron-Nano-8B-v1",
trust_remote_code=True
)
return model, tokenizer
model, tokenizer = load_model()
pipe = pipeline(
"text-generation",
model=model,
tokenizer=tokenizer,
device_map="auto"
)
def generate_response(request):
try:
messages = [
{"role": "user", "content": str(request)},
]
outputs = pipe(
messages,
max_new_tokens=512,
do_sample=True,
temperature=0.7,
top_p=0.9,
repetition_penalty=1.1
)
return outputs[0]["generated_text"][-1]['content']
except Exception as e:
return f"Произошла ошибка: {str(e)}"
demo = gr.Interface(
fn=generate_response,
inputs=gr.Textbox(
label="Ваш запрос",
placeholder="Введите ваш вопрос здесь...",
lines=3
),
outputs=gr.Textbox(
label="Ответ модели",
lines=5
),
title="Chat with 8-bit Llama-3.1-Nemotron-Nano",
description="8-битная квантованная версия модели NVIDIA Llama-3.1-Nemotron-Nano-8B",
allow_flagging="never"
)
# Запускаем интерфейс
if __name__ == "__main__":
demo.launch(share=True) |