PlutoAI / app.py
plutostack's picture
Update app.py
439d32c verified
raw
history blame
1.61 kB
import os
os.system("pip3 install transformers")
os.system("pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu")
os.system("pip3 install tensorflow")
os.system("pip3 install bitsandbytes accelerate")
import gradio as gr
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, pipeline
import torch
bnb_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_compute_dtype=torch.float16,
bnb_4bit_use_double_quant=True
)
model = AutoModelForCausalLM.from_pretrained(
"nvidia/Llama-3.1-Nemotron-Nano-8B-v1",
quantization_config=bnb_config,
device_map="auto",
trust_remote_code=True
)
tokenizer = AutoTokenizer.from_pretrained("nvidia/Llama-3.1-Nemotron-Nano-8B-v1")
pipe = pipeline(
"text-generation",
model=model,
tokenizer=tokenizer,
device_map="auto"
)
def textgen(request):
messages = [
{"role": "user", "content": str(request)},
]
outputs = pipe(
messages,
max_new_tokens=512,
do_sample=True,
temperature=0.7,
top_p=0.9
)
return outputs[0]["generated_text"][-1]['content']
demo = gr.Interface(
fn=textgen,
inputs=gr.Textbox(label="Ваш запрос", placeholder="Введите ваш вопрос здесь..."),
outputs=gr.Textbox(label="Ответ модели"),
title="Chat with Llama-3.1-Nemotron-Nano (4-bit quantized)",
description="Квантованная 4-bit версия модели NVIDIA Llama-3.1-Nemotron-Nano-8B"
)
demo.launch(share=True)