import os os.system("pip3 install transformers") os.system("pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu") os.system("pip3 install tensorflow") os.system("pip3 install accelerate") import gradio as gr from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline import torch def load_model(): model = AutoModelForCausalLM.from_pretrained( "nvidia/Llama-3.1-Nemotron-Nano-8B-v1", device_map="auto", torch_dtype=torch.float16, low_cpu_mem_usage=True, trust_remote_code=True ) return model model = load_model() tokenizer = AutoTokenizer.from_pretrained( "nvidia/Llama-3.1-Nemotron-Nano-8B-v1", trust_remote_code=True ) pipe = pipeline( "text-generation", model=model, tokenizer=tokenizer, device_map="auto" ) def generate_response(request): try: messages = [ {"role": "user", "content": str(request)}, ] outputs = pipe( messages, max_new_tokens=512, do_sample=True, temperature=0.7, top_p=0.9, repetition_penalty=1.1 ) return outputs[0]["generated_text"][-1]['content'] except Exception as e: return f"Ошибка генерации: {str(e)}" demo = gr.Interface( fn=generate_response, inputs=gr.Textbox(label="Ваш запрос", placeholder="Введите вопрос..."), outputs=gr.Textbox(label="Ответ модели"), title="Chat with Llama-3.1-Nemotron-Nano (FP16)", description="Модель работает в режиме float16 (без 8-битного квантования)" ) if __name__ == "__main__": demo.launch(share=True)