Spaces:

plutostack
/

PlutoAI

Running

App Files Files Community

plutostack commited on 2 days ago

Commit

1625b21

verified ·

1 Parent(s): 1c12c7c

Update app.py

Browse files

Files changed (1) hide show

app.py +13 -24

app.py CHANGED Viewed

@@ -3,7 +3,6 @@ os.system("pip3 install transformers")
 os.system("pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu")
 os.system("pip3 install tensorflow")
 os.system("pip3 install accelerate")
-os.system("pip3 install -U bitsandbytes")
 import gradio as gr
 from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
@@ -12,20 +11,19 @@ import torch
 def load_model():
     model = AutoModelForCausalLM.from_pretrained(
         "nvidia/Llama-3.1-Nemotron-Nano-8B-v1",
-        load_in_8bit=True,
         device_map="auto",
-        torch_dtype=torch.float16,
         trust_remote_code=True
     )
-    tokenizer = AutoTokenizer.from_pretrained(
-        "nvidia/Llama-3.1-Nemotron-Nano-8B-v1",
-        trust_remote_code=True
-    )
-    return model, tokenizer
-model, tokenizer = load_model()
 pipe = pipeline(
     "text-generation",
     model=model,
@@ -51,24 +49,15 @@ def generate_response(request):
         return outputs[0]["generated_text"][-1]['content']
     except Exception as e:
-        return f"Произошла ошибка: {str(e)}"
 demo = gr.Interface(
     fn=generate_response,
-    inputs=gr.Textbox(
-        label="Ваш запрос",
-        placeholder="Введите ваш вопрос здесь...",
-        lines=3
-    ),
-    outputs=gr.Textbox(
-        label="Ответ модели",
-        lines=5
-    ),
-    title="Chat with 8-bit Llama-3.1-Nemotron-Nano",
-    description="8-битная квантованная версия модели NVIDIA Llama-3.1-Nemotron-Nano-8B",
-    allow_flagging="never"
 )
-# Запускаем интерфейс
 if __name__ == "__main__":
     demo.launch(share=True)

 os.system("pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu")
 os.system("pip3 install tensorflow")
 os.system("pip3 install accelerate")
 import gradio as gr
 from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
 def load_model():
     model = AutoModelForCausalLM.from_pretrained(
         "nvidia/Llama-3.1-Nemotron-Nano-8B-v1",
         device_map="auto",
+        torch_dtype=torch.float16,
+        low_cpu_mem_usage=True,
         trust_remote_code=True
     )
+    return model
+model = load_model()
+tokenizer = AutoTokenizer.from_pretrained(
+    "nvidia/Llama-3.1-Nemotron-Nano-8B-v1",
+    trust_remote_code=True
+)
 pipe = pipeline(
     "text-generation",
     model=model,
         return outputs[0]["generated_text"][-1]['content']
     except Exception as e:
+        return f"Ошибка генерации: {str(e)}"
 demo = gr.Interface(
     fn=generate_response,
+    inputs=gr.Textbox(label="Ваш запрос", placeholder="Введите вопрос..."),
+    outputs=gr.Textbox(label="Ответ модели"),
+    title="Chat with Llama-3.1-Nemotron-Nano (FP16)",
+    description="Модель работает в режиме float16 (без 8-битного квантования)"
 )
 if __name__ == "__main__":
     demo.launch(share=True)