plutostack commited on
Commit
1625b21
·
verified ·
1 Parent(s): 1c12c7c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +13 -24
app.py CHANGED
@@ -3,7 +3,6 @@ os.system("pip3 install transformers")
3
  os.system("pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu")
4
  os.system("pip3 install tensorflow")
5
  os.system("pip3 install accelerate")
6
- os.system("pip3 install -U bitsandbytes")
7
 
8
  import gradio as gr
9
  from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
@@ -12,20 +11,19 @@ import torch
12
  def load_model():
13
  model = AutoModelForCausalLM.from_pretrained(
14
  "nvidia/Llama-3.1-Nemotron-Nano-8B-v1",
15
- load_in_8bit=True,
16
  device_map="auto",
17
- torch_dtype=torch.float16,
 
18
  trust_remote_code=True
19
  )
 
20
 
21
- tokenizer = AutoTokenizer.from_pretrained(
22
- "nvidia/Llama-3.1-Nemotron-Nano-8B-v1",
23
- trust_remote_code=True
24
- )
25
-
26
- return model, tokenizer
27
 
28
- model, tokenizer = load_model()
29
  pipe = pipeline(
30
  "text-generation",
31
  model=model,
@@ -51,24 +49,15 @@ def generate_response(request):
51
  return outputs[0]["generated_text"][-1]['content']
52
 
53
  except Exception as e:
54
- return f"Произошла ошибка: {str(e)}"
55
 
56
  demo = gr.Interface(
57
  fn=generate_response,
58
- inputs=gr.Textbox(
59
- label="Ваш запрос",
60
- placeholder="Введите ваш вопрос здесь...",
61
- lines=3
62
- ),
63
- outputs=gr.Textbox(
64
- label="Ответ модели",
65
- lines=5
66
- ),
67
- title="Chat with 8-bit Llama-3.1-Nemotron-Nano",
68
- description="8-битная квантованная версия модели NVIDIA Llama-3.1-Nemotron-Nano-8B",
69
- allow_flagging="never"
70
  )
71
 
72
- # Запускаем интерфейс
73
  if __name__ == "__main__":
74
  demo.launch(share=True)
 
3
  os.system("pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu")
4
  os.system("pip3 install tensorflow")
5
  os.system("pip3 install accelerate")
 
6
 
7
  import gradio as gr
8
  from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
 
11
  def load_model():
12
  model = AutoModelForCausalLM.from_pretrained(
13
  "nvidia/Llama-3.1-Nemotron-Nano-8B-v1",
 
14
  device_map="auto",
15
+ torch_dtype=torch.float16,
16
+ low_cpu_mem_usage=True,
17
  trust_remote_code=True
18
  )
19
+ return model
20
 
21
+ model = load_model()
22
+ tokenizer = AutoTokenizer.from_pretrained(
23
+ "nvidia/Llama-3.1-Nemotron-Nano-8B-v1",
24
+ trust_remote_code=True
25
+ )
 
26
 
 
27
  pipe = pipeline(
28
  "text-generation",
29
  model=model,
 
49
  return outputs[0]["generated_text"][-1]['content']
50
 
51
  except Exception as e:
52
+ return f"Ошибка генерации: {str(e)}"
53
 
54
  demo = gr.Interface(
55
  fn=generate_response,
56
+ inputs=gr.Textbox(label="Ваш запрос", placeholder="Введите вопрос..."),
57
+ outputs=gr.Textbox(label="Ответ модели"),
58
+ title="Chat with Llama-3.1-Nemotron-Nano (FP16)",
59
+ description="Модель работает в режиме float16 (без 8-битного квантования)"
 
 
 
 
 
 
 
 
60
  )
61
 
 
62
  if __name__ == "__main__":
63
  demo.launch(share=True)