plutostack commited on
Commit
7455e7d
·
verified ·
1 Parent(s): 439d32c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +50 -35
app.py CHANGED
@@ -2,28 +2,28 @@ import os
2
  os.system("pip3 install transformers")
3
  os.system("pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu")
4
  os.system("pip3 install tensorflow")
5
- os.system("pip3 install bitsandbytes accelerate")
6
 
7
  import gradio as gr
8
- from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, pipeline
9
  import torch
10
 
11
- bnb_config = BitsAndBytesConfig(
12
- load_in_4bit=True,
13
- bnb_4bit_quant_type="nf4",
14
- bnb_4bit_compute_dtype=torch.float16,
15
- bnb_4bit_use_double_quant=True
16
- )
17
-
 
18
 
19
- model = AutoModelForCausalLM.from_pretrained(
20
- "nvidia/Llama-3.1-Nemotron-Nano-8B-v1",
21
- quantization_config=bnb_config,
22
- device_map="auto",
23
- trust_remote_code=True
24
- )
25
- tokenizer = AutoTokenizer.from_pretrained("nvidia/Llama-3.1-Nemotron-Nano-8B-v1")
26
 
 
27
  pipe = pipeline(
28
  "text-generation",
29
  model=model,
@@ -31,27 +31,42 @@ pipe = pipeline(
31
  device_map="auto"
32
  )
33
 
34
- def textgen(request):
35
- messages = [
36
- {"role": "user", "content": str(request)},
37
- ]
38
-
39
- outputs = pipe(
40
- messages,
41
- max_new_tokens=512,
42
- do_sample=True,
43
- temperature=0.7,
44
- top_p=0.9
45
- )
 
 
 
 
46
 
47
- return outputs[0]["generated_text"][-1]['content']
 
48
 
49
  demo = gr.Interface(
50
- fn=textgen,
51
- inputs=gr.Textbox(label="Ваш запрос", placeholder="Введите ваш вопрос здесь..."),
52
- outputs=gr.Textbox(label="Ответ модели"),
53
- title="Chat with Llama-3.1-Nemotron-Nano (4-bit quantized)",
54
- description="Квантованная 4-bit версия модели NVIDIA Llama-3.1-Nemotron-Nano-8B"
 
 
 
 
 
 
 
 
55
  )
56
 
57
- demo.launch(share=True)
 
 
 
2
  os.system("pip3 install transformers")
3
  os.system("pip3 install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu")
4
  os.system("pip3 install tensorflow")
 
5
 
6
  import gradio as gr
7
+ from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
8
  import torch
9
 
10
+ def load_model():
11
+ model = AutoModelForCausalLM.from_pretrained(
12
+ "nvidia/Llama-3.1-Nemotron-Nano-8B-v1",
13
+ load_in_8bit=True,
14
+ device_map="auto",
15
+ torch_dtype=torch.float16,
16
+ trust_remote_code=True
17
+ )
18
 
19
+ tokenizer = AutoTokenizer.from_pretrained(
20
+ "nvidia/Llama-3.1-Nemotron-Nano-8B-v1",
21
+ trust_remote_code=True
22
+ )
23
+
24
+ return model, tokenizer
 
25
 
26
+ model, tokenizer = load_model()
27
  pipe = pipeline(
28
  "text-generation",
29
  model=model,
 
31
  device_map="auto"
32
  )
33
 
34
+ def generate_response(request):
35
+ try:
36
+ messages = [
37
+ {"role": "user", "content": str(request)},
38
+ ]
39
+
40
+ outputs = pipe(
41
+ messages,
42
+ max_new_tokens=512,
43
+ do_sample=True,
44
+ temperature=0.7,
45
+ top_p=0.9,
46
+ repetition_penalty=1.1
47
+ )
48
+
49
+ return outputs[0]["generated_text"][-1]['content']
50
 
51
+ except Exception as e:
52
+ return f"Произошла ошибка: {str(e)}"
53
 
54
  demo = gr.Interface(
55
+ fn=generate_response,
56
+ inputs=gr.Textbox(
57
+ label="Ваш запрос",
58
+ placeholder="Введите ваш вопрос здесь...",
59
+ lines=3
60
+ ),
61
+ outputs=gr.Textbox(
62
+ label="Ответ модели",
63
+ lines=5
64
+ ),
65
+ title="Chat with 8-bit Llama-3.1-Nemotron-Nano",
66
+ description="8-битная квантованная версия модели NVIDIA Llama-3.1-Nemotron-Nano-8B",
67
+ allow_flagging="never"
68
  )
69
 
70
+ # Запускаем интерфейс
71
+ if __name__ == "__main__":
72
+ demo.launch(share=True)