File size: 5,585 Bytes
a9c8347
e713918
 
a9c8347
 
 
 
 
 
e713918
 
4d99f77
e713918
4d99f77
e713918
9d8bdfd
 
 
 
 
 
3b1bd69
9d8bdfd
 
3b1bd69
9d8bdfd
 
 
 
 
3b1bd69
852b612
659c3bd
 
 
 
0a35a9d
 
 
77a349f
 
 
0a35a9d
 
 
 
852b612
 
 
 
 
 
 
 
 
 
 
e713918
77a349f
 
 
e713918
852b612
 
 
 
 
 
 
 
a9c8347
a826f18
 
 
 
 
 
c8daf5a
a826f18
 
 
 
 
 
 
 
 
 
 
 
 
 
 
00cfd74
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a826f18
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188

import os
##
# 获取全部环境变量
env_vars = os.environ

# 遍历并打印环境变量
for key, value in env_vars.items():
    print(f"{key}: {value}")
##
import subprocess
# 运行nvidia-smi
result = subprocess.run(
    ['nvidia-smi'], text=True
)

import spaces
from threading import Thread
from typing import Iterator

import gradio as gr
import torch
from modelscope import AutoModelForCausalLM, AutoTokenizer
from transformers import  TextIteratorStreamer

MAX_MAX_NEW_TOKENS = 2048
DEFAULT_MAX_NEW_TOKENS = 1024
MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))

##
# 检查CUDA是否可用
def print_gpu():
    result = subprocess.run(
        ['nvidia-smi'], text=True
    )

    result = subprocess.run(
        ['ps', '-ef'], text=True
    )
    result = subprocess.run(
        ['pip', 'list'], text=True
    )

    print("当前进程ID:", os.getpid())
    print("父进程ID:", os.getppid())

    if torch.cuda.is_available():
        print("CUDA is available. Listing available GPUs:")
        # 获取并打印GPU数量
        num_gpus = torch.cuda.device_count()
        for i in range(num_gpus):
            print(f"GPU {i}: {torch.cuda.get_device_name(i)}")
            # 其他相关信息,例如内存
            print(f"  Memory Allocated: {torch.cuda.memory_allocated(i) / 1024 ** 2:.0f} MB")
            print(f"  Memory Reserved: {torch.cuda.memory_reserved(i) / 1024 ** 2:.0f} MB")
    else:
        print("CUDA is not available.")

print("outter")
result = subprocess.run(['pip', 'list'], text=True)

##
os.environ['CUDA_VISIBLE_DEVICES'] = '-1'
print_gpu()

os.environ['CUDA_VISIBLE_DEVICES'] = '0,1,2,3,4,5,6,7,8'
print_gpu()

os.environ['CUDA_VISIBLE_DEVICES'] = '0,1,2,3,4,5,6,7,8,9,10,11,12,13,14'
print_gpu()


if not torch.cuda.is_available():
    DESCRIPTION += "\n<p>Running on CPU 🥶 This demo does not work on CPU.</p>"


if torch.cuda.is_available():
    model_id = "Qwen/Qwen1.5-14B-Chat"
    model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.float16, device_map="auto")
    tokenizer = AutoTokenizer.from_pretrained(model_id)
    tokenizer.use_default_system_prompt = False

@spaces.GPU
def generate(
    message: str,
    chat_history: list[tuple[str, str]],
    system_prompt: str,
    max_new_tokens: int = 1024,
    temperature: float = 0.6,
    top_p: float = 0.9,
    top_k: int = 50,
    repetition_penalty: float = 1.2,
) -> Iterator[str]:
    print_gpu()

    conversation = []
    if system_prompt:
        conversation.append({"role": "system", "content": system_prompt})
    for user, assistant in chat_history:
        conversation.extend([{"role": "user", "content": user}, {"role": "assistant", "content": assistant}])
    conversation.append({"role": "user", "content": message})

    input_ids = tokenizer.apply_chat_template(conversation, tokenize=False,add_generation_prompt=True)
    input_ids = tokenizer([input_ids],return_tensors="pt").to(model.device)

    streamer = TextIteratorStreamer(tokenizer, timeout=10.0, skip_prompt=True, skip_special_tokens=True)
    generate_kwargs = dict(
        input_ids=input_ids.input_ids,
        streamer=streamer,
        max_new_tokens=max_new_tokens,
        do_sample=True,
        top_p=top_p,
        top_k=top_k,
        temperature=temperature,
        repetition_penalty=repetition_penalty,
    ) 
    t = Thread(target=model.generate, kwargs=generate_kwargs)
    t.start()
    #dictionary update sequence element #0 has length 19; 2 is required

    outputs = []
    for text in streamer:
        outputs.append(text)
        yield "".join(outputs)

    #outputs = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
    print(outputs)
    yield outputs


chat_interface = gr.ChatInterface(
    fn=generate,
    additional_inputs=[
        gr.Textbox(label="System prompt", lines=6),
        gr.Slider(
            label="Max new tokens",
            minimum=1,
            maximum=MAX_MAX_NEW_TOKENS,
            step=1,
            value=DEFAULT_MAX_NEW_TOKENS,
        ),
        gr.Slider(
            label="Temperature",
            minimum=0.1,
            maximum=4.0,
            step=0.1,
            value=0.6,
        ),
        gr.Slider(
            label="Top-p (nucleus sampling)",
            minimum=0.05,
            maximum=1.0,
            step=0.05,
            value=0.9,
        ),
        gr.Slider(
            label="Top-k",
            minimum=1,
            maximum=1000,
            step=1,
            value=50,
        ),
        gr.Slider(
            label="Repetition penalty",
            minimum=1.0,
            maximum=2.0,
            step=0.05,
            value=1.2,
        ),
    ],
    stop_btn=None,
    examples=[
        ["你好!你是谁?"],
        ["请简单介绍一下大语言模型?"],
        ["请讲一个小人物成功的故事."],
        ["浙江的省会在哪里?"],
        ["写一篇100字的文章,题目是'人工智能开源的优势'"],
    ],
)

with gr.Blocks(css="style.css") as demo:
    gr.Markdown("""<p align="center"><img src="https://modelscope.cn/api/v1/models/qwen/Qwen-VL-Chat/repo?Revision=master&FilePath=assets/logo.jpg&View=true" style="height: 80px"/><p>""")
    gr.Markdown("""<center><font size=8>Qwen1.5-1.8B-Chat Bot👾</center>""")
    gr.Markdown("""<center><font size=4>通义千问1.5-1.8B(Qwen1.5-1.8B) 是阿里云研发的通义千问大模型系列的70亿参数规模的模型。</center>""")
    chat_interface.render()

if __name__ == "__main__":
    demo.queue(max_size=20).launch()