John6666 commited on
Commit
f53e84c
·
verified ·
1 Parent(s): b00887a

Upload 2 files

Browse files
Files changed (1) hide show
  1. app.py +58 -4
app.py CHANGED
@@ -15,6 +15,7 @@ torch.set_float32_matmul_precision("high")
15
  HF_TOKEN = os.getenv("HF_TOKEN", None)
16
  #REPO_ID = "deepseek-ai/DeepSeek-R1-Distill-Llama-8B"
17
  REPO_ID = "nicoboss/DeepSeek-R1-Distill-Qwen-32B-Uncensored"
 
18
 
19
  DESCRIPTION = f'''
20
  <div>
@@ -49,11 +50,10 @@ if torch.cuda.is_available():
49
  model = AutoModelForCausalLM.from_pretrained(REPO_ID, device_map="auto", quantization_config=nf4_config)
50
  else: model = AutoModelForCausalLM.from_pretrained(REPO_ID, torch_dtype=torch.float32)
51
  streamer = TextIteratorStreamer(tokenizer, timeout=10.0, skip_prompt=True, skip_special_tokens=True)
52
- flush()
53
 
54
  @spaces.GPU(duration=59)
55
  @torch.inference_mode()
56
- def chat(message: str,
57
  history: list[dict],
58
  temperature: float,
59
  max_new_tokens: int,
@@ -70,11 +70,15 @@ def chat(message: str,
70
  messages.append({"role": "system", "content": sys_prompt})
71
  messages.append({"role": "user", "content": message})
72
 
73
- input_tensors = tokenizer.apply_chat_template([{"role": x["role"], "content": x["content"]} for x in history] + messages, add_generation_prompt=True, return_dict=True, add_special_tokens=False, return_tensors="pt").to(model.device)
74
 
75
  input_ids = input_tensors["input_ids"]
76
  attention_mask = input_tensors["attention_mask"]
77
 
 
 
 
 
78
  generate_kwargs = dict(
79
  input_ids=input_ids,
80
  attention_mask=attention_mask,
@@ -102,10 +106,60 @@ def chat(message: str,
102
  finally:
103
  flush()
104
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
105
  with gr.Blocks(fill_height=True, fill_width=True, css=css) as demo:
106
  gr.Markdown(DESCRIPTION)
107
  gr.ChatInterface(
108
- fn=chat,
109
  type="messages",
110
  chatbot=gr.Chatbot(height=450, type="messages", placeholder=PLACEHOLDER, label='Gradio ChatInterface'),
111
  fill_height=True,
 
15
  HF_TOKEN = os.getenv("HF_TOKEN", None)
16
  #REPO_ID = "deepseek-ai/DeepSeek-R1-Distill-Llama-8B"
17
  REPO_ID = "nicoboss/DeepSeek-R1-Distill-Qwen-32B-Uncensored"
18
+ #REPO_ID = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"
19
 
20
  DESCRIPTION = f'''
21
  <div>
 
50
  model = AutoModelForCausalLM.from_pretrained(REPO_ID, device_map="auto", quantization_config=nf4_config)
51
  else: model = AutoModelForCausalLM.from_pretrained(REPO_ID, torch_dtype=torch.float32)
52
  streamer = TextIteratorStreamer(tokenizer, timeout=10.0, skip_prompt=True, skip_special_tokens=True)
 
53
 
54
  @spaces.GPU(duration=59)
55
  @torch.inference_mode()
56
+ def chat_stream(message: str,
57
  history: list[dict],
58
  temperature: float,
59
  max_new_tokens: int,
 
70
  messages.append({"role": "system", "content": sys_prompt})
71
  messages.append({"role": "user", "content": message})
72
 
73
+ input_tensors = tokenizer.apply_chat_template([{"role": x["role"], "content": x["content"]} for x in history + messages if "role" in x.keys()], add_generation_prompt=True, return_dict=True, add_special_tokens=False, return_tensors="pt").to(model.device)
74
 
75
  input_ids = input_tensors["input_ids"]
76
  attention_mask = input_tensors["attention_mask"]
77
 
78
+ #print("history: ", [{"role": x["role"], "content": x["content"]} for x in history if "role" in x.keys()])
79
+ #print("messages: ", [{"role": x["role"], "content": x["content"]} for x in messages if "role" in x.keys()])
80
+ #print("tokenized: ", tokenizer.apply_chat_template([{"role": x["role"], "content": x["content"]} for x in history + messages if "role" in x.keys()], add_generation_prompt=True, add_special_tokens=False, tokenize=False))
81
+
82
  generate_kwargs = dict(
83
  input_ids=input_ids,
84
  attention_mask=attention_mask,
 
106
  finally:
107
  flush()
108
 
109
+ @spaces.GPU(duration=59)
110
+ @torch.inference_mode()
111
+ def chat(message: str,
112
+ history: list[dict],
113
+ temperature: float,
114
+ max_new_tokens: int,
115
+ top_p: float,
116
+ top_k: int,
117
+ repetition_penalty: float,
118
+ sys_prompt: str,
119
+ progress=gr.Progress(track_tqdm=True)
120
+ ):
121
+ try:
122
+ messages = []
123
+ response = []
124
+ if not history: history = []
125
+ messages.append({"role": "system", "content": sys_prompt})
126
+ messages.append({"role": "user", "content": message})
127
+
128
+ input_tensors = tokenizer.apply_chat_template([{"role": x["role"], "content": x["content"]} for x in history + messages if "role" in x.keys()], add_generation_prompt=True, return_dict=True, add_special_tokens=False, return_tensors="pt").to(model.device)
129
+
130
+ input_ids = input_tensors["input_ids"]
131
+ attention_mask = input_tensors["attention_mask"]
132
+
133
+ generate_kwargs = dict(
134
+ input_ids=input_ids,
135
+ attention_mask=attention_mask,
136
+ max_new_tokens=max_new_tokens,
137
+ do_sample=True,
138
+ temperature=temperature,
139
+ top_k=top_k,
140
+ top_p=top_p,
141
+ repetition_penalty=repetition_penalty,
142
+ pad_token_id=tokenizer.eos_token_id,
143
+ )
144
+ if temperature == 0: generate_kwargs['do_sample'] = False
145
+ response.append({"role": "assistant", "content": ""})
146
+
147
+ output_ids = model.generate(**generate_kwargs)
148
+ output = tokenizer.decode(output_ids.tolist()[0][input_ids.size(1) :], skip_special_tokens=True)
149
+
150
+ response[-1]["content"] = output
151
+ return response
152
+ except Exception as e:
153
+ print(e)
154
+ gr.Warning(f"Error: {e}")
155
+ return response
156
+ finally:
157
+ flush()
158
+
159
  with gr.Blocks(fill_height=True, fill_width=True, css=css) as demo:
160
  gr.Markdown(DESCRIPTION)
161
  gr.ChatInterface(
162
+ fn=chat_stream,
163
  type="messages",
164
  chatbot=gr.Chatbot(height=450, type="messages", placeholder=PLACEHOLDER, label='Gradio ChatInterface'),
165
  fill_height=True,