Ngixdev commited on
Commit
b4cb8c4
·
verified ·
1 Parent(s): 13d1862

Switch to ZeroGPU with llama-cpp for GGUF model

Browse files
Files changed (3) hide show
  1. README.md +5 -0
  2. app.py +54 -87
  3. requirements.txt +2 -0
README.md CHANGED
@@ -8,6 +8,11 @@ sdk_version: 5.29.0
8
  app_file: app.py
9
  pinned: false
10
  license: apache-2.0
 
 
 
 
 
11
  ---
12
 
13
  # Qwen3.5-9B Uncensored API Interface
 
8
  app_file: app.py
9
  pinned: false
10
  license: apache-2.0
11
+ tags:
12
+ - qwen
13
+ - uncensored
14
+ - llama-cpp
15
+ - zerogpu
16
  ---
17
 
18
  # Qwen3.5-9B Uncensored API Interface
app.py CHANGED
@@ -1,54 +1,44 @@
1
  import gradio as gr
2
- from huggingface_hub import InferenceClient
 
 
3
 
4
- MODEL_ID = "HauhauCS/Qwen3.5-9B-Uncensored-HauhauCS-Aggressive"
 
5
 
6
- client = InferenceClient()
7
 
8
- DEFAULT_PARAMS = {
9
- "temperature": 0.7,
10
- "top_p": 0.8,
11
- "top_k": 20,
12
- "max_tokens": 2048,
13
- }
 
 
 
 
 
14
 
15
 
16
- def generate_response(
17
- message: str,
18
- history: list,
19
- system_prompt: str = "",
20
- temperature: float = 0.7,
21
- top_p: float = 0.8,
22
- top_k: int = 20,
23
- max_tokens: int = 2048,
24
- ) -> str:
25
- messages = []
26
 
27
  if system_prompt.strip():
28
- messages.append({"role": "system", "content": system_prompt})
29
 
30
  for user_msg, assistant_msg in history:
31
  if user_msg:
32
- messages.append({"role": "user", "content": user_msg})
33
  if assistant_msg:
34
- messages.append({"role": "assistant", "content": assistant_msg})
35
 
36
- messages.append({"role": "user", "content": message})
37
-
38
- try:
39
- response = client.chat_completion(
40
- model=MODEL_ID,
41
- messages=messages,
42
- temperature=temperature,
43
- top_p=top_p,
44
- max_tokens=max_tokens,
45
- )
46
- return response.choices[0].message.content
47
- except Exception as e:
48
- return f"Error: {str(e)}"
49
 
50
 
51
- def generate_stream(
 
52
  message: str,
53
  history: list,
54
  system_prompt: str = "",
@@ -56,39 +46,23 @@ def generate_stream(
56
  top_p: float = 0.8,
57
  top_k: int = 20,
58
  max_tokens: int = 2048,
59
- ):
60
- messages = []
61
-
62
- if system_prompt.strip():
63
- messages.append({"role": "system", "content": system_prompt})
64
-
65
- for user_msg, assistant_msg in history:
66
- if user_msg:
67
- messages.append({"role": "user", "content": user_msg})
68
- if assistant_msg:
69
- messages.append({"role": "assistant", "content": assistant_msg})
70
 
71
- messages.append({"role": "user", "content": message})
 
 
 
 
 
 
 
72
 
73
- try:
74
- stream = client.chat_completion(
75
- model=MODEL_ID,
76
- messages=messages,
77
- temperature=temperature,
78
- top_p=top_p,
79
- max_tokens=max_tokens,
80
- stream=True,
81
- )
82
-
83
- partial_message = ""
84
- for chunk in stream:
85
- if chunk.choices[0].delta.content:
86
- partial_message += chunk.choices[0].delta.content
87
- yield partial_message
88
- except Exception as e:
89
- yield f"Error: {str(e)}"
90
 
91
 
 
92
  def api_generate(
93
  prompt: str,
94
  system_prompt: str = "",
@@ -109,22 +83,16 @@ def api_generate(
109
  Returns:
110
  Dictionary with 'response' key containing generated text
111
  """
112
- messages = []
113
-
114
- if system_prompt.strip():
115
- messages.append({"role": "system", "content": system_prompt})
116
-
117
- messages.append({"role": "user", "content": prompt})
118
-
119
  try:
120
- response = client.chat_completion(
121
- model=MODEL_ID,
122
- messages=messages,
 
123
  temperature=temperature,
124
  top_p=top_p,
125
  max_tokens=max_tokens,
126
  )
127
- return {"response": response.choices[0].message.content, "status": "success"}
128
  except Exception as e:
129
  return {"response": None, "status": "error", "error": str(e)}
130
 
@@ -141,6 +109,7 @@ with gr.Blocks(title="Qwen3.5-9B Uncensored API", theme=gr.themes.Soft()) as dem
141
  - Fully uncensored (0/465 refusals)
142
  - Multimodal capable (text, image, video)
143
  - Supports 201 languages
 
144
 
145
  Use the chat interface below or access via API.
146
  """
@@ -189,8 +158,8 @@ with gr.Blocks(title="Qwen3.5-9B Uncensored API", theme=gr.themes.Soft()) as dem
189
  )
190
  max_tokens = gr.Slider(
191
  minimum=64,
192
- maximum=8192,
193
- value=2048,
194
  step=64,
195
  label="Max Tokens",
196
  )
@@ -207,8 +176,7 @@ with gr.Blocks(title="Qwen3.5-9B Uncensored API", theme=gr.themes.Soft()) as dem
207
  message = history[-1][0]
208
  history_without_last = history[:-1]
209
 
210
- response = ""
211
- for partial in generate_stream(
212
  message,
213
  history_without_last,
214
  system_prompt,
@@ -216,10 +184,9 @@ with gr.Blocks(title="Qwen3.5-9B Uncensored API", theme=gr.themes.Soft()) as dem
216
  top_p,
217
  top_k,
218
  max_tokens
219
- ):
220
- response = partial
221
- history[-1][1] = response
222
- yield history
223
 
224
  msg.submit(
225
  user_submit,
@@ -262,7 +229,7 @@ with gr.Blocks(title="Qwen3.5-9B Uncensored API", theme=gr.themes.Soft()) as dem
262
  system_prompt="You are a helpful assistant",
263
  temperature=0.7,
264
  top_p=0.8,
265
- max_tokens=2048,
266
  api_name="/api_generate"
267
  )
268
  print(result)
@@ -279,7 +246,7 @@ with gr.Blocks(title="Qwen3.5-9B Uncensored API", theme=gr.themes.Soft()) as dem
279
  "You are a helpful assistant",
280
  0.7,
281
  0.8,
282
- 2048
283
  ]
284
  }'
285
  ```
@@ -301,7 +268,7 @@ with gr.Blocks(title="Qwen3.5-9B Uncensored API", theme=gr.themes.Soft()) as dem
301
  with gr.Row():
302
  api_temp = gr.Slider(0.0, 2.0, 0.7, step=0.1, label="Temperature")
303
  api_top_p = gr.Slider(0.0, 1.0, 0.8, step=0.05, label="Top P")
304
- api_max_tokens = gr.Slider(64, 8192, 2048, step=64, label="Max Tokens")
305
  api_submit = gr.Button("Generate", variant="primary")
306
 
307
  with gr.Column():
 
1
  import gradio as gr
2
+ import spaces
3
+ from huggingface_hub import hf_hub_download
4
+ from llama_cpp import Llama
5
 
6
+ MODEL_REPO = "HauhauCS/Qwen3.5-9B-Uncensored-HauhauCS-Aggressive"
7
+ MODEL_FILE = "Qwen3.5-9B-Uncensored-HauhauCS-Aggressive-Q4_K_M.gguf"
8
 
9
+ llm = None
10
 
11
+ def load_model():
12
+ global llm
13
+ if llm is None:
14
+ model_path = hf_hub_download(repo_id=MODEL_REPO, filename=MODEL_FILE)
15
+ llm = Llama(
16
+ model_path=model_path,
17
+ n_ctx=8192,
18
+ n_gpu_layers=-1,
19
+ verbose=False,
20
+ )
21
+ return llm
22
 
23
 
24
+ def format_messages(message: str, history: list, system_prompt: str = "") -> str:
25
+ formatted = ""
 
 
 
 
 
 
 
 
26
 
27
  if system_prompt.strip():
28
+ formatted += f"<|im_start|>system\n{system_prompt}<|im_end|>\n"
29
 
30
  for user_msg, assistant_msg in history:
31
  if user_msg:
32
+ formatted += f"<|im_start|>user\n{user_msg}<|im_end|>\n"
33
  if assistant_msg:
34
+ formatted += f"<|im_start|>assistant\n{assistant_msg}<|im_end|>\n"
35
 
36
+ formatted += f"<|im_start|>user\n{message}<|im_end|>\n<|im_start|>assistant\n"
37
+ return formatted
 
 
 
 
 
 
 
 
 
 
 
38
 
39
 
40
+ @spaces.GPU
41
+ def generate_response(
42
  message: str,
43
  history: list,
44
  system_prompt: str = "",
 
46
  top_p: float = 0.8,
47
  top_k: int = 20,
48
  max_tokens: int = 2048,
49
+ ) -> str:
50
+ model = load_model()
51
+ prompt = format_messages(message, history, system_prompt)
 
 
 
 
 
 
 
 
52
 
53
+ output = model(
54
+ prompt,
55
+ max_tokens=max_tokens,
56
+ temperature=temperature,
57
+ top_p=top_p,
58
+ top_k=top_k,
59
+ stop=["<|im_end|>", "<|im_start|>"],
60
+ )
61
 
62
+ return output["choices"][0]["text"].strip()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
63
 
64
 
65
+ @spaces.GPU
66
  def api_generate(
67
  prompt: str,
68
  system_prompt: str = "",
 
83
  Returns:
84
  Dictionary with 'response' key containing generated text
85
  """
 
 
 
 
 
 
 
86
  try:
87
+ response = generate_response(
88
+ message=prompt,
89
+ history=[],
90
+ system_prompt=system_prompt,
91
  temperature=temperature,
92
  top_p=top_p,
93
  max_tokens=max_tokens,
94
  )
95
+ return {"response": response, "status": "success"}
96
  except Exception as e:
97
  return {"response": None, "status": "error", "error": str(e)}
98
 
 
109
  - Fully uncensored (0/465 refusals)
110
  - Multimodal capable (text, image, video)
111
  - Supports 201 languages
112
+ - Running on ZeroGPU with Q4_K_M quantization
113
 
114
  Use the chat interface below or access via API.
115
  """
 
158
  )
159
  max_tokens = gr.Slider(
160
  minimum=64,
161
+ maximum=4096,
162
+ value=1024,
163
  step=64,
164
  label="Max Tokens",
165
  )
 
176
  message = history[-1][0]
177
  history_without_last = history[:-1]
178
 
179
+ response = generate_response(
 
180
  message,
181
  history_without_last,
182
  system_prompt,
 
184
  top_p,
185
  top_k,
186
  max_tokens
187
+ )
188
+ history[-1][1] = response
189
+ return history
 
190
 
191
  msg.submit(
192
  user_submit,
 
229
  system_prompt="You are a helpful assistant",
230
  temperature=0.7,
231
  top_p=0.8,
232
+ max_tokens=1024,
233
  api_name="/api_generate"
234
  )
235
  print(result)
 
246
  "You are a helpful assistant",
247
  0.7,
248
  0.8,
249
+ 1024
250
  ]
251
  }'
252
  ```
 
268
  with gr.Row():
269
  api_temp = gr.Slider(0.0, 2.0, 0.7, step=0.1, label="Temperature")
270
  api_top_p = gr.Slider(0.0, 1.0, 0.8, step=0.05, label="Top P")
271
+ api_max_tokens = gr.Slider(64, 4096, 1024, step=64, label="Max Tokens")
272
  api_submit = gr.Button("Generate", variant="primary")
273
 
274
  with gr.Column():
requirements.txt CHANGED
@@ -1,2 +1,4 @@
1
  gradio>=4.0.0
2
  huggingface_hub>=0.20.0
 
 
 
1
  gradio>=4.0.0
2
  huggingface_hub>=0.20.0
3
+ llama-cpp-python
4
+ spaces