reach-vb HF staff commited on
Commit
098f871
·
verified ·
1 Parent(s): 18ff4e4
Files changed (1) hide show
  1. app.py +19 -183
app.py CHANGED
@@ -18,76 +18,8 @@ from textwrap import dedent
18
 
19
  HF_TOKEN = os.environ.get("HF_TOKEN")
20
 
21
- def generate_importance_matrix(model_path, train_data_path):
22
- imatrix_command = f"./llama-imatrix -m ../{model_path} -f {train_data_path} -ngl 99 --output-frequency 10"
23
 
24
- os.chdir("llama.cpp")
25
-
26
- print(f"Current working directory: {os.getcwd()}")
27
- print(f"Files in the current directory: {os.listdir('.')}")
28
-
29
- if not os.path.isfile(f"../{model_path}"):
30
- raise Exception(f"Model file not found: {model_path}")
31
-
32
- print("Running imatrix command...")
33
- process = subprocess.Popen(imatrix_command, shell=True)
34
-
35
- try:
36
- process.wait(timeout=60) # added wait
37
- except subprocess.TimeoutExpired:
38
- print("Imatrix computation timed out. Sending SIGINT to allow graceful termination...")
39
- process.send_signal(signal.SIGINT)
40
- try:
41
- process.wait(timeout=5) # grace period
42
- except subprocess.TimeoutExpired:
43
- print("Imatrix proc still didn't term. Forecfully terming process...")
44
- process.kill()
45
-
46
- os.chdir("..")
47
-
48
- print("Importance matrix generation completed.")
49
-
50
- def split_upload_model(model_path, repo_id, oauth_token: gr.OAuthToken | None, split_max_tensors=256, split_max_size=None):
51
- if oauth_token.token is None:
52
- raise ValueError("You have to be logged in.")
53
-
54
- split_cmd = f"llama.cpp/llama-gguf-split --split --split-max-tensors {split_max_tensors}"
55
- if split_max_size:
56
- split_cmd += f" --split-max-size {split_max_size}"
57
- split_cmd += f" {model_path} {model_path.split('.')[0]}"
58
-
59
- print(f"Split command: {split_cmd}")
60
-
61
- result = subprocess.run(split_cmd, shell=True, capture_output=True, text=True)
62
- print(f"Split command stdout: {result.stdout}")
63
- print(f"Split command stderr: {result.stderr}")
64
-
65
- if result.returncode != 0:
66
- raise Exception(f"Error splitting the model: {result.stderr}")
67
- print("Model split successfully!")
68
-
69
-
70
- sharded_model_files = [f for f in os.listdir('.') if f.startswith(model_path.split('.')[0])]
71
- if sharded_model_files:
72
- print(f"Sharded model files: {sharded_model_files}")
73
- api = HfApi(token=oauth_token.token)
74
- for file in sharded_model_files:
75
- file_path = os.path.join('.', file)
76
- print(f"Uploading file: {file_path}")
77
- try:
78
- api.upload_file(
79
- path_or_fileobj=file_path,
80
- path_in_repo=file,
81
- repo_id=repo_id,
82
- )
83
- except Exception as e:
84
- raise Exception(f"Error uploading file {file_path}: {e}")
85
- else:
86
- raise Exception("No sharded files found.")
87
-
88
- print("Sharded model has been uploaded successfully!")
89
-
90
- def process_model(model_id, q_method, use_imatrix, imatrix_q_method, private_repo, train_data_file, split_model, split_max_tensors, split_max_size, oauth_token: gr.OAuthToken | None):
91
  if oauth_token.token is None:
92
  raise ValueError("You must be logged in to use GGUF-my-repo")
93
  model_name = model_id.split('/')[-1]
@@ -126,29 +58,11 @@ def process_model(model_id, q_method, use_imatrix, imatrix_q_method, private_rep
126
  print("Model converted to fp16 successfully!")
127
  print(f"Converted model path: {fp16}")
128
 
129
- imatrix_path = "llama.cpp/imatrix.dat"
130
-
131
- if use_imatrix:
132
- if train_data_file:
133
- train_data_path = train_data_file.name
134
- else:
135
- train_data_path = "groups_merged.txt" #fallback calibration dataset
136
-
137
- print(f"Training data file path: {train_data_path}")
138
-
139
- if not os.path.isfile(train_data_path):
140
- raise Exception(f"Training data file not found: {train_data_path}")
141
-
142
- generate_importance_matrix(fp16, train_data_path)
143
- else:
144
- print("Not using imatrix quantization.")
145
  username = whoami(oauth_token.token)["name"]
146
  quantized_gguf_name = f"{model_name.lower()}-{imatrix_q_method.lower()}-imat.gguf" if use_imatrix else f"{model_name.lower()}-{q_method.lower()}.gguf"
147
  quantized_gguf_path = quantized_gguf_name
148
- if use_imatrix:
149
- quantise_ggml = f"./llama.cpp/llama-quantize --imatrix {imatrix_path} {fp16} {quantized_gguf_path} {imatrix_q_method}"
150
- else:
151
- quantise_ggml = f"./llama.cpp/llama-quantize {fp16} {quantized_gguf_path} {q_method}"
152
  result = subprocess.run(quantise_ggml, shell=True, capture_output=True)
153
  if result.returncode != 0:
154
  raise Exception(f"Error quantizing: {result.stderr}")
@@ -218,32 +132,16 @@ def process_model(model_id, q_method, use_imatrix, imatrix_q_method, private_rep
218
  )
219
  card.save(f"README.md")
220
 
221
- if split_model:
222
- split_upload_model(quantized_gguf_path, new_repo_id, oauth_token, split_max_tensors, split_max_size)
223
- else:
224
- try:
225
- print(f"Uploading quantized model: {quantized_gguf_path}")
226
- api.upload_file(
227
- path_or_fileobj=quantized_gguf_path,
228
- path_in_repo=quantized_gguf_name,
229
- repo_id=new_repo_id,
230
- )
231
- except Exception as e:
232
- raise Exception(f"Error uploading quantized model: {e}")
233
-
234
 
235
- imatrix_path = "llama.cpp/imatrix.dat"
236
- if os.path.isfile(imatrix_path):
237
- try:
238
- print(f"Uploading imatrix.dat: {imatrix_path}")
239
- api.upload_file(
240
- path_or_fileobj=imatrix_path,
241
- path_in_repo="imatrix.dat",
242
- repo_id=new_repo_id,
243
- )
244
- except Exception as e:
245
- raise Exception(f"Error uploading imatrix.dat: {e}")
246
-
247
  api.upload_file(
248
  path_or_fileobj=f"README.md",
249
  path_in_repo=f"README.md",
@@ -266,7 +164,7 @@ css="""/* Custom CSS to allow scrolling */
266
  """
267
  # Create Gradio interface
268
  with gr.Blocks(css=css) as demo:
269
- gr.Markdown("You must be logged in to use GGUF-my-repo.")
270
  gr.LoginButton(min_width=250)
271
 
272
  model_id = HuggingfaceHubSearch(
@@ -276,28 +174,14 @@ with gr.Blocks(css=css) as demo:
276
  )
277
 
278
  q_method = gr.Dropdown(
279
- ["Q2_K", "Q3_K_S", "Q3_K_M", "Q3_K_L", "Q4_0", "Q4_K_S", "Q4_K_M", "Q5_0", "Q5_K_S", "Q5_K_M", "Q6_K", "Q8_0"],
280
  label="Quantization Method",
281
- info="GGML quantization type",
282
- value="Q4_K_M",
283
  filterable=False,
284
  visible=True
285
  )
286
 
287
- imatrix_q_method = gr.Dropdown(
288
- ["IQ3_M", "IQ3_XXS", "Q4_K_M", "Q4_K_S", "IQ4_NL", "IQ4_XS", "Q5_K_M", "Q5_K_S"],
289
- label="Imatrix Quantization Method",
290
- info="GGML imatrix quants type",
291
- value="IQ4_NL",
292
- filterable=False,
293
- visible=False
294
- )
295
-
296
- use_imatrix = gr.Checkbox(
297
- value=False,
298
- label="Use Imatrix Quantization",
299
- info="Use importance matrix for quantization."
300
- )
301
 
302
  private_repo = gr.Checkbox(
303
  value=False,
@@ -305,73 +189,25 @@ with gr.Blocks(css=css) as demo:
305
  info="Create a private repo under your username."
306
  )
307
 
308
- train_data_file = gr.File(
309
- label="Training Data File",
310
- file_types=["txt"],
311
- visible=False
312
- )
313
-
314
- split_model = gr.Checkbox(
315
- value=False,
316
- label="Split Model",
317
- info="Shard the model using gguf-split."
318
- )
319
-
320
- split_max_tensors = gr.Number(
321
- value=256,
322
- label="Max Tensors per File",
323
- info="Maximum number of tensors per file when splitting model.",
324
- visible=False
325
- )
326
-
327
- split_max_size = gr.Textbox(
328
- label="Max File Size",
329
- info="Maximum file size when splitting model (--split-max-size). May leave empty to use the default.",
330
- visible=False
331
- )
332
-
333
- def update_visibility(use_imatrix):
334
- return gr.update(visible=not use_imatrix), gr.update(visible=use_imatrix), gr.update(visible=use_imatrix)
335
-
336
- use_imatrix.change(
337
- fn=update_visibility,
338
- inputs=use_imatrix,
339
- outputs=[q_method, imatrix_q_method, train_data_file]
340
- )
341
-
342
  iface = gr.Interface(
343
  fn=process_model,
344
  inputs=[
345
  model_id,
346
  q_method,
347
- use_imatrix,
348
- imatrix_q_method,
349
  private_repo,
350
- train_data_file,
351
- split_model,
352
- split_max_tensors,
353
- split_max_size,
354
  ],
355
  outputs=[
356
  gr.Markdown(label="output"),
357
  gr.Image(show_label=False),
358
  ],
359
- title="Create your own GGUF Quants, blazingly fast ⚡!",
360
- description="The space takes an HF repo as an input, quantizes it and creates a Public repo containing the selected quant under your HF user namespace.",
361
  api_name=False
362
  )
363
-
364
- def update_split_visibility(split_model):
365
- return gr.update(visible=split_model), gr.update(visible=split_model)
366
-
367
- split_model.change(
368
- fn=update_split_visibility,
369
- inputs=split_model,
370
- outputs=[split_max_tensors, split_max_size]
371
  )
372
 
373
  def restart_space():
374
- HfApi().restart_space(repo_id="ggml-org/gguf-my-repo", token=HF_TOKEN, factory_reboot=True)
375
 
376
  scheduler = BackgroundScheduler()
377
  scheduler.add_job(restart_space, "interval", seconds=21600)
 
18
 
19
  HF_TOKEN = os.environ.get("HF_TOKEN")
20
 
 
 
21
 
22
+ def process_model(model_id, q_method, private_repo, oauth_token: gr.OAuthToken | None):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23
  if oauth_token.token is None:
24
  raise ValueError("You must be logged in to use GGUF-my-repo")
25
  model_name = model_id.split('/')[-1]
 
58
  print("Model converted to fp16 successfully!")
59
  print(f"Converted model path: {fp16}")
60
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
61
  username = whoami(oauth_token.token)["name"]
62
  quantized_gguf_name = f"{model_name.lower()}-{imatrix_q_method.lower()}-imat.gguf" if use_imatrix else f"{model_name.lower()}-{q_method.lower()}.gguf"
63
  quantized_gguf_path = quantized_gguf_name
64
+
65
+ quantise_ggml = f"./llama.cpp/llama-quantize {fp16} {quantized_gguf_path} {q_method}"
 
 
66
  result = subprocess.run(quantise_ggml, shell=True, capture_output=True)
67
  if result.returncode != 0:
68
  raise Exception(f"Error quantizing: {result.stderr}")
 
132
  )
133
  card.save(f"README.md")
134
 
135
+ try:
136
+ print(f"Uploading quantized model: {quantized_gguf_path}")
137
+ api.upload_file(
138
+ path_or_fileobj=quantized_gguf_path,
139
+ path_in_repo=quantized_gguf_name,
140
+ repo_id=new_repo_id,
141
+ )
142
+ except Exception as e:
143
+ raise Exception(f"Error uploading quantized model: {e}")
 
 
 
 
144
 
 
 
 
 
 
 
 
 
 
 
 
 
145
  api.upload_file(
146
  path_or_fileobj=f"README.md",
147
  path_in_repo=f"README.md",
 
164
  """
165
  # Create Gradio interface
166
  with gr.Blocks(css=css) as demo:
167
+ gr.Markdown("You must be logged in to use MLX-my-repo.")
168
  gr.LoginButton(min_width=250)
169
 
170
  model_id = HuggingfaceHubSearch(
 
174
  )
175
 
176
  q_method = gr.Dropdown(
177
+ ["Q4", "Q8"],
178
  label="Quantization Method",
179
+ info="MLX quantization type",
180
+ value="Q4",
181
  filterable=False,
182
  visible=True
183
  )
184
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
185
 
186
  private_repo = gr.Checkbox(
187
  value=False,
 
189
  info="Create a private repo under your username."
190
  )
191
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
192
  iface = gr.Interface(
193
  fn=process_model,
194
  inputs=[
195
  model_id,
196
  q_method,
 
 
197
  private_repo,
 
 
 
 
198
  ],
199
  outputs=[
200
  gr.Markdown(label="output"),
201
  gr.Image(show_label=False),
202
  ],
203
+ title="Create your own MLX Quants, blazingly fast ⚡!",
204
+ description="The space takes an HF repo as an input, quantizes it and creates a Public/ Private repo containing the selected quant under your HF user namespace.",
205
  api_name=False
206
  )
 
 
 
 
 
 
 
 
207
  )
208
 
209
  def restart_space():
210
+ HfApi().restart_space(repo_id="reach-vb/mlx-my-repo", token=HF_TOKEN, factory_reboot=True)
211
 
212
  scheduler = BackgroundScheduler()
213
  scheduler.add_job(restart_space, "interval", seconds=21600)