Ffftdtd5dtft commited on
Commit
bc1d53e
1 Parent(s): 1af6864

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +37 -240
app.py CHANGED
@@ -1,74 +1,41 @@
1
  import os
2
  import shutil
3
  import subprocess
4
- import signal
5
- import time
6
  import torch
7
- from torch.nn.utils import prune
8
- from transformers import GPT2LMHeadModel, AutoTokenizer, AutoModelForCausalLM, DistilBertModel, AutoConfig
9
- from huggingface_hub import create_repo, HfApi, snapshot_download, whoami, ModelCard
10
  from gradio_huggingfacehub_search import HuggingfaceHubSearch
11
  from apscheduler.schedulers.background import BackgroundScheduler
12
  from textwrap import dedent
13
  import gradio as gr
14
- import torch.quantization
15
- from torch.nn import functional as F
16
- from copy import deepcopy
17
- from torch.utils.checkpoint import checkpoint
18
  import hashlib
19
 
20
  os.environ["GRADIO_ANALYTICS_ENABLED"] = "False"
21
  HF_TOKEN = os.environ.get("HF_TOKEN")
22
 
23
  def generate_importance_matrix(model_path, train_data_path):
24
- # Change the working directory to the llama.cpp directory
25
  os.chdir("llama.cpp")
26
-
27
- # Check if the model file exists
28
  if not os.path.isfile(f"../{model_path}"):
29
  raise Exception(f"Model file not found: {model_path}")
30
-
31
- # Construct the command to generate the importance matrix
32
  imatrix_command = f"./llama-imatrix -m ../{model_path} -f {train_data_path} -ngl 99 --output-frequency 10"
33
-
34
- # Execute the command and wait for it to finish
35
  process = subprocess.Popen(imatrix_command, shell=True)
36
  try:
37
- process.wait(timeout=0)
38
  except subprocess.TimeoutExpired:
39
- # If the process takes too long, send a SIGINT signal (interrupt)
40
- process.send_signal(signal.SIGINT)
41
- try:
42
- process.wait(timeout=0)
43
- except subprocess.TimeoutExpired:
44
- # If it still doesn't finish, kill the process
45
- process.kill()
46
-
47
- # Change the working directory back to the parent directory
48
  os.chdir("..")
49
 
50
  def split_upload_model(model_path, repo_id, oauth_token, split_max_tensors=256, split_max_size=None):
51
- # Check if the user is logged in
52
  if oauth_token.token is None:
53
  raise ValueError("You have to be logged in.")
54
-
55
- # Construct the command to split the model
56
  split_cmd = f"llama.cpp/llama-gguf-split --split --split-max-tensors {split_max_tensors}"
57
  if split_max_size:
58
  split_cmd += f" --split-max-size {split_max_size}"
59
  split_cmd += f" {model_path} {model_path.split('.')[0]}"
60
-
61
- # Execute the command and capture the output
62
  result = subprocess.run(split_cmd, shell=True, capture_output=True, text=True)
63
-
64
- # Check if the command was successful
65
  if result.returncode != 0:
66
  raise Exception(f"Error splitting the model: {result.stderr}")
67
-
68
- # Get a list of sharded model files
69
  sharded_model_files = [f for f in os.listdir('.') if f.startswith(model_path.split('.')[0])]
70
-
71
- # If sharded files were found, upload them to the Hugging Face repository
72
  if sharded_model_files:
73
  api = HfApi(token=oauth_token.token)
74
  for file in sharded_model_files:
@@ -80,97 +47,54 @@ def split_upload_model(model_path, repo_id, oauth_token, split_max_tensors=256,
80
  else:
81
  raise Exception("No sharded files found.")
82
 
83
- def prune_model(model, amount=0.5):
84
- # Iterate over the model's modules and apply pruning to linear and convolutional layers
85
- for name, module in model.named_modules():
86
- if isinstance(module, (torch.nn.Linear, torch.nn.Conv2d)):
87
- # Apply L1 unstructured pruning
88
- prune.l1_unstructured(module, name='weight', amount=amount)
89
- # Remove the pruned weights
90
- prune.remove(module, 'weight')
91
- return model
92
-
93
  def quantize_to_q1_with_min(tensor, min_value=-1):
94
- # Quantize the tensor to -1, 0, or 1 based on the sign and minimum value
95
  tensor = torch.sign(tensor)
96
  tensor[tensor < min_value] = min_value
97
  return tensor
98
 
99
  def quantize_model_to_q1_with_min(model, min_value=-1):
100
- # Iterate over the model's parameters and apply quantization
101
  for name, param in model.named_parameters():
102
  if param.dtype in [torch.float32, torch.float16]:
103
  with torch.no_grad():
104
  param.copy_(quantize_to_q1_with_min(param.data, min_value))
105
 
106
  def disable_unnecessary_components(model):
107
- # Iterate over the model's modules and disable dropout and batch normalization
108
  for name, module in model.named_modules():
109
  if isinstance(module, torch.nn.Dropout):
110
- # Set dropout probability to 0
111
  module.p = 0.0
112
  elif isinstance(module, torch.nn.BatchNorm1d):
113
- # Set batch normalization to evaluation mode
114
  module.eval()
115
 
116
  def ultra_max_compress(model):
117
- # Apply a series of aggressive optimization techniques to the model
118
- model = prune_model(model, amount=0.8) # Prune 80% of the weights
119
- quantize_model_to_q1_with_min(model, min_value=-0.05) # Quantize weights to -1, 0, or 1
120
- disable_unnecessary_components(model) # Disable dropout and batch normalization
121
-
122
  with torch.no_grad():
123
  for name, param in model.named_parameters():
124
  if param.requires_grad:
125
  param.requires_grad = False
126
- param.data = torch.nn.functional.hardtanh(param.data, min_val=-1.0, max_val=1.0) # Apply hardtanh activation
127
- param.data = param.data.half() # Convert weights to half precision
128
-
129
- try:
130
- # Attempt to convert the model to a TorchScript module
131
- model = torch.jit.script(model)
132
- except Exception:
133
- pass
134
-
135
- model = prune_model(model, amount=0.9) # Prune another 90% of the weights
136
- model.eval() # Set the model to evaluation mode
137
-
138
- # Remove empty buffers from the model
139
  for buffer_name, buffer in model.named_buffers():
140
  if buffer.numel() == 0:
141
  model._buffers.pop(buffer_name)
142
-
143
  return model
144
 
145
  def optimize_model_resources(model):
146
- # Disable gradient calculations
147
  torch.set_grad_enabled(False)
148
-
149
- # Set the model to evaluation mode
150
  model.eval()
151
-
152
- # Iterate over the model's parameters and convert float32 weights to half precision
153
  for name, param in model.named_parameters():
154
  param.requires_grad = False
155
  if param.dtype == torch.float32:
156
  param.data = param.data.half()
157
-
158
- # Adjust model configuration for resource optimization
159
  if hasattr(model, 'config'):
160
  if hasattr(model.config, 'max_position_embeddings'):
161
- # Limit the maximum position embeddings to 512
162
  model.config.max_position_embeddings = min(model.config.max_position_embeddings, 512)
163
  if hasattr(model.config, 'hidden_size'):
164
- # Limit the hidden size to 768
165
  model.config.hidden_size = min(model.config.hidden_size, 768)
166
-
167
- # Optimize the model for inference using TorchScript
168
- model = torch.jit.optimize_for_inference(model)
169
-
170
  return model
171
 
172
  def aggressive_optimize(model, reduce_layers_factor=0.5):
173
- # Reduce the number of attention heads and hidden size based on the reduction factor
174
  if hasattr(model.config, 'num_attention_heads'):
175
  model.config.num_attention_heads = int(model.config.num_attention_heads * reduce_layers_factor)
176
  if hasattr(model.config, 'hidden_size'):
@@ -178,7 +102,6 @@ def aggressive_optimize(model, reduce_layers_factor=0.5):
178
  return model
179
 
180
  def apply_quantization(model, use_int8_inference):
181
- # Apply dynamic quantization to linear layers if INT8 inference is enabled
182
  if use_int8_inference:
183
  quantized_model = torch.quantization.quantize_dynamic(
184
  model, {torch.nn.Linear}, dtype=torch.qint8
@@ -188,7 +111,6 @@ def apply_quantization(model, use_int8_inference):
188
  return model
189
 
190
  def reduce_layers(model, reduction_factor=0.5):
191
- # Reduce the number of layers in the transformer block
192
  if hasattr(model, 'transformer') and hasattr(model.transformer, 'h'):
193
  original_num_layers = len(model.transformer.h)
194
  new_num_layers = int(original_num_layers * reduction_factor)
@@ -196,7 +118,6 @@ def reduce_layers(model, reduction_factor=0.5):
196
  return model
197
 
198
  def use_smaller_embeddings(model, reduction_factor=0.75):
199
- # Reduce the size of the embedding layer
200
  original_embedding_dim = model.config.hidden_size
201
  new_embedding_dim = int(original_embedding_dim * reduction_factor)
202
  model.config.hidden_size = new_embedding_dim
@@ -204,122 +125,101 @@ def use_smaller_embeddings(model, reduction_factor=0.75):
204
  return model
205
 
206
  def use_fp16_embeddings(model):
207
- # Convert the embedding weights to half precision (float16)
208
  model.transformer.wte = model.transformer.wte.half()
209
  return model
210
 
211
  def quantize_embeddings(model):
212
- # Quantize the embedding layer using dynamic quantization
213
  model.transformer.wte = torch.quantization.quantize_dynamic(
214
  model.transformer.wte, {torch.nn.Embedding}, dtype=torch.qint8
215
  )
216
  return model
217
 
218
  def use_bnb_f16(model):
219
- # Convert the model to BFLOAT16 (BF16) data type if supported by the hardware
220
  if torch.cuda.is_available() and torch.cuda.is_bf16_supported():
221
  model = model.to(dtype=torch.bfloat16)
222
  return model
223
 
224
  def use_group_quantization(model):
225
- # Apply group quantization to linear layers in the model
226
  for module in model.modules():
227
  if isinstance(module, torch.nn.Linear):
228
- # Fuse the linear layer's weight
229
  torch.quantization.fuse_modules(module, ['weight'], inplace=True)
230
- # Quantize the fused linear layer using dynamic quantization
231
  torch.quantization.quantize_dynamic(module, {torch.nn.Linear}, dtype=torch.qint8, inplace=True)
232
  return model
233
 
234
  def apply_layer_norm_trick(model):
235
- # Disable learnable parameters (elementwise_affine) in LayerNorm layers
236
  for name, module in model.named_modules():
237
  if isinstance(module, torch.nn.LayerNorm):
238
  module.elementwise_affine = False
239
  return model
240
 
241
  def remove_padding(inputs, attention_mask):
242
- # Remove padding from input sequences based on the attention mask
243
- last_non_padded = attention_mask.sum(dim=1) - 1 # Find the last non-padded token in each sequence
244
- gathered_inputs = torch.gather(inputs, dim=1, index=last_non_padded.unsqueeze(1).unsqueeze(2).expand(-1, -1, inputs.size(2))) # Gather the non-padded tokens
245
  return gathered_inputs
246
 
247
  def use_selective_quantization(model):
248
- # Apply dynamic quantization to multi-head attention layers
249
  for module in model.modules():
250
  if isinstance(module, torch.nn.MultiheadAttention):
251
  torch.quantization.quantize_dynamic(module, {torch.nn.Linear}, dtype=torch.qint8, inplace=True)
252
  return model
253
 
254
  def use_mixed_precision(model):
255
- # Convert the embedding weights to half precision (float16)
256
  model.transformer.wte = model.transformer.wte.half()
257
  return model
258
 
259
  def use_pruning_after_training(model, prune_amount=0.1):
260
- # Apply pruning to the model after training
261
- model = prune_model(model, amount=prune_amount)
 
 
262
  return model
263
 
264
  def use_knowledge_distillation(model, teacher_model, temperature=2.0, alpha=0.5):
265
- # Set the teacher model to evaluation mode
266
  teacher_model.eval()
267
-
268
- # Define the knowledge distillation loss function (Kullback-Leibler divergence)
269
  criterion = torch.nn.KLDivLoss(reduction='batchmean')
270
 
271
  def distillation_loss(student_logits, teacher_logits):
272
- # Calculate the distillation loss between student and teacher logits
273
  student_probs = F.log_softmax(student_logits / temperature, dim=-1)
274
  teacher_probs = F.softmax(teacher_logits / temperature, dim=-1)
275
  return criterion(student_probs, teacher_probs) * (temperature**2)
276
 
277
  def train_step(inputs, labels):
278
- # Define the training step for knowledge distillation
279
- student_outputs = model(**inputs, labels=labels) # Get student outputs
280
- student_logits = student_outputs.logits # Extract student logits
281
  with torch.no_grad():
282
- teacher_outputs = teacher_model(**inputs) # Get teacher outputs
283
- teacher_logits = teacher_outputs.logits # Extract teacher logits
284
- # Calculate the combined loss (student loss + distillation loss)
285
  loss = alpha * student_outputs.loss + (1 - alpha) * distillation_loss(student_logits, teacher_logits)
286
  return loss
287
 
288
  return train_step
289
 
290
  def use_weight_sharing(model):
291
- # Share weights between the first and last layers of the transformer block
292
  if hasattr(model, 'transformer') and hasattr(model.transformer, 'h'):
293
  model.transformer.h[-1].load_state_dict(model.transformer.h[0].state_dict())
294
  return model
295
 
296
  def use_low_rank_approximation(model, rank_factor=0.5):
297
- # Apply low-rank approximation to linear layers using Singular Value Decomposition (SVD)
298
  for module in model.modules():
299
  if isinstance(module, torch.nn.Linear):
300
  original_weight = module.weight.data
301
- U, S, V = torch.linalg.svd(original_weight) # Perform SVD
302
- rank = int(S.size(0) * rank_factor) # Calculate the reduced rank
303
- # Reconstruct the weight matrix with the reduced rank
304
  module.weight.data = U[:, :rank] @ torch.diag(S[:rank]) @ V[:rank, :]
305
  return model
306
 
307
  def use_hashing_trick(model, num_hashes=1024):
308
  def hash_features(features):
309
- # Convert features to bytes
310
  features_bytes = features.cpu().numpy().tobytes()
311
- # Calculate hash using SHA256
312
  hash_object = hashlib.sha256(features_bytes)
313
  hash_value = hash_object.hexdigest()
314
- # Convert hash to integer and modulo by num_hashes
315
  hashed_features = int(hash_value, 16) % num_hashes
316
  return torch.tensor(hashed_features, device=features.device)
317
 
318
- # Modify the model's forward pass to incorporate hashing
319
  original_forward = model.forward
320
 
321
  def forward(*args, **kwargs):
322
- inputs = args[0] # Assuming the first argument is the input features
323
  hashed_inputs = hash_features(inputs)
324
  return original_forward(hashed_inputs, *args[1:], **kwargs)
325
 
@@ -327,97 +227,54 @@ def use_hashing_trick(model, num_hashes=1024):
327
  return model
328
 
329
  def use_quantization_aware_training(model):
330
- # Set the quantization configuration for QAT
331
  model.qconfig = torch.quantization.get_default_qat_qconfig('fbgemm')
332
- # Prepare the model for quantization-aware training
333
  torch.quantization.prepare_qat(model, inplace=True)
334
- # ... (Train the model using quantization-aware training)
335
- # Convert the model to quantized form after training
336
  torch.quantization.convert(model, inplace=True)
337
  return model
338
 
339
  def use_gradient_checkpointing(model):
340
- # Enable gradient checkpointing for the model
341
  def custom_forward(*inputs):
342
  return checkpoint(model, *inputs)
343
  model.forward = custom_forward
344
  return model
345
 
346
- def use_model_pruning(model, prune_amount=0.1):
347
- # Apply pruning to the model
348
- return prune_model(model, amount=prune_amount)
349
-
350
- def use_distillation_then_pruning(model, teacher_model, prune_amount=0.1):
351
- # Apply knowledge distillation followed by pruning
352
- model = use_knowledge_distillation(model, teacher_model)
353
- model = prune_model(model, amount=prune_amount)
354
- return model
355
-
356
  def use_channel_pruning(model, prune_amount=0.1):
357
- # Apply channel pruning to convolutional layers in the model
358
  for module in model.modules():
359
  if isinstance(module, torch.nn.Conv2d):
360
- # Apply L1 structured pruning to the convolutional layer's weight
361
  prune.ln_structured(module, name="weight", amount=prune_amount, n=2, dim=0)
362
- # Remove the pruned weights
363
  prune.remove(module, 'weight')
364
  return model
365
 
366
  def use_sparse_tensors(model, sparsity_threshold=0.01):
367
- # Convert dense tensors to sparse tensors based on a sparsity threshold
368
  for name, param in model.named_parameters():
369
  if param.dim() >= 2 and param.is_floating_point():
370
- # Convert the parameter to a sparse tensor
371
  sparse_param = param.to_sparse()
372
- # Set values below the threshold to 0 in the sparse tensor
373
  sparse_param._values()[sparse_param._values().abs() < sparsity_threshold] = 0
374
- # Convert the sparse tensor back to a dense tensor
375
  param.data = sparse_param.to_dense()
376
  return model
377
 
378
- def use_hardware_acceleration(model):
379
- # Hardware acceleration is usually handled automatically by the deep learning framework
380
- return model
381
-
382
  def process_model(model_id, q_method, use_imatrix, imatrix_q_method, private_repo, train_data_file, split_model, split_max_tensors, split_max_size,
383
  oauth_token: gr.OAuthToken | None):
384
- # Check if the user is logged in
385
  if oauth_token.token is None:
386
  raise ValueError("You must be logged in to use GGUF-my-repo")
387
-
388
- # Extract the model name from the model ID
389
  model_name = model_id.split('/')[-1]
390
- # Define the filename for the FP16 GGUF model
391
  fp16 = f"{model_name}.fp16.gguf"
392
 
393
  try:
394
- # Initialize the Hugging Face API
395
  api = HfApi(token=oauth_token.token)
396
-
397
- # Define the file patterns to download from the repository
398
  dl_pattern = ["*.safetensors", "*.bin", "*.pt", "*.onnx", "*.h5", "*.tflite", "*.ckpt", "*.pb", "*.tar", "*.xml", "*.caffemodel", "*.md", "*.json", "*.model"]
399
  pattern = "*.safetensors" if any(file.path.endswith(".safetensors") for file in api.list_repo_tree(repo_id=model_id, recursive=True)) else "*.bin"
400
  dl_pattern += pattern
401
-
402
- # Download the model files from the Hugging Face repository
403
  api.snapshot_download(repo_id=model_id, local_dir=model_name, local_dir_use_symlinks=False, allow_patterns=dl_pattern)
404
-
405
- # Define the command to convert the model to FP16 GGUF format
406
  conversion_script = "convert_hf_to_gguf.py"
407
  fp16_conversion = f"python llama.cpp/{conversion_script} {model_name} --outtype f16 --outfile {fp16}"
408
-
409
- # Execute the conversion command
410
  result = subprocess.run(fp16_conversion, shell=True, capture_output=True)
411
-
412
- # Check if the conversion was successful
413
  if result.returncode != 0:
414
  raise Exception(f"Error converting to fp16: {result.stderr}")
415
 
416
- # Load the model
417
  config = AutoConfig.from_pretrained(model_name)
418
  model = AutoModelForCausalLM.from_pretrained(model_name, config=config, torch_dtype=torch.float16)
419
 
420
- # Apply model optimization techniques
421
  model = optimize_model_resources(model)
422
  model = apply_quantization(model, use_int8_inference=True)
423
  model = reduce_layers(model, reduction_factor=0.5)
@@ -430,8 +287,6 @@ def process_model(model_id, q_method, use_imatrix, imatrix_q_method, private_rep
430
  model = use_selective_quantization(model)
431
  model = use_mixed_precision(model)
432
  model = use_pruning_after_training(model, prune_amount=0.1)
433
- teacher_model = deepcopy(model) # Create a copy for knowledge distillation
434
- model = use_knowledge_distillation(model, teacher_model)
435
  model = use_weight_sharing(model)
436
  model = use_low_rank_approximation(model, rank_factor=0.5)
437
  model = use_quantization_aware_training(model)
@@ -440,72 +295,49 @@ def process_model(model_id, q_method, use_imatrix, imatrix_q_method, private_rep
440
  model = use_sparse_tensors(model, sparsity_threshold=0.01)
441
  model = use_hashing_trick(model, num_hashes=1024)
442
 
443
- # Save the optimized model
444
  model.save_pretrained(model_name)
445
 
446
- # Define the path to the importance matrix file
447
  imatrix_path = "llama.cpp/imatrix.dat"
448
-
449
- # Generate the importance matrix if the use_imatrix flag is set
450
  if use_imatrix:
451
  if train_data_file:
452
  train_data_path = train_data_file.name
453
  else:
454
  train_data_path = "groups_merged.txt"
455
- # Check if the training data file exists
456
  if not os.path.isfile(train_data_path):
457
  raise Exception(f"Training data file not found: {train_data_path}")
458
- # Generate the importance matrix
459
  generate_importance_matrix(fp16, train_data_path)
460
 
461
- # Get the username of the logged-in user
462
  username = whoami(oauth_token.token)["name"]
463
-
464
- # Define the filename for the quantized GGUF model
465
  quantized_gguf_name = f"{model_name.lower()}-{imatrix_q_method.lower()}-imat.gguf" if use_imatrix else f"{model_name.lower()}-{q_method.lower()}.gguf"
466
  quantized_gguf_path = quantized_gguf_name
467
 
468
- # Construct the command to quantize the model using llama.cpp
469
  if use_imatrix:
470
  quantise_ggml = f"./llama.cpp/llama-quantize --imatrix {imatrix_path} {fp16} {quantized_gguf_path} {imatrix_q_method}"
471
  else:
472
  quantise_ggml = f"./llama.cpp/llama-quantize {fp16} {quantized_gguf_path} {q_method}"
473
 
474
- # Execute the quantization command
475
  result = subprocess.run(quantise_ggml, shell=True, capture_output=True)
476
-
477
- # Check if the quantization was successful
478
  if result.returncode != 0:
479
  raise Exception(f"Error quantizing: {result.stderr}")
480
 
481
- # Verify the processed model
482
  try:
483
- # Run the llama.cpp binary with the quantized model and a test prompt
484
  subprocess.run(["llama.cpp/llama", "-m", quantized_gguf_path, "-p", "Test prompt"], check=True)
485
  except Exception as e:
486
  raise Exception(f"Model verification failed: {e}")
487
 
488
- # Create a new Hugging Face repository for the quantized model
489
  new_repo_url = api.create_repo(repo_id=f"{username}/{model_name}-{imatrix_q_method if use_imatrix else q_method}-GGUF", exist_ok=True, private=private_repo)
490
  new_repo_id = new_repo_url.repo_id
491
 
492
- # Load the model card from the original model
493
  try:
494
  card = ModelCard.load(model_id, token=oauth_token.token)
495
  except:
496
- # Create an empty model card if loading fails
497
  card = ModelCard("")
498
 
499
- # Add tags to the model card
500
  if card.data.tags is None:
501
  card.data.tags = []
502
  card.data.tags.append("llama-cpp")
503
  card.data.tags.append("gguf-my-repo")
504
-
505
- # Set the base model in the model card
506
  card.data.base_model = model_id
507
-
508
- # Set the model card text
509
  card.text = dedent(
510
  f"""
511
  # {new_repo_id}
@@ -550,10 +382,8 @@ def process_model(model_id, q_method, use_imatrix, imatrix_q_method, private_rep
550
  ```
551
  """
552
  )
553
- # Save the model card to a file
554
  card.save(f"README.md")
555
 
556
- # Upload the quantized model to the Hugging Face repository
557
  if split_model:
558
  split_upload_model(quantized_gguf_path, new_repo_id, oauth_token, split_max_tensors, split_max_size)
559
  else:
@@ -562,105 +392,72 @@ def process_model(model_id, q_method, use_imatrix, imatrix_q_method, private_rep
562
  except Exception as e:
563
  raise Exception(f"Error uploading quantized model: {e}")
564
 
565
- # Upload the importance matrix file if it exists
566
  if os.path.isfile(imatrix_path):
567
  try:
568
  api.upload_file(path_or_fileobj=imatrix_path, path_in_repo="imatrix.dat", repo_id=new_repo_id)
569
  except Exception as e:
570
  raise Exception(f"Error uploading imatrix.dat: {e}")
571
 
572
- # Upload the model card file
573
  api.upload_file(path_or_fileobj=f"README.md", path_in_repo=f"README.md", repo_id=new_repo_id)
574
 
575
- # Return a message with a link to the new repository
576
  return (f'Find your repo <a href=\'{new_repo_url}\' target="_blank" style="text-decoration:underline">here</a>', "llama.png")
577
  except Exception as e:
578
- # Return an error message if an exception occurs
579
  return (f"Error: {e}", "error.png")
580
  finally:
581
- # Remove the downloaded model directory
582
  shutil.rmtree(model_name, ignore_errors=True)
583
 
584
- # Define the CSS styles for the Gradio interface
585
  css="""/* Custom CSS to allow scrolling */ .gradio-container {overflow-y: auto;}"""
586
 
587
- # Create the Gradio interface
588
  with gr.Blocks(css=css) as demo:
589
- # Display a message indicating that the user must be logged in
590
  gr.Markdown("You must be logged in to use GGUF-my-repo.")
591
- # Add a login button
592
  gr.LoginButton(min_width=250)
593
- # Add a search bar for Hugging Face model IDs
594
  model_id = HuggingfaceHubSearch(label="Hub Model ID", placeholder="Search for model id on Huggingface", search_type="model")
595
 
596
- # Quantization Options
597
- # Dropdown menu for selecting the quantization method
598
  q_method = gr.Dropdown(["Q2_K", "Q3_K_S", "Q3_K_M", "Q3_K_L", "Q4_0", "Q4_K_S", "Q4_K_M", "Q5_0", "Q5_K_S", "Q5_K_M", "Q6_K", "Q8_0"],
599
  label="Quantization Method", info="GGML quantization type", value="Q2_K", filterable=False, visible=True)
600
- # Dropdown menu for selecting the imatrix quantization method
601
  imatrix_q_method = gr.Dropdown(["IQ1", "IQ1_S", "IQ1_XXS", "IQ2_S", "IQ2_XXS", "IQ3_M", "IQ3_XXS", "Q4_K_M", "Q4_K_S", "IQ4_NL", "IQ4_XS", "Q5_K_M", "Q5_K_S"],
602
  label="Imatrix Quantization Method", info="GGML imatrix quants type", value="IQ4_NL", filterable=False, visible=False)
603
- # Checkbox for enabling imatrix quantization
604
  use_imatrix = gr.Checkbox(value=False, label="Use Imatrix Quantization", info="Use importance matrix for quantization.")
605
- # File upload component for the training data file
606
  train_data_file = gr.File(label="Training Data File", file_types=["txt"], visible=False)
607
 
608
- # Repo Options
609
- # Checkbox for creating a private repository
610
  private_repo = gr.Checkbox(value=False, label="Private Repo", info="Create a private repo under your username.")
611
- # Checkbox for splitting the model into shards
612
  split_model = gr.Checkbox(value=False, label="Split Model", info="Shard the model using gguf-split.")
613
- # Number input for the maximum number of tensors per shard
614
  split_max_tensors = gr.Number(value=256, label="Max Tensors per File", info="Maximum number of tensors per file when splitting model.", visible=False)
615
- # Textbox for the maximum file size of each shard
616
  split_max_size = gr.Textbox(label="Max File Size", info="Maximum file size when splitting model (--split-max-size). May leave empty to use the default.", visible=False)
617
 
618
- # Dynamically show/hide options based on selections
619
- # Show/hide the quantization method dropdown based on the use_imatrix checkbox
620
  use_imatrix.change(fn=lambda use_imatrix: gr.update(visible=not use_imatrix), inputs=use_imatrix, outputs=q_method)
621
- # Show/hide the imatrix quantization method dropdown based on the use_imatrix checkbox
622
  use_imatrix.change(fn=lambda use_imatrix: gr.update(visible=use_imatrix), inputs=use_imatrix, outputs=imatrix_q_method)
623
- # Show/hide the training data file upload component based on the use_imatrix checkbox
624
  use_imatrix.change(fn=lambda use_imatrix: gr.update(visible=use_imatrix), inputs=use_imatrix, outputs=train_data_file)
625
- # Show/hide the maximum tensors per file number input based on the split_model checkbox
626
  split_model.change(fn=lambda split_model: gr.update(visible=split_model), inputs=split_model, outputs=split_max_tensors)
627
- # Show/hide the maximum file size textbox based on the split_model checkbox
628
  split_model.change(fn=lambda split_model: gr.update(visible=split_model), inputs=split_model, outputs=split_max_size)
629
 
630
- # Define the Gradio interface
631
  iface = gr.Interface(
632
- fn=process_model, # The function to call when the interface is submitted
633
  inputs=[
634
- model_id, # The Hugging Face model ID
635
- q_method, # The quantization method
636
- use_imatrix, # Whether to use imatrix quantization
637
- imatrix_q_method, # The imatrix quantization method
638
- private_repo, # Whether to create a private repository
639
- train_data_file, # The training data file
640
- split_model, # Whether to split the model into shards
641
- split_max_tensors, # The maximum number of tensors per shard
642
- split_max_size # The maximum file size of each shard
643
  ],
644
  outputs=[
645
- gr.Markdown(label="output"), # A Markdown component to display the output message
646
- gr.Image(show_label=False), # An image component to display the output image
647
  ],
648
- title="Create your own GGUF Quants, blazingly fast ���!", # The title of the interface
649
- description="The space takes an HF repo as an input, quantizes it and creates a Public repo containing the selected quant under your HF user namespace.", # The description of the interface
650
- api_name=False # Whether to expose the interface as an API
651
  )
652
 
653
- # Define a function to restart the Gradio space
654
  def restart_space():
655
- # Restart the space using the Hugging Face API
656
  HfApi().restart_space(repo_id="Ffftdtd5dtft/gguf-my-repo", token=HF_TOKEN, factory_reboot=True)
657
 
658
- # Create a background scheduler
659
  scheduler = BackgroundScheduler()
660
- # Add a job to restart the space every 6 hours (21600 seconds)
661
  scheduler.add_job(restart_space, "interval", seconds=21600)
662
- # Start the scheduler
663
  scheduler.start()
664
 
665
- # Launch the Gradio interface with queuing and debugging enabled
666
  demo.queue(default_concurrency_limit=1, max_size=5).launch(debug=True, show_api=False)
 
1
  import os
2
  import shutil
3
  import subprocess
 
 
4
  import torch
5
+ from transformers import AutoConfig, AutoModelForCausalLM
6
+ from huggingface_hub import HfApi, snapshot_download, whoami, ModelCard
 
7
  from gradio_huggingfacehub_search import HuggingfaceHubSearch
8
  from apscheduler.schedulers.background import BackgroundScheduler
9
  from textwrap import dedent
10
  import gradio as gr
 
 
 
 
11
  import hashlib
12
 
13
  os.environ["GRADIO_ANALYTICS_ENABLED"] = "False"
14
  HF_TOKEN = os.environ.get("HF_TOKEN")
15
 
16
  def generate_importance_matrix(model_path, train_data_path):
 
17
  os.chdir("llama.cpp")
 
 
18
  if not os.path.isfile(f"../{model_path}"):
19
  raise Exception(f"Model file not found: {model_path}")
 
 
20
  imatrix_command = f"./llama-imatrix -m ../{model_path} -f {train_data_path} -ngl 99 --output-frequency 10"
 
 
21
  process = subprocess.Popen(imatrix_command, shell=True)
22
  try:
23
+ process.wait(timeout=3600)
24
  except subprocess.TimeoutExpired:
25
+ process.kill()
 
 
 
 
 
 
 
 
26
  os.chdir("..")
27
 
28
  def split_upload_model(model_path, repo_id, oauth_token, split_max_tensors=256, split_max_size=None):
 
29
  if oauth_token.token is None:
30
  raise ValueError("You have to be logged in.")
 
 
31
  split_cmd = f"llama.cpp/llama-gguf-split --split --split-max-tensors {split_max_tensors}"
32
  if split_max_size:
33
  split_cmd += f" --split-max-size {split_max_size}"
34
  split_cmd += f" {model_path} {model_path.split('.')[0]}"
 
 
35
  result = subprocess.run(split_cmd, shell=True, capture_output=True, text=True)
 
 
36
  if result.returncode != 0:
37
  raise Exception(f"Error splitting the model: {result.stderr}")
 
 
38
  sharded_model_files = [f for f in os.listdir('.') if f.startswith(model_path.split('.')[0])]
 
 
39
  if sharded_model_files:
40
  api = HfApi(token=oauth_token.token)
41
  for file in sharded_model_files:
 
47
  else:
48
  raise Exception("No sharded files found.")
49
 
 
 
 
 
 
 
 
 
 
 
50
  def quantize_to_q1_with_min(tensor, min_value=-1):
 
51
  tensor = torch.sign(tensor)
52
  tensor[tensor < min_value] = min_value
53
  return tensor
54
 
55
  def quantize_model_to_q1_with_min(model, min_value=-1):
 
56
  for name, param in model.named_parameters():
57
  if param.dtype in [torch.float32, torch.float16]:
58
  with torch.no_grad():
59
  param.copy_(quantize_to_q1_with_min(param.data, min_value))
60
 
61
  def disable_unnecessary_components(model):
 
62
  for name, module in model.named_modules():
63
  if isinstance(module, torch.nn.Dropout):
 
64
  module.p = 0.0
65
  elif isinstance(module, torch.nn.BatchNorm1d):
 
66
  module.eval()
67
 
68
  def ultra_max_compress(model):
69
+ model = quantize_model_to_q1_with_min(model, min_value=-0.05)
70
+ disable_unnecessary_components(model)
 
 
 
71
  with torch.no_grad():
72
  for name, param in model.named_parameters():
73
  if param.requires_grad:
74
  param.requires_grad = False
75
+ param.data = torch.nn.functional.hardtanh(param.data, min_val=-1.0, max_val=1.0)
76
+ param.data = param.data.half()
77
+ model.eval()
 
 
 
 
 
 
 
 
 
 
78
  for buffer_name, buffer in model.named_buffers():
79
  if buffer.numel() == 0:
80
  model._buffers.pop(buffer_name)
 
81
  return model
82
 
83
  def optimize_model_resources(model):
 
84
  torch.set_grad_enabled(False)
 
 
85
  model.eval()
 
 
86
  for name, param in model.named_parameters():
87
  param.requires_grad = False
88
  if param.dtype == torch.float32:
89
  param.data = param.data.half()
 
 
90
  if hasattr(model, 'config'):
91
  if hasattr(model.config, 'max_position_embeddings'):
 
92
  model.config.max_position_embeddings = min(model.config.max_position_embeddings, 512)
93
  if hasattr(model.config, 'hidden_size'):
 
94
  model.config.hidden_size = min(model.config.hidden_size, 768)
 
 
 
 
95
  return model
96
 
97
  def aggressive_optimize(model, reduce_layers_factor=0.5):
 
98
  if hasattr(model.config, 'num_attention_heads'):
99
  model.config.num_attention_heads = int(model.config.num_attention_heads * reduce_layers_factor)
100
  if hasattr(model.config, 'hidden_size'):
 
102
  return model
103
 
104
  def apply_quantization(model, use_int8_inference):
 
105
  if use_int8_inference:
106
  quantized_model = torch.quantization.quantize_dynamic(
107
  model, {torch.nn.Linear}, dtype=torch.qint8
 
111
  return model
112
 
113
  def reduce_layers(model, reduction_factor=0.5):
 
114
  if hasattr(model, 'transformer') and hasattr(model.transformer, 'h'):
115
  original_num_layers = len(model.transformer.h)
116
  new_num_layers = int(original_num_layers * reduction_factor)
 
118
  return model
119
 
120
  def use_smaller_embeddings(model, reduction_factor=0.75):
 
121
  original_embedding_dim = model.config.hidden_size
122
  new_embedding_dim = int(original_embedding_dim * reduction_factor)
123
  model.config.hidden_size = new_embedding_dim
 
125
  return model
126
 
127
  def use_fp16_embeddings(model):
 
128
  model.transformer.wte = model.transformer.wte.half()
129
  return model
130
 
131
  def quantize_embeddings(model):
 
132
  model.transformer.wte = torch.quantization.quantize_dynamic(
133
  model.transformer.wte, {torch.nn.Embedding}, dtype=torch.qint8
134
  )
135
  return model
136
 
137
  def use_bnb_f16(model):
 
138
  if torch.cuda.is_available() and torch.cuda.is_bf16_supported():
139
  model = model.to(dtype=torch.bfloat16)
140
  return model
141
 
142
  def use_group_quantization(model):
 
143
  for module in model.modules():
144
  if isinstance(module, torch.nn.Linear):
 
145
  torch.quantization.fuse_modules(module, ['weight'], inplace=True)
 
146
  torch.quantization.quantize_dynamic(module, {torch.nn.Linear}, dtype=torch.qint8, inplace=True)
147
  return model
148
 
149
  def apply_layer_norm_trick(model):
 
150
  for name, module in model.named_modules():
151
  if isinstance(module, torch.nn.LayerNorm):
152
  module.elementwise_affine = False
153
  return model
154
 
155
  def remove_padding(inputs, attention_mask):
156
+ last_non_padded = attention_mask.sum(dim=1) - 1
157
+ gathered_inputs = torch.gather(inputs, dim=1, index=last_non_padded.unsqueeze(1).unsqueeze(2).expand(-1, -1, inputs.size(2)))
 
158
  return gathered_inputs
159
 
160
  def use_selective_quantization(model):
 
161
  for module in model.modules():
162
  if isinstance(module, torch.nn.MultiheadAttention):
163
  torch.quantization.quantize_dynamic(module, {torch.nn.Linear}, dtype=torch.qint8, inplace=True)
164
  return model
165
 
166
  def use_mixed_precision(model):
 
167
  model.transformer.wte = model.transformer.wte.half()
168
  return model
169
 
170
  def use_pruning_after_training(model, prune_amount=0.1):
171
+ for name, module in model.named_modules():
172
+ if isinstance(module, (torch.nn.Linear, torch.nn.Conv2d)):
173
+ prune.l1_unstructured(module, name='weight', amount=prune_amount)
174
+ prune.remove(module, 'weight')
175
  return model
176
 
177
  def use_knowledge_distillation(model, teacher_model, temperature=2.0, alpha=0.5):
 
178
  teacher_model.eval()
 
 
179
  criterion = torch.nn.KLDivLoss(reduction='batchmean')
180
 
181
  def distillation_loss(student_logits, teacher_logits):
 
182
  student_probs = F.log_softmax(student_logits / temperature, dim=-1)
183
  teacher_probs = F.softmax(teacher_logits / temperature, dim=-1)
184
  return criterion(student_probs, teacher_probs) * (temperature**2)
185
 
186
  def train_step(inputs, labels):
187
+ student_outputs = model(**inputs, labels=labels)
188
+ student_logits = student_outputs.logits
 
189
  with torch.no_grad():
190
+ teacher_outputs = teacher_model(**inputs)
191
+ teacher_logits = teacher_outputs.logits
 
192
  loss = alpha * student_outputs.loss + (1 - alpha) * distillation_loss(student_logits, teacher_logits)
193
  return loss
194
 
195
  return train_step
196
 
197
  def use_weight_sharing(model):
 
198
  if hasattr(model, 'transformer') and hasattr(model.transformer, 'h'):
199
  model.transformer.h[-1].load_state_dict(model.transformer.h[0].state_dict())
200
  return model
201
 
202
  def use_low_rank_approximation(model, rank_factor=0.5):
 
203
  for module in model.modules():
204
  if isinstance(module, torch.nn.Linear):
205
  original_weight = module.weight.data
206
+ U, S, V = torch.linalg.svd(original_weight)
207
+ rank = int(S.size(0) * rank_factor)
 
208
  module.weight.data = U[:, :rank] @ torch.diag(S[:rank]) @ V[:rank, :]
209
  return model
210
 
211
  def use_hashing_trick(model, num_hashes=1024):
212
  def hash_features(features):
 
213
  features_bytes = features.cpu().numpy().tobytes()
 
214
  hash_object = hashlib.sha256(features_bytes)
215
  hash_value = hash_object.hexdigest()
 
216
  hashed_features = int(hash_value, 16) % num_hashes
217
  return torch.tensor(hashed_features, device=features.device)
218
 
 
219
  original_forward = model.forward
220
 
221
  def forward(*args, **kwargs):
222
+ inputs = args[0]
223
  hashed_inputs = hash_features(inputs)
224
  return original_forward(hashed_inputs, *args[1:], **kwargs)
225
 
 
227
  return model
228
 
229
  def use_quantization_aware_training(model):
 
230
  model.qconfig = torch.quantization.get_default_qat_qconfig('fbgemm')
 
231
  torch.quantization.prepare_qat(model, inplace=True)
 
 
232
  torch.quantization.convert(model, inplace=True)
233
  return model
234
 
235
  def use_gradient_checkpointing(model):
 
236
  def custom_forward(*inputs):
237
  return checkpoint(model, *inputs)
238
  model.forward = custom_forward
239
  return model
240
 
 
 
 
 
 
 
 
 
 
 
241
  def use_channel_pruning(model, prune_amount=0.1):
 
242
  for module in model.modules():
243
  if isinstance(module, torch.nn.Conv2d):
 
244
  prune.ln_structured(module, name="weight", amount=prune_amount, n=2, dim=0)
 
245
  prune.remove(module, 'weight')
246
  return model
247
 
248
  def use_sparse_tensors(model, sparsity_threshold=0.01):
 
249
  for name, param in model.named_parameters():
250
  if param.dim() >= 2 and param.is_floating_point():
 
251
  sparse_param = param.to_sparse()
 
252
  sparse_param._values()[sparse_param._values().abs() < sparsity_threshold] = 0
 
253
  param.data = sparse_param.to_dense()
254
  return model
255
 
 
 
 
 
256
  def process_model(model_id, q_method, use_imatrix, imatrix_q_method, private_repo, train_data_file, split_model, split_max_tensors, split_max_size,
257
  oauth_token: gr.OAuthToken | None):
 
258
  if oauth_token.token is None:
259
  raise ValueError("You must be logged in to use GGUF-my-repo")
 
 
260
  model_name = model_id.split('/')[-1]
 
261
  fp16 = f"{model_name}.fp16.gguf"
262
 
263
  try:
 
264
  api = HfApi(token=oauth_token.token)
 
 
265
  dl_pattern = ["*.safetensors", "*.bin", "*.pt", "*.onnx", "*.h5", "*.tflite", "*.ckpt", "*.pb", "*.tar", "*.xml", "*.caffemodel", "*.md", "*.json", "*.model"]
266
  pattern = "*.safetensors" if any(file.path.endswith(".safetensors") for file in api.list_repo_tree(repo_id=model_id, recursive=True)) else "*.bin"
267
  dl_pattern += pattern
 
 
268
  api.snapshot_download(repo_id=model_id, local_dir=model_name, local_dir_use_symlinks=False, allow_patterns=dl_pattern)
 
 
269
  conversion_script = "convert_hf_to_gguf.py"
270
  fp16_conversion = f"python llama.cpp/{conversion_script} {model_name} --outtype f16 --outfile {fp16}"
 
 
271
  result = subprocess.run(fp16_conversion, shell=True, capture_output=True)
 
 
272
  if result.returncode != 0:
273
  raise Exception(f"Error converting to fp16: {result.stderr}")
274
 
 
275
  config = AutoConfig.from_pretrained(model_name)
276
  model = AutoModelForCausalLM.from_pretrained(model_name, config=config, torch_dtype=torch.float16)
277
 
 
278
  model = optimize_model_resources(model)
279
  model = apply_quantization(model, use_int8_inference=True)
280
  model = reduce_layers(model, reduction_factor=0.5)
 
287
  model = use_selective_quantization(model)
288
  model = use_mixed_precision(model)
289
  model = use_pruning_after_training(model, prune_amount=0.1)
 
 
290
  model = use_weight_sharing(model)
291
  model = use_low_rank_approximation(model, rank_factor=0.5)
292
  model = use_quantization_aware_training(model)
 
295
  model = use_sparse_tensors(model, sparsity_threshold=0.01)
296
  model = use_hashing_trick(model, num_hashes=1024)
297
 
 
298
  model.save_pretrained(model_name)
299
 
 
300
  imatrix_path = "llama.cpp/imatrix.dat"
 
 
301
  if use_imatrix:
302
  if train_data_file:
303
  train_data_path = train_data_file.name
304
  else:
305
  train_data_path = "groups_merged.txt"
 
306
  if not os.path.isfile(train_data_path):
307
  raise Exception(f"Training data file not found: {train_data_path}")
 
308
  generate_importance_matrix(fp16, train_data_path)
309
 
 
310
  username = whoami(oauth_token.token)["name"]
 
 
311
  quantized_gguf_name = f"{model_name.lower()}-{imatrix_q_method.lower()}-imat.gguf" if use_imatrix else f"{model_name.lower()}-{q_method.lower()}.gguf"
312
  quantized_gguf_path = quantized_gguf_name
313
 
 
314
  if use_imatrix:
315
  quantise_ggml = f"./llama.cpp/llama-quantize --imatrix {imatrix_path} {fp16} {quantized_gguf_path} {imatrix_q_method}"
316
  else:
317
  quantise_ggml = f"./llama.cpp/llama-quantize {fp16} {quantized_gguf_path} {q_method}"
318
 
 
319
  result = subprocess.run(quantise_ggml, shell=True, capture_output=True)
 
 
320
  if result.returncode != 0:
321
  raise Exception(f"Error quantizing: {result.stderr}")
322
 
 
323
  try:
 
324
  subprocess.run(["llama.cpp/llama", "-m", quantized_gguf_path, "-p", "Test prompt"], check=True)
325
  except Exception as e:
326
  raise Exception(f"Model verification failed: {e}")
327
 
 
328
  new_repo_url = api.create_repo(repo_id=f"{username}/{model_name}-{imatrix_q_method if use_imatrix else q_method}-GGUF", exist_ok=True, private=private_repo)
329
  new_repo_id = new_repo_url.repo_id
330
 
 
331
  try:
332
  card = ModelCard.load(model_id, token=oauth_token.token)
333
  except:
 
334
  card = ModelCard("")
335
 
 
336
  if card.data.tags is None:
337
  card.data.tags = []
338
  card.data.tags.append("llama-cpp")
339
  card.data.tags.append("gguf-my-repo")
 
 
340
  card.data.base_model = model_id
 
 
341
  card.text = dedent(
342
  f"""
343
  # {new_repo_id}
 
382
  ```
383
  """
384
  )
 
385
  card.save(f"README.md")
386
 
 
387
  if split_model:
388
  split_upload_model(quantized_gguf_path, new_repo_id, oauth_token, split_max_tensors, split_max_size)
389
  else:
 
392
  except Exception as e:
393
  raise Exception(f"Error uploading quantized model: {e}")
394
 
 
395
  if os.path.isfile(imatrix_path):
396
  try:
397
  api.upload_file(path_or_fileobj=imatrix_path, path_in_repo="imatrix.dat", repo_id=new_repo_id)
398
  except Exception as e:
399
  raise Exception(f"Error uploading imatrix.dat: {e}")
400
 
 
401
  api.upload_file(path_or_fileobj=f"README.md", path_in_repo=f"README.md", repo_id=new_repo_id)
402
 
 
403
  return (f'Find your repo <a href=\'{new_repo_url}\' target="_blank" style="text-decoration:underline">here</a>', "llama.png")
404
  except Exception as e:
 
405
  return (f"Error: {e}", "error.png")
406
  finally:
 
407
  shutil.rmtree(model_name, ignore_errors=True)
408
 
 
409
  css="""/* Custom CSS to allow scrolling */ .gradio-container {overflow-y: auto;}"""
410
 
 
411
  with gr.Blocks(css=css) as demo:
 
412
  gr.Markdown("You must be logged in to use GGUF-my-repo.")
 
413
  gr.LoginButton(min_width=250)
 
414
  model_id = HuggingfaceHubSearch(label="Hub Model ID", placeholder="Search for model id on Huggingface", search_type="model")
415
 
 
 
416
  q_method = gr.Dropdown(["Q2_K", "Q3_K_S", "Q3_K_M", "Q3_K_L", "Q4_0", "Q4_K_S", "Q4_K_M", "Q5_0", "Q5_K_S", "Q5_K_M", "Q6_K", "Q8_0"],
417
  label="Quantization Method", info="GGML quantization type", value="Q2_K", filterable=False, visible=True)
 
418
  imatrix_q_method = gr.Dropdown(["IQ1", "IQ1_S", "IQ1_XXS", "IQ2_S", "IQ2_XXS", "IQ3_M", "IQ3_XXS", "Q4_K_M", "Q4_K_S", "IQ4_NL", "IQ4_XS", "Q5_K_M", "Q5_K_S"],
419
  label="Imatrix Quantization Method", info="GGML imatrix quants type", value="IQ4_NL", filterable=False, visible=False)
 
420
  use_imatrix = gr.Checkbox(value=False, label="Use Imatrix Quantization", info="Use importance matrix for quantization.")
 
421
  train_data_file = gr.File(label="Training Data File", file_types=["txt"], visible=False)
422
 
 
 
423
  private_repo = gr.Checkbox(value=False, label="Private Repo", info="Create a private repo under your username.")
 
424
  split_model = gr.Checkbox(value=False, label="Split Model", info="Shard the model using gguf-split.")
 
425
  split_max_tensors = gr.Number(value=256, label="Max Tensors per File", info="Maximum number of tensors per file when splitting model.", visible=False)
 
426
  split_max_size = gr.Textbox(label="Max File Size", info="Maximum file size when splitting model (--split-max-size). May leave empty to use the default.", visible=False)
427
 
 
 
428
  use_imatrix.change(fn=lambda use_imatrix: gr.update(visible=not use_imatrix), inputs=use_imatrix, outputs=q_method)
 
429
  use_imatrix.change(fn=lambda use_imatrix: gr.update(visible=use_imatrix), inputs=use_imatrix, outputs=imatrix_q_method)
 
430
  use_imatrix.change(fn=lambda use_imatrix: gr.update(visible=use_imatrix), inputs=use_imatrix, outputs=train_data_file)
 
431
  split_model.change(fn=lambda split_model: gr.update(visible=split_model), inputs=split_model, outputs=split_max_tensors)
 
432
  split_model.change(fn=lambda split_model: gr.update(visible=split_model), inputs=split_model, outputs=split_max_size)
433
 
 
434
  iface = gr.Interface(
435
+ fn=process_model,
436
  inputs=[
437
+ model_id,
438
+ q_method,
439
+ use_imatrix,
440
+ imatrix_q_method,
441
+ private_repo,
442
+ train_data_file,
443
+ split_model,
444
+ split_max_tensors,
445
+ split_max_size
446
  ],
447
  outputs=[
448
+ gr.Markdown(label="output"),
449
+ gr.Image(show_label=False),
450
  ],
451
+ title="Create your own GGUF Quants, blazingly fast ⚡!",
452
+ description="The space takes an HF repo as an input, quantizes it and creates a Public repo containing the selected quant under your HF user namespace.",
453
+ api_name=False
454
  )
455
 
 
456
  def restart_space():
 
457
  HfApi().restart_space(repo_id="Ffftdtd5dtft/gguf-my-repo", token=HF_TOKEN, factory_reboot=True)
458
 
 
459
  scheduler = BackgroundScheduler()
 
460
  scheduler.add_job(restart_space, "interval", seconds=21600)
 
461
  scheduler.start()
462
 
 
463
  demo.queue(default_concurrency_limit=1, max_size=5).launch(debug=True, show_api=False)