MekkCyber commited on
Commit
e5bb0c6
·
1 Parent(s): 3d2f5ba
Files changed (1) hide show
  1. app.py +16 -9
app.py CHANGED
@@ -1,6 +1,6 @@
1
  import gradio as gr
2
  import torch
3
- from transformers import TorchAoConfig, AutoModelForCausalLM, AutoTokenizer
4
  import tempfile
5
  from huggingface_hub import HfApi
6
  from huggingface_hub import list_models
@@ -59,14 +59,14 @@ model = AutoModel.from_pretrained("{model_name}")"""
59
 
60
  return model_card
61
 
62
- def quantize_model(model_name, quantization_type, group_size=128, auth_token=None, username=None):
63
  print(f"Quantizing model: {quantization_type}")
64
  if quantization_type == "int4_weight_only" :
65
  quantization_config = TorchAoConfig(quantization_type, group_size=group_size)
66
- model = AutoModelForCausalLM.from_pretrained(model_name, device_map="cuda", torch_dtype=torch.bfloat16, quantization_config=quantization_config, use_auth_token=auth_token.token)
67
  else :
68
  quantization_config = TorchAoConfig(quantization_type)
69
- model = AutoModelForCausalLM.from_pretrained(model_name, device_map="cuda", torch_dtype=torch.bfloat16, quantization_config=quantization_config, use_auth_token=auth_token.token)
 
70
 
71
  return model
72
 
@@ -97,7 +97,7 @@ def save_model(model, model_name, quantization_type, group_size=128, username=No
97
 
98
  return f"https://huggingface.co/{repo_name}"
99
 
100
- def quantize_and_save(profile: gr.OAuthProfile | None, oauth_token: gr.OAuthToken | None, model_name, quantization_type, group_size, quantized_model_name):
101
  if oauth_token is None :
102
  return "Error : Please Sign In to your HuggingFace account to use the quantizer"
103
  if not profile:
@@ -105,14 +105,16 @@ def quantize_and_save(profile: gr.OAuthProfile | None, oauth_token: gr.OAuthToke
105
  exists_message = check_model_exists(oauth_token, profile.username, quantization_type, group_size, model_name, quantized_model_name)
106
  if exists_message :
107
  return exists_message
108
- quantized_model = quantize_model(model_name, quantization_type, group_size, oauth_token, profile.username)
 
 
109
  return save_model(quantized_model, model_name, quantization_type, group_size, profile.username, oauth_token, quantized_model_name)
110
 
111
 
112
  with gr.Blocks(theme=gr.themes.Soft()) as app:
113
  gr.Markdown(
114
  """
115
- # 🚀 Model Quantization App :hugging-torch:
116
 
117
  Quantize your favorite Hugging Face models and save them to your profile!
118
  """
@@ -141,6 +143,11 @@ with gr.Blocks(theme=gr.themes.Soft()) as app:
141
  value=128,
142
  interactive=True
143
  )
 
 
 
 
 
144
  quantized_model_name = gr.Textbox(
145
  label="Model Name (optional : to override default)",
146
  value="",
@@ -162,7 +169,7 @@ with gr.Blocks(theme=gr.themes.Soft()) as app:
162
  """
163
  ## Instructions
164
  1. Login to your HuggingFace account
165
- 2. Enter the name of the Hugging Face model you want to quantize (Make sure you have access to it)
166
  3. Choose the quantization type.
167
  4. Optionally, specify the group size.
168
  5. Optionally, choose a custom name for the quantized model
@@ -193,7 +200,7 @@ with gr.Blocks(theme=gr.themes.Soft()) as app:
193
 
194
  quantize_button.click(
195
  fn=quantize_and_save,
196
- inputs=[model_name, quantization_type, group_size, quantized_model_name],
197
  outputs=[output_link]
198
  )
199
 
 
1
  import gradio as gr
2
  import torch
3
+ from transformers import TorchAoConfig, AutoModelForCausalLM, AutoTokenizer, AutoModel
4
  import tempfile
5
  from huggingface_hub import HfApi
6
  from huggingface_hub import list_models
 
59
 
60
  return model_card
61
 
62
+ def quantize_model(model_name, quantization_type, group_size=128, auth_token=None, username=None, device="cuda"):
63
  print(f"Quantizing model: {quantization_type}")
64
  if quantization_type == "int4_weight_only" :
65
  quantization_config = TorchAoConfig(quantization_type, group_size=group_size)
 
66
  else :
67
  quantization_config = TorchAoConfig(quantization_type)
68
+
69
+ model = AutoModel.from_pretrained(model_name, device_map=device, torch_dtype=torch.bfloat16, quantization_config=quantization_config, use_auth_token=auth_token.token)
70
 
71
  return model
72
 
 
97
 
98
  return f"https://huggingface.co/{repo_name}"
99
 
100
+ def quantize_and_save(profile: gr.OAuthProfile | None, oauth_token: gr.OAuthToken | None, model_name, quantization_type, group_size, quantized_model_name, device):
101
  if oauth_token is None :
102
  return "Error : Please Sign In to your HuggingFace account to use the quantizer"
103
  if not profile:
 
105
  exists_message = check_model_exists(oauth_token, profile.username, quantization_type, group_size, model_name, quantized_model_name)
106
  if exists_message :
107
  return exists_message
108
+ if quantization_type == "int4_weight_only" and device == "cpu" :
109
+ return "int4_weight_only not supported on cpu"
110
+ quantized_model = quantize_model(model_name, quantization_type, group_size, oauth_token, profile.username, device)
111
  return save_model(quantized_model, model_name, quantization_type, group_size, profile.username, oauth_token, quantized_model_name)
112
 
113
 
114
  with gr.Blocks(theme=gr.themes.Soft()) as app:
115
  gr.Markdown(
116
  """
117
+ # 🚀 LLM Model Quantization App
118
 
119
  Quantize your favorite Hugging Face models and save them to your profile!
120
  """
 
143
  value=128,
144
  interactive=True
145
  )
146
+ device = gr.Dropdown(
147
+ label="Device (int4 only works with cuda)",
148
+ choices=["cuda", "cpu"],
149
+ value="cuda"
150
+ )
151
  quantized_model_name = gr.Textbox(
152
  label="Model Name (optional : to override default)",
153
  value="",
 
169
  """
170
  ## Instructions
171
  1. Login to your HuggingFace account
172
+ 2. Enter the name of the Hugging Face LLM model you want to quantize (Make sure you have access to it)
173
  3. Choose the quantization type.
174
  4. Optionally, specify the group size.
175
  5. Optionally, choose a custom name for the quantized model
 
200
 
201
  quantize_button.click(
202
  fn=quantize_and_save,
203
+ inputs=[model_name, quantization_type, group_size, quantized_model_name, device],
204
  outputs=[output_link]
205
  )
206