mgoin commited on
Commit
de81c99
Β·
1 Parent(s): 610c32f

Conversion app

Browse files
Files changed (2) hide show
  1. app.py +313 -51
  2. requirements.txt +2 -1
app.py CHANGED
@@ -1,64 +1,326 @@
 
 
1
  import gradio as gr
2
- from huggingface_hub import InferenceClient
 
 
 
 
 
 
3
 
4
- """
5
- For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference
6
- """
7
- client = InferenceClient("HuggingFaceH4/zephyr-7b-beta")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
 
 
 
 
 
 
 
9
 
10
- def respond(
11
- message,
12
- history: list[tuple[str, str]],
13
- system_message,
14
- max_tokens,
15
- temperature,
16
- top_p,
17
- ):
18
- messages = [{"role": "system", "content": system_message}]
 
19
 
20
- for val in history:
21
- if val[0]:
22
- messages.append({"role": "user", "content": val[0]})
23
- if val[1]:
24
- messages.append({"role": "assistant", "content": val[1]})
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
25
 
26
- messages.append({"role": "user", "content": message})
27
 
28
- response = ""
29
 
30
- for message in client.chat_completion(
31
- messages,
32
- max_tokens=max_tokens,
33
- stream=True,
34
- temperature=temperature,
35
- top_p=top_p,
36
- ):
37
- token = message.choices[0].delta.content
38
 
39
- response += token
40
- yield response
 
 
 
41
 
 
42
 
 
 
 
 
 
 
43
  """
44
- For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
45
  """
46
- demo = gr.ChatInterface(
47
- respond,
48
- additional_inputs=[
49
- gr.Textbox(value="You are a friendly Chatbot.", label="System message"),
50
- gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
51
- gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
52
- gr.Slider(
53
- minimum=0.1,
54
- maximum=1.0,
55
- value=0.95,
56
- step=0.05,
57
- label="Top-p (nucleus sampling)",
58
- ),
59
- ],
60
- )
61
-
62
-
63
- if __name__ == "__main__":
64
- demo.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from typing import Optional, Tuple, List
3
  import gradio as gr
4
+ import torch
5
+ import spaces
6
+ from dataclasses import dataclass
7
+ from huggingface_hub import HfApi, Repository, CommitOperationAdd
8
+ from transformers import AutoProcessor
9
+ from llmcompressor.modifiers.quantization import QuantizationModifier
10
+ from llmcompressor.transformers import oneshot, wrap_hf_model_class
11
 
12
+ @dataclass
13
+ class CommitInfo:
14
+ repo_url: str
15
+
16
+ HF_TOKEN = os.environ.get("HF_TOKEN")
17
+
18
+ def get_model_class(class_name: str):
19
+ """Dynamically import and return the specified model class from transformers"""
20
+ try:
21
+ # Default to AutoModelForCausalLM if not specified
22
+ if not class_name:
23
+ from transformers import AutoModelForCausalLM
24
+ return AutoModelForCausalLM
25
+
26
+ exec(f"from transformers import {class_name}")
27
+ return eval(class_name)
28
+ except Exception as e:
29
+ raise ValueError(f"Failed to import model class {class_name}: {str(e)}")
30
+
31
+ def parse_ignore_list(ignore_str: str) -> List[str]:
32
+ """Parse comma-separated ignore list string into list"""
33
+ if not ignore_str:
34
+ return ["lm_head"] # Default ignore list
35
+ return [item.strip() for item in ignore_str.split(',') if item.strip()]
36
+
37
+ def create_quantized_model(
38
+ model_id: str,
39
+ work_dir: str,
40
+ api: HfApi,
41
+ ignore_list: List[str],
42
+ model_class_name: str
43
+ ) -> Tuple[str, List[Tuple[str, Exception]]]:
44
+ """Quantize model to FP8 and save to disk"""
45
+
46
+ errors = []
47
+ try:
48
+ # Get the appropriate model class
49
+ model_class = get_model_class(model_class_name)
50
+ wrapped_model_class = wrap_hf_model_class(model_class)
51
+
52
+ # Load model with ZeroGPU
53
+ model = wrapped_model_class.from_pretrained(
54
+ model_id,
55
+ device_map="auto",
56
+ torch_dtype="auto",
57
+ trust_remote_code=True,
58
+ _attn_implementation="eager"
59
+ )
60
+ processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True)
61
 
62
+ # Configure quantization
63
+ recipe = QuantizationModifier(
64
+ targets="Linear",
65
+ scheme="FP8_DYNAMIC",
66
+ ignore=ignore_list,
67
+ )
68
 
69
+ # Apply quantization
70
+ save_dir = os.path.join(work_dir, f"{model_id.split('/')[-1]}-FP8-dynamic")
71
+ oneshot(model=model, recipe=recipe, output_dir=save_dir)
72
+ processor.save_pretrained(save_dir)
73
+
74
+ return save_dir, errors
75
+
76
+ except Exception as e:
77
+ errors.append((model_id, e))
78
+ raise e
79
 
80
+ def push_to_hub(
81
+ api: HfApi,
82
+ model_id: str,
83
+ quantized_path: str,
84
+ token: str,
85
+ ignore_list: List[str],
86
+ model_class_name: str,
87
+ ) -> CommitInfo:
88
+ """Create new repository with quantized model"""
89
+
90
+ # Create new model repo name
91
+ original_owner = model_id.split('/')[0]
92
+ new_model_name = f"{model_id.split('/')[-1]}-fp8"
93
+
94
+ # Get the token owner's username
95
+ token_owner = api.whoami(token)["name"]
96
+
97
+ # Create the new repo under the token owner's account
98
+ target_repo = f"{token_owner}/{new_model_name}"
99
+
100
+ # Create model card content
101
+ model_card = f"""---
102
+ language:
103
+ - en
104
+ license: apache-2.0
105
+ tags:
106
+ - fp8
107
+ - quantized
108
+ - llmcompressor
109
+ base_model: {model_id}
110
+ quantization_config:
111
+ ignored_layers: {ignore_list}
112
+ model_class: {model_class_name}
113
+ ---
114
 
115
+ # {new_model_name}
116
 
117
+ This is an FP8-quantized version of [{model_id}](https://huggingface.co/{model_id}) using [LLM Compressor](https://github.com/georgian-io/LLM-Compressor).
118
 
119
+ ## Quantization Details
 
 
 
 
 
 
 
120
 
121
+ - Weights quantized to FP8 with per channel PTQ
122
+ - Activations quantized to FP8 with dynamic per token
123
+ - Linear layers targeted for quantization
124
+ - Ignored layers: {ignore_list}
125
+ - Model class: {model_class_name}
126
 
127
+ ## Usage
128
 
129
+ ```python
130
+ from transformers import {model_class_name}, AutoProcessor
131
+
132
+ model = {model_class_name}.from_pretrained("{target_repo}")
133
+ processor = AutoProcessor.from_pretrained("{target_repo}")
134
+ ```
135
  """
136
+
137
+ # Create new repository
138
+ api.create_repo(
139
+ repo_id=target_repo,
140
+ private=False,
141
+ exist_ok=True,
142
+ )
143
+
144
+ # Prepare operations for upload
145
+ operations = [
146
+ CommitOperationAdd(path_in_repo="README.md", path_or_content=model_card),
147
+ ]
148
+
149
+ # Add all files from quantized model
150
+ for root, _, files in os.walk(quantized_path):
151
+ for file in files:
152
+ file_path = os.path.join(root, file)
153
+ relative_path = os.path.relpath(file_path, quantized_path)
154
+ operations.append(
155
+ CommitOperationAdd(
156
+ path_in_repo=relative_path,
157
+ path_or_content=file_path
158
+ )
159
+ )
160
+
161
+ # Upload files
162
+ api.create_commit(
163
+ repo_id=target_repo,
164
+ operations=operations,
165
+ commit_message=f"Add FP8 quantized version of {model_id}",
166
+ )
167
+
168
+ return CommitInfo(repo_url=f"https://huggingface.co/{target_repo}")
169
+
170
+ @spaces.GPU(duration=300) # 5 minutes timeout for large models
171
+ def run(
172
+ model_id: str,
173
+ is_private: bool,
174
+ token: str,
175
+ ignore_str: str,
176
+ model_class_name: str
177
+ ) -> str:
178
+ """Main function to handle quantization and model upload"""
179
+
180
+ if not token or model_id == "":
181
+ return """
182
+ ### Invalid input 🐞
183
+
184
+ Please provide both a token and model_id.
185
+ """
186
+
187
+ try:
188
+ # Parse ignore list
189
+ ignore_list = parse_ignore_list(ignore_str)
190
+
191
+ # Set up API with user's token
192
+ api = HfApi(token=token)
193
+
194
+ print("Processing model:", model_id)
195
+ print("Ignore list:", ignore_list)
196
+ print("Model class:", model_class_name)
197
+
198
+ # Create working directory
199
+ work_dir = "quantized_models"
200
+ os.makedirs(work_dir, exist_ok=True)
201
+
202
+ # Quantize model
203
+ quantized_path, errors = create_quantized_model(
204
+ model_id,
205
+ work_dir,
206
+ api,
207
+ ignore_list,
208
+ model_class_name
209
+ )
210
+
211
+ # Upload quantized model to new repository
212
+ commit_info = push_to_hub(
213
+ api,
214
+ model_id,
215
+ quantized_path,
216
+ token,
217
+ ignore_list,
218
+ model_class_name
219
+ )
220
+
221
+ response = f"""
222
+ ### Success πŸ”₯
223
+
224
+ Your model has been successfully quantized to FP8 and uploaded to a new repository:
225
+
226
+ [{commit_info.repo_url}]({commit_info.repo_url})
227
+
228
+ Configuration:
229
+ - Ignored layers: {ignore_list}
230
+ - Model class: {model_class_name}
231
+
232
+ You can use this model directly with the transformers library!
233
+ """
234
+
235
+ if errors:
236
+ response += "\nWarnings during quantization:\n"
237
+ response += "\n".join(f"Warning for {filename}: {e}" for filename, e in errors)
238
+
239
+ return response
240
+
241
+ except Exception as e:
242
+ return f"""
243
+ ### Error 😒
244
+
245
+ An error occurred during processing:
246
+ {str(e)}
247
+ """
248
+
249
+ # Gradio Interface
250
+ DESCRIPTION = """
251
+ # Convert any model to FP8 using LLM Compressor
252
+
253
+ This space will quantize your model to FP8 format using LLM Compressor and create a new model repository under your account.
254
+
255
+ The steps are:
256
+ 1. Paste your HuggingFace token (from hf.co/settings/tokens) - needs write access
257
+ 2. Enter the model ID you want to quantize
258
+ 3. (Optional) Customize ignored layers and model class
259
+ 4. Click "Submit"
260
+ 5. You'll get a link to your new quantized model repository! πŸš€
261
+
262
+ ## Advanced Options:
263
+ - **Ignore List**: Comma-separated list of layer patterns to ignore during quantization. Examples:
264
+ - Llama: `lm_head`
265
+ - Phi3v: `re:.*lm_head,re:model.vision_embed_tokens.*`
266
+ - Pixtral: `re:.*lm_head,re:multi_modal_projector.*`
267
+ - Llama Vision: `re:.*lm_head,re:multi_modal_projector.*,re:vision_model.*`
268
+ - **Model Class**: Specific model class from transformers (default: AutoModelForCausalLM). Examples:
269
+ - `MllamaForConditionalGeneration`
270
+ - `Qwen2VLForConditionalGeneration`
271
+ - `LlavaForConditionalGeneration`
272
+
273
+ Note:
274
+ - Processing may take several minutes depending on the model size
275
+ - The quantized model will be created as a new public repository under your account
276
+ - Your token needs write access to create the new repository
277
  """
278
+
279
+ title = "FP8 Quantization with LLM Compressor"
280
+
281
+ with gr.Blocks(title=title) as demo:
282
+ gr.Markdown(DESCRIPTION)
283
+
284
+ with gr.Row():
285
+ with gr.Column():
286
+ model_id = gr.Text(
287
+ max_lines=1,
288
+ label="model_id",
289
+ placeholder="huggingface/model-name"
290
+ )
291
+ is_private = gr.Checkbox(
292
+ label="Private model (requires read access to original model)"
293
+ )
294
+ token = gr.Text(
295
+ max_lines=1,
296
+ label="your_hf_token (requires write access)",
297
+ placeholder="hf_..."
298
+ )
299
+ ignore_str = gr.Text(
300
+ max_lines=1,
301
+ label="ignore_list (comma-separated)",
302
+ placeholder="lm_head,re:vision_model.*",
303
+ value="lm_head"
304
+ )
305
+ model_class_name = gr.Text(
306
+ max_lines=1,
307
+ label="model_class_name (optional)",
308
+ placeholder="AutoModelForCausalLM",
309
+ value="AutoModelForCausalLM"
310
+ )
311
+
312
+ with gr.Row():
313
+ clean = gr.ClearButton()
314
+ submit = gr.Button("Submit", variant="primary")
315
+
316
+ with gr.Column():
317
+ output = gr.Markdown()
318
+
319
+ submit.click(
320
+ run,
321
+ inputs=[model_id, is_private, token, ignore_str, model_class_name],
322
+ outputs=output,
323
+ concurrency_limit=1
324
+ )
325
+
326
+ demo.queue(max_size=10).launch(show_api=True)
requirements.txt CHANGED
@@ -1 +1,2 @@
1
- huggingface_hub==0.25.2
 
 
1
+ huggingface_hub==0.25.2
2
+ llmcompressor==0.3.0