Spaces:

mgoin
/

convert-fp8

Sleeping

App Files Files Community

mgoin commited on Nov 13, 2024

Commit

de81c99

1 Parent(s): 610c32f

Conversion app

Browse files

Files changed (2) hide show

app.py +313 -51
requirements.txt +2 -1

app.py CHANGED Viewed

@@ -1,64 +1,326 @@
 import gradio as gr
-from huggingface_hub import InferenceClient
-"""
-For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference
-"""
-client = InferenceClient("HuggingFaceH4/zephyr-7b-beta")
-def respond(
-    message,
-    history: list[tuple[str, str]],
-    system_message,
-    max_tokens,
-    temperature,
-    top_p,
-):
-    messages = [{"role": "system", "content": system_message}]
-    for val in history:
-        if val[0]:
-            messages.append({"role": "user", "content": val[0]})
-        if val[1]:
-            messages.append({"role": "assistant", "content": val[1]})
-    messages.append({"role": "user", "content": message})
-    response = ""
-    for message in client.chat_completion(
-        messages,
-        max_tokens=max_tokens,
-        stream=True,
-        temperature=temperature,
-        top_p=top_p,
-    ):
-        token = message.choices[0].delta.content
-        response += token
-        yield response
 """
-For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface
 """
-demo = gr.ChatInterface(
-    respond,
-    additional_inputs=[
-        gr.Textbox(value="You are a friendly Chatbot.", label="System message"),
-        gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
-        gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
-        gr.Slider(
-            minimum=0.1,
-            maximum=1.0,
-            value=0.95,
-            step=0.05,
-            label="Top-p (nucleus sampling)",
-        ),
-    ],
-)
-if __name__ == "__main__":
-    demo.launch()

+import os
+from typing import Optional, Tuple, List
 import gradio as gr
+import torch
+import spaces
+from dataclasses import dataclass
+from huggingface_hub import HfApi, Repository, CommitOperationAdd
+from transformers import AutoProcessor
+from llmcompressor.modifiers.quantization import QuantizationModifier
+from llmcompressor.transformers import oneshot, wrap_hf_model_class
+@dataclass
+class CommitInfo:
+    repo_url: str
+HF_TOKEN = os.environ.get("HF_TOKEN")
+def get_model_class(class_name: str):
+    """Dynamically import and return the specified model class from transformers"""
+    try:
+        # Default to AutoModelForCausalLM if not specified
+        if not class_name:
+            from transformers import AutoModelForCausalLM
+            return AutoModelForCausalLM
+        exec(f"from transformers import {class_name}")
+        return eval(class_name)
+    except Exception as e:
+        raise ValueError(f"Failed to import model class {class_name}: {str(e)}")
+def parse_ignore_list(ignore_str: str) -> List[str]:
+    """Parse comma-separated ignore list string into list"""
+    if not ignore_str:
+        return ["lm_head"]  # Default ignore list
+    return [item.strip() for item in ignore_str.split(',') if item.strip()]
+def create_quantized_model(
+    model_id: str,
+    work_dir: str,
+    api: HfApi,
+    ignore_list: List[str],
+    model_class_name: str
+) -> Tuple[str, List[Tuple[str, Exception]]]:
+    """Quantize model to FP8 and save to disk"""
+    errors = []
+    try:
+        # Get the appropriate model class
+        model_class = get_model_class(model_class_name)
+        wrapped_model_class = wrap_hf_model_class(model_class)
+        # Load model with ZeroGPU
+        model = wrapped_model_class.from_pretrained(
+            model_id,
+            device_map="auto",
+            torch_dtype="auto",
+            trust_remote_code=True,
+            _attn_implementation="eager"
+        )
+        processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True)
+        # Configure quantization
+        recipe = QuantizationModifier(
+            targets="Linear",
+            scheme="FP8_DYNAMIC",
+            ignore=ignore_list,
+        )
+        # Apply quantization
+        save_dir = os.path.join(work_dir, f"{model_id.split('/')[-1]}-FP8-dynamic")
+        oneshot(model=model, recipe=recipe, output_dir=save_dir)
+        processor.save_pretrained(save_dir)
+        return save_dir, errors
+    except Exception as e:
+        errors.append((model_id, e))
+        raise e
+def push_to_hub(
+    api: HfApi,
+    model_id: str,
+    quantized_path: str,
+    token: str,
+    ignore_list: List[str],
+    model_class_name: str,
+) -> CommitInfo:
+    """Create new repository with quantized model"""
+    # Create new model repo name
+    original_owner = model_id.split('/')[0]
+    new_model_name = f"{model_id.split('/')[-1]}-fp8"
+    # Get the token owner's username
+    token_owner = api.whoami(token)["name"]
+    # Create the new repo under the token owner's account
+    target_repo = f"{token_owner}/{new_model_name}"
+    # Create model card content
+    model_card = f"""---
+language:
+- en
+license: apache-2.0
+tags:
+- fp8
+- quantized
+- llmcompressor
+base_model: {model_id}
+quantization_config:
+  ignored_layers: {ignore_list}
+  model_class: {model_class_name}
+---
+# {new_model_name}
+This is an FP8-quantized version of [{model_id}](https://huggingface.co/{model_id}) using [LLM Compressor](https://github.com/georgian-io/LLM-Compressor).
+## Quantization Details
+- Weights quantized to FP8 with per channel PTQ
+- Activations quantized to FP8 with dynamic per token
+- Linear layers targeted for quantization
+- Ignored layers: {ignore_list}
+- Model class: {model_class_name}
+## Usage
+```python
+from transformers import {model_class_name}, AutoProcessor
+model = {model_class_name}.from_pretrained("{target_repo}")
+processor = AutoProcessor.from_pretrained("{target_repo}")
+```
 """
+    # Create new repository
+    api.create_repo(
+        repo_id=target_repo,
+        private=False,
+        exist_ok=True,
+    )
+    # Prepare operations for upload
+    operations = [
+        CommitOperationAdd(path_in_repo="README.md", path_or_content=model_card),
+    ]
+    # Add all files from quantized model
+    for root, _, files in os.walk(quantized_path):
+        for file in files:
+            file_path = os.path.join(root, file)
+            relative_path = os.path.relpath(file_path, quantized_path)
+            operations.append(
+                CommitOperationAdd(
+                    path_in_repo=relative_path,
+                    path_or_content=file_path
+                )
+            )
+    # Upload files
+    api.create_commit(
+        repo_id=target_repo,
+        operations=operations,
+        commit_message=f"Add FP8 quantized version of {model_id}",
+    )
+    return CommitInfo(repo_url=f"https://huggingface.co/{target_repo}")
+@spaces.GPU(duration=300)  # 5 minutes timeout for large models
+def run(
+    model_id: str,
+    is_private: bool,
+    token: str,
+    ignore_str: str,
+    model_class_name: str
+) -> str:
+    """Main function to handle quantization and model upload"""
+    if not token or model_id == "":
+        return """
+        ### Invalid input 🐞
+        Please provide both a token and model_id.
+        """
+    try:
+        # Parse ignore list
+        ignore_list = parse_ignore_list(ignore_str)
+        # Set up API with user's token
+        api = HfApi(token=token)
+        print("Processing model:", model_id)
+        print("Ignore list:", ignore_list)
+        print("Model class:", model_class_name)
+        # Create working directory
+        work_dir = "quantized_models"
+        os.makedirs(work_dir, exist_ok=True)
+        # Quantize model
+        quantized_path, errors = create_quantized_model(
+            model_id,
+            work_dir,
+            api,
+            ignore_list,
+            model_class_name
+        )
+        # Upload quantized model to new repository
+        commit_info = push_to_hub(
+            api,
+            model_id,
+            quantized_path,
+            token,
+            ignore_list,
+            model_class_name
+        )
+        response = f"""
+        ### Success 🔥
+        Your model has been successfully quantized to FP8 and uploaded to a new repository:
+        [{commit_info.repo_url}]({commit_info.repo_url})
+        Configuration:
+        - Ignored layers: {ignore_list}
+        - Model class: {model_class_name}
+        You can use this model directly with the transformers library!
+        """
+        if errors:
+            response += "\nWarnings during quantization:\n"
+            response += "\n".join(f"Warning for {filename}: {e}" for filename, e in errors)
+        return response
+    except Exception as e:
+        return f"""
+        ### Error 😢
+        An error occurred during processing:
+        {str(e)}
+        """
+# Gradio Interface
+DESCRIPTION = """
+# Convert any model to FP8 using LLM Compressor
+This space will quantize your model to FP8 format using LLM Compressor and create a new model repository under your account.
+The steps are:
+1. Paste your HuggingFace token (from hf.co/settings/tokens) - needs write access
+2. Enter the model ID you want to quantize
+3. (Optional) Customize ignored layers and model class
+4. Click "Submit"
+5. You'll get a link to your new quantized model repository! 🚀
+## Advanced Options:
+- **Ignore List**: Comma-separated list of layer patterns to ignore during quantization. Examples:
+  - Llama: `lm_head`
+  - Phi3v: `re:.*lm_head,re:model.vision_embed_tokens.*`
+  - Pixtral: `re:.*lm_head,re:multi_modal_projector.*`
+  - Llama Vision: `re:.*lm_head,re:multi_modal_projector.*,re:vision_model.*`
+- **Model Class**: Specific model class from transformers (default: AutoModelForCausalLM). Examples:
+  - `MllamaForConditionalGeneration`
+  - `Qwen2VLForConditionalGeneration`
+  - `LlavaForConditionalGeneration`
+Note:
+- Processing may take several minutes depending on the model size
+- The quantized model will be created as a new public repository under your account
+- Your token needs write access to create the new repository
 """
+title = "FP8 Quantization with LLM Compressor"
+with gr.Blocks(title=title) as demo:
+    gr.Markdown(DESCRIPTION)
+    with gr.Row():
+        with gr.Column():
+            model_id = gr.Text(
+                max_lines=1,
+                label="model_id",
+                placeholder="huggingface/model-name"
+            )
+            is_private = gr.Checkbox(
+                label="Private model (requires read access to original model)"
+            )
+            token = gr.Text(
+                max_lines=1,
+                label="your_hf_token (requires write access)",
+                placeholder="hf_..."
+            )
+            ignore_str = gr.Text(
+                max_lines=1,
+                label="ignore_list (comma-separated)",
+                placeholder="lm_head,re:vision_model.*",
+                value="lm_head"
+            )
+            model_class_name = gr.Text(
+                max_lines=1,
+                label="model_class_name (optional)",
+                placeholder="AutoModelForCausalLM",
+                value="AutoModelForCausalLM"
+            )
+            with gr.Row():
+                clean = gr.ClearButton()
+                submit = gr.Button("Submit", variant="primary")
+        with gr.Column():
+            output = gr.Markdown()
+    submit.click(
+        run,
+        inputs=[model_id, is_private, token, ignore_str, model_class_name],
+        outputs=output,
+        concurrency_limit=1
+    )
+demo.queue(max_size=10).launch(show_api=True)

requirements.txt CHANGED Viewed

	@@ -1 +1,2 @@
1	- huggingface_hub==0.25.2


1	+ huggingface_hub==0.25.2
2	+ llmcompressor==0.3.0