Spaces:
Running
on
A100
Running
on
A100
| import gradio as gr | |
| import torch | |
| from transformers import TorchAoConfig, AutoModelForCausalLM, AutoTokenizer, AutoModel | |
| import tempfile | |
| from huggingface_hub import HfApi | |
| from huggingface_hub import list_models | |
| from packaging import version | |
| import os | |
| import spaces | |
| def hello(profile: gr.OAuthProfile | None, oauth_token: gr.OAuthToken | None) -> str: | |
| # ^ expect a gr.OAuthProfile object as input to get the user's profile | |
| # if the user is not logged in, profile will be None | |
| if profile is None: | |
| return "Hello !" | |
| return f"Hello {profile.name} !" | |
| def check_model_exists(oauth_token: gr.OAuthToken | None, username, quantization_type, group_size, model_name, quantized_model_name): | |
| """Check if a model exists in the user's Hugging Face repository.""" | |
| try: | |
| models = list_models(author=username, token=oauth_token.token) | |
| model_names = [model.id for model in models] | |
| if quantized_model_name : | |
| repo_name = f"{username}/{quantized_model_name}" | |
| else : | |
| if quantization_type == "int4_weight_only" : | |
| repo_name = f"{username}/{model_name.split('/')[-1]}-torchao-{quantization_type.lower()}-gs_{group_size}" | |
| else : | |
| repo_name = f"{username}/{model_name.split('/')[-1]}-torchao-{quantization_type.lower()}" | |
| if repo_name in model_names: | |
| return f"Model '{repo_name}' already exists in your repository." | |
| else: | |
| return None # Model does not exist | |
| except Exception as e: | |
| return f"Error checking model existence: {str(e)}" | |
| def create_model_card(model_name, quantization_type, group_size): | |
| model_card = f"""--- | |
| base_model: | |
| - {model_name} | |
| --- | |
| # {model_name} (Quantized) | |
| ## Description | |
| This model is a quantized version of the original model `{model_name}`. It has been quantized using {quantization_type} quantization with torchao. | |
| ## Quantization Details | |
| - **Quantization Type**: {quantization_type} | |
| - **Group Size**: {group_size if quantization_type == "int4_weight_only" else None} | |
| ## Usage | |
| You can use this model in your applications by loading it directly from the Hugging Face Hub: | |
| ```python | |
| from transformers import AutoModel | |
| model = AutoModel.from_pretrained("{model_name}")""" | |
| return model_card | |
| def load_model_gpu(model_name, quantization_config, auth_token) : | |
| return AutoModel.from_pretrained(model_name, torch_dtype=torch.bfloat16, quantization_config=quantization_config, use_auth_token=auth_token.token) | |
| def load_model_cpu(model_name, quantization_config, auth_token) : | |
| return AutoModel.from_pretrained(model_name, torch_dtype=torch.bfloat16, quantization_config=quantization_config, use_auth_token=auth_token.token) | |
| def quantize_model(model_name, quantization_type, group_size=128, auth_token=None, username=None, device="cuda"): | |
| print(f"Quantizing model: {quantization_type}") | |
| if quantization_type == "int4_weight_only" : | |
| quantization_config = TorchAoConfig(quantization_type, group_size=group_size) | |
| else : | |
| quantization_config = TorchAoConfig(quantization_type) | |
| if device == "cuda" : | |
| model = load_model_gpu(model_name, quantization_config=quantization_config, auth_token=auth_token) | |
| else : | |
| model = load_model_cpu(model_name, quantization_config=quantization_config, auth_token=auth_token) | |
| return model | |
| def save_model(model, model_name, quantization_type, group_size=128, username=None, auth_token=None, quantized_model_name=None): | |
| print("Saving quantized model") | |
| with tempfile.TemporaryDirectory() as tmpdirname: | |
| model.save_pretrained(tmpdirname, safe_serialization=False, use_auth_token=auth_token.token) | |
| if quantized_model_name : | |
| repo_name = f"{username}/{quantized_model_name}" | |
| else : | |
| if quantization_type == "int4_weight_only" : | |
| repo_name = f"{username}/{model_name.split('/')[-1]}-torchao-{quantization_type.lower()}-gs_{group_size}" | |
| else : | |
| repo_name = f"{username}/{model_name.split('/')[-1]}-torchao-{quantization_type.lower()}" | |
| model_card = create_model_card(repo_name, quantization_type, group_size) | |
| with open(os.path.join(tmpdirname, "README.md"), "w") as f: | |
| f.write(model_card) | |
| # Push to Hub | |
| api = HfApi(token=auth_token.token) | |
| api.create_repo(repo_name, exist_ok=True) | |
| api.upload_folder( | |
| folder_path=tmpdirname, | |
| repo_id=repo_name, | |
| repo_type="model", | |
| ) | |
| return f"https://huggingface.co/{repo_name}" | |
| def quantize_and_save(profile: gr.OAuthProfile | None, oauth_token: gr.OAuthToken | None, model_name, quantization_type, group_size, quantized_model_name, device): | |
| if oauth_token is None : | |
| return "Error : Please Sign In to your HuggingFace account to use the quantizer" | |
| if not profile: | |
| return "Error: Please Sign In to your HuggingFace account to use the quantizer" | |
| exists_message = check_model_exists(oauth_token, profile.username, quantization_type, group_size, model_name, quantized_model_name) | |
| if exists_message : | |
| return exists_message | |
| if quantization_type == "int4_weight_only" and device == "cpu" : | |
| return "int4_weight_only not supported on cpu" | |
| # try : | |
| quantized_model = quantize_model(model_name, quantization_type, group_size, oauth_token, profile.username, device) | |
| return save_model(quantized_model, model_name, quantization_type, group_size, profile.username, oauth_token, quantized_model_name) | |
| # except Exception as e : | |
| # return e | |
| with gr.Blocks(theme=gr.themes.Soft()) as app: | |
| gr.Markdown( | |
| """ | |
| # 🚀 LLM Model Quantization App | |
| Quantize your favorite Hugging Face models and save them to your profile! | |
| """ | |
| ) | |
| gr.LoginButton(elem_id="login-button", elem_classes="center-button") | |
| m1 = gr.Markdown() | |
| app.load(hello, inputs=None, outputs=m1) | |
| with gr.Row(): | |
| with gr.Column(): | |
| model_name = gr.Textbox( | |
| label="Model Name", | |
| placeholder="e.g., meta-llama/Meta-Llama-3-8B", | |
| value="meta-llama/Meta-Llama-3-8B" | |
| ) | |
| quantization_type = gr.Dropdown( | |
| label="Quantization Type", | |
| choices=["int4_weight_only", "int8_weight_only", "int8_dynamic_activation_int8_weight"], | |
| value="int8_weight_only" | |
| ) | |
| group_size = gr.Number( | |
| label="Group Size (only for int4_weight_only)", | |
| value=128, | |
| interactive=True | |
| ) | |
| device = gr.Dropdown( | |
| label="Device (int4 only works with cuda)", | |
| choices=["cuda", "cpu"], | |
| value="cuda" | |
| ) | |
| quantized_model_name = gr.Textbox( | |
| label="Model Name (optional : to override default)", | |
| value="", | |
| interactive=True | |
| ) | |
| # with gr.Row(): | |
| # username = gr.Textbox( | |
| # label="Hugging Face Username", | |
| # placeholder="Enter your Hugging Face username", | |
| # value="", | |
| # interactive=True, | |
| # elem_id="username-box" | |
| # ) | |
| with gr.Column(): | |
| quantize_button = gr.Button("Quantize and Save Model", variant="primary") | |
| output_link = gr.Textbox(label="Quantized Model Link") | |
| gr.Markdown( | |
| """ | |
| ## Instructions | |
| 1. Login to your HuggingFace account | |
| 2. Enter the name of the Hugging Face LLM model you want to quantize (Make sure you have access to it) | |
| 3. Choose the quantization type. | |
| 4. Optionally, specify the group size. | |
| 5. Optionally, choose a custom name for the quantized model | |
| 6. Click "Quantize and Save Model" to start the process. | |
| 7. Once complete, you'll receive a link to the quantized model on Hugging Face. | |
| Note: This process may take some time depending on the model size and your hardware you can check the container logs to see where are you at in the process! | |
| """ | |
| ) | |
| # Adding CSS styles for the username box | |
| app.css = """ | |
| #username-box { | |
| background-color: #f0f8ff; /* Light color */ | |
| border-radius: 8px; | |
| padding: 10px; | |
| } | |
| """ | |
| app.css = """ | |
| .center-button { | |
| display: flex; | |
| justify-content: center; | |
| align-items: center; | |
| margin: 0 auto; /* Center horizontally */ | |
| } | |
| """ | |
| quantize_button.click( | |
| fn=quantize_and_save, | |
| inputs=[model_name, quantization_type, group_size, quantized_model_name, device], | |
| outputs=[output_link] | |
| ) | |
| # Launch the app | |
| app.launch() |