File size: 9,423 Bytes
20aa964
 
 
8c9d2de
20aa964
 
 
 
8c9d2de
a360f5e
20aa964
 
81197b0
883b775
 
4149fa9
20aa964
 
 
4b54665
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8c9d2de
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7d5c4eb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8c9d2de
 
 
883b775
 
 
 
 
 
 
4b54665
8c9d2de
 
3e2702a
a360f5e
 
20aa964
a360f5e
20aa964
a360f5e
8c9d2de
 
 
 
8342e6f
 
 
8c9d2de
 
 
 
 
 
 
 
 
 
 
 
 
a360f5e
883b775
a360f5e
883b775
bf39fb9
81c930a
883b775
20aa964
8c9d2de
 
883b775
a360f5e
883b775
bf39fb9
81c930a
883b775
a360f5e
8c9d2de
 
a360f5e
 
02b66bf
4149fa9
20aa964
a360f5e
4149fa9
 
a360f5e
4149fa9
 
 
 
 
 
 
 
7d5c4eb
 
 
 
 
4149fa9
 
 
 
 
 
 
 
4b54665
883b775
44fe74d
81197b0
 
 
3daf848
81d79e6
3daf848
44fe74d
 
883b775
 
 
 
 
 
 
 
 
 
44fe74d
4e3cb72
5cfec42
5ce249a
5cfec42
5ce249a
c869e4f
20aa964
81197b0
 
3daf848
81197b0
 
5ce249a
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
import os
import shutil
import subprocess
import sys
import signal
os.environ["GRADIO_ANALYTICS_ENABLED"] = "False"
import gradio as gr

import huggingface_hub
from huggingface_hub import HfApi
from huggingface_hub import ModelCard

from apscheduler.schedulers.background import BackgroundScheduler
from gradio_huggingfacehub_search import HuggingfaceHubSearch

from textwrap import dedent

HF_PATH = "https://huggingface.co/"

CONV_TEMPLATES = [
    "llama-3",
    "llama-3_1",
    "chatml",
    "chatml_nosystem",
    "qwen2",
    "open_hermes_mistral",
    "neural_hermes_mistral",
    "llama_default",
    "llama-2",
    "mistral_default",
    "gpt2",
    "codellama_completion",
    "codellama_instruct",
    "vicuna_v1.1",
    "conv_one_shot",
    "redpajama_chat",
    "rwkv_world",
    "rwkv",
    "gorilla",
    "gorilla-openfunctions-v2",
    "guanaco",
    "dolly",
    "oasst",
    "stablelm",
    "stablecode_completion",
    "stablecode_instruct",
    "minigpt",
    "moss",
    "LM",
    "stablelm-3b",
    "gpt_bigcode",
    "wizardlm_7b",
    "wizard_coder_or_math",
    "glm",
    "custom",  # for web-llm only
    "phi-2",
    "phi-3",
    "phi-3-vision",
    "stablelm-2",
    "gemma_instruction",
    "orion",
    "llava",
    "hermes2_pro_llama3",
    "hermes3_llama-3_1",
    "tinyllama_v1_0",
    "aya-23",
]

QUANTIZATIONS = ["q0f16", 
                 "q0f32", 
                 "q3f16_1", 
                 "q4f16_1", 
                 "q4f32_1", 
                 "q4f16_awq"]

SUPPORTED_MODEL_TYPES = ['llama', 
                         'mistral', 
                         'gemma', 
                         'gemma2', 
                         'gpt2', 
                         'mixtral', 
                         'gpt_neox', 
                         'gpt_bigcode', 
                         'phi-msft', 
                         'phi', 
                         'phi3', 
                         'phi3_v', 
                         'qwen', 
                         'qwen2', 
                         'qwen2_moe', 
                         'stablelm', 
                         'baichuan', 
                         'internlm', 
                         'internlm2', 
                         'rwkv5', 
                         'orion', 
                         'llava', 
                         'rwkv6', 
                         'chatglm', 
                         'eagle', 
                         'bert', 
                         'medusa', 
                         'starcoder2', 
                         'cohere', 
                         'minicpm']

readme_template = """
---
library_name: mlc-llm
base_model: {base_model}
tags:
- mlc-llm
- web-llm
---

# {model_name}

This is the [{base_model_name}](https://huggingface.co/{base_model}) model in MLC format `{quant_format}`.
The conversion was done using the [MLC-Weight-Conversion](https://huggingface.co/spaces/mlc-ai/MLC-Weight-Conversion) space.
The model can be used for projects [MLC-LLM](https://github.com/mlc-ai/mlc-llm) and [WebLLM](https://github.com/mlc-ai/web-llm).

## Example Usage

Here are some examples of using this model in MLC LLM.
Before running the examples, please install MLC LLM by following the [installation documentation](https://llm.mlc.ai/docs/install/mlc_llm.html#install-mlc-packages).

### Chat

In command line, run
```bash
mlc_llm chat HF://mlc-ai/{model_name}
```

### REST Server

In command line, run
```bash
mlc_llm serve HF://mlc-ai/{model_name}
```

### Python API

```python
from mlc_llm import MLCEngine

# Create engine
model = "HF://mlc-ai/{model_name}"
engine = MLCEngine(model)

# Run chat completion in OpenAI API.
for response in engine.chat.completions.create(
    messages=[{{"role": "user", "content": "What is the meaning of life?"}}],
    model=model,
    stream=True,
):
    for choice in response.choices:
        print(choice.delta.content, end="", flush=True)
print("\\n")

engine.terminate()
```

## Documentation

For more information on MLC LLM project, please visit our [documentation](https://llm.mlc.ai/docs/) and [GitHub repo](http://github.com/mlc-ai/mlc-llm).
""".strip()


def button_click(hf_model_id, conv_template, quantization, oauth_token: gr.OAuthToken | None, progress=gr.Progress()):
    if oauth_token.token is None:
        return "Log in to Huggingface to use this"
    elif not hf_model_id:
        return "Enter a Huggingface model ID"
    elif not conv_template:
        return "Select a conversation template"
    elif not quantization:
        return "Select a quantization method"
    
    progress(0, desc="Verifying inputs...")
    
    api = HfApi(token=oauth_token.token)
    model_dir_name = hf_model_id.split("/")[1]
    mlc_model_name = model_dir_name + "-" + quantization + "-" + "MLC"

    os.system("mkdir -p dist/models")
    os.system("git lfs install")

    model_info = api.repo_info(hf_model_id)
    if type(model_info) != huggingface_hub.hf_api.ModelInfo:
        os.system("rm -rf dist/")
        return "Entered Huggingface model ID is not a model repository"
    if "model_type" not in model_info.config:
        os.system("rm -rf dist/")
        return "Cannot infer model type from config file"
    if model_info.config['model_type'] not in SUPPORTED_MODEL_TYPES:
        os.system("rm -rf dist/")
        return f"Model type ({model_info.config['model_type']}) currently not supported by MLC-LLM"

    progress(0.1, desc="Downloading weights from Huggingface...")

    try:
        api.snapshot_download(repo_id=hf_model_id, local_dir=f"./dist/models/{model_dir_name}")
    except BaseException as error:
        os.system("rm -rf dist/")
        return error
    
    progress(0.5, desc="Converting weight to MLC")

    convert_weight_result = subprocess.run(["mlc_llm convert_weight ./dist/models/" + model_dir_name + "/" + \
              " --quantization " + quantization + \
              " -o dist/" + mlc_model_name], shell=True, capture_output=True, text=True)
    if convert_weight_result.returncode != 0:
        os.system("rm -rf dist/")
        return convert_weight_result.stderr
    
    progress(0.8, desc="Generating config...")
    
    gen_config_result = subprocess.run(["mlc_llm gen_config ./dist/models/" + model_dir_name + "/" + \
              " --quantization " + quantization + " --conv-template " + conv_template + \
              " -o dist/" + mlc_model_name + "/"], shell=True, capture_output=True, text=True)
    if gen_config_result.returncode != 0:
        os.system("rm -rf dist/")
        return gen_config_result.stderr
    
    progress(0.9, desc="Creating your Huggingface repo...")

    # push to HF
    user_name = api.whoami()["name"]
    created_repo_url = api.create_repo(repo_id=f"{user_name}/{mlc_model_name}", private=False) # set public
    created_repo_id = created_repo_url.repo_id

    api.upload_large_folder(folder_path=f"./dist/{mlc_model_name}",
                            repo_id=f"{user_name}/{mlc_model_name}",
                            repo_type="model")
    
    # push model card to HF
    card = ModelCard.load(hf_model_id, token=oauth_token.token)
    if not card.data.tags:
        card.data.tags = []
    card.data.tags.append("mlc-ai")
    card.data.tags.append("MLC-Weight-Conversion")
    card.data.base_model = hf_model_id

    card.text = readme_template.format(
        model_name=f"{user_name}/{mlc_model_name}",
        base_model=hf_model_id,
        base_model_name=model_dir_name,
        quant_format=quantization,
    )
    card.save("./dist/README.md")

    api.upload_file(path_or_fileobj="./dist/README.md",
                    path_in_repo="README.md",
                    repo_id=created_repo_id,
                    repo_type="model")

    os.system("rm -rf dist/")
    return "Successful, please find your compiled LLM model on your personal account"

def clean():
    os.system("rm -rf dist/")

def restart_space():
    HfApi().restart_space(repo_id="mlc-ai/MLC-Weight-Conversion", token=os.environ.get("HF_TOKEN"), factory_reboot=True)

with gr.Blocks() as demo:
    gr.LoginButton()
    gr.Markdown(
    """
    # Compile your LLM model with MLC-LLM and run it locally!
    ### This space takes in Huggingface model ID, and converts it for you using your selected conversation template and quantization method!
    """)
    model_id = HuggingfaceHubSearch(
        label="HF Model ID",
        placeholder="Search for your model on Huggingface",
        search_type="model",
    )
    conv = gr.Dropdown(CONV_TEMPLATES, label="Conversation Template")
    quant = gr.Dropdown(QUANTIZATIONS, label="Quantization Method", info="The format of the code is qAfB(_id), where A represents the number of bits for storing weights and B represents the number of bits for storing activations. The _id is an integer identifier to distinguish different quantization algorithms (e.g. symmetric, non-symmetric, AWQ, etc).")
    btn = gr.Button("Convert to MLC")
    btn2 = gr.Button("Cancel Conversion")
    out = gr.Textbox(label="Conversion Result")
    click_event = btn.click(fn=button_click , inputs=[model_id, conv, quant], outputs=out)
    btn2.click(fn=None, inputs=None, outputs=None, cancels=[click_event], js="window.location.reload()")

scheduler = BackgroundScheduler()
scheduler.add_job(clean, "interval", seconds=21600)
scheduler.add_job(restart_space, "interval", seconds=86400)
scheduler.start()

demo.queue(max_size=5).launch()