Files changed (6) hide show
  1. .gitattributes +37 -36
  2. Dockerfile +71 -57
  3. README.md +15 -15
  4. app.py +372 -282
  5. imatrix_calibration.txt +0 -0
  6. start.sh +5 -4
.gitattributes CHANGED
@@ -1,36 +1,37 @@
1
- *.7z filter=lfs diff=lfs merge=lfs -text
2
- *.arrow filter=lfs diff=lfs merge=lfs -text
3
- *.bin filter=lfs diff=lfs merge=lfs -text
4
- *.bz2 filter=lfs diff=lfs merge=lfs -text
5
- *.ckpt filter=lfs diff=lfs merge=lfs -text
6
- *.ftz filter=lfs diff=lfs merge=lfs -text
7
- *.gz filter=lfs diff=lfs merge=lfs -text
8
- *.h5 filter=lfs diff=lfs merge=lfs -text
9
- *.joblib filter=lfs diff=lfs merge=lfs -text
10
- *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
- *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
- *.model filter=lfs diff=lfs merge=lfs -text
13
- *.msgpack filter=lfs diff=lfs merge=lfs -text
14
- *.npy filter=lfs diff=lfs merge=lfs -text
15
- *.npz filter=lfs diff=lfs merge=lfs -text
16
- *.onnx filter=lfs diff=lfs merge=lfs -text
17
- *.ot filter=lfs diff=lfs merge=lfs -text
18
- *.parquet filter=lfs diff=lfs merge=lfs -text
19
- *.pb filter=lfs diff=lfs merge=lfs -text
20
- *.pickle filter=lfs diff=lfs merge=lfs -text
21
- *.pkl filter=lfs diff=lfs merge=lfs -text
22
- *.pt filter=lfs diff=lfs merge=lfs -text
23
- *.pth filter=lfs diff=lfs merge=lfs -text
24
- *.rar filter=lfs diff=lfs merge=lfs -text
25
- *.safetensors filter=lfs diff=lfs merge=lfs -text
26
- saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
- *.tar.* filter=lfs diff=lfs merge=lfs -text
28
- *.tar filter=lfs diff=lfs merge=lfs -text
29
- *.tflite filter=lfs diff=lfs merge=lfs -text
30
- *.tgz filter=lfs diff=lfs merge=lfs -text
31
- *.wasm filter=lfs diff=lfs merge=lfs -text
32
- *.xz filter=lfs diff=lfs merge=lfs -text
33
- *.zip filter=lfs diff=lfs merge=lfs -text
34
- *.zst filter=lfs diff=lfs merge=lfs -text
35
- *tfevents* filter=lfs diff=lfs merge=lfs -text
36
- llama.png filter=lfs diff=lfs merge=lfs -text
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ llama.png filter=lfs diff=lfs merge=lfs -text
37
+ imatrix_calibration.txt filter=lfs diff=lfs merge=lfs -text
Dockerfile CHANGED
@@ -1,57 +1,71 @@
1
- FROM python:3.9
2
- ENV DEBIAN_FRONTEND=noninteractive
3
- RUN apt-get update && \
4
- apt-get upgrade -y && \
5
- apt-get install -y --no-install-recommends \
6
- git \
7
- git-lfs \
8
- wget \
9
- curl \
10
- # python build dependencies \
11
- build-essential \
12
- libssl-dev \
13
- zlib1g-dev \
14
- libbz2-dev \
15
- libreadline-dev \
16
- libsqlite3-dev \
17
- libncursesw5-dev \
18
- xz-utils \
19
- tk-dev \
20
- libxml2-dev \
21
- libxmlsec1-dev \
22
- libffi-dev \
23
- liblzma-dev \
24
- # gradio dependencies \
25
- ffmpeg
26
-
27
- RUN useradd -m -u 1000 user
28
- USER user
29
- ENV HOME=/home/user \
30
- PATH=/home/user/.local/bin:${PATH}
31
- WORKDIR ${HOME}/app
32
-
33
- RUN curl https://pyenv.run | bash
34
- ENV PATH=${HOME}/.pyenv/shims:${HOME}/.pyenv/bin:${PATH}
35
- ARG PYTHON_VERSION=3.10.13
36
- RUN pyenv install ${PYTHON_VERSION} && \
37
- pyenv global ${PYTHON_VERSION} && \
38
- pyenv rehash && \
39
- pip install --no-cache-dir -U pip setuptools wheel && \
40
- pip install "huggingface-hub" "hf-transfer" "gradio[oauth]>=4.28.0" "gradio_huggingfacehub_search==0.0.7" "APScheduler"
41
-
42
- COPY --chown=1000 . ${HOME}/app
43
- RUN git clone https://github.com/ggerganov/llama.cpp
44
- RUN pip install -r llama.cpp/requirements.txt
45
-
46
- ENV PYTHONPATH=${HOME}/app \
47
- PYTHONUNBUFFERED=1 \
48
- HF_HUB_ENABLE_HF_TRANSFER=1 \
49
- GRADIO_ALLOW_FLAGGING=never \
50
- GRADIO_NUM_PORTS=1 \
51
- GRADIO_SERVER_NAME=0.0.0.0 \
52
- GRADIO_THEME=huggingface \
53
- TQDM_POSITION=-1 \
54
- TQDM_MININTERVAL=1 \
55
- SYSTEM=spaces
56
-
57
- ENTRYPOINT /bin/sh start.sh
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM nvidia/cuda:11.8.0-cudnn8-devel-ubuntu22.04
2
+
3
+ ENV DEBIAN_FRONTEND=noninteractive
4
+
5
+ RUN apt-get update && \
6
+ apt-get upgrade -y && \
7
+ apt-get install -y --no-install-recommends \
8
+ git \
9
+ git-lfs \
10
+ wget \
11
+ curl \
12
+ build-essential \
13
+ libssl-dev \
14
+ zlib1g-dev \
15
+ libbz2-dev \
16
+ libreadline-dev \
17
+ libsqlite3-dev \
18
+ libncursesw5-dev \
19
+ xz-utils \
20
+ tk-dev \
21
+ libxml2-dev \
22
+ libxmlsec1-dev \
23
+ libffi-dev \
24
+ liblzma-dev \
25
+ ffmpeg \
26
+ nvidia-driver-515
27
+
28
+ RUN useradd -m -u 1000 user
29
+
30
+ USER user
31
+
32
+ ENV HOME=/home/user \
33
+ PATH=/home/user/.local/bin:${PATH}
34
+
35
+ WORKDIR ${HOME}/app
36
+
37
+ RUN curl https://pyenv.run | bash
38
+
39
+ ENV PATH=${HOME}/.pyenv/shims:${HOME}/.pyenv/bin:${PATH}
40
+
41
+ ARG PYTHON_VERSION=3.10.13
42
+
43
+ RUN pyenv install ${PYTHON_VERSION} && \
44
+ pyenv global ${PYTHON_VERSION} && \
45
+ pyenv rehash && \
46
+ pip install --no-cache-dir -U pip setuptools wheel && \
47
+ pip install "huggingface-hub" "hf-transfer" "gradio[oauth]>=4.28.0" "gradio_huggingfacehub_search==0.0.7" "APScheduler"
48
+
49
+ COPY --chown=1000 . ${HOME}/app
50
+
51
+ RUN git clone https://github.com/ggerganov/llama.cpp
52
+
53
+ RUN pip install -r llama.cpp/requirements.txt
54
+
55
+ COPY imatrix_calibration.txt ${HOME}/app/llama.cpp/
56
+
57
+ ENV PYTHONPATH=${HOME}/app \
58
+ PYTHONUNBUFFERED=1 \
59
+ HF_HUB_ENABLE_HF_TRANSFER=1 \
60
+ GRADIO_ALLOW_FLAGGING=never \
61
+ GRADIO_NUM_PORTS=1 \
62
+ GRADIO_SERVER_NAME=0.0.0.0 \
63
+ GRADIO_THEME=huggingface \
64
+ TQDM_POSITION=-1 \
65
+ TQDM_MININTERVAL=1 \
66
+ SYSTEM=spaces \
67
+ LD_LIBRARY_PATH=/usr/local/cuda/lib64:${LD_LIBRARY_PATH} \
68
+ PATH=/usr/local/nvidia/bin:${PATH}
69
+
70
+
71
+ ENTRYPOINT ["/bin/bash", "-c", "cd llama.cpp && LLAMA_CUDA=1 make -j && cd .. && /bin/sh start.sh"]
README.md CHANGED
@@ -1,15 +1,15 @@
1
- ---
2
- title: GGUF My Repo
3
- emoji: πŸ¦™
4
- colorFrom: gray
5
- colorTo: pink
6
- sdk: docker
7
- hf_oauth: true
8
- hf_oauth_scopes:
9
- - read-repos
10
- - write-repos
11
- - manage-repos
12
- pinned: false
13
- ---
14
-
15
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
+ ---
2
+ title: GGUF My Repo
3
+ emoji: πŸ¦™
4
+ colorFrom: gray
5
+ colorTo: pink
6
+ sdk: docker
7
+ hf_oauth: true
8
+ hf_oauth_scopes:
9
+ - read-repos
10
+ - write-repos
11
+ - manage-repos
12
+ pinned: false
13
+ ---
14
+
15
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py CHANGED
@@ -1,282 +1,372 @@
1
- import os
2
- import shutil
3
- import subprocess
4
- os.environ["GRADIO_ANALYTICS_ENABLED"] = "False"
5
- import gradio as gr
6
-
7
- from huggingface_hub import create_repo, HfApi
8
- from huggingface_hub import snapshot_download
9
- from huggingface_hub import whoami
10
- from huggingface_hub import ModelCard
11
-
12
- from gradio_huggingfacehub_search import HuggingfaceHubSearch
13
-
14
- from apscheduler.schedulers.background import BackgroundScheduler
15
-
16
- from textwrap import dedent
17
-
18
- HF_TOKEN = os.environ.get("HF_TOKEN")
19
-
20
- def split_upload_model(model_path, repo_id, oauth_token: gr.OAuthToken | None, split_max_tensors=256, split_max_size=None):
21
- if oauth_token.token is None:
22
- raise ValueError("You have to be logged in.")
23
-
24
- split_cmd = f"llama.cpp/gguf-split --split --split-max-tensors {split_max_tensors}"
25
- if split_max_size:
26
- split_cmd += f" --split-max-size {split_max_size}"
27
- split_cmd += f" {model_path} {model_path.split('.')[0]}"
28
-
29
- print(f"Split command: {split_cmd}")
30
-
31
- result = subprocess.run(split_cmd, shell=True, capture_output=True, text=True)
32
- print(f"Split command stdout: {result.stdout}")
33
- print(f"Split command stderr: {result.stderr}")
34
-
35
- if result.returncode != 0:
36
- raise Exception(f"Error splitting the model: {result.stderr}")
37
- print("Model split successfully!")
38
-
39
-
40
- sharded_model_files = [f for f in os.listdir('.') if f.startswith(model_path.split('.')[0])]
41
- if sharded_model_files:
42
- print(f"Sharded model files: {sharded_model_files}")
43
- api = HfApi(token=oauth_token.token)
44
- for file in sharded_model_files:
45
- file_path = os.path.join('.', file)
46
- print(f"Uploading file: {file_path}")
47
- try:
48
- api.upload_file(
49
- path_or_fileobj=file_path,
50
- path_in_repo=file,
51
- repo_id=repo_id,
52
- )
53
- except Exception as e:
54
- raise Exception(f"Error uploading file {file_path}: {e}")
55
- else:
56
- raise Exception("No sharded files found.")
57
-
58
- print("Sharded model has been uploaded successfully!")
59
-
60
- def process_model(model_id, q_method, private_repo, split_model, split_max_tensors, split_max_size, oauth_token: gr.OAuthToken | None):
61
- if oauth_token.token is None:
62
- raise ValueError("You must be logged in to use GGUF-my-repo")
63
- model_name = model_id.split('/')[-1]
64
- fp16 = f"{model_name}.fp16.gguf"
65
-
66
- try:
67
- api = HfApi(token=oauth_token.token)
68
-
69
- dl_pattern = ["*.md", "*.json", "*.model"]
70
-
71
- pattern = (
72
- "*.safetensors"
73
- if any(
74
- file.path.endswith(".safetensors")
75
- for file in api.list_repo_tree(
76
- repo_id=model_id,
77
- recursive=True,
78
- )
79
- )
80
- else "*.bin"
81
- )
82
-
83
- dl_pattern += pattern
84
-
85
- api.snapshot_download(repo_id=model_id, local_dir=model_name, local_dir_use_symlinks=False, allow_patterns=dl_pattern)
86
- print("Model downloaded successfully!")
87
- print(f"Current working directory: {os.getcwd()}")
88
- print(f"Model directory contents: {os.listdir(model_name)}")
89
-
90
- conversion_script = "convert-hf-to-gguf.py"
91
- fp16_conversion = f"python llama.cpp/{conversion_script} {model_name} --outtype f16 --outfile {fp16}"
92
- result = subprocess.run(fp16_conversion, shell=True, capture_output=True)
93
- print(result)
94
- if result.returncode != 0:
95
- raise Exception(f"Error converting to fp16: {result.stderr}")
96
- print("Model converted to fp16 successfully!")
97
- print(f"Converted model path: {fp16}")
98
-
99
- username = whoami(oauth_token.token)["name"]
100
- quantized_gguf_name = f"{model_name.lower()}-{q_method.lower()}.gguf"
101
- quantized_gguf_path = quantized_gguf_name
102
- quantise_ggml = f"./llama.cpp/quantize {fp16} {quantized_gguf_path} {q_method}"
103
- result = subprocess.run(quantise_ggml, shell=True, capture_output=True)
104
- if result.returncode != 0:
105
- raise Exception(f"Error quantizing: {result.stderr}")
106
- print(f"Quantized successfully with {q_method} option!")
107
- print(f"Quantized model path: {quantized_gguf_path}")
108
-
109
- # Create empty repo
110
- new_repo_url = api.create_repo(repo_id=f"{username}/{model_name}-{q_method}-GGUF", exist_ok=True, private=private_repo)
111
- new_repo_id = new_repo_url.repo_id
112
- print("Repo created successfully!", new_repo_url)
113
-
114
- try:
115
- card = ModelCard.load(model_id, token=oauth_token.token)
116
- except:
117
- card = ModelCard("")
118
- if card.data.tags is None:
119
- card.data.tags = []
120
- card.data.tags.append("llama-cpp")
121
- card.data.tags.append("gguf-my-repo")
122
- card.data.base_model = model_id
123
- card.text = dedent(
124
- f"""
125
- # {new_repo_id}
126
- This model was converted to GGUF format from [`{model_id}`](https://huggingface.co/{model_id}) using llama.cpp via the ggml.ai's [GGUF-my-repo](https://huggingface.co/spaces/ggml-org/gguf-my-repo) space.
127
- Refer to the [original model card](https://huggingface.co/{model_id}) for more details on the model.
128
-
129
- ## Use with llama.cpp
130
- Install llama.cpp through brew (works on Mac and Linux)
131
-
132
- ```bash
133
- brew install llama.cpp
134
-
135
- ```
136
- Invoke the llama.cpp server or the CLI.
137
-
138
- ### CLI:
139
- ```bash
140
- llama --hf-repo {new_repo_id} --hf-file {quantized_gguf_name} -p "The meaning to life and the universe is"
141
- ```
142
-
143
- ### Server:
144
- ```bash
145
- llama-server --hf-repo {new_repo_id} --hf-file {quantized_gguf_name} -c 2048
146
- ```
147
-
148
- Note: You can also use this checkpoint directly through the [usage steps](https://github.com/ggerganov/llama.cpp?tab=readme-ov-file#usage) listed in the Llama.cpp repo as well.
149
-
150
- Step 1: Clone llama.cpp from GitHub.
151
- ```
152
- git clone https://github.com/ggerganov/llama.cpp
153
- ```
154
-
155
- Step 2: Move into the llama.cpp folder and build it with `LLAMA_CURL=1` flag along with other hardware-specific flags (for ex: LLAMA_CUDA=1 for Nvidia GPUs on Linux).
156
- ```
157
- cd llama.cpp && LLAMA_CURL=1 make
158
- ```
159
-
160
- Step 3: Run inference through the main binary.
161
- ```
162
- ./main --hf-repo {new_repo_id} --hf-file {quantized_gguf_name} -p "The meaning to life and the universe is"
163
- ```
164
- or
165
- ```
166
- ./server --hf-repo {new_repo_id} --hf-file {quantized_gguf_name} -c 2048
167
- ```
168
- """
169
- )
170
- card.save(f"README.md")
171
-
172
- if split_model:
173
- split_upload_model(quantized_gguf_path, new_repo_id, oauth_token, split_max_tensors, split_max_size)
174
- else:
175
- try:
176
- print(f"Uploading quantized model: {quantized_gguf_path}")
177
- api.upload_file(
178
- path_or_fileobj=quantized_gguf_path,
179
- path_in_repo=quantized_gguf_name,
180
- repo_id=new_repo_id,
181
- )
182
- except Exception as e:
183
- raise Exception(f"Error uploading quantized model: {e}")
184
-
185
- api.upload_file(
186
- path_or_fileobj=f"README.md",
187
- path_in_repo=f"README.md",
188
- repo_id=new_repo_id,
189
- )
190
- print(f"Uploaded successfully with {q_method} option!")
191
-
192
- return (
193
- f'Find your repo <a href=\'{new_repo_url}\' target="_blank" style="text-decoration:underline">here</a>',
194
- "llama.png",
195
- )
196
- except Exception as e:
197
- return (f"Error: {e}", "error.png")
198
- finally:
199
- shutil.rmtree(model_name, ignore_errors=True)
200
- print("Folder cleaned up successfully!")
201
-
202
-
203
- # Create Gradio interface
204
- with gr.Blocks() as demo:
205
- gr.Markdown("You must be logged in to use GGUF-my-repo.")
206
- gr.LoginButton(min_width=250)
207
-
208
- model_id_input = HuggingfaceHubSearch(
209
- label="Hub Model ID",
210
- placeholder="Search for model id on Huggingface",
211
- search_type="model",
212
- )
213
-
214
- q_method_input = gr.Dropdown(
215
- ["Q2_K", "Q3_K_S", "Q3_K_M", "Q3_K_L", "Q4_0", "Q4_K_S", "Q4_K_M", "Q5_0", "Q5_K_S", "Q5_K_M", "Q6_K", "Q8_0"],
216
- label="Quantization Method",
217
- info="GGML quantization type",
218
- value="Q4_K_M",
219
- filterable=False
220
- )
221
-
222
- private_repo_input = gr.Checkbox(
223
- value=False,
224
- label="Private Repo",
225
- info="Create a private repo under your username."
226
- )
227
-
228
- split_model_input = gr.Checkbox(
229
- value=False,
230
- label="Split Model",
231
- info="Shard the model using gguf-split."
232
- )
233
-
234
- split_max_tensors_input = gr.Number(
235
- value=256,
236
- label="Max Tensors per File",
237
- info="Maximum number of tensors per file when splitting model.",
238
- visible=False
239
- )
240
-
241
- split_max_size_input = gr.Textbox(
242
- label="Max File Size",
243
- info="Maximum file size when splitting model (--split-max-size). May leave empty to use the default.",
244
- visible=False
245
- )
246
-
247
- iface = gr.Interface(
248
- fn=process_model,
249
- inputs=[
250
- model_id_input,
251
- q_method_input,
252
- private_repo_input,
253
- split_model_input,
254
- split_max_tensors_input,
255
- split_max_size_input,
256
- ],
257
- outputs=[
258
- gr.Markdown(label="output"),
259
- gr.Image(show_label=False),
260
- ],
261
- title="Create your own GGUF Quants, blazingly fast ⚑!",
262
- description="The space takes an HF repo as an input, quantizes it and creates a Public repo containing the selected quant under your HF user namespace.",
263
- )
264
-
265
- def update_visibility(split_model):
266
- return gr.update(visible=split_model), gr.update(visible=split_model)
267
-
268
- split_model_input.change(
269
- fn=update_visibility,
270
- inputs=split_model_input,
271
- outputs=[split_max_tensors_input, split_max_size_input]
272
- )
273
-
274
- def restart_space():
275
- HfApi().restart_space(repo_id="ggml-org/gguf-my-repo", token=HF_TOKEN, factory_reboot=True)
276
-
277
- scheduler = BackgroundScheduler()
278
- scheduler.add_job(restart_space, "interval", seconds=21600)
279
- scheduler.start()
280
-
281
- # Launch the interface
282
- demo.queue(default_concurrency_limit=1, max_size=5).launch(debug=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import shutil
3
+ import subprocess
4
+ os.environ["GRADIO_ANALYTICS_ENABLED"] = "False"
5
+ import gradio as gr
6
+
7
+ from huggingface_hub import create_repo, HfApi
8
+ from huggingface_hub import snapshot_download
9
+ from huggingface_hub import whoami
10
+ from huggingface_hub import ModelCard
11
+
12
+ from gradio_huggingfacehub_search import HuggingfaceHubSearch
13
+
14
+ from apscheduler.schedulers.background import BackgroundScheduler
15
+
16
+ from textwrap import dedent
17
+
18
+ HF_TOKEN = os.environ.get("HF_TOKEN")
19
+
20
+ def generate_importance_matrix(model_path, train_data_path):
21
+ imatrix_command = f"./imatrix -m ../{model_path} -f {train_data_path} -ngl 99"
22
+
23
+ os.chdir("llama.cpp")
24
+
25
+ compile_command = "LLAMA_CUDA=1 make -j VERBOSE=1"
26
+ compile_result = subprocess.run(compile_command, shell=True, capture_output=True, text=True)
27
+ if compile_result.returncode != 0:
28
+ raise Exception(f"Error compiling imatrix: {compile_result.stderr}")
29
+
30
+
31
+ print(f"Current working directory: {os.getcwd()}")
32
+ print(f"Files in the current directory: {os.listdir('.')}")
33
+
34
+ if not os.path.isfile(f"../{model_path}"):
35
+ raise Exception(f"Model file not found: {model_path}")
36
+
37
+ print("Running imatrix command...")
38
+ result = subprocess.run(imatrix_command, shell=True, capture_output=True, text=True)
39
+
40
+ os.chdir("..")
41
+
42
+ if result.returncode != 0:
43
+ raise Exception(f"Error generating importance matrix: {result.stderr}")
44
+ print("Importance matrix generated successfully!")
45
+
46
+ def split_upload_model(model_path, repo_id, oauth_token: gr.OAuthToken | None, split_max_tensors=256, split_max_size=None):
47
+ if oauth_token.token is None:
48
+ raise ValueError("You have to be logged in.")
49
+
50
+ split_cmd = f"llama.cpp/gguf-split --split --split-max-tensors {split_max_tensors}"
51
+ if split_max_size:
52
+ split_cmd += f" --split-max-size {split_max_size}"
53
+ split_cmd += f" {model_path} {model_path.split('.')[0]}"
54
+
55
+ print(f"Split command: {split_cmd}")
56
+
57
+ result = subprocess.run(split_cmd, shell=True, capture_output=True, text=True)
58
+ print(f"Split command stdout: {result.stdout}")
59
+ print(f"Split command stderr: {result.stderr}")
60
+
61
+ if result.returncode != 0:
62
+ raise Exception(f"Error splitting the model: {result.stderr}")
63
+ print("Model split successfully!")
64
+
65
+
66
+ sharded_model_files = [f for f in os.listdir('.') if f.startswith(model_path.split('.')[0])]
67
+ if sharded_model_files:
68
+ print(f"Sharded model files: {sharded_model_files}")
69
+ api = HfApi(token=oauth_token.token)
70
+ for file in sharded_model_files:
71
+ file_path = os.path.join('.', file)
72
+ print(f"Uploading file: {file_path}")
73
+ try:
74
+ api.upload_file(
75
+ path_or_fileobj=file_path,
76
+ path_in_repo=file,
77
+ repo_id=repo_id,
78
+ )
79
+ except Exception as e:
80
+ raise Exception(f"Error uploading file {file_path}: {e}")
81
+ else:
82
+ raise Exception("No sharded files found.")
83
+
84
+ print("Sharded model has been uploaded successfully!")
85
+
86
+ def process_model(model_id, q_method, use_imatrix, imatrix_q_method, private_repo, train_data_file, split_model, split_max_tensors, split_max_size, oauth_token: gr.OAuthToken | None):
87
+ if oauth_token.token is None:
88
+ raise ValueError("You must be logged in to use GGUF-my-repo")
89
+ model_name = model_id.split('/')[-1]
90
+ fp16 = f"{model_name}.fp16.gguf"
91
+
92
+ try:
93
+ api = HfApi(token=oauth_token.token)
94
+
95
+ dl_pattern = ["*.md", "*.json", "*.model"]
96
+
97
+ pattern = (
98
+ "*.safetensors"
99
+ if any(
100
+ file.path.endswith(".safetensors")
101
+ for file in api.list_repo_tree(
102
+ repo_id=model_id,
103
+ recursive=True,
104
+ )
105
+ )
106
+ else "*.bin"
107
+ )
108
+
109
+ dl_pattern += pattern
110
+
111
+ api.snapshot_download(repo_id=model_id, local_dir=model_name, local_dir_use_symlinks=False, allow_patterns=dl_pattern)
112
+ print("Model downloaded successfully!")
113
+ print(f"Current working directory: {os.getcwd()}")
114
+ print(f"Model directory contents: {os.listdir(model_name)}")
115
+
116
+ conversion_script = "convert-hf-to-gguf.py"
117
+ fp16_conversion = f"python llama.cpp/{conversion_script} {model_name} --outtype f16 --outfile {fp16}"
118
+ result = subprocess.run(fp16_conversion, shell=True, capture_output=True)
119
+ print(result)
120
+ if result.returncode != 0:
121
+ raise Exception(f"Error converting to fp16: {result.stderr}")
122
+ print("Model converted to fp16 successfully!")
123
+ print(f"Converted model path: {fp16}")
124
+
125
+ imatrix_path = "llama.cpp/imatrix.dat"
126
+
127
+ if use_imatrix:
128
+ if train_data_file:
129
+ train_data_path = train_data_file.name
130
+ else:
131
+ train_data_path = "imatrix_calibration.txt"
132
+
133
+ print(f"Training data file path: {train_data_path}")
134
+
135
+ if not os.path.isfile(train_data_path):
136
+ raise Exception(f"Training data file not found: {train_data_path}")
137
+
138
+ generate_importance_matrix(fp16, train_data_path)
139
+ else:
140
+ print("Not using imatrix quantization.")
141
+
142
+ username = whoami(oauth_token.token)["name"]
143
+ quantized_gguf_name = f"{model_name.lower()}-{imatrix_q_method.lower()}-imat.gguf" if use_imatrix else f"{model_name.lower()}-{q_method.lower()}.gguf"
144
+ quantized_gguf_path = quantized_gguf_name
145
+ if use_imatrix:
146
+ quantise_ggml = f"./llama.cpp/quantize --imatrix {imatrix_path} {fp16} {quantized_gguf_path} {imatrix_q_method}"
147
+ else:
148
+ quantise_ggml = f"./llama.cpp/quantize {fp16} {quantized_gguf_path} {q_method}"
149
+ result = subprocess.run(quantise_ggml, shell=True, capture_output=True)
150
+ if result.returncode != 0:
151
+ raise Exception(f"Error quantizing: {result.stderr}")
152
+ print(f"Quantized successfully with {imatrix_q_method if use_imatrix else q_method} option!")
153
+ print(f"Quantized model path: {quantized_gguf_path}")
154
+
155
+ # Create empty repo
156
+ new_repo_url = api.create_repo(repo_id=f"{username}/{model_name}-{imatrix_q_method if use_imatrix else q_method}-GGUF", exist_ok=True, private=private_repo)
157
+ new_repo_id = new_repo_url.repo_id
158
+ print("Repo created successfully!", new_repo_url)
159
+
160
+ try:
161
+ card = ModelCard.load(model_id, token=oauth_token.token)
162
+ except:
163
+ card = ModelCard("")
164
+ if card.data.tags is None:
165
+ card.data.tags = []
166
+ card.data.tags.append("llama-cpp")
167
+ card.data.tags.append("gguf-my-repo")
168
+ card.data.base_model = model_id
169
+ card.text = dedent(
170
+ f"""
171
+ # {new_repo_id}
172
+ This model was converted to GGUF format from [`{model_id}`](https://huggingface.co/{model_id}) using llama.cpp via the ggml.ai's [GGUF-my-repo](https://huggingface.co/spaces/ggml-org/gguf-my-repo) space.
173
+ Refer to the [original model card](https://huggingface.co/{model_id}) for more details on the model.
174
+
175
+ ## Use with llama.cpp
176
+ Install llama.cpp through brew (works on Mac and Linux)
177
+
178
+ ```bash
179
+ brew install llama.cpp
180
+
181
+ ```
182
+ Invoke the llama.cpp server or the CLI.
183
+
184
+ ### CLI:
185
+ ```bash
186
+ llama --hf-repo {new_repo_id} --hf-file {quantized_gguf_name} -p "The meaning to life and the universe is"
187
+ ```
188
+
189
+ ### Server:
190
+ ```bash
191
+ llama-server --hf-repo {new_repo_id} --hf-file {quantized_gguf_name} -c 2048
192
+ ```
193
+
194
+ Note: You can also use this checkpoint directly through the [usage steps](https://github.com/ggerganov/llama.cpp?tab=readme-ov-file#usage) listed in the Llama.cpp repo as well.
195
+ Step 1: Clone llama.cpp from GitHub.
196
+ ```
197
+ git clone https://github.com/ggerganov/llama.cpp
198
+ ```
199
+ Step 2: Move into the llama.cpp folder and build it with `LLAMA_CURL=1` flag along with other hardware-specific flags (for ex: LLAMA_CUDA=1 for Nvidia GPUs on Linux).
200
+ ```
201
+ cd llama.cpp && LLAMA_CURL=1 make
202
+ ```
203
+ Step 3: Run inference through the main binary.
204
+ ```
205
+ ./main --hf-repo {new_repo_id} --hf-file {quantized_gguf_name} -p "The meaning to life and the universe is"
206
+ ```
207
+ or
208
+ ```
209
+ ./server --hf-repo {new_repo_id} --hf-file {quantized_gguf_name} -c 2048
210
+ ```
211
+ """
212
+ )
213
+ card.save(f"README.md")
214
+
215
+ if split_model:
216
+ split_upload_model(quantized_gguf_path, new_repo_id, oauth_token, split_max_tensors, split_max_size)
217
+ else:
218
+ try:
219
+ print(f"Uploading quantized model: {quantized_gguf_path}")
220
+ api.upload_file(
221
+ path_or_fileobj=quantized_gguf_path,
222
+ path_in_repo=quantized_gguf_name,
223
+ repo_id=new_repo_id,
224
+ )
225
+ except Exception as e:
226
+ raise Exception(f"Error uploading quantized model: {e}")
227
+
228
+
229
+ imatrix_path = "llama.cpp/imatrix.dat"
230
+ if os.path.isfile(imatrix_path):
231
+ try:
232
+ print(f"Uploading imatrix.dat: {imatrix_path}")
233
+ api.upload_file(
234
+ path_or_fileobj=imatrix_path,
235
+ path_in_repo="imatrix.dat",
236
+ repo_id=new_repo_id,
237
+ )
238
+ except Exception as e:
239
+ raise Exception(f"Error uploading imatrix.dat: {e}")
240
+
241
+ api.upload_file(
242
+ path_or_fileobj=f"README.md",
243
+ path_in_repo=f"README.md",
244
+ repo_id=new_repo_id,
245
+ )
246
+ print(f"Uploaded successfully with {imatrix_q_method if use_imatrix else q_method} option!")
247
+
248
+ return (
249
+ f'Find your repo <a href=\'{new_repo_url}\' target="_blank" style="text-decoration:underline">here</a>',
250
+ "llama.png",
251
+ )
252
+ except Exception as e:
253
+ return (f"Error: {e}", "error.png")
254
+ finally:
255
+ shutil.rmtree(model_name, ignore_errors=True)
256
+ print("Folder cleaned up successfully!")
257
+
258
+
259
+
260
+ with gr.Blocks(css=".gradio-container {max-height: 600px; overflow-y: auto;}") as demo:
261
+ gr.Markdown("You must be logged in to use GGUF-my-repo.")
262
+ gr.LoginButton(min_width=250)
263
+
264
+ model_id = HuggingfaceHubSearch(
265
+ label="Hub Model ID",
266
+ placeholder="Search for model id on Huggingface",
267
+ search_type="model",
268
+ )
269
+
270
+ q_method = gr.Dropdown(
271
+ ["Q2_K", "Q3_K_S", "Q3_K_M", "Q3_K_L", "Q4_0", "Q4_K_S", "Q4_K_M", "Q5_0", "Q5_K_S", "Q5_K_M", "Q6_K", "Q8_0"],
272
+ label="Quantization Method",
273
+ info="GGML quantization type",
274
+ value="Q4_K_M",
275
+ filterable=False,
276
+ visible=True
277
+ )
278
+
279
+ imatrix_q_method = gr.Dropdown(
280
+ ["IQ3_M", "IQ3_XXS", "Q4_K_M", "Q4_K_S", "IQ4_NL", "IQ4_XS", "Q5_K_M", "Q5_K_S"],
281
+ label="Imatrix Quantization Method",
282
+ info="GGML imatrix quants type",
283
+ value="IQ4_NL",
284
+ filterable=False,
285
+ visible=False
286
+ )
287
+
288
+ use_imatrix = gr.Checkbox(
289
+ value=False,
290
+ label="Use Imatrix Quantization",
291
+ info="Use importance matrix for quantization."
292
+ )
293
+
294
+ private_repo = gr.Checkbox(
295
+ value=False,
296
+ label="Private Repo",
297
+ info="Create a private repo under your username."
298
+ )
299
+
300
+ train_data_file = gr.File(
301
+ label="Training Data File",
302
+ file_types=["txt"],
303
+ visible=False
304
+ )
305
+
306
+ split_model = gr.Checkbox(
307
+ value=False,
308
+ label="Split Model",
309
+ info="Shard the model using gguf-split."
310
+ )
311
+
312
+ split_max_tensors = gr.Number(
313
+ value=256,
314
+ label="Max Tensors per File",
315
+ info="Maximum number of tensors per file when splitting model.",
316
+ visible=False
317
+ )
318
+
319
+ split_max_size = gr.Textbox(
320
+ label="Max File Size",
321
+ info="Maximum file size when splitting model (--split-max-size). May leave empty to use the default.",
322
+ visible=False
323
+ )
324
+
325
+ def update_visibility(use_imatrix):
326
+ return gr.update(visible=not use_imatrix), gr.update(visible=use_imatrix), gr.update(visible=use_imatrix)
327
+
328
+ use_imatrix.change(
329
+ fn=update_visibility,
330
+ inputs=use_imatrix,
331
+ outputs=[q_method, imatrix_q_method, train_data_file]
332
+ )
333
+
334
+ iface = gr.Interface(
335
+ fn=process_model,
336
+ inputs=[
337
+ model_id,
338
+ q_method,
339
+ use_imatrix,
340
+ imatrix_q_method,
341
+ private_repo,
342
+ train_data_file,
343
+ split_model,
344
+ split_max_tensors,
345
+ split_max_size,
346
+ ],
347
+ outputs=[
348
+ gr.Markdown(label="output"),
349
+ gr.Image(show_label=False),
350
+ ],
351
+ title="Create your own GGUF Quants, blazingly fast ⚑!",
352
+ description="The space takes an HF repo as an input, quantizes it and creates a Public repo containing the selected quant under your HF user namespace.",
353
+ )
354
+
355
+ def update_split_visibility(split_model):
356
+ return gr.update(visible=split_model), gr.update(visible=split_model)
357
+
358
+ split_model.change(
359
+ fn=update_split_visibility,
360
+ inputs=split_model,
361
+ outputs=[split_max_tensors, split_max_size]
362
+ )
363
+
364
+ def restart_space():
365
+ HfApi().restart_space(repo_id="ggml-org/gguf-my-repo", token=HF_TOKEN, factory_reboot=True)
366
+
367
+ scheduler = BackgroundScheduler()
368
+ scheduler.add_job(restart_space, "interval", seconds=21600)
369
+ scheduler.start()
370
+
371
+ # Launch the interface
372
+ demo.queue(default_concurrency_limit=1, max_size=5).launch(debug=True)
imatrix_calibration.txt ADDED
The diff for this file is too large to render. See raw diff
 
start.sh CHANGED
@@ -1,4 +1,5 @@
1
- cd llama.cpp
2
- make -j quantize gguf-split
3
- cd ..
4
- python app.py
 
 
1
+ cd llama.cpp
2
+ make -j quantize gguf-split imatrix
3
+
4
+ cd ..
5
+ python app.py