SixOpen commited on
Commit
70cc07f
·
1 Parent(s): 87a3f98
Files changed (3) hide show
  1. Dockerfile.bak +0 -63
  2. app.py.bak +0 -375
  3. start.sh.bak +0 -5
Dockerfile.bak DELETED
@@ -1,63 +0,0 @@
1
- FROM nvidia/cuda:11.8.0-cudnn8-devel-ubuntu22.04
2
-
3
- ENV DEBIAN_FRONTEND=noninteractive
4
- RUN apt-get update && \
5
- apt-get upgrade -y && \
6
- apt-get install -y --no-install-recommends \
7
- git \
8
- git-lfs \
9
- wget \
10
- curl \
11
- # python build dependencies \
12
- build-essential \
13
- libssl-dev \
14
- zlib1g-dev \
15
- libbz2-dev \
16
- libreadline-dev \
17
- libsqlite3-dev \
18
- libncursesw5-dev \
19
- xz-utils \
20
- tk-dev \
21
- libxml2-dev \
22
- libxmlsec1-dev \
23
- libffi-dev \
24
- liblzma-dev \
25
- ffmpeg \
26
- nvidia-driver-515
27
-
28
- RUN useradd -m -u 1000 user
29
- USER user
30
- ENV HOME=/home/user \
31
- PATH=/home/user/.local/bin:${PATH}
32
- WORKDIR ${HOME}/app
33
-
34
- RUN curl https://pyenv.run | bash
35
- ENV PATH=${HOME}/.pyenv/shims:${HOME}/.pyenv/bin:${PATH}
36
- ARG PYTHON_VERSION=3.10.13
37
- RUN pyenv install ${PYTHON_VERSION} && \
38
- pyenv global ${PYTHON_VERSION} && \
39
- pyenv rehash && \
40
- pip install --no-cache-dir -U pip setuptools wheel && \
41
- pip install "huggingface-hub" "hf-transfer" "gradio[oauth]>=4.28.0" "gradio_huggingfacehub_search==0.0.7" "APScheduler"
42
-
43
- COPY --chown=1000 . ${HOME}/app
44
- RUN git clone https://github.com/ggerganov/llama.cpp
45
- RUN pip install -r llama.cpp/requirements.txt
46
-
47
- COPY imatrix_calibration.txt ${HOME}/app/llama.cpp/
48
-
49
- ENV PYTHONPATH=${HOME}/app \
50
- PYTHONUNBUFFERED=1 \
51
- HF_HUB_ENABLE_HF_TRANSFER=1 \
52
- GRADIO_ALLOW_FLAGGING=never \
53
- GRADIO_NUM_PORTS=1 \
54
- GRADIO_SERVER_NAME=0.0.0.0 \
55
- GRADIO_THEME=huggingface \
56
- TQDM_POSITION=-1 \
57
- TQDM_MININTERVAL=1 \
58
- SYSTEM=spaces \
59
- LD_LIBRARY_PATH=/usr/local/cuda/lib64:${LD_LIBRARY_PATH} \
60
- PATH=/usr/local/nvidia/bin:${PATH}
61
-
62
-
63
- ENTRYPOINT ["/bin/bash", "-c", "cd llama.cpp && LLAMA_CUDA=1 make -j && cd .. && /bin/sh start.sh"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
app.py.bak DELETED
@@ -1,375 +0,0 @@
1
- import os
2
- import shutil
3
- import subprocess
4
- os.environ["GRADIO_ANALYTICS_ENABLED"] = "False"
5
- import gradio as gr
6
-
7
- from huggingface_hub import create_repo, HfApi
8
- from huggingface_hub import snapshot_download
9
- from huggingface_hub import whoami
10
- from huggingface_hub import ModelCard
11
-
12
- from gradio_huggingfacehub_search import HuggingfaceHubSearch
13
-
14
- from apscheduler.schedulers.background import BackgroundScheduler
15
-
16
- from textwrap import dedent
17
-
18
- HF_TOKEN = os.environ.get("HF_TOKEN")
19
-
20
- def generate_importance_matrix(model_path, train_data_path):
21
- imatrix_command = f"./imatrix -m ../{model_path} -f {train_data_path} -ngl 99"
22
-
23
- os.chdir("llama.cpp")
24
-
25
- compile_command = "LLAMA_CUDA=1 make -j"
26
- compile_result = subprocess.run(compile_command, shell=True, capture_output=True, text=True)
27
- if compile_result.returncode != 0:
28
- raise Exception(f"Error compiling imatrix: {compile_result.stderr}")
29
-
30
-
31
- print(f"Current working directory: {os.getcwd()}")
32
- print(f"Files in the current directory: {os.listdir('.')}")
33
-
34
- if not os.path.isfile(f"../{model_path}"):
35
- raise Exception(f"Model file not found: {model_path}")
36
-
37
- print("Running imatrix command...")
38
- result = subprocess.run(imatrix_command, shell=True, capture_output=True, text=True)
39
-
40
- os.chdir("..")
41
-
42
- if result.returncode != 0:
43
- raise Exception(f"Error generating importance matrix: {result.stderr}")
44
- print("Importance matrix generated successfully!")
45
-
46
- def split_upload_model(model_path, repo_id, oauth_token: gr.OAuthToken | None, split_max_tensors=256, split_max_size=None):
47
- if oauth_token.token is None:
48
- raise ValueError("You have to be logged in.")
49
-
50
- split_cmd = f"llama.cpp/gguf-split --split --split-max-tensors {split_max_tensors}"
51
- if split_max_size:
52
- split_cmd += f" --split-max-size {split_max_size}"
53
- split_cmd += f" {model_path} {model_path.split('.')[0]}"
54
-
55
- print(f"Split command: {split_cmd}")
56
-
57
- result = subprocess.run(split_cmd, shell=True, capture_output=True, text=True)
58
- print(f"Split command stdout: {result.stdout}")
59
- print(f"Split command stderr: {result.stderr}")
60
-
61
- if result.returncode != 0:
62
- raise Exception(f"Error splitting the model: {result.stderr}")
63
- print("Model split successfully!")
64
-
65
-
66
- sharded_model_files = [f for f in os.listdir('.') if f.startswith(model_path.split('.')[0])]
67
- if sharded_model_files:
68
- print(f"Sharded model files: {sharded_model_files}")
69
- api = HfApi(token=oauth_token.token)
70
- for file in sharded_model_files:
71
- file_path = os.path.join('.', file)
72
- print(f"Uploading file: {file_path}")
73
- try:
74
- api.upload_file(
75
- path_or_fileobj=file_path,
76
- path_in_repo=file,
77
- repo_id=repo_id,
78
- )
79
- except Exception as e:
80
- raise Exception(f"Error uploading file {file_path}: {e}")
81
- else:
82
- raise Exception("No sharded files found.")
83
-
84
- print("Sharded model has been uploaded successfully!")
85
-
86
- def process_model(model_id, q_method, use_imatrix, imatrix_q_method, private_repo, train_data_file, split_model, split_max_tensors, split_max_size, oauth_token: gr.OAuthToken | None):
87
- if oauth_token.token is None:
88
- raise ValueError("You must be logged in to use GGUF-my-repo")
89
- model_name = model_id.split('/')[-1]
90
- fp16 = f"{model_name}.fp16.gguf"
91
-
92
- try:
93
- api = HfApi(token=oauth_token.token)
94
-
95
- dl_pattern = ["*.md", "*.json", "*.model"]
96
-
97
- pattern = (
98
- "*.safetensors"
99
- if any(
100
- file.path.endswith(".safetensors")
101
- for file in api.list_repo_tree(
102
- repo_id=model_id,
103
- recursive=True,
104
- )
105
- )
106
- else "*.bin"
107
- )
108
-
109
- dl_pattern += pattern
110
-
111
- api.snapshot_download(repo_id=model_id, local_dir=model_name, local_dir_use_symlinks=False, allow_patterns=dl_pattern)
112
- print("Model downloaded successfully!")
113
- print(f"Current working directory: {os.getcwd()}")
114
- print(f"Model directory contents: {os.listdir(model_name)}")
115
-
116
- conversion_script = "convert-hf-to-gguf.py"
117
- fp16_conversion = f"python llama.cpp/{conversion_script} {model_name} --outtype f16 --outfile {fp16}"
118
- result = subprocess.run(fp16_conversion, shell=True, capture_output=True)
119
- print(result)
120
- if result.returncode != 0:
121
- raise Exception(f"Error converting to fp16: {result.stderr}")
122
- print("Model converted to fp16 successfully!")
123
- print(f"Converted model path: {fp16}")
124
-
125
- imatrix_path = "llama.cpp/imatrix.dat"
126
-
127
- if use_imatrix:
128
- if train_data_file:
129
- train_data_path = train_data_file.name
130
- else:
131
- train_data_path = "imatrix_calibration.txt"
132
-
133
- print(f"Training data file path: {train_data_path}")
134
-
135
- if not os.path.isfile(train_data_path):
136
- raise Exception(f"Training data file not found: {train_data_path}")
137
-
138
- generate_importance_matrix(fp16, train_data_path)
139
- else:
140
- print("Not using imatrix quantization.")
141
- username = whoami(oauth_token.token)["name"]
142
- quantized_gguf_name = f"{model_name.lower()}-{imatrix_q_method.lower()}-imat.gguf" if use_imatrix else f"{model_name.lower()}-{q_method.lower()}.gguf"
143
- quantized_gguf_path = quantized_gguf_name
144
- if use_imatrix:
145
- quantise_ggml = f"./llama.cpp/quantize --imatrix {imatrix_path} {fp16} {quantized_gguf_path} {imatrix_q_method}"
146
- else:
147
- quantise_ggml = f"./llama.cpp/quantize {fp16} {quantized_gguf_path} {q_method}"
148
- result = subprocess.run(quantise_ggml, shell=True, capture_output=True)
149
- if result.returncode != 0:
150
- raise Exception(f"Error quantizing: {result.stderr}")
151
- print(f"Quantized successfully with {imatrix_q_method if use_imatrix else q_method} option!")
152
- print(f"Quantized model path: {quantized_gguf_path}")
153
-
154
- # Create empty repo
155
- new_repo_url = api.create_repo(repo_id=f"{username}/{model_name}-{imatrix_q_method if use_imatrix else q_method}-GGUF", exist_ok=True, private=private_repo)
156
- new_repo_id = new_repo_url.repo_id
157
- print("Repo created successfully!", new_repo_url)
158
-
159
- try:
160
- card = ModelCard.load(model_id, token=oauth_token.token)
161
- except:
162
- card = ModelCard("")
163
- if card.data.tags is None:
164
- card.data.tags = []
165
- card.data.tags.append("llama-cpp")
166
- card.data.tags.append("gguf-my-repo")
167
- card.data.base_model = model_id
168
- card.text = dedent(
169
- f"""
170
- # {new_repo_id}
171
- This model was converted to GGUF format from [`{model_id}`](https://huggingface.co/{model_id}) using llama.cpp via the ggml.ai's [GGUF-my-repo](https://huggingface.co/spaces/ggml-org/gguf-my-repo) space.
172
- Refer to the [original model card](https://huggingface.co/{model_id}) for more details on the model.
173
-
174
- ## Use with llama.cpp
175
- Install llama.cpp through brew (works on Mac and Linux)
176
-
177
- ```bash
178
- brew install llama.cpp
179
-
180
- ```
181
- Invoke the llama.cpp server or the CLI.
182
-
183
- ### CLI:
184
- ```bash
185
- llama --hf-repo {new_repo_id} --hf-file {quantized_gguf_name} -p "The meaning to life and the universe is"
186
- ```
187
-
188
- ### Server:
189
- ```bash
190
- llama-server --hf-repo {new_repo_id} --hf-file {quantized_gguf_name} -c 2048
191
- ```
192
-
193
- Note: You can also use this checkpoint directly through the [usage steps](https://github.com/ggerganov/llama.cpp?tab=readme-ov-file#usage) listed in the Llama.cpp repo as well.
194
-
195
- Step 1: Clone llama.cpp from GitHub.
196
- ```
197
- git clone https://github.com/ggerganov/llama.cpp
198
- ```
199
-
200
- Step 2: Move into the llama.cpp folder and build it with `LLAMA_CURL=1` flag along with other hardware-specific flags (for ex: LLAMA_CUDA=1 for Nvidia GPUs on Linux).
201
- ```
202
- cd llama.cpp && LLAMA_CURL=1 make
203
- ```
204
-
205
- Step 3: Run inference through the main binary.
206
- ```
207
- ./main --hf-repo {new_repo_id} --hf-file {quantized_gguf_name} -p "The meaning to life and the universe is"
208
- ```
209
- or
210
- ```
211
- ./server --hf-repo {new_repo_id} --hf-file {quantized_gguf_name} -c 2048
212
- ```
213
- """
214
- )
215
- card.save(f"README.md")
216
-
217
- if split_model:
218
- split_upload_model(quantized_gguf_path, new_repo_id, oauth_token, split_max_tensors, split_max_size)
219
- else:
220
- try:
221
- print(f"Uploading quantized model: {quantized_gguf_path}")
222
- api.upload_file(
223
- path_or_fileobj=quantized_gguf_path,
224
- path_in_repo=quantized_gguf_name,
225
- repo_id=new_repo_id,
226
- )
227
- except Exception as e:
228
- raise Exception(f"Error uploading quantized model: {e}")
229
-
230
-
231
- imatrix_path = "llama.cpp/imatrix.dat"
232
- if os.path.isfile(imatrix_path):
233
- try:
234
- print(f"Uploading imatrix.dat: {imatrix_path}")
235
- api.upload_file(
236
- path_or_fileobj=imatrix_path,
237
- path_in_repo="imatrix.dat",
238
- repo_id=new_repo_id,
239
- )
240
- except Exception as e:
241
- raise Exception(f"Error uploading imatrix.dat: {e}")
242
-
243
- api.upload_file(
244
- path_or_fileobj=f"README.md",
245
- path_in_repo=f"README.md",
246
- repo_id=new_repo_id,
247
- )
248
- print(f"Uploaded successfully with {imatrix_q_method if use_imatrix else q_method} option!")
249
-
250
- return (
251
- f'Find your repo <a href=\'{new_repo_url}\' target="_blank" style="text-decoration:underline">here</a>',
252
- "llama.png",
253
- )
254
- except Exception as e:
255
- return (f"Error: {e}", "error.png")
256
- finally:
257
- shutil.rmtree(model_name, ignore_errors=True)
258
- print("Folder cleaned up successfully!")
259
-
260
-
261
- # Create Gradio interface
262
- with gr.Blocks(css=".gradio-container {max-height: 600px; overflow-y: auto;}") as demo:
263
- gr.Markdown("You must be logged in to use GGUF-my-repo.")
264
- gr.LoginButton(min_width=250)
265
-
266
- model_id = HuggingfaceHubSearch(
267
- label="Hub Model ID",
268
- placeholder="Search for model id on Huggingface",
269
- search_type="model",
270
- )
271
-
272
- q_method = gr.Dropdown(
273
- ["Q2_K", "Q3_K_S", "Q3_K_M", "Q3_K_L", "Q4_0", "Q4_K_S", "Q4_K_M", "Q5_0", "Q5_K_S", "Q5_K_M", "Q6_K", "Q8_0"],
274
- label="Quantization Method",
275
- info="GGML quantization type",
276
- value="Q4_K_M",
277
- filterable=False,
278
- visible=True
279
- )
280
-
281
- imatrix_q_method = gr.Dropdown(
282
- ["IQ3_M", "IQ3_XXS", "Q4_K_M", "Q4_K_S", "IQ4_NL", "IQ4_XS", "Q5_K_M", "Q5_K_S"],
283
- label="Imatrix Quantization Method",
284
- info="GGML imatrix quants type",
285
- value="IQ4_NL",
286
- filterable=False,
287
- visible=False
288
- )
289
-
290
- use_imatrix = gr.Checkbox(
291
- value=False,
292
- label="Use Imatrix Quantization",
293
- info="Use importance matrix for quantization."
294
- )
295
-
296
- private_repo = gr.Checkbox(
297
- value=False,
298
- label="Private Repo",
299
- info="Create a private repo under your username."
300
- )
301
-
302
- train_data_file = gr.File(
303
- label="Training Data File",
304
- file_types=["txt"],
305
- visible=False
306
- )
307
-
308
- split_model = gr.Checkbox(
309
- value=False,
310
- label="Split Model",
311
- info="Shard the model using gguf-split."
312
- )
313
-
314
- split_max_tensors = gr.Number(
315
- value=256,
316
- label="Max Tensors per File",
317
- info="Maximum number of tensors per file when splitting model.",
318
- visible=False
319
- )
320
-
321
- split_max_size = gr.Textbox(
322
- label="Max File Size",
323
- info="Maximum file size when splitting model (--split-max-size). May leave empty to use the default.",
324
- visible=False
325
- )
326
-
327
- def update_visibility(use_imatrix):
328
- return gr.update(visible=not use_imatrix), gr.update(visible=use_imatrix), gr.update(visible=use_imatrix)
329
-
330
- use_imatrix.change(
331
- fn=update_visibility,
332
- inputs=use_imatrix,
333
- outputs=[q_method, imatrix_q_method, train_data_file]
334
- )
335
-
336
- iface = gr.Interface(
337
- fn=process_model,
338
- inputs=[
339
- model_id,
340
- q_method,
341
- use_imatrix,
342
- imatrix_q_method,
343
- private_repo,
344
- train_data_file,
345
- split_model,
346
- split_max_tensors,
347
- split_max_size,
348
- ],
349
- outputs=[
350
- gr.Markdown(label="output"),
351
- gr.Image(show_label=False),
352
- ],
353
- title="Create your own GGUF Quants, blazingly fast ⚡!",
354
- description="The space takes an HF repo as an input, quantizes it and creates a Public repo containing the selected quant under your HF user namespace.",
355
- api_name=False
356
- )
357
-
358
- def update_split_visibility(split_model):
359
- return gr.update(visible=split_model), gr.update(visible=split_model)
360
-
361
- split_model.change(
362
- fn=update_split_visibility,
363
- inputs=split_model,
364
- outputs=[split_max_tensors, split_max_size]
365
- )
366
-
367
- def restart_space():
368
- HfApi().restart_space(repo_id="ggml-org/gguf-my-repo", token=HF_TOKEN, factory_reboot=True)
369
-
370
- scheduler = BackgroundScheduler()
371
- scheduler.add_job(restart_space, "interval", seconds=21600)
372
- scheduler.start()
373
-
374
- # Launch the interface
375
- demo.queue(default_concurrency_limit=1, max_size=5).launch(debug=True, show_api=False)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
start.sh.bak DELETED
@@ -1,5 +0,0 @@
1
- cd llama.cpp
2
- make -j quantize gguf-split imatrix
3
-
4
- cd ..
5
- python app.py