Minor improvement suggestions (#28)
Browse files- app : clear trailing whitespace (054f452b5cc97f343a1e3b9406be7e48006f580f)
- app : do not add traililng whitespace in prompt using llama-cli (36c74fdb2029320a98a0b09cf47b367d1e1ee52c)
- start : add -j to make command (should be faster) (21eb7b52991f49368a9fe02c2e88cf60120b8aff)
Co-authored-by: Georgi Gerganov <ggerganov@users.noreply.huggingface.co>
app.py
CHANGED
@@ -26,12 +26,12 @@ def script_to_use(model_id, api):
|
|
26 |
def process_model(model_id, q_method, hf_token, private_repo):
|
27 |
model_name = model_id.split('/')[-1]
|
28 |
fp16 = f"{model_name}/{model_name.lower()}.fp16.bin"
|
29 |
-
|
30 |
try:
|
31 |
api = HfApi(token=hf_token)
|
32 |
|
33 |
dl_pattern = ["*.md", "*.json", "*.model"]
|
34 |
-
|
35 |
pattern = (
|
36 |
"*.safetensors"
|
37 |
if any(
|
@@ -48,7 +48,7 @@ def process_model(model_id, q_method, hf_token, private_repo):
|
|
48 |
|
49 |
snapshot_download(repo_id=model_id, local_dir=model_name, local_dir_use_symlinks=False, token=hf_token, allow_patterns=dl_pattern)
|
50 |
print("Model downloaded successully!")
|
51 |
-
|
52 |
conversion_script = script_to_use(model_id, api)
|
53 |
fp16_conversion = f"python llama.cpp/{conversion_script} {model_name} --outtype f16 --outfile {fp16}"
|
54 |
result = subprocess.run(fp16_conversion, shell=True, capture_output=True)
|
@@ -90,13 +90,13 @@ def process_model(model_id, q_method, hf_token, private_repo):
|
|
90 |
Invoke the llama.cpp server or the CLI.
|
91 |
|
92 |
CLI:
|
93 |
-
|
94 |
```bash
|
95 |
-
llama-cli --hf-repo {new_repo_id} --model {qtype.split("/")[-1]} -p "The meaning to life and the universe is
|
96 |
```
|
97 |
|
98 |
Server:
|
99 |
-
|
100 |
```bash
|
101 |
llama-server --hf-repo {new_repo_id} --model {qtype.split("/")[-1]} -c 2048
|
102 |
```
|
@@ -139,22 +139,22 @@ def process_model(model_id, q_method, hf_token, private_repo):
|
|
139 |
|
140 |
# Create Gradio interface
|
141 |
iface = gr.Interface(
|
142 |
-
fn=process_model,
|
143 |
inputs=[
|
144 |
gr.Textbox(
|
145 |
-
lines=1,
|
146 |
label="Hub Model ID",
|
147 |
info="Model repo ID",
|
148 |
),
|
149 |
gr.Dropdown(
|
150 |
-
["Q2_K", "Q3_K_S", "Q3_K_M", "Q3_K_L", "Q4_0", "Q4_K_S", "Q4_K_M", "Q5_0", "Q5_K_S", "Q5_K_M", "Q6_K", "Q8_0"],
|
151 |
-
label="Quantization Method",
|
152 |
info="GGML quantisation type",
|
153 |
value="Q4_K_M",
|
154 |
filterable=False
|
155 |
),
|
156 |
gr.Textbox(
|
157 |
-
lines=1,
|
158 |
label="HF Write Token",
|
159 |
info="https://hf.co/settings/token",
|
160 |
type="password",
|
@@ -164,7 +164,7 @@ iface = gr.Interface(
|
|
164 |
label="Private Repo",
|
165 |
info="Create a private repo under your username."
|
166 |
)
|
167 |
-
],
|
168 |
outputs=[
|
169 |
gr.Markdown(label="output"),
|
170 |
gr.Image(show_label=False),
|
@@ -172,8 +172,7 @@ iface = gr.Interface(
|
|
172 |
title="Create your own GGUF Quants, blazingly fast ⚡!",
|
173 |
description="The space takes an HF repo as an input, quantises it and creates a Public repo containing the selected quant under your HF user namespace. You need to specify a write token obtained in https://hf.co/settings/tokens.",
|
174 |
article="<p>Find your write token at <a href='https://huggingface.co/settings/tokens' target='_blank'>token settings</a></p>",
|
175 |
-
|
176 |
)
|
177 |
|
178 |
# Launch the interface
|
179 |
-
iface.launch(debug=True)
|
|
|
26 |
def process_model(model_id, q_method, hf_token, private_repo):
|
27 |
model_name = model_id.split('/')[-1]
|
28 |
fp16 = f"{model_name}/{model_name.lower()}.fp16.bin"
|
29 |
+
|
30 |
try:
|
31 |
api = HfApi(token=hf_token)
|
32 |
|
33 |
dl_pattern = ["*.md", "*.json", "*.model"]
|
34 |
+
|
35 |
pattern = (
|
36 |
"*.safetensors"
|
37 |
if any(
|
|
|
48 |
|
49 |
snapshot_download(repo_id=model_id, local_dir=model_name, local_dir_use_symlinks=False, token=hf_token, allow_patterns=dl_pattern)
|
50 |
print("Model downloaded successully!")
|
51 |
+
|
52 |
conversion_script = script_to_use(model_id, api)
|
53 |
fp16_conversion = f"python llama.cpp/{conversion_script} {model_name} --outtype f16 --outfile {fp16}"
|
54 |
result = subprocess.run(fp16_conversion, shell=True, capture_output=True)
|
|
|
90 |
Invoke the llama.cpp server or the CLI.
|
91 |
|
92 |
CLI:
|
93 |
+
|
94 |
```bash
|
95 |
+
llama-cli --hf-repo {new_repo_id} --model {qtype.split("/")[-1]} -p "The meaning to life and the universe is"
|
96 |
```
|
97 |
|
98 |
Server:
|
99 |
+
|
100 |
```bash
|
101 |
llama-server --hf-repo {new_repo_id} --model {qtype.split("/")[-1]} -c 2048
|
102 |
```
|
|
|
139 |
|
140 |
# Create Gradio interface
|
141 |
iface = gr.Interface(
|
142 |
+
fn=process_model,
|
143 |
inputs=[
|
144 |
gr.Textbox(
|
145 |
+
lines=1,
|
146 |
label="Hub Model ID",
|
147 |
info="Model repo ID",
|
148 |
),
|
149 |
gr.Dropdown(
|
150 |
+
["Q2_K", "Q3_K_S", "Q3_K_M", "Q3_K_L", "Q4_0", "Q4_K_S", "Q4_K_M", "Q5_0", "Q5_K_S", "Q5_K_M", "Q6_K", "Q8_0"],
|
151 |
+
label="Quantization Method",
|
152 |
info="GGML quantisation type",
|
153 |
value="Q4_K_M",
|
154 |
filterable=False
|
155 |
),
|
156 |
gr.Textbox(
|
157 |
+
lines=1,
|
158 |
label="HF Write Token",
|
159 |
info="https://hf.co/settings/token",
|
160 |
type="password",
|
|
|
164 |
label="Private Repo",
|
165 |
info="Create a private repo under your username."
|
166 |
)
|
167 |
+
],
|
168 |
outputs=[
|
169 |
gr.Markdown(label="output"),
|
170 |
gr.Image(show_label=False),
|
|
|
172 |
title="Create your own GGUF Quants, blazingly fast ⚡!",
|
173 |
description="The space takes an HF repo as an input, quantises it and creates a Public repo containing the selected quant under your HF user namespace. You need to specify a write token obtained in https://hf.co/settings/tokens.",
|
174 |
article="<p>Find your write token at <a href='https://huggingface.co/settings/tokens' target='_blank'>token settings</a></p>",
|
|
|
175 |
)
|
176 |
|
177 |
# Launch the interface
|
178 |
+
iface.launch(debug=True)
|
start.sh
CHANGED
@@ -1,4 +1,4 @@
|
|
1 |
cd llama.cpp
|
2 |
-
make quantize
|
3 |
cd ..
|
4 |
python app.py
|
|
|
1 |
cd llama.cpp
|
2 |
+
make -j quantize
|
3 |
cd ..
|
4 |
python app.py
|