gguf-my-repo / app.py
Wauplin's picture
Wauplin HF staff
Use OAuth ("Sign in with Hugging Face")
07dffe1 verified
raw
history blame
No virus
6.23 kB
import os
import shutil
import subprocess
import gradio as gr
from huggingface_hub import create_repo, HfApi
from huggingface_hub import snapshot_download
from huggingface_hub import whoami
from huggingface_hub import ModelCard
from gradio_huggingfacehub_search import HuggingfaceHubSearch
from textwrap import dedent
LLAMA_LIKE_ARCHS = ["MistralForCausalLM",]
def script_to_use(model_id, api):
info = api.model_info(model_id)
if info.config is None:
return None
arch = info.config.get("architectures", None)
if arch is None:
return None
arch = arch[0]
return "convert.py" if arch in LLAMA_LIKE_ARCHS else "convert-hf-to-gguf.py"
def process_model(model_id, q_method, private_repo, oauth_token: gr.OAuthToken | None):
if token is None:
raise ValueError("You must be logged in to use GGUF-my-repo")
model_name = model_id.split('/')[-1]
fp16 = f"{model_name}/{model_name.lower()}.fp16.bin"
try:
api = HfApi(token=oauth_token.token)
dl_pattern = ["*.md", "*.json", "*.model"]
pattern = (
"*.safetensors"
if any(
file.path.endswith(".safetensors")
for file in api.list_repo_tree(
repo_id=model_id,
recursive=True,
)
)
else "*.bin"
)
dl_pattern += pattern
api.snapshot_download(repo_id=model_id, local_dir=model_name, local_dir_use_symlinks=False, allow_patterns=dl_pattern)
print("Model downloaded successully!")
conversion_script = script_to_use(model_id, api)
fp16_conversion = f"python llama.cpp/{conversion_script} {model_name} --outtype f16 --outfile {fp16}"
result = subprocess.run(fp16_conversion, shell=True, capture_output=True)
print(result)
if result.returncode != 0:
raise Exception(f"Error converting to fp16: {result.stderr}")
print("Model converted to fp16 successully!")
qtype = f"{model_name}/{model_name.lower()}.{q_method.upper()}.gguf"
quantise_ggml = f"./llama.cpp/quantize {fp16} {qtype} {q_method}"
result = subprocess.run(quantise_ggml, shell=True, capture_output=True)
if result.returncode != 0:
raise Exception(f"Error quantizing: {result.stderr}")
print("Quantised successfully!")
# Create empty repo
new_repo_url = api.create_repo(repo_id=f"{model_name}-{q_method}-GGUF", exist_ok=True, private=private_repo)
new_repo_id = new_repo_url.repo_id
print("Repo created successfully!", new_repo_url)
try:
card = ModelCard.load(model_id, token=oauth_token.token)
except:
card = ModelCard("")
if card.data.tags is None:
card.data.tags = []
card.data.tags.append("llama-cpp")
card.data.tags.append("gguf-my-repo")
card.text = dedent(
f"""
# {new_repo_id}
This model was converted to GGUF format from [`{model_id}`](https://huggingface.co/{model_id}) using llama.cpp via the ggml.ai's [GGUF-my-repo](https://huggingface.co/spaces/ggml-org/gguf-my-repo) space.
Refer to the [original model card](https://huggingface.co/{model_id}) for more details on the model.
## Use with llama.cpp
Install llama.cpp through brew.
```bash
brew install ggerganov/ggerganov/llama.cpp
```
Invoke the llama.cpp server or the CLI.
CLI:
```bash
llama-cli --hf-repo {new_repo_id} --model {qtype.split("/")[-1]} -p "The meaning to life and the universe is"
```
Server:
```bash
llama-server --hf-repo {new_repo_id} --model {qtype.split("/")[-1]} -c 2048
```
Note: You can also use this checkpoint directly through the [usage steps](https://github.com/ggerganov/llama.cpp?tab=readme-ov-file#usage) listed in the Llama.cpp repo as well.
```
git clone https://github.com/ggerganov/llama.cpp && \
cd llama.cpp && \
make && \
./main -m {qtype.split("/")[-1]} -n 128
```
"""
)
card.save(os.path.join(model_name, "README-new.md"))
api.upload_file(
path_or_fileobj=qtype,
path_in_repo=qtype.split("/")[-1],
repo_id=new_repo_id,
)
api.upload_file(
path_or_fileobj=f"{model_name}/README-new.md",
path_in_repo="README.md",
repo_id=new_repo_id,
)
print("Uploaded successfully!")
return (
f'Find your repo <a href=\'{new_repo_url}\' target="_blank" style="text-decoration:underline">here</a>',
"llama.png",
)
except Exception as e:
return (f"Error: {e}", "error.png")
finally:
shutil.rmtree(model_name, ignore_errors=True)
print("Folder cleaned up successfully!")
# Create Gradio interface
iface = gr.Interface(
fn=process_model,
inputs=[
HuggingfaceHubSearch(
label="Hub Model ID",
placeholder="Search for model id on Huggingface",
search_type="model",
),
gr.Dropdown(
["Q2_K", "Q3_K_S", "Q3_K_M", "Q3_K_L", "Q4_0", "Q4_K_S", "Q4_K_M", "Q5_0", "Q5_K_S", "Q5_K_M", "Q6_K", "Q8_0"],
label="Quantization Method",
info="GGML quantisation type",
value="Q4_K_M",
filterable=False
),
gr.Checkbox(
value=False,
label="Private Repo",
info="Create a private repo under your username."
),
gr.LoginButton(min_width=250),
],
outputs=[
gr.Markdown(label="output"),
gr.Image(show_label=False),
],
title="Create your own GGUF Quants, blazingly fast ⚡!",
description="The space takes an HF repo as an input, quantises it and creates a Public repo containing the selected quant under your HF user namespace.",
)
# Launch the interface
iface.queue(default_concurrency_limit=1, max_size=5).launch(debug=True)