Spaces:
Runtime error
Runtime error
from typing import Any | |
import gradio as gr | |
import pandas as pd | |
import json | |
import requests | |
from html.parser import HTMLParser | |
quants = { | |
"Q2_K": 3.35, | |
"Q3_K_S": 3.5, | |
"Q3_K_M": 3.91, | |
"Q3_K_L": 4.27, | |
"Q4_0": 4.55, | |
"Q4_K_S": 4.58, | |
"Q4_K_M": 4.85, | |
"Q5_0": 5.54, | |
"Q5_K_S": 5.54, | |
"Q5_K_M": 5.69, | |
"Q6_K": 6.59, | |
"Q8_0": 8.5, | |
} | |
class SvelteHydratorExtractor(HTMLParser): | |
def __init__(self): | |
self.data = None | |
super().__init__() | |
def handle_starttag(self, tag, attrs): | |
print("Start tag:", tag) | |
for attr in attrs: | |
if attr[0] == "data-props": | |
self.data = attr[1].replace("":", '"') | |
def calc_model_size(parameters: int, quant: float) -> int: | |
return parameters * quant // 8 | |
def get_model_config(hf_model: str) -> dict[str, Any]: | |
config = requests.get( | |
f"https://huggingface.co/{hf_model}/raw/main/config.json" | |
).json() | |
model_size = 0 | |
try: | |
model_size = requests.get( | |
f"https://huggingface.co/{hf_model}/raw/main/model.safetensors.index.json" | |
).json()["metadta"]["total_size"] | |
except: | |
try: | |
model_size = requests.get( | |
f"https://huggingface.co/{hf_model}/raw/main/pytorch_model.bin.index.json" | |
).json()["metadta"]["total_size"] | |
except: | |
model_page = requests.get( | |
f"https://huggingface.co/{hf_model}" | |
).text | |
param_props_idx = model_page.find('data-target="ModelSafetensorsParams"') | |
if param_props_idx != -1: | |
param_props_start = model_page.rfind("<div", 0, param_props_idx) | |
param_props_end = model_page.find(">", param_props_idx) | |
extractor = SvelteHydratorExtractor() | |
extractor.feed(model_page[param_props_start:param_props_end + 1]) | |
model_size = ( | |
json.loads( | |
extractor.data | |
)["safetensors"]["total"] | |
* 2 | |
) | |
else: | |
param_props_idx = model_page.find('data-target="ModelHeader"') | |
param_props_start = model_page.rfind("<div", 0, param_props_idx) | |
param_props_end = model_page.find(">", param_props_idx) | |
extractor = SvelteHydratorExtractor() | |
extractor.feed(model_page[param_props_start:param_props_end + 1]) | |
model_size = ( | |
json.loads( | |
extractor.data | |
)["model"]["safetensors"]["total"] | |
* 2 | |
) | |
# assume fp16 weights | |
config["parameters"] = model_size / 2 | |
return config | |
def calc_input_buffer_size(model_config, context: int) -> float: | |
return 4096 + 2048 * model_config["hidden_size"] + context * 4 + context * 2048 | |
def calc_compute_buffer_size(model_config, context: int) -> float: | |
return ( | |
(context / 1024 * 2 + 0.75) * model_config["num_attention_heads"] * 1024 * 1024 | |
) | |
def calc_context_size(model_config, context: int) -> float: | |
n_gqa = model_config["num_attention_heads"] / model_config["num_key_value_heads"] | |
n_embd_gqa = model_config["hidden_size"] / n_gqa | |
n_elements = n_embd_gqa * (model_config["num_hidden_layers"] * context) | |
return 2 * n_elements * 2 | |
def calc(model_base, context, quant_size): | |
model_config = get_model_config(model_base) | |
quant_bpw = 0 | |
try: | |
quant_bpw = float(quant_size) | |
except: | |
quant_bpw = quants[quant_size] | |
model_size = round( | |
calc_model_size(model_config["parameters"], quant_bpw) / 1000 / 1000 / 1000, 2 | |
) | |
context_size = round( | |
( | |
calc_input_buffer_size(model_config, context) | |
+ calc_context_size(model_config, context) | |
+ calc_compute_buffer_size(model_config, context) | |
) | |
/ 1000 | |
/ 1000 | |
/ 1000, | |
2, | |
) | |
return model_size, context_size, round(model_size + context_size, 2) | |
title = "GGUF VRAM Calculator" | |
with gr.Blocks(title=title, theme=gr.themes.Monochrome()) as app: | |
default_model = "mistralai/Mistral-7B-v0.1" | |
default_quant = "Q4_K_S" | |
default_context = 8192 | |
default_size = calc(default_model, default_context, default_quant) | |
default_model_size = default_size[0] | |
default_context_size = default_size[1] | |
gr.Markdown( | |
f"# {app.title}\n## This space has been superseeded by the [NyxKrage/LLM-Model-VRAM-Calculator](https://huggingface.co/spaces/NyxKrage/LLM-Model-VRAM-Calculator), which has model search built in, and doesn't rely on gradio\nThis is meant only as a guide and is will not be 100% accurate, this also does not account for anything that might be running in the background on your system or CUDA system memory fallback on Windows" | |
) | |
model = gr.Textbox( | |
value=default_model, | |
label="Enter Unquantized HF Model Name (e.g. mistralai/Mistral-7B-v0.1)", | |
) | |
context = gr.Number( | |
minimum=1, value=default_context, label="Desired Context Size (Tokens)" | |
) | |
quant = gr.Dropdown( | |
choices=list(quants.keys()), | |
value=default_quant, | |
allow_custom_value=True, | |
label="Enter GGUF Quant (e.g. Q4_K_S) or the specific BPW for other quantization schemes such as exl2 (e.g. 4.5)", | |
) | |
btn = gr.Button(value="Submit", variant="primary") | |
btn.click( | |
calc, | |
inputs=[ | |
model, | |
context, | |
quant, | |
], | |
outputs=[ | |
gr.Number( | |
label="Model Size (GB)", | |
value=default_size[0], | |
), | |
gr.Number( | |
label="Context Size (GB)", | |
value=default_size[1], | |
), | |
gr.Number( | |
label="Total Size (GB)", | |
value=default_size[2], | |
), | |
], | |
) | |
app.launch() | |