Spaces:
Runtime error
Runtime error
File size: 5,966 Bytes
177e22a c028b5a 4c8f20d 177e22a 880094b 177e22a 880094b 177e22a 260ecb5 177e22a 260ecb5 177e22a 260ecb5 177e22a 260ecb5 b38ea7a 880094b b38ea7a 880094b b38ea7a 880094b b38ea7a 880094b b38ea7a 177e22a 260ecb5 177e22a c028b5a 177e22a c028b5a 177e22a 4cc80a6 177e22a 4cc80a6 177e22a c028b5a 177e22a c028b5a 177e22a c028b5a 177e22a c028b5a 177e22a 18845ec 177e22a c028b5a 6c720bd 177e22a 6c720bd 81e7dde c028b5a 177e22a c028b5a 177e22a c028b5a 177e22a c028b5a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 |
from typing import Any
import gradio as gr
import pandas as pd
import json
import requests
from html.parser import HTMLParser
quants = {
"Q2_K": 3.35,
"Q3_K_S": 3.5,
"Q3_K_M": 3.91,
"Q3_K_L": 4.27,
"Q4_0": 4.55,
"Q4_K_S": 4.58,
"Q4_K_M": 4.85,
"Q5_0": 5.54,
"Q5_K_S": 5.54,
"Q5_K_M": 5.69,
"Q6_K": 6.59,
"Q8_0": 8.5,
}
class SvelteHydratorExtractor(HTMLParser):
def __init__(self):
self.data = None
super().__init__()
def handle_starttag(self, tag, attrs):
print("Start tag:", tag)
for attr in attrs:
if attr[0] == "data-props":
self.data = attr[1].replace("":", '"')
def calc_model_size(parameters: int, quant: float) -> int:
return parameters * quant // 8
def get_model_config(hf_model: str) -> dict[str, Any]:
config = requests.get(
f"https://huggingface.co/{hf_model}/raw/main/config.json"
).json()
model_size = 0
try:
model_size = requests.get(
f"https://huggingface.co/{hf_model}/raw/main/model.safetensors.index.json"
).json()["metadta"]["total_size"]
except:
try:
model_size = requests.get(
f"https://huggingface.co/{hf_model}/raw/main/pytorch_model.bin.index.json"
).json()["metadta"]["total_size"]
except:
model_page = requests.get(
f"https://huggingface.co/{hf_model}"
).text
param_props_idx = model_page.find('data-target="ModelSafetensorsParams"')
if param_props_idx != -1:
param_props_start = model_page.rfind("<div", 0, param_props_idx)
param_props_end = model_page.find(">", param_props_idx)
extractor = SvelteHydratorExtractor()
extractor.feed(model_page[param_props_start:param_props_end + 1])
model_size = (
json.loads(
extractor.data
)["safetensors"]["total"]
* 2
)
else:
param_props_idx = model_page.find('data-target="ModelHeader"')
param_props_start = model_page.rfind("<div", 0, param_props_idx)
param_props_end = model_page.find(">", param_props_idx)
extractor = SvelteHydratorExtractor()
extractor.feed(model_page[param_props_start:param_props_end + 1])
model_size = (
json.loads(
extractor.data
)["model"]["safetensors"]["total"]
* 2
)
# assume fp16 weights
config["parameters"] = model_size / 2
return config
def calc_input_buffer_size(model_config, context: int) -> float:
return 4096 + 2048 * model_config["hidden_size"] + context * 4 + context * 2048
def calc_compute_buffer_size(model_config, context: int) -> float:
return (
(context / 1024 * 2 + 0.75) * model_config["num_attention_heads"] * 1024 * 1024
)
def calc_context_size(model_config, context: int) -> float:
n_gqa = model_config["num_attention_heads"] / model_config["num_key_value_heads"]
n_embd_gqa = model_config["hidden_size"] / n_gqa
n_elements = n_embd_gqa * (model_config["num_hidden_layers"] * context)
return 2 * n_elements * 2
def calc(model_base, context, quant_size):
model_config = get_model_config(model_base)
quant_bpw = 0
try:
quant_bpw = float(quant_size)
except:
quant_bpw = quants[quant_size]
model_size = round(
calc_model_size(model_config["parameters"], quant_bpw) / 1000 / 1000 / 1000, 2
)
context_size = round(
(
calc_input_buffer_size(model_config, context)
+ calc_context_size(model_config, context)
+ calc_compute_buffer_size(model_config, context)
)
/ 1000
/ 1000
/ 1000,
2,
)
return model_size, context_size, round(model_size + context_size, 2)
title = "GGUF VRAM Calculator"
with gr.Blocks(title=title, theme=gr.themes.Monochrome()) as app:
default_model = "mistralai/Mistral-7B-v0.1"
default_quant = "Q4_K_S"
default_context = 8192
default_size = calc(default_model, default_context, default_quant)
default_model_size = default_size[0]
default_context_size = default_size[1]
gr.Markdown(
f"# {app.title}\n## This space has been superseeded by the [NyxKrage/LLM-Model-VRAM-Calculator](https://huggingface.co/spaces/NyxKrage/LLM-Model-VRAM-Calculator), which has model search built in, and doesn't rely on gradio\nThis is meant only as a guide and is will not be 100% accurate, this also does not account for anything that might be running in the background on your system or CUDA system memory fallback on Windows"
)
model = gr.Textbox(
value=default_model,
label="Enter Unquantized HF Model Name (e.g. mistralai/Mistral-7B-v0.1)",
)
context = gr.Number(
minimum=1, value=default_context, label="Desired Context Size (Tokens)"
)
quant = gr.Dropdown(
choices=list(quants.keys()),
value=default_quant,
allow_custom_value=True,
label="Enter GGUF Quant (e.g. Q4_K_S) or the specific BPW for other quantization schemes such as exl2 (e.g. 4.5)",
)
btn = gr.Button(value="Submit", variant="primary")
btn.click(
calc,
inputs=[
model,
context,
quant,
],
outputs=[
gr.Number(
label="Model Size (GB)",
value=default_size[0],
),
gr.Number(
label="Context Size (GB)",
value=default_size[1],
),
gr.Number(
label="Total Size (GB)",
value=default_size[2],
),
],
)
app.launch()
|