Spaces:

NyxKrage
/

GGUF-VRAM-Calculator

Runtime error

App Files Files Community

GGUF-VRAM-Calculator / app.py

NyxKrage

Update app.py

18845ec verified over 1 year ago

raw

history blame contribute delete

5.97 kB

	from typing import Any
	import gradio as gr
	import pandas as pd
	import json
	import requests
	from html.parser import HTMLParser

	quants = {
	"Q2_K": 3.35,
	"Q3_K_S": 3.5,
	"Q3_K_M": 3.91,
	"Q3_K_L": 4.27,
	"Q4_0": 4.55,
	"Q4_K_S": 4.58,
	"Q4_K_M": 4.85,
	"Q5_0": 5.54,
	"Q5_K_S": 5.54,
	"Q5_K_M": 5.69,
	"Q6_K": 6.59,
	"Q8_0": 8.5,
	}

	class SvelteHydratorExtractor(HTMLParser):
	def __init__(self):
	self.data = None
	super().__init__()

	def handle_starttag(self, tag, attrs):
	print("Start tag:", tag)
	for attr in attrs:
	if attr[0] == "data-props":
	self.data = attr[1].replace("&quot:", '"')


	def calc_model_size(parameters: int, quant: float) -> int:
	return parameters * quant // 8


	def get_model_config(hf_model: str) -> dict[str, Any]:
	config = requests.get(
	f"https://huggingface.co/{hf_model}/raw/main/config.json"
	).json()
	model_size = 0
	try:
	model_size = requests.get(
	f"https://huggingface.co/{hf_model}/raw/main/model.safetensors.index.json"
	).json()["metadta"]["total_size"]
	except:
	try:
	model_size = requests.get(
	f"https://huggingface.co/{hf_model}/raw/main/pytorch_model.bin.index.json"
	).json()["metadta"]["total_size"]
	except:
	model_page = requests.get(
	f"https://huggingface.co/{hf_model}"
	).text
	param_props_idx = model_page.find('data-target="ModelSafetensorsParams"')
	if param_props_idx != -1:
	param_props_start = model_page.rfind("<div", 0, param_props_idx)
	param_props_end = model_page.find(">", param_props_idx)
	extractor = SvelteHydratorExtractor()
	extractor.feed(model_page[param_props_start:param_props_end + 1])
	model_size = (
	json.loads(
	extractor.data
	)["safetensors"]["total"]
	* 2
	)
	else:
	param_props_idx = model_page.find('data-target="ModelHeader"')
	param_props_start = model_page.rfind("<div", 0, param_props_idx)
	param_props_end = model_page.find(">", param_props_idx)
	extractor = SvelteHydratorExtractor()
	extractor.feed(model_page[param_props_start:param_props_end + 1])
	model_size = (
	json.loads(
	extractor.data
	)["model"]["safetensors"]["total"]
	* 2
	)

	# assume fp16 weights
	config["parameters"] = model_size / 2
	return config


	def calc_input_buffer_size(model_config, context: int) -> float:
	return 4096 + 2048 * model_config["hidden_size"] + context * 4 + context * 2048


	def calc_compute_buffer_size(model_config, context: int) -> float:
	return (
	(context / 1024 * 2 + 0.75) * model_config["num_attention_heads"] * 1024 * 1024
	)


	def calc_context_size(model_config, context: int) -> float:
	n_gqa = model_config["num_attention_heads"] / model_config["num_key_value_heads"]
	n_embd_gqa = model_config["hidden_size"] / n_gqa
	n_elements = n_embd_gqa * (model_config["num_hidden_layers"] * context)
	return 2 * n_elements * 2


	def calc(model_base, context, quant_size):
	model_config = get_model_config(model_base)
	quant_bpw = 0
	try:
	quant_bpw = float(quant_size)
	except:
	quant_bpw = quants[quant_size]

	model_size = round(
	calc_model_size(model_config["parameters"], quant_bpw) / 1000 / 1000 / 1000, 2
	)
	context_size = round(
	(
	calc_input_buffer_size(model_config, context)
	+ calc_context_size(model_config, context)
	+ calc_compute_buffer_size(model_config, context)
	)
	/ 1000
	/ 1000
	/ 1000,
	2,
	)

	return model_size, context_size, round(model_size + context_size, 2)


	title = "GGUF VRAM Calculator"

	with gr.Blocks(title=title, theme=gr.themes.Monochrome()) as app:
	default_model = "mistralai/Mistral-7B-v0.1"
	default_quant = "Q4_K_S"
	default_context = 8192
	default_size = calc(default_model, default_context, default_quant)
	default_model_size = default_size[0]
	default_context_size = default_size[1]

	gr.Markdown(
	f"# {app.title}\n## This space has been superseeded by the [NyxKrage/LLM-Model-VRAM-Calculator](https://huggingface.co/spaces/NyxKrage/LLM-Model-VRAM-Calculator), which has model search built in, and doesn't rely on gradio\nThis is meant only as a guide and is will not be 100% accurate, this also does not account for anything that might be running in the background on your system or CUDA system memory fallback on Windows"
	)
	model = gr.Textbox(
	value=default_model,
	label="Enter Unquantized HF Model Name (e.g. mistralai/Mistral-7B-v0.1)",
	)
	context = gr.Number(
	minimum=1, value=default_context, label="Desired Context Size (Tokens)"
	)
	quant = gr.Dropdown(
	choices=list(quants.keys()),
	value=default_quant,
	allow_custom_value=True,
	label="Enter GGUF Quant (e.g. Q4_K_S) or the specific BPW for other quantization schemes such as exl2 (e.g. 4.5)",
	)
	btn = gr.Button(value="Submit", variant="primary")
	btn.click(
	calc,
	inputs=[
	model,
	context,
	quant,
	],
	outputs=[
	gr.Number(
	label="Model Size (GB)",
	value=default_size[0],
	),
	gr.Number(
	label="Context Size (GB)",
	value=default_size[1],
	),
	gr.Number(
	label="Total Size (GB)",
	value=default_size[2],
	),
	],
	)

	app.launch()