Spaces:

SadP0i
/

GGUF-Model-VRAM-Calculator

Running

App Files Files Community

GGUF-Model-VRAM-Calculator / index.html

SadP0i

Upload 3 files

bb71914 verified 8 months ago

raw

history blame

21.6 kB

	<!DOCTYPE html>
	<html lang="en">
	<head>
	<meta charset="UTF-8" />
	<meta name="viewport" content="width=device-width, initial-scale=1.0" />
	<script>
	function strToHtml(str) {
	let parser = new DOMParser();
	return parser.parseFromString(str, "text/html");
	}

	//Short, jQuery-independent function to read html table and write them into an Array.
	//Kudos to RobG at StackOverflow
	function tableToObj(table) {
	var rows = table.rows;
	var propCells = rows[0].cells;
	var propNames = [];
	var results = [];
	var obj, row, cells;

	// Use the first row for the property names
	// Could use a header section but result is the same if
	// there is only one header row
	for (var i = 0, iLen = propCells.length; i < iLen; i++) {
	propNames.push(
	(propCells[i].textContent \|\| propCells[i].innerText).trim()
	);
	}

	// Use the rows for data
	// Could use tbody rows here to exclude header & footer
	// but starting from 1 gives required result
	for (var j = 1, jLen = rows.length; j < jLen; j++) {
	cells = rows[j].cells;
	obj = {};

	for (var k = 0; k < iLen; k++) {
	obj[propNames[k]] = (
	cells[k].textContent \|\| cells[k].innerText
	).trim();
	}
	results.push(obj);
	}
	return results;
	}

	function formatGpu(gpus) {
	return gpus.map(
	(g) => `${g["Product Name"]} - ${g["Memory"].split(",")[0]}`
	);
	}

	const gguf_quants = {
	"Q2_K": 3.35,
	"Q3_K_S": 3.5,
	"Q3_K_M": 3.91,
	"Q3_K_L": 4.27,
	"Q4_0": 4.55,
	"Q4_K_S": 4.58,
	"Q4_K_M": 4.85,
	"Q5_0": 5.54,
	"Q5_K_S": 5.54,
	"Q5_K_M": 5.69,
	"Q6_K": 6.59,
	"Q8_0": 8.5,
	}

	async function modelConfig(hf_model) {
	let config = await fetch(
	`https://huggingface.co/${hf_model}/raw/main/config.json`
	).then(r => r.json())
	let model_size = 0
	try {
	model_size = (await fetch(`https://huggingface.co/${hf_model}/resolve/main/model.safetensors.index.json`).then(r => r.json()))["metadata"]["total_size"] / 2
	if (isNaN(model_size)) {
	throw new Erorr("no size in safetensors metadata")
	}
	} catch (e) {
	try {
	model_size = (await fetch(`https://huggingface.co/${hf_model}/resolve/main/pytorch_model.bin.index.json`).then(r => r.json()))["metadata"]["total_size"] / 2
	if (isNaN(model_size)) {
	throw new Erorr("no size in pytorch metadata")
	}
	} catch {
	let model_page = await fetch(
	"https://corsproxy.io/?" + encodeURIComponent(`https://huggingface.co/${hf_model}`)
	).then(r => r.text())
	let el = document.createElement( 'html' );
	el.innerHTML = model_page
	let params_el = el.querySelector('div[data-target="ModelSafetensorsParams"]')
	if (params_el !== null) {
	model_size = JSON.parse(params_el.attributes.getNamedItem("data-props").value)["safetensors"]["total"]
	} else {
	params_el = el.querySelector('div[data-target="ModelHeader"]')
	model_size = JSON.parse(params_el.attributes.getNamedItem("data-props").value)["model"]["safetensors"]["total"]
	}
	}
	}
	config.parameters = model_size
	return config
	}

	function inputBuffer(context=8192, model_config, bsz=512) {
	/* Calculation taken from github:ggerganov/llama.cpp/llama.cpp:11248
	ctx->inp_tokens = ggml_new_tensor_1d(ctx->ctx_input, GGML_TYPE_I32, cparams.n_batch);
	ctx->inp_embd = ggml_new_tensor_2d(ctx->ctx_input, GGML_TYPE_F32, hparams.n_embd, cparams.n_batch);
	ctx->inp_pos = ggml_new_tensor_1d(ctx->ctx_input, GGML_TYPE_I32, cparams.n_batch);
	ctx->inp_KQ_mask = ggml_new_tensor_2d(ctx->ctx_input, GGML_TYPE_F32, cparams.n_ctx, cparams.n_batch);
	ctx->inp_K_shift = ggml_new_tensor_1d(ctx->ctx_input, GGML_TYPE_I32, cparams.n_ctx);
	ctx->inp_sum = ggml_new_tensor_2d(ctx->ctx_input, GGML_TYPE_F32, 1, cparams.n_batch);

	n_embd is hidden size (github:ggeranov/llama.cpp/convert.py:248)
	*/
	const inp_tokens = bsz
	const inp_embd = model_config["hidden_size"] * bsz
	const inp_pos = bsz
	const inp_KQ_mask = context * bsz
	const inp_K_shift = context
	const inp_sum = bsz

	return inp_tokens + inp_embd + inp_pos + inp_KQ_mask + inp_K_shift + inp_sum
	}

	function computeBuffer(context=8192, model_config, bsz=512) {
	if (bsz != 512) {
	alert("batch size other than 512 is currently not supported for the compute buffer, using batchsize 512 for compute buffer calculation, end result result will be an overestimatition")
	}
	return (context / 1024 * 2 + 0.75) * model_config["num_attention_heads"] * 1024 * 1024
	}

	function kvCache(context=8192, model_config, cache_bit=16) {
	const n_gqa = model_config["num_attention_heads"] / model_config["num_key_value_heads"]
	const n_embd_gqa = model_config["hidden_size"] / n_gqa
	const n_elements = n_embd_gqa * (model_config["num_hidden_layers"] * context)
	const size = 2 * n_elements
	return size * (cache_bit / 8)
	}

	function contextSize(context=8192, model_config, bsz=512, cache_bit=16) {
	return Number.parseFloat((inputBuffer(context, model_config, bsz) + kvCache(context, model_config, cache_bit) + computeBuffer(context, model_config, bsz)).toFixed(2))
	}

	function modelSize(model_config, bpw=4.5) {
	return Number.parseFloat((model_config["parameters"] * bpw / 8).toFixed(2))
	}

	async function calculateSizes(format) {

	format = "gguf"

	try {
	const model_config = await modelConfig(document.getElementById("modelsearch").value)
	const context = parseInt(document.getElementById("contextsize").value)
	let bsz = 512
	let cache_bit = 16
	let bpw = 0
	if (format === "gguf") {
	bsz = parseInt(document.getElementById("batchsize").value)
	bpw = gguf_quants[document.getElementById("quantsize").innerText]

	} else if (format == "exl2") {
	cache_bit = Number.parseInt(document.getElementById("kvCache").value)
	bpw = Number.parseFloat(document.getElementById("bpw").value)
	}

	const model_size = modelSize(model_config, bpw)
	const context_size = contextSize(context, model_config, bsz, cache_bit)
	const total_size = ((model_size + context_size) / 2**30)
	document.getElementById("resultmodel").innerText = (model_size / 2**30).toFixed(2)
	document.getElementById("resultcontext").innerText = (context_size / 2**30).toFixed(2)
	const result_total_el = document.getElementById("resulttotal");
	result_total_el.innerText = total_size.toFixed(2)

	//const gpu = document.getElementById("gpusearch").value
	// if (gpu !== "") {
	// const vram = parseFloat(gpu.split("-")[1].replace("GB", "").trim())
	// if (vram - total_size > 0.5) {
	// result_total_el.style.backgroundColor = "#bef264"
	// } else if (vram - total_size > 0) {
	// result_total_el.style.backgroundColor = "#facc15"
	// } else {
	// result_total_el.style.backgroundColor = "#ef4444"
	// }
	// }

	const allocated_vram = Number.parseInt(document.getElementById("maxvram").value);
	const vram = allocated_vram
	if (vram - total_size > 0.5) {
	result_total_el.style.backgroundColor = "#bef264"
	} else if (vram - total_size > 0) {
	result_total_el.style.backgroundColor = "#facc15"
	} else {
	result_total_el.style.backgroundColor = "#ef4444"
	}

	const layer_size = ((model_size / 2**30) / model_config["num_hidden_layers"])
	const layer_size_el = document.getElementById("layersize");
	layer_size_el.innerText = layer_size.toFixed(2)

	const layers_offload = Math.floor((allocated_vram - (context_size / 2**30)) / layer_size)

	const layers_offload_el = document.getElementById("layersoffload");
	layers_offload_el.innerText = layers_offload > model_config["num_hidden_layers"] ? model_config["num_hidden_layers"] : Math.max(0, layers_offload)

	} catch(e) {
	alert(e);
	}
	}
	</script>
	<link href="./styles.css" rel="stylesheet">
	<title>Can I split it? - GGUF VRAM Calculator</title>
	</head>
	<body class="p-8">
	<div x-data="{ format: 'gguf' }" class="flex flex-col max-h-screen items-center mt-16 gap-10">
	<div style="text-align: center;">
	<h1 class="text-xl font-semibold leading-6 text-gray-900">
	GGUF Model, Can I split it?
	</h1>
	<h3 class="font-semibold leading-6 text-gray-900">
	Based on NyxKrage's LLM VRAM calculator
	</h3>
	</div>
	<div class="flex flex-col gap-10">
	<div class="w-auto flex flex-col gap-4">
	<div class="relative">
	<label
	for="maxvram"
	class="absolute -top-2 left-2 inline-block bg-white px-1 text-xs font-medium text-gray-900"
	>
	Max Allocated VRAM
	</label>
	<input
	value="24"
	type="number"
	name="maxvram"
	id="maxvram"
	step="1"
	class="block w-full rounded-md border-0 p-3 text-gray-900 shadow-sm ring-1 ring-inset ring-gray-300 placeholder:text-gray-400 focus:ring-2 focus:ring-inset focus:ring-indigo-600 sm:text-sm sm:leading-6"
	/>
	</div>

	<!-- Model Selector -->


	<div class="flex flex-row gap-4 relative">
	<label
	for="contextsize"
	class="absolute -top-2 left-2 inline-block bg-white px-1 text-xs font-medium text-gray-900"
	>
	Model (unquantized)
	</label>
	<div
	class="block w-full rounded-md border-0 p-3 text-gray-900 shadow-sm ring-1 ring-inset ring-gray-300 placeholder:text-gray-400 focus:ring-2 focus:ring-inset focus:ring-indigo-600 sm:text-sm sm:leading-6"
	x-data="{
	open: false,
	value: 'Nexusflow/Starling-LM-7B-beta',
	results: null,
	toggle() {
	if (this.open) {
	return this.close()
	}

	this.$refs.input.focus()

	this.open = true
	},
	close(focusAfter) {
	if (! this.open) return

	this.open = false

	focusAfter && focusAfter.focus()
	}
	}"
	x-on:keydown.escape.prevent.stop="close($refs.input)"
	x-id="['model-typeahead']"
	class="relative"
	>
	<!-- Input -->
	<input
	id="modelsearch"
	x-ref="input"
	x-on:click="toggle()"
	@keypress.debounce.150ms="results = (await
	fetch('https://huggingface.co/api/quicksearch?type=model&q=' +
	encodeURIComponent(value)).then(r => r.json())).models.filter(m => !m.id.includes('GGUF') && !m.id.includes('AWQ') && !m.id.includes('GPTQ') && !m.id.includes('exl2'));"
	:aria-expanded="open"
	:aria-controls="$id('model-typeahead')"
	x-model="value"
	class="flex justify-between items-center gap-2 w-full"
	/>

	<!-- Panel -->
	<div
	x-ref="panel"
	x-show="open"
	x-transition.origin.top.left
	x-on:click.outside="close($refs.input)"
	:id="$id('model-typeahead')"
	style="display: none"
	class="absolute left-0 mt-4 w-full rounded-md bg-white shadow-sm ring-1 ring-inset ring-gray-300 z-10"
	>
	<template x-for="result in results">
	<a
	@click="value = result.id; close($refs.input)"
	x-text="result.id"
	class="flex cursor-pointer items-center gap-2 w-full first-of-type:rounded-t-md last-of-type:rounded-b-md px-4 py-2.5 text-left text-sm hover:bg-gray-500/5 disabled:text-gray-500"
	></a>
	</template>
	</div>
	</div>
	</div>


	<!-- Context Size Selector -->
	<div class="relative">
	<label
	for="contextsize"
	class="absolute -top-2 left-2 inline-block bg-white px-1 text-xs font-medium text-gray-900"
	>
	Context Size
	</label>
	<input
	value="8192"
	type="number"
	name="contextsize"
	id="contextsize"
	step="1024"
	class="block w-full rounded-md border-0 p-3 text-gray-900 shadow-sm ring-1 ring-inset ring-gray-300 placeholder:text-gray-400 focus:ring-2 focus:ring-inset focus:ring-indigo-600 sm:text-sm sm:leading-6"
	/>
	</div>
	<!-- GGUF Options -->
	<div x-show="format === 'gguf'" class="relative">
	<div class="flex flex-row gap-4">
	<label
	for="contextsize"
	class="absolute -top-2 left-2 inline-block bg-white px-1 text-xs font-medium text-gray-900"
	>
	Quantization Size
	</label>
	<div
	class="block w-full rounded-md border-0 p-3 text-gray-900 shadow-sm ring-1 ring-inset ring-gray-300 placeholder:text-gray-400 focus:ring-2 focus:ring-inset focus:ring-indigo-600 sm:text-sm sm:leading-6"
	x-data="{
	open: false,
	value: '',
	toggle() {
	if (this.open) {
	return this.close()
	}

	this.$refs.button.focus()

	this.open = true
	},
	close(focusAfter) {
	if (! this.open) return

	this.open = false

	focusAfter && focusAfter.focus()
	}
	}"
	x-on:keydown.escape.prevent.stop="close($refs.button)"
	x-id="['dropdown-button']"
	class="relative"
	>
	<!-- Button -->
	<button
	x-ref="button"
	x-on:click="toggle()"
	:aria-expanded="open"
	:aria-controls="$id('dropdown-button')"
	type="button"
	id="quantsize"
	x-text="value.length === 0 ? 'Q4_K_S' : value"
	class="flex justify-between items-center gap-2 w-full"
	>
	Q4_K_S

	<!-- Heroicon: chevron-down -->
	<svg
	xmlns="http://www.w3.org/2000/svg"
	class="h-5 w-5 text-gray-400"
	viewBox="0 0 20 20"
	fill="currentColor"
	>
	<path
	fill-rule="evenodd"
	d="M5.293 7.293a1 1 0 011.414 0L10 10.586l3.293-3.293a1 1 0 111.414 1.414l-4 4a1 1 0 01-1.414 0l-4-4a1 1 0 010-1.414z"
	clip-rule="evenodd"
	/>
	</svg>
	</button>

	<!-- Panel -->
	<div
	x-data="{ quants: [
	'Q3_K_S',
	'Q3_K_M',
	'Q3_K_L',
	'Q4_0',
	'Q4_K_S',
	'Q4_K_M',
	'Q5_0',
	'Q5_K_S',
	'Q5_K_M',
	'Q6_K',
	'Q8_0'
	]}"
	x-ref="panel"
	x-show="open"
	x-transition.origin.top.left
	x-on:click.outside="close($refs.button)"
	:id="$id('dropdown-button')"
	style="display: none"
	class="absolute left-0 mt-4 w-full rounded-md bg-white shadow-sm ring-1 ring-inset ring-gray-300 z-10"
	>
	<template x-for="quant in quants">
	<a
	@click="value = quant; close($refs.button)"
	x-text="quant"
	class="flex cursor-pointer items-center gap-2 w-full first-of-type:rounded-t-md last-of-type:rounded-b-md px-4 py-2.5 text-left text-sm hover:bg-gray-500/5 disabled:text-gray-500"
	></a>
	</template>
	</div>
	</div>
	<div class="relative">
	<label
	for="batchsize"
	class="absolute -top-2 left-2 inline-block bg-white px-1 text-xs font-medium text-gray-900"
	>
	Batch Size
	</label>
	<input
	value="512"
	type="number"
	step="128"
	id="batchsize"
	class="block w-full rounded-md border-0 p-3 text-gray-900 shadow-sm ring-1 ring-inset ring-gray-300 placeholder:text-gray-400 focus:ring-2 focus:ring-inset focus:ring-indigo-600 sm:text-sm sm:leading-6"
	/>
	</div>
	</div>
	</div>
	<button
	type="button"
	class="rounded-md bg-slate-800 px-3 py-2 text-sm font-semibold text-white shadow-sm hover:bg-slate-700 focus-visible:outline focus-visible:outline-2 focus-visible:outline-offset-2 focus-visible:outline-indigo-600"
	@click="calculateSizes(format)"
	>
	Submit
	</button>
	</div>
	<div class="w-auto flex flex-col gap-4">
	<div class="relative">
	<label
	class="absolute -top-2 left-2 inline-block bg-white px-1 text-xs font-medium text-gray-900"
	>
	Model Size (GB)
	</label>
	<div
	id="resultmodel"
	class="block w-full rounded-md border-0 p-3 text-gray-900 shadow-sm ring-1 ring-inset ring-gray-300 placeholder:text-gray-400 focus:ring-2 focus:ring-inset focus:ring-indigo-600 sm:text-sm sm:leading-6"
	>4.20</div>
	</div>
	<div class="relative">
	<label
	class="absolute -top-2 left-2 inline-block bg-white px-1 text-xs font-medium text-gray-900"
	>
	Context Size (GB)
	</label>
	<div
	id="resultcontext"
	class="block w-full rounded-md border-0 p-3 text-gray-900 shadow-sm ring-1 ring-inset ring-gray-300 placeholder:text-gray-400 focus:ring-2 focus:ring-inset focus:ring-indigo-600 sm:text-sm sm:leading-6"
	>6.90</div>
	</div>
	<div class="relative">
	<label
	class="absolute -top-2 left-2 inline-block bg-white px-1 text-xs font-medium text-gray-900"
	>
	Total Size (GB)
	</label>
	<div
	id="resulttotal"
	class="block w-full rounded-md border-0 p-3 text-gray-900 shadow-sm ring-1 ring-inset ring-gray-300 placeholder:text-gray-400 focus:ring-2 focus:ring-inset focus:ring-indigo-600 sm:text-sm sm:leading-6"
	>420.69</div>
	</div>
	<div class="relative">
	<label
	class="absolute -top-2 left-2 inline-block bg-white px-1 text-xs font-medium text-gray-900"
	>
	Layer size (GB)
	</label>
	<div
	id="layersize"
	class="block w-full rounded-md border-0 p-3 text-gray-900 shadow-sm ring-1 ring-inset ring-gray-300 placeholder:text-gray-400 focus:ring-2 focus:ring-inset focus:ring-indigo-600 sm:text-sm sm:leading-6"
	>42.69</div>
	</div>
	<div class="relative">
	<label
	class="absolute -top-2 left-2 inline-block bg-white px-1 text-xs font-medium text-gray-900"
	>
	Layers offloaded to GPU
	</label>
	<div
	id="layersoffload"
	class="block w-full rounded-md border-0 p-3 text-gray-900 shadow-sm ring-1 ring-inset ring-gray-300 placeholder:text-gray-400 focus:ring-2 focus:ring-inset focus:ring-indigo-600 sm:text-sm sm:leading-6"
	>42</div>
	</div>
	</div>
	</div>
	</div>
	<script
	src="https://cdn.jsdelivr.net/npm/alpinejs@3.x.x/dist/cdn.min.js"
	></script>
	<script defer>
	calculateSizes("gguf")
	</script>
	</body>
	</html>