mo137 commited on
Commit
c8c7129
1 Parent(s): 9243531

Add support for EXL2 4 bit KV cache; switch from metric gigabytes (1e9 bytes) to JEDEC gigabytes (2^30 bytes)

Browse files

I'm sorry for mashing 2 unrelated issues together in one PR.

1. I changed the 8 bit cache checkbox to a drop-down that defaults to 16 bit cache, but 8 bit or 4 bit can be selected. The calculation now uses an int value instead of a conditional statement.

1. (Concerns lines **168-170** only) Your calculator over-estimated the memory use because it used metric gigabytes, equal to 1e9 bytes. But VRAM is measured in JEDEC Standard 100B.01 gigabytes, equal to 2^30 bytes. An RTX 4090 has 24 GB = 25.77e9 B memory. This 7.4% difference may seem insignificant, but it is significant when figuring out how big of a model you can squeeze into your GPU. For instance, 22.5 GB is equal 24.16e9 B. The first number suggests that the model will fit in 24 GB VRAM, the other implies it won't.

Files changed (1) hide show
  1. index.html +17 -17
index.html CHANGED
@@ -128,19 +128,16 @@
128
  return (context / 1024 * 2 + 0.75) * model_config["num_attention_heads"] * 1024 * 1024
129
  }
130
 
131
- function kvCache(context=8192, model_config, fp8_cache=false) {
132
  const n_gqa = model_config["num_attention_heads"] / model_config["num_key_value_heads"]
133
  const n_embd_gqa = model_config["hidden_size"] / n_gqa
134
  const n_elements = n_embd_gqa * (model_config["num_hidden_layers"] * context)
135
  const size = 2 * n_elements
136
- if (fp8_cache) {
137
- return size
138
- }
139
- return size * 2
140
  }
141
 
142
- function contextSize(context=8192, model_config, bsz=512, fp8_cache=false) {
143
- return Number.parseFloat((inputBuffer(context, model_config, bsz) + kvCache(context, model_config, fp8_cache) + computeBuffer(context, model_config, bsz)).toFixed(2))
144
  }
145
 
146
  function modelSize(model_config, bpw=4.5) {
@@ -152,22 +149,22 @@
152
  const model_config = await modelConfig(document.getElementById("modelsearch").value)
153
  const context = parseInt(document.getElementById("contextsize").value)
154
  let bsz = 512
155
- let fp8_cache = false
156
  let bpw = 0
157
  if (format === "gguf") {
158
  bsz = parseInt(document.getElementById("batchsize").value)
159
  bpw = gguf_quants[document.getElementById("quantsize").innerText]
160
 
161
  } else if (format == "exl2") {
162
- fp8_cache = document.getElementById("fp8cache").checked
163
  bpw = Number.parseFloat(document.getElementById("bpw").value)
164
  }
165
 
166
  const model_size = modelSize(model_config, bpw)
167
- const context_size = contextSize(context, model_config, bsz, fp8_cache)
168
- const total_size = ((model_size + context_size) / 1e+9)
169
- document.getElementById("resultmodel").innerText = (model_size / 1e+9).toFixed(2)
170
- document.getElementById("resultcontext").innerText = (context_size / 1e+9).toFixed(2)
171
  const result_total_el = document.getElementById("resulttotal");
172
  result_total_el.innerText = total_size.toFixed(2)
173
 
@@ -401,13 +398,16 @@
401
  class="w-fit p-3 h-full flex items-center gap-2 justify-center rounded-md border-0 text-gray-900 shadow-sm ring-1 ring-inset ring-gray-300 placeholder:text-gray-400 focus:ring-2 focus:ring-inset focus:ring-indigo-600 sm:text-sm sm:leading-6"
402
  >
403
  <label
404
- for="fp8cache"
405
  class="inline-block bg-white text-xs font-medium text-gray-900"
406
  >
407
- FP8 Cache
408
  </label>
409
- <input id="fp8cache" type="checkbox">
410
- </input>
 
 
 
411
  </div>
412
  </div>
413
  </div>
 
128
  return (context / 1024 * 2 + 0.75) * model_config["num_attention_heads"] * 1024 * 1024
129
  }
130
 
131
+ function kvCache(context=8192, model_config, cache_bit=16) {
132
  const n_gqa = model_config["num_attention_heads"] / model_config["num_key_value_heads"]
133
  const n_embd_gqa = model_config["hidden_size"] / n_gqa
134
  const n_elements = n_embd_gqa * (model_config["num_hidden_layers"] * context)
135
  const size = 2 * n_elements
136
+ return size * (cache_bit / 8)
 
 
 
137
  }
138
 
139
+ function contextSize(context=8192, model_config, bsz=512, cache_bit=16) {
140
+ return Number.parseFloat((inputBuffer(context, model_config, bsz) + kvCache(context, model_config, cache_bit) + computeBuffer(context, model_config, bsz)).toFixed(2))
141
  }
142
 
143
  function modelSize(model_config, bpw=4.5) {
 
149
  const model_config = await modelConfig(document.getElementById("modelsearch").value)
150
  const context = parseInt(document.getElementById("contextsize").value)
151
  let bsz = 512
152
+ let cache_bit = 16
153
  let bpw = 0
154
  if (format === "gguf") {
155
  bsz = parseInt(document.getElementById("batchsize").value)
156
  bpw = gguf_quants[document.getElementById("quantsize").innerText]
157
 
158
  } else if (format == "exl2") {
159
+ cache_bit = Number.parseInt(document.getElementById("kvCache").value)
160
  bpw = Number.parseFloat(document.getElementById("bpw").value)
161
  }
162
 
163
  const model_size = modelSize(model_config, bpw)
164
+ const context_size = contextSize(context, model_config, bsz, cache_bit)
165
+ const total_size = ((model_size + context_size) / 2**30)
166
+ document.getElementById("resultmodel").innerText = (model_size / 2**30).toFixed(2)
167
+ document.getElementById("resultcontext").innerText = (context_size / 2**30).toFixed(2)
168
  const result_total_el = document.getElementById("resulttotal");
169
  result_total_el.innerText = total_size.toFixed(2)
170
 
 
398
  class="w-fit p-3 h-full flex items-center gap-2 justify-center rounded-md border-0 text-gray-900 shadow-sm ring-1 ring-inset ring-gray-300 placeholder:text-gray-400 focus:ring-2 focus:ring-inset focus:ring-indigo-600 sm:text-sm sm:leading-6"
399
  >
400
  <label
401
+ for="kvCache"
402
  class="inline-block bg-white text-xs font-medium text-gray-900"
403
  >
404
+ KV Cache
405
  </label>
406
+ <select id="kvCache" name="kvCache">
407
+ <option value="16">16 bit</option>
408
+ <option value="8">8 bit</option>
409
+ <option value="4">4 bit</option>
410
+ </select>
411
  </div>
412
  </div>
413
  </div>