Spaces:

NyxKrage
/

LLM-Model-VRAM-Calculator

Running

App Files Files Community

Add support for EXL2 4 bit KV cache; switch from metric gigabytes (1e9 bytes) to JEDEC gigabytes (2^30 bytes)

by mo137 - opened Mar 16, 2024

base: refs/heads/main

←

from: refs/pr/2

Discussion Files changed

+17

-17

Files changed (1) hide show

index.html +17 -17

index.html CHANGED Viewed

@@ -128,19 +128,16 @@
         return (context / 1024 * 2 + 0.75) * model_config["num_attention_heads"] * 1024 * 1024
       }
-      function kvCache(context=8192, model_config, fp8_cache=false) {
         const n_gqa = model_config["num_attention_heads"] / model_config["num_key_value_heads"]
         const n_embd_gqa = model_config["hidden_size"] / n_gqa
         const n_elements = n_embd_gqa * (model_config["num_hidden_layers"] * context)
         const size = 2 * n_elements
-        if (fp8_cache) {
-          return size
-        }
-        return size * 2
       }
-      function contextSize(context=8192, model_config, bsz=512, fp8_cache=false) {
-        return Number.parseFloat((inputBuffer(context, model_config, bsz) + kvCache(context, model_config, fp8_cache) + computeBuffer(context, model_config, bsz)).toFixed(2))
       }
       function modelSize(model_config, bpw=4.5) {
@@ -152,22 +149,22 @@
           const model_config = await modelConfig(document.getElementById("modelsearch").value)
           const context = parseInt(document.getElementById("contextsize").value)
           let bsz = 512
-          let fp8_cache = false
           let bpw = 0
           if (format === "gguf") {
             bsz = parseInt(document.getElementById("batchsize").value)
             bpw = gguf_quants[document.getElementById("quantsize").innerText]
           } else if (format == "exl2") {
-            fp8_cache = document.getElementById("fp8cache").checked
             bpw = Number.parseFloat(document.getElementById("bpw").value)
           }
           const model_size = modelSize(model_config, bpw)
-          const context_size = contextSize(context, model_config, bsz, fp8_cache)
-          const total_size = ((model_size + context_size) / 1e+9)
-          document.getElementById("resultmodel").innerText = (model_size / 1e+9).toFixed(2)
-          document.getElementById("resultcontext").innerText = (context_size / 1e+9).toFixed(2)
           const result_total_el = document.getElementById("resulttotal");
           result_total_el.innerText = total_size.toFixed(2)
@@ -401,13 +398,16 @@
                 class="w-fit p-3 h-full flex items-center gap-2 justify-center rounded-md border-0 text-gray-900 shadow-sm ring-1 ring-inset ring-gray-300 placeholder:text-gray-400 focus:ring-2 focus:ring-inset focus:ring-indigo-600 sm:text-sm sm:leading-6"
               >
                 <label
-                  for="fp8cache"
                   class="inline-block bg-white text-xs font-medium text-gray-900"
                 >
-                  FP8 Cache
                 </label>
-                <input id="fp8cache" type="checkbox">
-              </input>
               </div>
             </div>
           </div>

         return (context / 1024 * 2 + 0.75) * model_config["num_attention_heads"] * 1024 * 1024
       }
+      function kvCache(context=8192, model_config, cache_bit=16) {
         const n_gqa = model_config["num_attention_heads"] / model_config["num_key_value_heads"]
         const n_embd_gqa = model_config["hidden_size"] / n_gqa
         const n_elements = n_embd_gqa * (model_config["num_hidden_layers"] * context)
         const size = 2 * n_elements
+        return size * (cache_bit / 8)
       }
+      function contextSize(context=8192, model_config, bsz=512, cache_bit=16) {
+        return Number.parseFloat((inputBuffer(context, model_config, bsz) + kvCache(context, model_config, cache_bit) + computeBuffer(context, model_config, bsz)).toFixed(2))
       }
       function modelSize(model_config, bpw=4.5) {
           const model_config = await modelConfig(document.getElementById("modelsearch").value)
           const context = parseInt(document.getElementById("contextsize").value)
           let bsz = 512
+          let cache_bit = 16
           let bpw = 0
           if (format === "gguf") {
             bsz = parseInt(document.getElementById("batchsize").value)
             bpw = gguf_quants[document.getElementById("quantsize").innerText]
           } else if (format == "exl2") {
+            cache_bit = Number.parseInt(document.getElementById("kvCache").value)
             bpw = Number.parseFloat(document.getElementById("bpw").value)
           }
           const model_size = modelSize(model_config, bpw)
+          const context_size = contextSize(context, model_config, bsz, cache_bit)
+          const total_size = ((model_size + context_size) / 2**30)
+          document.getElementById("resultmodel").innerText = (model_size / 2**30).toFixed(2)
+          document.getElementById("resultcontext").innerText = (context_size / 2**30).toFixed(2)
           const result_total_el = document.getElementById("resulttotal");
           result_total_el.innerText = total_size.toFixed(2)
                 class="w-fit p-3 h-full flex items-center gap-2 justify-center rounded-md border-0 text-gray-900 shadow-sm ring-1 ring-inset ring-gray-300 placeholder:text-gray-400 focus:ring-2 focus:ring-inset focus:ring-indigo-600 sm:text-sm sm:leading-6"
               >
                 <label
+                  for="kvCache"
                   class="inline-block bg-white text-xs font-medium text-gray-900"
                 >
+                  KV Cache
                 </label>
+                <select id="kvCache" name="kvCache">
+                  <option value="16">16 bit</option>
+                  <option value="8">8 bit</option>
+                  <option value="4">4 bit</option>
+                </select>
               </div>
             </div>
           </div>