Add support for EXL2 4 bit KV cache; switch from metric gigabytes (1e9 bytes) to JEDEC gigabytes (2^30 bytes)
#2
by
mo137
- opened
- index.html +17 -17
index.html
CHANGED
|
@@ -128,19 +128,16 @@
|
|
| 128 |
return (context / 1024 * 2 + 0.75) * model_config["num_attention_heads"] * 1024 * 1024
|
| 129 |
}
|
| 130 |
|
| 131 |
-
function kvCache(context=8192, model_config,
|
| 132 |
const n_gqa = model_config["num_attention_heads"] / model_config["num_key_value_heads"]
|
| 133 |
const n_embd_gqa = model_config["hidden_size"] / n_gqa
|
| 134 |
const n_elements = n_embd_gqa * (model_config["num_hidden_layers"] * context)
|
| 135 |
const size = 2 * n_elements
|
| 136 |
-
|
| 137 |
-
return size
|
| 138 |
-
}
|
| 139 |
-
return size * 2
|
| 140 |
}
|
| 141 |
|
| 142 |
-
function contextSize(context=8192, model_config, bsz=512,
|
| 143 |
-
return Number.parseFloat((inputBuffer(context, model_config, bsz) + kvCache(context, model_config,
|
| 144 |
}
|
| 145 |
|
| 146 |
function modelSize(model_config, bpw=4.5) {
|
|
@@ -152,22 +149,22 @@
|
|
| 152 |
const model_config = await modelConfig(document.getElementById("modelsearch").value)
|
| 153 |
const context = parseInt(document.getElementById("contextsize").value)
|
| 154 |
let bsz = 512
|
| 155 |
-
let
|
| 156 |
let bpw = 0
|
| 157 |
if (format === "gguf") {
|
| 158 |
bsz = parseInt(document.getElementById("batchsize").value)
|
| 159 |
bpw = gguf_quants[document.getElementById("quantsize").innerText]
|
| 160 |
|
| 161 |
} else if (format == "exl2") {
|
| 162 |
-
|
| 163 |
bpw = Number.parseFloat(document.getElementById("bpw").value)
|
| 164 |
}
|
| 165 |
|
| 166 |
const model_size = modelSize(model_config, bpw)
|
| 167 |
-
const context_size = contextSize(context, model_config, bsz,
|
| 168 |
-
const total_size = ((model_size + context_size) /
|
| 169 |
-
document.getElementById("resultmodel").innerText = (model_size /
|
| 170 |
-
document.getElementById("resultcontext").innerText = (context_size /
|
| 171 |
const result_total_el = document.getElementById("resulttotal");
|
| 172 |
result_total_el.innerText = total_size.toFixed(2)
|
| 173 |
|
|
@@ -401,13 +398,16 @@
|
|
| 401 |
class="w-fit p-3 h-full flex items-center gap-2 justify-center rounded-md border-0 text-gray-900 shadow-sm ring-1 ring-inset ring-gray-300 placeholder:text-gray-400 focus:ring-2 focus:ring-inset focus:ring-indigo-600 sm:text-sm sm:leading-6"
|
| 402 |
>
|
| 403 |
<label
|
| 404 |
-
for="
|
| 405 |
class="inline-block bg-white text-xs font-medium text-gray-900"
|
| 406 |
>
|
| 407 |
-
|
| 408 |
</label>
|
| 409 |
-
<
|
| 410 |
-
|
|
|
|
|
|
|
|
|
|
| 411 |
</div>
|
| 412 |
</div>
|
| 413 |
</div>
|
|
|
|
| 128 |
return (context / 1024 * 2 + 0.75) * model_config["num_attention_heads"] * 1024 * 1024
|
| 129 |
}
|
| 130 |
|
| 131 |
+
function kvCache(context=8192, model_config, cache_bit=16) {
|
| 132 |
const n_gqa = model_config["num_attention_heads"] / model_config["num_key_value_heads"]
|
| 133 |
const n_embd_gqa = model_config["hidden_size"] / n_gqa
|
| 134 |
const n_elements = n_embd_gqa * (model_config["num_hidden_layers"] * context)
|
| 135 |
const size = 2 * n_elements
|
| 136 |
+
return size * (cache_bit / 8)
|
|
|
|
|
|
|
|
|
|
| 137 |
}
|
| 138 |
|
| 139 |
+
function contextSize(context=8192, model_config, bsz=512, cache_bit=16) {
|
| 140 |
+
return Number.parseFloat((inputBuffer(context, model_config, bsz) + kvCache(context, model_config, cache_bit) + computeBuffer(context, model_config, bsz)).toFixed(2))
|
| 141 |
}
|
| 142 |
|
| 143 |
function modelSize(model_config, bpw=4.5) {
|
|
|
|
| 149 |
const model_config = await modelConfig(document.getElementById("modelsearch").value)
|
| 150 |
const context = parseInt(document.getElementById("contextsize").value)
|
| 151 |
let bsz = 512
|
| 152 |
+
let cache_bit = 16
|
| 153 |
let bpw = 0
|
| 154 |
if (format === "gguf") {
|
| 155 |
bsz = parseInt(document.getElementById("batchsize").value)
|
| 156 |
bpw = gguf_quants[document.getElementById("quantsize").innerText]
|
| 157 |
|
| 158 |
} else if (format == "exl2") {
|
| 159 |
+
cache_bit = Number.parseInt(document.getElementById("kvCache").value)
|
| 160 |
bpw = Number.parseFloat(document.getElementById("bpw").value)
|
| 161 |
}
|
| 162 |
|
| 163 |
const model_size = modelSize(model_config, bpw)
|
| 164 |
+
const context_size = contextSize(context, model_config, bsz, cache_bit)
|
| 165 |
+
const total_size = ((model_size + context_size) / 2**30)
|
| 166 |
+
document.getElementById("resultmodel").innerText = (model_size / 2**30).toFixed(2)
|
| 167 |
+
document.getElementById("resultcontext").innerText = (context_size / 2**30).toFixed(2)
|
| 168 |
const result_total_el = document.getElementById("resulttotal");
|
| 169 |
result_total_el.innerText = total_size.toFixed(2)
|
| 170 |
|
|
|
|
| 398 |
class="w-fit p-3 h-full flex items-center gap-2 justify-center rounded-md border-0 text-gray-900 shadow-sm ring-1 ring-inset ring-gray-300 placeholder:text-gray-400 focus:ring-2 focus:ring-inset focus:ring-indigo-600 sm:text-sm sm:leading-6"
|
| 399 |
>
|
| 400 |
<label
|
| 401 |
+
for="kvCache"
|
| 402 |
class="inline-block bg-white text-xs font-medium text-gray-900"
|
| 403 |
>
|
| 404 |
+
KV Cache
|
| 405 |
</label>
|
| 406 |
+
<select id="kvCache" name="kvCache">
|
| 407 |
+
<option value="16">16 bit</option>
|
| 408 |
+
<option value="8">8 bit</option>
|
| 409 |
+
<option value="4">4 bit</option>
|
| 410 |
+
</select>
|
| 411 |
</div>
|
| 412 |
</div>
|
| 413 |
</div>
|