pankajroark commited on
Commit
2319002
1 Parent(s): 18b067e

inflight batching engine for 7b-sq-int8kv-tp1

Browse files
7b-sq-int8kv-tp1/config.json CHANGED
@@ -24,7 +24,7 @@
24
  "plugin_config": {
25
  "attention_qk_half_accumulation": false,
26
  "bert_attention_plugin": false,
27
- "context_fmha_type": 0,
28
  "gemm_plugin": "float16",
29
  "gpt_attention_plugin": "float16",
30
  "identity_plugin": false,
@@ -32,14 +32,14 @@
32
  "layernorm_quantization_plugin": false,
33
  "lookup_plugin": false,
34
  "nccl_plugin": false,
35
- "paged_kv_cache": false,
36
  "quantize_per_token_plugin": true,
37
  "quantize_tensor_plugin": true,
38
- "remove_input_padding": false,
39
  "rmsnorm_plugin": false,
40
  "rmsnorm_quantization_plugin": "float16",
41
  "smooth_quant_gemm_plugin": "float16",
42
- "tokens_per_block": 0,
43
  "use_custom_all_reduce": false,
44
  "weight_only_groupwise_quant_matmul_plugin": false,
45
  "weight_only_quant_matmul_plugin": false
 
24
  "plugin_config": {
25
  "attention_qk_half_accumulation": false,
26
  "bert_attention_plugin": false,
27
+ "context_fmha_type": 1,
28
  "gemm_plugin": "float16",
29
  "gpt_attention_plugin": "float16",
30
  "identity_plugin": false,
 
32
  "layernorm_quantization_plugin": false,
33
  "lookup_plugin": false,
34
  "nccl_plugin": false,
35
+ "paged_kv_cache": true,
36
  "quantize_per_token_plugin": true,
37
  "quantize_tensor_plugin": true,
38
+ "remove_input_padding": true,
39
  "rmsnorm_plugin": false,
40
  "rmsnorm_quantization_plugin": "float16",
41
  "smooth_quant_gemm_plugin": "float16",
42
+ "tokens_per_block": 64,
43
  "use_custom_all_reduce": false,
44
  "weight_only_groupwise_quant_matmul_plugin": false,
45
  "weight_only_quant_matmul_plugin": false
7b-sq-int8kv-tp1/llama_float16_tp1_rank0.engine CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d3bd67a974e1ba09ce35f4de7d03010dfdb4089e72aec6a74c1c2e3714c0bca1
3
- size 7006262500
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:55022cb34074c613e09f1fd4d42109c0a375a04bf71d02106e92ef47c2fc795f
3
+ size 7006227084
7b-sq-int8kv-tp1/model.cache CHANGED
Binary files a/7b-sq-int8kv-tp1/model.cache and b/7b-sq-int8kv-tp1/model.cache differ