diff --git a/README.md b/README.md index 9b452a0e00dfcbe7dd2daf4e713b0a030b9444bd..6920d0366086c56fbc64e29edbfb41f898c0549f 100644 --- a/README.md +++ b/README.md @@ -14,9 +14,17 @@ pinned: false ## 压缩率 Compress Rate -在 [cc-100](https://huggingface.co/datasets/cc100) 数据集,每个语言取1万条数据,测试不同tokenizer的压缩率。压缩率指标 `g_bytes/b_tokens` +在 [cc-100](https://huggingface.co/datasets/cc100) 数据集,每个语言取1万条数据,测试不同tokenizer的压缩率。 -您可通过以下脚本进行复现 +> 压缩率示例: +llama3扩充了词典,具有更高的压缩比。同样1T字节的简体中文语料,llama分词后是 0.56万亿个token,llama3只需要0.31万亿个token。 + +| tokenizer | vocab_size | t_bytes/t_tokens | t_tokens/t_bytes | n_chars/n_tokens | +|:-----------------------------|-------------:|-------------------:|-------------------:|-------------------:| +| llama | 32000 | 1.8 | 0.56 | 0.7 | +| llama3 | 128000 | 3.2 | 0.31 | 1.24 | + +可通过以下脚本进行复现 ```sh python utils/compress_rate_util.py ``` @@ -24,92 +32,177 @@ python utils/compress_rate_util.py +
英文压缩率 +在英文数据集 cc100-en 计算压缩率 + +| tokenizer | vocab_size | g_bytes/b_tokens | b_tokens/g_bytes | t_bytes/t_tokens | t_tokens/t_bytes | n_chars/n_tokens | +|:----------------------------|-------------:|-------------------:|-------------------:|-------------------:|-------------------:|-------------------:| +| amber | 32000 | 3.56 | 0.28 | 3.47 | 0.29 | 3.81 | +| aya_101 | 250100 | 3.3 | 0.3 | 3.22 | 0.31 | 3.53 | +| baichuan | 64000 | 3.74 | 0.27 | 3.65 | 0.27 | 4 | +| baichuan2 | 125696 | 3.89 | 0.26 | 3.8 | 0.26 | 4.17 | +| bert_base_cased | 28996 | 3.64 | 0.27 | 3.55 | 0.28 | 3.89 | +| bert_base_chinese | 21128 | 2.78 | 0.36 | 2.71 | 0.37 | 2.97 | +| bert_base_uncased | 30522 | 3.73 | 0.27 | 3.65 | 0.27 | 4 | +| bloom | 250680 | 4.07 | 0.25 | 3.97 | 0.25 | 4.36 | +| byt5_small | 256 | 0.92 | 1.08 | 0.9 | 1.11 | 0.99 | +| character_glm_6b | 64794 | 3.62 | 0.28 | 3.54 | 0.28 | 3.88 | +| chatglm2_6b | 64794 | 3.62 | 0.28 | 3.54 | 0.28 | 3.88 | +| chatglm3_6b | 64798 | 3.62 | 0.28 | 3.54 | 0.28 | 3.88 | +| chatglm_6b | 150344 | 3.68 | 0.27 | 3.59 | 0.28 | 3.94 | +| chatyuan_large_v2 | 32128 | 1.95 | 0.51 | 1.91 | 0.52 | 2.09 | +| chinese_llama | 49953 | 3.59 | 0.28 | 3.51 | 0.28 | 3.85 | +| chinese_llama2 | 55296 | 3.56 | 0.28 | 3.47 | 0.29 | 3.81 | +| code_davinci_002 | 50281 | 4.05 | 0.25 | 3.96 | 0.25 | 4.34 | +| crystal_coder | 32000 | 3.68 | 0.27 | 3.59 | 0.28 | 3.94 | +| dbrx_instruct | 100277 | 4.11 | 0.24 | 4.01 | 0.25 | 4.4 | +| deepseek_coder_33b_instruct | 32000 | 3.64 | 0.27 | 3.56 | 0.28 | 3.9 | +| deepseek_llm_7b_base | 100000 | 3.85 | 0.26 | 3.76 | 0.27 | 4.12 | +| falcon_180b | 65024 | 3.99 | 0.25 | 3.9 | 0.26 | 4.27 | +| falcon_7b | 65024 | 3.99 | 0.25 | 3.9 | 0.26 | 4.27 | +| fastchat_t5_3b | 32000 | 2.16 | 0.46 | 2.11 | 0.47 | 2.31 | +| flan_t5_base | 32100 | 3.61 | 0.28 | 3.53 | 0.28 | 3.87 | +| gemma_7b | 256000 | 3.91 | 0.26 | 3.82 | 0.26 | 4.18 | +| gpt2 | 50257 | 4.05 | 0.25 | 3.96 | 0.25 | 4.34 | +| gpt2_chinese | 21128 | 2.67 | 0.37 | 2.61 | 0.38 | 2.86 | +| gpt_35_turbo | 100277 | 4.11 | 0.24 | 4.01 | 0.25 | 4.4 | +| gpt_4 | 100277 | 4.11 | 0.24 | 4.01 | 0.25 | 4.4 | +| gpt_nexo_20b | 50254 | 4.04 | 0.25 | 3.94 | 0.25 | 4.32 | +| grok_1 | 131072 | 4.06 | 0.25 | 3.96 | 0.25 | 4.35 | +| internlm2_chat_7b | 92544 | 3.86 | 0.26 | 3.77 | 0.27 | 4.13 | +| internlm2_math_7b | 92544 | 3.86 | 0.26 | 3.77 | 0.27 | 4.13 | +| internlm_chat_7b | 103168 | 3.86 | 0.26 | 3.77 | 0.27 | 4.13 | +| internlm_xcomposer_7b | 103168 | 3.86 | 0.26 | 3.77 | 0.27 | 4.13 | +| jamba_v0_1 | 65536 | 3.82 | 0.26 | 3.73 | 0.27 | 4.09 | +| kplug | 10261 | 2.66 | 0.38 | 2.6 | 0.38 | 2.85 | +| llama | 32000 | 3.56 | 0.28 | 3.47 | 0.29 | 3.81 | +| llama2 | 32000 | 3.56 | 0.28 | 3.47 | 0.29 | 3.81 | +| llama3 | 128000 | 4.11 | 0.24 | 4.01 | 0.25 | 4.4 | +| mistral_7b | 32000 | 3.67 | 0.27 | 3.58 | 0.28 | 3.92 | +| mixtral_8_7b | 32000 | 3.67 | 0.27 | 3.58 | 0.28 | 3.92 | +| mobilebert_uncased | 30522 | 3.73 | 0.27 | 3.65 | 0.27 | 4 | +| moss | 106029 | 4.08 | 0.25 | 3.98 | 0.25 | 4.36 | +| mt5_large | 250100 | 3.3 | 0.3 | 3.22 | 0.31 | 3.53 | +| olmo_7b | 50280 | 4.04 | 0.25 | 3.94 | 0.25 | 4.32 | +| orion_14b_chat | 84608 | 3.94 | 0.25 | 3.85 | 0.26 | 4.22 | +| phi_1 | 50257 | 4.05 | 0.25 | 3.96 | 0.25 | 4.34 | +| phi_2 | 50257 | 4.05 | 0.25 | 3.96 | 0.25 | 4.34 | +| pko_t5_large | 50258 | 1.59 | 0.63 | 1.55 | 0.64 | 1.7 | +| prompt_clue | 32128 | 1.95 | 0.51 | 1.91 | 0.52 | 2.09 | +| qwen1_5_14b_chat | 151643 | 4.06 | 0.25 | 3.97 | 0.25 | 4.35 | +| qwen_1_8b_chat | 151851 | 4.06 | 0.25 | 3.97 | 0.25 | 4.35 | +| qwen_72b_chat | 151851 | 4.06 | 0.25 | 3.97 | 0.25 | 4.35 | +| qwen_7b_chat | 151851 | 4.06 | 0.25 | 3.97 | 0.25 | 4.35 | +| roberta_chinese_clue | 8021 | 1.8 | 0.56 | 1.75 | 0.57 | 1.92 | +| skywork_13b_base | 65519 | 3.56 | 0.28 | 3.47 | 0.29 | 3.81 | +| skywork_13b_math | 65519 | 3.56 | 0.28 | 3.47 | 0.29 | 3.81 | +| solar_10_7b | 32000 | 3.67 | 0.27 | 3.58 | 0.28 | 3.92 | +| starchat_alpha | 49152 | 3.63 | 0.28 | 3.54 | 0.28 | 3.88 | +| switch_c_2048 | 32100 | 3.61 | 0.28 | 3.53 | 0.28 | 3.87 | +| t5_base | 32100 | 3.61 | 0.28 | 3.53 | 0.28 | 3.87 | +| t5_large | 32100 | 3.61 | 0.28 | 3.53 | 0.28 | 3.87 | +| t5_small | 32100 | 3.61 | 0.28 | 3.53 | 0.28 | 3.87 | +| text_davinci_003 | 50281 | 4.05 | 0.25 | 3.96 | 0.25 | 4.34 | +| tigerbot_13b_chat_v2 | 60512 | 3.67 | 0.27 | 3.58 | 0.28 | 3.93 | +| tigerbot_70b_chat_v4_4k | 65107 | 3.65 | 0.27 | 3.57 | 0.28 | 3.91 | +| wizardcoder_15b_v1 | 49152 | 3.63 | 0.28 | 3.54 | 0.28 | 3.88 | +| wizardcoder_python_7b_v1 | 32000 | 3.56 | 0.28 | 3.47 | 0.29 | 3.81 | +| wizardlm_7b_v1 | 32000 | 3.56 | 0.28 | 3.47 | 0.29 | 3.81 | +| wizardmath_70b_v1 | 32000 | 3.56 | 0.28 | 3.47 | 0.29 | 3.81 | +| xlm_roberta | 250002 | 3.49 | 0.29 | 3.41 | 0.29 | 3.74 | +| yi_34b | 64000 | 3.87 | 0.26 | 3.78 | 0.26 | 4.15 | +| yi_6b | 64000 | 3.87 | 0.26 | 3.78 | 0.26 | 4.15 | +| yi_vl34b | 64000 | 3.88 | 0.26 | 3.79 | 0.26 | 4.16 | +| zephyr_7b_beta | 32000 | 3.67 | 0.27 | 3.58 | 0.28 | 3.92 | + +
+
简体中文压缩率 在简体中文数据集 cc100-zh-Hans 计算压缩率 +| tokenizer | vocab_size | g_bytes/b_tokens | b_tokens/g_bytes | t_bytes/t_tokens | t_tokens/t_bytes | n_chars/n_tokens | +|:----------------------------|-------------:|-------------------:|-------------------:|-------------------:|-------------------:|-------------------:| +| amber | 32000 | 1.84 | 0.54 | 1.8 | 0.56 | 0.7 | +| aya_101 | 250100 | 3.89 | 0.26 | 3.79 | 0.26 | 1.47 | +| baichuan | 64000 | 3.92 | 0.26 | 3.82 | 0.26 | 1.48 | +| baichuan2 | 125696 | 4.53 | 0.22 | 4.42 | 0.23 | 1.71 | +| bert_base_cased | 28996 | 2.73 | 0.37 | 2.66 | 0.38 | 1.03 | +| bert_base_chinese | 21128 | 2.74 | 0.37 | 2.67 | 0.37 | 1.03 | +| bert_base_uncased | 30522 | 2.73 | 0.37 | 2.67 | 0.38 | 1.03 | +| bloom | 250680 | 4.28 | 0.23 | 4.18 | 0.24 | 1.62 | +| byt5_small | 256 | 0.93 | 1.08 | 0.91 | 1.1 | 0.35 | +| character_glm_6b | 64794 | 4.2 | 0.24 | 4.1 | 0.24 | 1.59 | +| chatglm2_6b | 64794 | 4.2 | 0.24 | 4.1 | 0.24 | 1.59 | +| chatglm3_6b | 64798 | 4.2 | 0.24 | 4.1 | 0.24 | 1.59 | +| chatglm_6b | 150344 | 4.65 | 0.22 | 4.54 | 0.22 | 1.76 | +| chatyuan_large_v2 | 32128 | 4.34 | 0.23 | 4.24 | 0.24 | 1.64 | +| chinese_llama | 49953 | 3.93 | 0.25 | 3.84 | 0.26 | 1.49 | +| chinese_llama2 | 55296 | 3.92 | 0.26 | 3.83 | 0.26 | 1.48 | +| code_davinci_002 | 50281 | 1.31 | 0.77 | 1.28 | 0.78 | 0.49 | +| crystal_coder | 32000 | 1.86 | 0.54 | 1.81 | 0.55 | 0.7 | +| dbrx_instruct | 100277 | 2.26 | 0.44 | 2.21 | 0.45 | 0.85 | +| deepseek_coder_33b_instruct | 32000 | 3.4 | 0.29 | 3.32 | 0.3 | 1.29 | +| deepseek_llm_7b_base | 100000 | 4.05 | 0.25 | 3.96 | 0.25 | 1.53 | +| falcon_180b | 65024 | 2.18 | 0.46 | 2.13 | 0.47 | 0.82 | +| falcon_7b | 65024 | 2.18 | 0.46 | 2.13 | 0.47 | 0.82 | +| fastchat_t5_3b | 32000 | 13.7 | 0.07 | 13.38 | 0.07 | 5.18 | +| flan_t5_base | 32100 | 14.13 | 0.07 | 13.8 | 0.07 | 5.34 | +| gemma_7b | 256000 | 3.82 | 0.26 | 3.73 | 0.27 | 1.44 | +| gpt2 | 50257 | 1.31 | 0.77 | 1.28 | 0.78 | 0.49 | +| gpt2_chinese | 21128 | 2.73 | 0.37 | 2.66 | 0.38 | 1.03 | +| gpt_35_turbo | 100277 | 2.26 | 0.44 | 2.21 | 0.45 | 0.85 | +| gpt_4 | 100277 | 2.26 | 0.44 | 2.21 | 0.45 | 0.85 | +| gpt_nexo_20b | 50254 | 2.01 | 0.5 | 1.96 | 0.51 | 0.76 | +| grok_1 | 131072 | 1.73 | 0.58 | 1.69 | 0.59 | 0.66 | +| internlm2_chat_7b | 92544 | 4.23 | 0.24 | 4.13 | 0.24 | 1.6 | +| internlm2_math_7b | 92544 | 4.23 | 0.24 | 4.13 | 0.24 | 1.6 | +| internlm_chat_7b | 103168 | 4.23 | 0.24 | 4.14 | 0.24 | 1.6 | +| internlm_xcomposer_7b | 103168 | 4.23 | 0.24 | 4.14 | 0.24 | 1.6 | +| jamba_v0_1 | 65536 | 2.3 | 0.44 | 2.24 | 0.45 | 0.87 | +| kplug | 10261 | 2.72 | 0.37 | 2.65 | 0.38 | 1.03 | +| llama | 32000 | 1.84 | 0.54 | 1.8 | 0.56 | 0.7 | +| llama2 | 32000 | 1.84 | 0.54 | 1.8 | 0.56 | 0.7 | +| llama3 | 128000 | 3.28 | 0.3 | 3.2 | 0.31 | 1.24 | +| mistral_7b | 32000 | 2.36 | 0.42 | 2.3 | 0.43 | 0.89 | +| mixtral_8_7b | 32000 | 2.36 | 0.42 | 2.3 | 0.43 | 0.89 | +| mobilebert_uncased | 30522 | 2.73 | 0.37 | 2.67 | 0.38 | 1.03 | +| moss | 106029 | 4.4 | 0.23 | 4.3 | 0.23 | 1.66 | +| mt5_large | 250100 | 3.89 | 0.26 | 3.79 | 0.26 | 1.47 | +| olmo_7b | 50280 | 2.01 | 0.5 | 1.96 | 0.51 | 0.76 | +| orion_14b_chat | 84608 | 4.63 | 0.22 | 4.52 | 0.22 | 1.75 | +| phi_1 | 50257 | 1.31 | 0.77 | 1.28 | 0.78 | 0.49 | +| phi_2 | 50257 | 1.31 | 0.77 | 1.28 | 0.78 | 0.49 | +| pko_t5_large | 50258 | 0.97 | 1.03 | 0.95 | 1.06 | 0.37 | +| prompt_clue | 32128 | 4.34 | 0.23 | 4.24 | 0.24 | 1.64 | +| qwen1_5_14b_chat | 151643 | 4.16 | 0.24 | 4.06 | 0.25 | 1.57 | +| qwen_1_8b_chat | 151851 | 4.16 | 0.24 | 4.06 | 0.25 | 1.57 | +| qwen_72b_chat | 151851 | 4.16 | 0.24 | 4.06 | 0.25 | 1.57 | +| qwen_7b_chat | 151851 | 4.16 | 0.24 | 4.06 | 0.25 | 1.57 | +| roberta_chinese_clue | 8021 | 2.7 | 0.37 | 2.64 | 0.38 | 1.02 | +| skywork_13b_base | 65519 | 3.69 | 0.27 | 3.61 | 0.28 | 1.4 | +| skywork_13b_math | 65519 | 3.69 | 0.27 | 3.61 | 0.28 | 1.4 | +| solar_10_7b | 32000 | 2.36 | 0.42 | 2.3 | 0.43 | 0.89 | +| starchat_alpha | 49152 | 2.78 | 0.36 | 2.72 | 0.37 | 1.05 | +| switch_c_2048 | 32100 | 14.13 | 0.07 | 13.8 | 0.07 | 5.34 | +| t5_base | 32100 | 14.13 | 0.07 | 13.8 | 0.07 | 5.34 | +| t5_large | 32100 | 14.13 | 0.07 | 13.8 | 0.07 | 5.34 | +| t5_small | 32100 | 14.13 | 0.07 | 13.8 | 0.07 | 5.34 | +| text_davinci_003 | 50281 | 1.31 | 0.77 | 1.28 | 0.78 | 0.49 | +| tigerbot_13b_chat_v2 | 60512 | 4.25 | 0.24 | 4.15 | 0.24 | 1.61 | +| tigerbot_70b_chat_v4_4k | 65107 | 4.25 | 0.24 | 4.15 | 0.24 | 1.61 | +| wizardcoder_15b_v1 | 49152 | 2.78 | 0.36 | 2.72 | 0.37 | 1.05 | +| wizardcoder_python_7b_v1 | 32000 | 1.84 | 0.54 | 1.8 | 0.56 | 0.7 | +| wizardlm_7b_v1 | 32000 | 1.84 | 0.54 | 1.8 | 0.56 | 0.7 | +| wizardmath_70b_v1 | 32000 | 1.84 | 0.54 | 1.8 | 0.56 | 0.7 | +| xlm_roberta | 250002 | 3.96 | 0.25 | 3.86 | 0.26 | 1.5 | +| yi_34b | 64000 | 4.17 | 0.24 | 4.07 | 0.25 | 1.58 | +| yi_6b | 64000 | 4.17 | 0.24 | 4.07 | 0.25 | 1.58 | +| yi_vl34b | 64000 | 4.11 | 0.24 | 4.02 | 0.25 | 1.56 | +| zephyr_7b_beta | 32000 | 2.36 | 0.42 | 2.3 | 0.43 | 0.89 |
-| tokenizer | vocab_size | g_bytes/b_tokens | t_bytes/t_tokens | b_tokens/g_bytes | -|:----------------------------|-------------:|-------------------:|-------------------:|-------------------:| -| amber | 32000 | 1.84 | 1.8 | 0.54 | -| aya_101 | 250100 | 3.89 | 3.79 | 0.26 | -| baichuan | 64000 | 3.92 | 3.82 | 0.26 | -| baichuan2 | 125696 | 4.53 | 4.42 | 0.22 | -| bert_base_cased | 28996 | 2.73 | 2.66 | 0.37 | -| bert_base_chinese | 21128 | 2.74 | 2.67 | 0.37 | -| bert_base_uncased | 30522 | 2.73 | 2.67 | 0.37 | -| bloom | 250680 | 4.28 | 4.18 | 0.23 | -| byt5_small | 256 | 0.93 | 0.91 | 1.08 | -| character_glm_6b | 64794 | 4.2 | 4.1 | 0.24 | -| chatglm2_6b | 64794 | 4.2 | 4.1 | 0.24 | -| chatglm3_6b | 64798 | 4.2 | 4.1 | 0.24 | -| chatglm_6b | 150344 | 4.65 | 4.54 | 0.22 | -| chatyuan_large_v2 | 32128 | 4.34 | 4.24 | 0.23 | -| chinese_llama | 49953 | 3.93 | 3.84 | 0.25 | -| chinese_llama2 | 55296 | 3.92 | 3.83 | 0.26 | -| code_davinci_002 | 50281 | 1.31 | 1.28 | 0.77 | -| crystal_coder | 32000 | 1.86 | 1.81 | 0.54 | -| deepseek_coder_33b_instruct | 32000 | 3.4 | 3.32 | 0.29 | -| deepseek_llm_7b_base | 100000 | 4.05 | 3.96 | 0.25 | -| falcon_180b | 65024 | 2.18 | 2.13 | 0.46 | -| falcon_7b | 65024 | 2.18 | 2.13 | 0.46 | -| fastchat_t5_3b | 32000 | 13.7 | 13.38 | 0.07 | -| flan_t5_base | 32100 | 14.13 | 13.8 | 0.07 | -| gemma_7b | 256000 | 3.82 | 3.73 | 0.26 | -| gpt2 | 50257 | 1.31 | 1.28 | 0.77 | -| gpt2_chinese | 21128 | 2.73 | 2.66 | 0.37 | -| gpt_35_turbo | 100277 | 2.26 | 2.21 | 0.44 | -| gpt_4 | 100277 | 2.26 | 2.21 | 0.44 | -| gpt_nexo_20b | 50254 | 2.01 | 1.96 | 0.5 | -| internlm2_chat_7b | 92544 | 4.23 | 4.13 | 0.24 | -| internlm2_math_7b | 92544 | 4.23 | 4.13 | 0.24 | -| internlm_chat_7b | 103168 | 4.23 | 4.14 | 0.24 | -| internlm_xcomposer_7b | 103168 | 4.23 | 4.14 | 0.24 | -| kplug | 10261 | 2.72 | 2.65 | 0.37 | -| llama | 32000 | 1.84 | 1.8 | 0.54 | -| llama2 | 32000 | 1.84 | 1.8 | 0.54 | -| mistral_7b | 32000 | 2.36 | 2.3 | 0.42 | -| mixtral_8_7b | 32000 | 2.36 | 2.3 | 0.42 | -| mobilebert_uncased | 30522 | 2.73 | 2.67 | 0.37 | -| moss | 106029 | 4.4 | 4.3 | 0.23 | -| mt5_large | 250100 | 3.89 | 3.79 | 0.26 | -| olmo_7b | 50280 | 2.01 | 1.96 | 0.5 | -| orion_14b_chat | 84608 | 4.63 | 4.52 | 0.22 | -| phi_1 | 50257 | 1.31 | 1.28 | 0.77 | -| phi_2 | 50257 | 1.31 | 1.28 | 0.77 | -| pko_t5_large | 50258 | 0.97 | 0.95 | 1.03 | -| prompt_clue | 32128 | 4.34 | 4.24 | 0.23 | -| qwen1_5_14b_chat | 151643 | 4.16 | 4.06 | 0.24 | -| qwen_1_8b_chat | 151851 | 4.16 | 4.06 | 0.24 | -| qwen_72b_chat | 151851 | 4.16 | 4.06 | 0.24 | -| qwen_7b_chat | 151851 | 4.16 | 4.06 | 0.24 | -| roberta_chinese_clue | 8021 | 2.7 | 2.64 | 0.37 | -| skywork_13b_base | 65519 | 3.69 | 3.61 | 0.27 | -| skywork_13b_math | 65519 | 3.69 | 3.61 | 0.27 | -| solar_10_7b | 32000 | 2.36 | 2.3 | 0.42 | -| starchat_alpha | 49152 | 2.78 | 2.72 | 0.36 | -| switch_c_2048 | 32100 | 14.13 | 13.8 | 0.07 | -| t5_base | 32100 | 14.13 | 13.8 | 0.07 | -| t5_large | 32100 | 14.13 | 13.8 | 0.07 | -| t5_small | 32100 | 14.13 | 13.8 | 0.07 | -| text_davinci_003 | 50281 | 1.31 | 1.28 | 0.77 | -| tigerbot_13b_chat_v2 | 60512 | 4.25 | 4.15 | 0.24 | -| tigerbot_70b_chat_v4_4k | 65107 | 4.25 | 4.15 | 0.24 | -| wizardcoder_15b_v1 | 49152 | 2.78 | 2.72 | 0.36 | -| wizardcoder_python_7b_v1 | 32000 | 1.84 | 1.8 | 0.54 | -| wizardlm_7b_v1 | 32000 | 1.84 | 1.8 | 0.54 | -| wizardmath_70b_v1 | 32000 | 1.84 | 1.8 | 0.54 | -| xlm_roberta | 250002 | 3.96 | 3.86 | 0.25 | -| yi_34b | 64000 | 4.17 | 4.07 | 0.24 | -| yi_6b | 64000 | 4.17 | 4.07 | 0.24 | -| yi_vl34b | 64000 | 4.11 | 4.02 | 0.24 | -| zephyr_7b_beta | 32000 | 2.36 | 2.3 | 0.42 | - - -**结论** -larger vocabulary sizes diff --git a/app.py b/app.py index 72d4e5bb6715d07b59d40350aa6c26b4197e36a6..e89e6f4cef0254755b1ed32996ab5177c0bcadd0 100644 --- a/app.py +++ b/app.py @@ -78,13 +78,13 @@ with gr.Blocks(css="css/style.css", title="Tokenizer Arena") as demo: gr.Markdown("Please select corpus and unit of compress rate, get more details at [github](https://github.com/xu-song/tokenizer-arena/). ") with gr.Row(): compress_rate_corpus = gr.CheckboxGroup( - ["cc100-en", "cc100-zh-Hans", "cc100-es", "code"], + ["cc100-en", "cc100-zh-Hans", "cc100-es"], # , "code" value=["cc100-en", "cc100-zh-Hans"], label="corpus", # info="" ) compress_rate_unit = gr.Radio( - ["b_tokens/g_bytes", "g_bytes/b_tokens", "t_tokens/t_bytes", "t_bytes/t_tokens"], + ["b_tokens/g_bytes", "g_bytes/b_tokens", "t_tokens/t_bytes", "t_bytes/t_tokens", "n_chars/n_tokens"], value="b_tokens/g_bytes", label="unit", ) diff --git a/stats/README.md b/stats/README.md new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/stats/compress_rate/amber.en.json b/stats/compress_rate/amber.en.json new file mode 100644 index 0000000000000000000000000000000000000000..5d35bdd8f62e22e8f60aa2746b1336323526a3d1 --- /dev/null +++ b/stats/compress_rate/amber.en.json @@ -0,0 +1 @@ +{"vocab_size": 32000, "n_bytes": 1124813, "n_tokens": 294627, "n_chars": 1121360} \ No newline at end of file diff --git a/stats/compress_rate/amber.zh-Hans.json b/stats/compress_rate/amber.zh-Hans.json new file mode 100644 index 0000000000000000000000000000000000000000..10b4d902001cac4d7cca7faaee0389193b8a8127 --- /dev/null +++ b/stats/compress_rate/amber.zh-Hans.json @@ -0,0 +1 @@ +{"vocab_size": 32000, "n_bytes": 2633047, "n_tokens": 1330093, "n_chars": 927311} \ No newline at end of file diff --git a/stats/compress_rate/aya_101.en.json b/stats/compress_rate/aya_101.en.json new file mode 100644 index 0000000000000000000000000000000000000000..b0a79af7b29730761f24588b314d0d330a5a95f7 --- /dev/null +++ b/stats/compress_rate/aya_101.en.json @@ -0,0 +1 @@ +{"vocab_size": 250100, "n_bytes": 1124813, "n_tokens": 317881, "n_chars": 1121360} \ No newline at end of file diff --git a/stats/compress_rate/aya_101.zh-Hans.json b/stats/compress_rate/aya_101.zh-Hans.json new file mode 100644 index 0000000000000000000000000000000000000000..86a0cf561d02c2e8e0065d494a398b7d8932260e --- /dev/null +++ b/stats/compress_rate/aya_101.zh-Hans.json @@ -0,0 +1 @@ +{"vocab_size": 250100, "n_bytes": 2633047, "n_tokens": 631182, "n_chars": 927311} \ No newline at end of file diff --git a/stats/compress_rate/baichuan.en.json b/stats/compress_rate/baichuan.en.json new file mode 100644 index 0000000000000000000000000000000000000000..319736dd6b1559c64a6b39813909cccb8296a9f5 --- /dev/null +++ b/stats/compress_rate/baichuan.en.json @@ -0,0 +1 @@ +{"vocab_size": 64000, "n_bytes": 1124813, "n_tokens": 280108, "n_chars": 1121360} \ No newline at end of file diff --git a/stats/compress_rate/baichuan.zh-Hans.json b/stats/compress_rate/baichuan.zh-Hans.json new file mode 100644 index 0000000000000000000000000000000000000000..8a8ef41d4932a2b78a8b798cdb8a09cb4b0fc640 --- /dev/null +++ b/stats/compress_rate/baichuan.zh-Hans.json @@ -0,0 +1 @@ +{"vocab_size": 64000, "n_bytes": 2633047, "n_tokens": 626117, "n_chars": 927311} \ No newline at end of file diff --git a/stats/compress_rate/baichuan2.en.json b/stats/compress_rate/baichuan2.en.json new file mode 100644 index 0000000000000000000000000000000000000000..1539f01488307203d6e022a2a1144e9f57aeb143 --- /dev/null +++ b/stats/compress_rate/baichuan2.en.json @@ -0,0 +1 @@ +{"vocab_size": 125696, "n_bytes": 1124813, "n_tokens": 269011, "n_chars": 1121360} \ No newline at end of file diff --git a/stats/compress_rate/baichuan2.zh-Hans.json b/stats/compress_rate/baichuan2.zh-Hans.json new file mode 100644 index 0000000000000000000000000000000000000000..cb312131ee90fc3c54fd222f22e31eb8d50342a2 --- /dev/null +++ b/stats/compress_rate/baichuan2.zh-Hans.json @@ -0,0 +1 @@ +{"vocab_size": 125696, "n_bytes": 2633047, "n_tokens": 541464, "n_chars": 927311} \ No newline at end of file diff --git a/stats/compress_rate/bert_base_cased.en.json b/stats/compress_rate/bert_base_cased.en.json new file mode 100644 index 0000000000000000000000000000000000000000..2a8093fd72d7d8a726a8b55252137ead8ed0671e --- /dev/null +++ b/stats/compress_rate/bert_base_cased.en.json @@ -0,0 +1 @@ +{"vocab_size": 28996, "n_bytes": 1124813, "n_tokens": 288022, "n_chars": 1121360} \ No newline at end of file diff --git a/stats/compress_rate/bert_base_cased.zh-Hans.json b/stats/compress_rate/bert_base_cased.zh-Hans.json new file mode 100644 index 0000000000000000000000000000000000000000..1d9d6a086851393e7a0a9d3b974bd1f571213f9d --- /dev/null +++ b/stats/compress_rate/bert_base_cased.zh-Hans.json @@ -0,0 +1 @@ +{"vocab_size": 28996, "n_bytes": 2633047, "n_tokens": 899709, "n_chars": 927311} \ No newline at end of file diff --git a/stats/compress_rate/bert_base_chinese.en.json b/stats/compress_rate/bert_base_chinese.en.json new file mode 100644 index 0000000000000000000000000000000000000000..8d0205fc8d185e7c3169a84e58e9b33c011ae145 --- /dev/null +++ b/stats/compress_rate/bert_base_chinese.en.json @@ -0,0 +1 @@ +{"vocab_size": 21128, "n_bytes": 1124813, "n_tokens": 377068, "n_chars": 1121360} \ No newline at end of file diff --git a/stats/compress_rate/bert_base_chinese.zh-Hans.json b/stats/compress_rate/bert_base_chinese.zh-Hans.json new file mode 100644 index 0000000000000000000000000000000000000000..edfdbda92cbb9e8ab01febedc505e69f47425102 --- /dev/null +++ b/stats/compress_rate/bert_base_chinese.zh-Hans.json @@ -0,0 +1 @@ +{"vocab_size": 21128, "n_bytes": 2633047, "n_tokens": 896599, "n_chars": 927311} \ No newline at end of file diff --git a/stats/compress_rate/bert_base_uncased.en.json b/stats/compress_rate/bert_base_uncased.en.json new file mode 100644 index 0000000000000000000000000000000000000000..b77bdaf83aadc81b0b0ad3ee9e373955bd3555f7 --- /dev/null +++ b/stats/compress_rate/bert_base_uncased.en.json @@ -0,0 +1 @@ +{"vocab_size": 30522, "n_bytes": 1124813, "n_tokens": 280575, "n_chars": 1121360} \ No newline at end of file diff --git a/stats/compress_rate/bert_base_uncased.zh-Hans.json b/stats/compress_rate/bert_base_uncased.zh-Hans.json new file mode 100644 index 0000000000000000000000000000000000000000..872b373a274914990dfc077b60c07e14779fccbf --- /dev/null +++ b/stats/compress_rate/bert_base_uncased.zh-Hans.json @@ -0,0 +1 @@ +{"vocab_size": 30522, "n_bytes": 2633047, "n_tokens": 898554, "n_chars": 927311} \ No newline at end of file diff --git a/stats/compress_rate/bloom.en.json b/stats/compress_rate/bloom.en.json new file mode 100644 index 0000000000000000000000000000000000000000..380f6398a5738eef8e9e971f088993f822383f7a --- /dev/null +++ b/stats/compress_rate/bloom.en.json @@ -0,0 +1 @@ +{"vocab_size": 250680, "n_bytes": 1124813, "n_tokens": 257405, "n_chars": 1121360} \ No newline at end of file diff --git a/stats/compress_rate/bloom.zh-Hans.json b/stats/compress_rate/bloom.zh-Hans.json new file mode 100644 index 0000000000000000000000000000000000000000..ad01e115fbb06990797c1c7c178f1e999fdf27f5 --- /dev/null +++ b/stats/compress_rate/bloom.zh-Hans.json @@ -0,0 +1 @@ +{"vocab_size": 250680, "n_bytes": 2633047, "n_tokens": 573008, "n_chars": 927311} \ No newline at end of file diff --git a/stats/compress_rate/byt5_small.en.json b/stats/compress_rate/byt5_small.en.json new file mode 100644 index 0000000000000000000000000000000000000000..72134fe3c6522d9dee0f866fd79de1db0b7fd73f --- /dev/null +++ b/stats/compress_rate/byt5_small.en.json @@ -0,0 +1 @@ +{"vocab_size": 256, "n_bytes": 1124813, "n_tokens": 1134813, "n_chars": 1121360} \ No newline at end of file diff --git a/stats/compress_rate/byt5_small.zh-Hans.json b/stats/compress_rate/byt5_small.zh-Hans.json new file mode 100644 index 0000000000000000000000000000000000000000..b000742d49b285140e2c7dfcd530e5bea0733350 --- /dev/null +++ b/stats/compress_rate/byt5_small.zh-Hans.json @@ -0,0 +1 @@ +{"vocab_size": 256, "n_bytes": 2633047, "n_tokens": 2643047, "n_chars": 927311} \ No newline at end of file diff --git a/stats/compress_rate/character_glm_6b.en.json b/stats/compress_rate/character_glm_6b.en.json new file mode 100644 index 0000000000000000000000000000000000000000..a757a1d42375090a3faca990c86ab44a7b8af7a6 --- /dev/null +++ b/stats/compress_rate/character_glm_6b.en.json @@ -0,0 +1 @@ +{"vocab_size": 64794, "n_bytes": 1124813, "n_tokens": 289347, "n_chars": 1121360} \ No newline at end of file diff --git a/stats/compress_rate/character_glm_6b.zh-Hans.json b/stats/compress_rate/character_glm_6b.zh-Hans.json new file mode 100644 index 0000000000000000000000000000000000000000..c3552a17033918dc51107f5a0826cfb686ab5b9a --- /dev/null +++ b/stats/compress_rate/character_glm_6b.zh-Hans.json @@ -0,0 +1 @@ +{"vocab_size": 64794, "n_bytes": 2633047, "n_tokens": 583646, "n_chars": 927311} \ No newline at end of file diff --git a/stats/compress_rate/chatglm2_6b.en.json b/stats/compress_rate/chatglm2_6b.en.json new file mode 100644 index 0000000000000000000000000000000000000000..384ea5c511b4531418b954635ca1ee814416c7f0 --- /dev/null +++ b/stats/compress_rate/chatglm2_6b.en.json @@ -0,0 +1 @@ +{"vocab_size": 64794, "n_bytes": 1124813, "n_tokens": 289329, "n_chars": 1121360} \ No newline at end of file diff --git a/stats/compress_rate/chatglm2_6b.zh-Hans.json b/stats/compress_rate/chatglm2_6b.zh-Hans.json new file mode 100644 index 0000000000000000000000000000000000000000..c3552a17033918dc51107f5a0826cfb686ab5b9a --- /dev/null +++ b/stats/compress_rate/chatglm2_6b.zh-Hans.json @@ -0,0 +1 @@ +{"vocab_size": 64794, "n_bytes": 2633047, "n_tokens": 583646, "n_chars": 927311} \ No newline at end of file diff --git a/stats/compress_rate/chatglm3_6b.en.json b/stats/compress_rate/chatglm3_6b.en.json new file mode 100644 index 0000000000000000000000000000000000000000..6946a597f1bfdbebfdfce690a83955ce8bdff1d7 --- /dev/null +++ b/stats/compress_rate/chatglm3_6b.en.json @@ -0,0 +1 @@ +{"vocab_size": 64798, "n_bytes": 1124813, "n_tokens": 289347, "n_chars": 1121360} \ No newline at end of file diff --git a/stats/compress_rate/chatglm3_6b.zh-Hans.json b/stats/compress_rate/chatglm3_6b.zh-Hans.json new file mode 100644 index 0000000000000000000000000000000000000000..e9db0458133a10f554f19068b04aeab5a68b4a5f --- /dev/null +++ b/stats/compress_rate/chatglm3_6b.zh-Hans.json @@ -0,0 +1 @@ +{"vocab_size": 64798, "n_bytes": 2633047, "n_tokens": 583646, "n_chars": 927311} \ No newline at end of file diff --git a/stats/compress_rate/chatglm_6b.en.json b/stats/compress_rate/chatglm_6b.en.json new file mode 100644 index 0000000000000000000000000000000000000000..14ce46f374239e8f20004eec8e36fdad07326ed6 --- /dev/null +++ b/stats/compress_rate/chatglm_6b.en.json @@ -0,0 +1 @@ +{"vocab_size": 150344, "n_bytes": 1124813, "n_tokens": 284761, "n_chars": 1121360} \ No newline at end of file diff --git a/stats/compress_rate/chatglm_6b.zh-Hans.json b/stats/compress_rate/chatglm_6b.zh-Hans.json new file mode 100644 index 0000000000000000000000000000000000000000..b3dc0942e55c3f773a123224c3f8fcce005a7ce0 --- /dev/null +++ b/stats/compress_rate/chatglm_6b.zh-Hans.json @@ -0,0 +1 @@ +{"vocab_size": 150344, "n_bytes": 2633047, "n_tokens": 527384, "n_chars": 927311} \ No newline at end of file diff --git a/stats/compress_rate/chatyuan_large_v2.en.json b/stats/compress_rate/chatyuan_large_v2.en.json new file mode 100644 index 0000000000000000000000000000000000000000..dce7d8b1e30246491ffac1df4f27231548b1012d --- /dev/null +++ b/stats/compress_rate/chatyuan_large_v2.en.json @@ -0,0 +1 @@ +{"vocab_size": 32128, "n_bytes": 1124813, "n_tokens": 536033, "n_chars": 1121360} \ No newline at end of file diff --git a/stats/compress_rate/chatyuan_large_v2.zh-Hans.json b/stats/compress_rate/chatyuan_large_v2.zh-Hans.json new file mode 100644 index 0000000000000000000000000000000000000000..29d8ad785f9ae43c138de6da4f1e5bc28fc55c28 --- /dev/null +++ b/stats/compress_rate/chatyuan_large_v2.zh-Hans.json @@ -0,0 +1 @@ +{"vocab_size": 32128, "n_bytes": 2633047, "n_tokens": 564905, "n_chars": 927311} \ No newline at end of file diff --git a/stats/compress_rate/chinese_llama.en.json b/stats/compress_rate/chinese_llama.en.json new file mode 100644 index 0000000000000000000000000000000000000000..48259f60ddc14bc83032b5fce7ac0d7c89872e6d --- /dev/null +++ b/stats/compress_rate/chinese_llama.en.json @@ -0,0 +1 @@ +{"vocab_size": 49953, "n_bytes": 1124813, "n_tokens": 291514, "n_chars": 1121360} \ No newline at end of file diff --git a/stats/compress_rate/chinese_llama.zh-Hans.json b/stats/compress_rate/chinese_llama.zh-Hans.json new file mode 100644 index 0000000000000000000000000000000000000000..38fe6619b1f98ca1daaf918d2378f0e352e716cd --- /dev/null +++ b/stats/compress_rate/chinese_llama.zh-Hans.json @@ -0,0 +1 @@ +{"vocab_size": 49953, "n_bytes": 2633047, "n_tokens": 623219, "n_chars": 927311} \ No newline at end of file diff --git a/stats/compress_rate/chinese_llama2.en.json b/stats/compress_rate/chinese_llama2.en.json new file mode 100644 index 0000000000000000000000000000000000000000..7e4c12068cdfafaa5eea675525af7b570aa17087 --- /dev/null +++ b/stats/compress_rate/chinese_llama2.en.json @@ -0,0 +1 @@ +{"vocab_size": 55296, "n_bytes": 1124813, "n_tokens": 294627, "n_chars": 1121360} \ No newline at end of file diff --git a/stats/compress_rate/chinese_llama2.zh-Hans.json b/stats/compress_rate/chinese_llama2.zh-Hans.json new file mode 100644 index 0000000000000000000000000000000000000000..db908e707b8f6272dd9350c043061921093dfa27 --- /dev/null +++ b/stats/compress_rate/chinese_llama2.zh-Hans.json @@ -0,0 +1 @@ +{"vocab_size": 55296, "n_bytes": 2633047, "n_tokens": 625766, "n_chars": 927311} \ No newline at end of file diff --git a/stats/compress_rate/code_davinci_002.en.json b/stats/compress_rate/code_davinci_002.en.json new file mode 100644 index 0000000000000000000000000000000000000000..27ad8b617e80415a1c05a199574f4b4bbdeb7231 --- /dev/null +++ b/stats/compress_rate/code_davinci_002.en.json @@ -0,0 +1 @@ +{"vocab_size": 50281, "n_bytes": 1124813, "n_tokens": 258403, "n_chars": 1121360} \ No newline at end of file diff --git a/stats/compress_rate/code_davinci_002.zh-Hans.json b/stats/compress_rate/code_davinci_002.zh-Hans.json new file mode 100644 index 0000000000000000000000000000000000000000..4ae85715f9abab2fa1b044de2ea87ffbcb031199 --- /dev/null +++ b/stats/compress_rate/code_davinci_002.zh-Hans.json @@ -0,0 +1 @@ +{"vocab_size": 50281, "n_bytes": 2633047, "n_tokens": 1876809, "n_chars": 927311} \ No newline at end of file diff --git a/stats/compress_rate/crystal_coder.en.json b/stats/compress_rate/crystal_coder.en.json new file mode 100644 index 0000000000000000000000000000000000000000..2ddb08a97c614d528b0a2c8c182ae70b49174238 --- /dev/null +++ b/stats/compress_rate/crystal_coder.en.json @@ -0,0 +1 @@ +{"vocab_size": 32000, "n_bytes": 1124813, "n_tokens": 284627, "n_chars": 1121360} \ No newline at end of file diff --git a/stats/compress_rate/crystal_coder.zh-Hans.json b/stats/compress_rate/crystal_coder.zh-Hans.json new file mode 100644 index 0000000000000000000000000000000000000000..240f82a179d6aa86b4f9916e7f015e8b73a3b417 --- /dev/null +++ b/stats/compress_rate/crystal_coder.zh-Hans.json @@ -0,0 +1 @@ +{"vocab_size": 32000, "n_bytes": 2633047, "n_tokens": 1320093, "n_chars": 927311} \ No newline at end of file diff --git a/stats/compress_rate/dbrx_instruct.en.json b/stats/compress_rate/dbrx_instruct.en.json new file mode 100644 index 0000000000000000000000000000000000000000..98fec102df7513d5e8668e5a6b178e194457d849 --- /dev/null +++ b/stats/compress_rate/dbrx_instruct.en.json @@ -0,0 +1 @@ +{"vocab_size": 100277, "n_bytes": 1124813, "n_tokens": 254985, "n_chars": 1121360} \ No newline at end of file diff --git a/stats/compress_rate/dbrx_instruct.zh-Hans.json b/stats/compress_rate/dbrx_instruct.zh-Hans.json new file mode 100644 index 0000000000000000000000000000000000000000..39ba160f0c4a91e82191668aa76e59a00cdfbca8 --- /dev/null +++ b/stats/compress_rate/dbrx_instruct.zh-Hans.json @@ -0,0 +1 @@ +{"vocab_size": 100277, "n_bytes": 2633047, "n_tokens": 1084939, "n_chars": 927311} \ No newline at end of file diff --git a/stats/compress_rate/deepseek_coder_33b_instruct.en.json b/stats/compress_rate/deepseek_coder_33b_instruct.en.json new file mode 100644 index 0000000000000000000000000000000000000000..9d2026ff9bcb8fee4655b554bba8fda8e109ae91 --- /dev/null +++ b/stats/compress_rate/deepseek_coder_33b_instruct.en.json @@ -0,0 +1 @@ +{"vocab_size": 32000, "n_bytes": 1124813, "n_tokens": 287408, "n_chars": 1121360} \ No newline at end of file diff --git a/stats/compress_rate/deepseek_coder_33b_instruct.zh-Hans.json b/stats/compress_rate/deepseek_coder_33b_instruct.zh-Hans.json new file mode 100644 index 0000000000000000000000000000000000000000..bc74e780a2e6b70de76bfec4613dba6eb4cf0c76 --- /dev/null +++ b/stats/compress_rate/deepseek_coder_33b_instruct.zh-Hans.json @@ -0,0 +1 @@ +{"vocab_size": 32000, "n_bytes": 2633047, "n_tokens": 720577, "n_chars": 927311} \ No newline at end of file diff --git a/stats/compress_rate/deepseek_llm_7b_base.en.json b/stats/compress_rate/deepseek_llm_7b_base.en.json new file mode 100644 index 0000000000000000000000000000000000000000..5dcb1b5909fbe553793723f555566a5b29c892ba --- /dev/null +++ b/stats/compress_rate/deepseek_llm_7b_base.en.json @@ -0,0 +1 @@ +{"vocab_size": 100000, "n_bytes": 1124813, "n_tokens": 272324, "n_chars": 1121360} \ No newline at end of file diff --git a/stats/compress_rate/deepseek_llm_7b_base.zh-Hans.json b/stats/compress_rate/deepseek_llm_7b_base.zh-Hans.json new file mode 100644 index 0000000000000000000000000000000000000000..ebe1ef5595510e828bef871ccf4baa911c322d99 --- /dev/null +++ b/stats/compress_rate/deepseek_llm_7b_base.zh-Hans.json @@ -0,0 +1 @@ +{"vocab_size": 100000, "n_bytes": 2633047, "n_tokens": 605081, "n_chars": 927311} \ No newline at end of file diff --git a/stats/compress_rate/falcon_180b.en.json b/stats/compress_rate/falcon_180b.en.json new file mode 100644 index 0000000000000000000000000000000000000000..9b531d9df44d557754de8ce3a77479bacf92ca16 --- /dev/null +++ b/stats/compress_rate/falcon_180b.en.json @@ -0,0 +1 @@ +{"vocab_size": 65024, "n_bytes": 1124813, "n_tokens": 262509, "n_chars": 1121360} \ No newline at end of file diff --git a/stats/compress_rate/falcon_180b.zh-Hans.json b/stats/compress_rate/falcon_180b.zh-Hans.json new file mode 100644 index 0000000000000000000000000000000000000000..ba1f5512ca44d793c3f9b5915b905947689f0616 --- /dev/null +++ b/stats/compress_rate/falcon_180b.zh-Hans.json @@ -0,0 +1 @@ +{"vocab_size": 65024, "n_bytes": 2633047, "n_tokens": 1124681, "n_chars": 927311} \ No newline at end of file diff --git a/stats/compress_rate/falcon_7b.en.json b/stats/compress_rate/falcon_7b.en.json new file mode 100644 index 0000000000000000000000000000000000000000..9b531d9df44d557754de8ce3a77479bacf92ca16 --- /dev/null +++ b/stats/compress_rate/falcon_7b.en.json @@ -0,0 +1 @@ +{"vocab_size": 65024, "n_bytes": 1124813, "n_tokens": 262509, "n_chars": 1121360} \ No newline at end of file diff --git a/stats/compress_rate/falcon_7b.zh-Hans.json b/stats/compress_rate/falcon_7b.zh-Hans.json new file mode 100644 index 0000000000000000000000000000000000000000..ba1f5512ca44d793c3f9b5915b905947689f0616 --- /dev/null +++ b/stats/compress_rate/falcon_7b.zh-Hans.json @@ -0,0 +1 @@ +{"vocab_size": 65024, "n_bytes": 2633047, "n_tokens": 1124681, "n_chars": 927311} \ No newline at end of file diff --git a/stats/compress_rate/fastchat_t5_3b.en.json b/stats/compress_rate/fastchat_t5_3b.en.json new file mode 100644 index 0000000000000000000000000000000000000000..3431f8959efbe1fa08ebe73df1340d0d0cade2e6 --- /dev/null +++ b/stats/compress_rate/fastchat_t5_3b.en.json @@ -0,0 +1 @@ +{"vocab_size": 32000, "n_bytes": 1124813, "n_tokens": 484941, "n_chars": 1121360} \ No newline at end of file diff --git a/stats/compress_rate/fastchat_t5_3b.zh-Hans.json b/stats/compress_rate/fastchat_t5_3b.zh-Hans.json new file mode 100644 index 0000000000000000000000000000000000000000..58915b3215187dca95be11b27699250122c80ea6 --- /dev/null +++ b/stats/compress_rate/fastchat_t5_3b.zh-Hans.json @@ -0,0 +1 @@ +{"vocab_size": 32000, "n_bytes": 2633047, "n_tokens": 178974, "n_chars": 927311} \ No newline at end of file diff --git a/stats/compress_rate/flan_t5_base.en.json b/stats/compress_rate/flan_t5_base.en.json new file mode 100644 index 0000000000000000000000000000000000000000..a7e21bb731a976080d974313aa72dc8c40837d66 --- /dev/null +++ b/stats/compress_rate/flan_t5_base.en.json @@ -0,0 +1 @@ +{"vocab_size": 32100, "n_bytes": 1124813, "n_tokens": 290104, "n_chars": 1121360} \ No newline at end of file diff --git a/stats/compress_rate/flan_t5_base.zh-Hans.json b/stats/compress_rate/flan_t5_base.zh-Hans.json new file mode 100644 index 0000000000000000000000000000000000000000..0c5d6d6ee81a4c101d6c1f831c25e218f2889359 --- /dev/null +++ b/stats/compress_rate/flan_t5_base.zh-Hans.json @@ -0,0 +1 @@ +{"vocab_size": 32100, "n_bytes": 2633047, "n_tokens": 173520, "n_chars": 927311} \ No newline at end of file diff --git a/stats/compress_rate/gemma_7b.en.json b/stats/compress_rate/gemma_7b.en.json new file mode 100644 index 0000000000000000000000000000000000000000..a258cfcd4562e39068951d497a69f59a2b76fe32 --- /dev/null +++ b/stats/compress_rate/gemma_7b.en.json @@ -0,0 +1 @@ +{"vocab_size": 256000, "n_bytes": 1124813, "n_tokens": 268010, "n_chars": 1121360} \ No newline at end of file diff --git a/stats/compress_rate/gemma_7b.zh-Hans.json b/stats/compress_rate/gemma_7b.zh-Hans.json new file mode 100644 index 0000000000000000000000000000000000000000..8aeaaab7bae09693dabc285108f70e6de89c73ea --- /dev/null +++ b/stats/compress_rate/gemma_7b.zh-Hans.json @@ -0,0 +1 @@ +{"vocab_size": 256000, "n_bytes": 2633047, "n_tokens": 641795, "n_chars": 927311} \ No newline at end of file diff --git a/stats/compress_rate/gpt2.en.json b/stats/compress_rate/gpt2.en.json new file mode 100644 index 0000000000000000000000000000000000000000..e96328fcbfdc5731dc497ef4f5628c8c20cc6f7f --- /dev/null +++ b/stats/compress_rate/gpt2.en.json @@ -0,0 +1 @@ +{"vocab_size": 50257, "n_bytes": 1124813, "n_tokens": 258428, "n_chars": 1121360} \ No newline at end of file diff --git a/stats/compress_rate/gpt2.zh-Hans.json b/stats/compress_rate/gpt2.zh-Hans.json new file mode 100644 index 0000000000000000000000000000000000000000..3a8fdf75f1fe9a7047bb545fbe0857625b8f7899 --- /dev/null +++ b/stats/compress_rate/gpt2.zh-Hans.json @@ -0,0 +1 @@ +{"vocab_size": 50257, "n_bytes": 2633047, "n_tokens": 1876809, "n_chars": 927311} \ No newline at end of file diff --git a/stats/compress_rate/gpt2_chinese.en.json b/stats/compress_rate/gpt2_chinese.en.json new file mode 100644 index 0000000000000000000000000000000000000000..337df98cc44cab515162bd982a6a3cc2bdff826b --- /dev/null +++ b/stats/compress_rate/gpt2_chinese.en.json @@ -0,0 +1 @@ +{"vocab_size": 21128, "n_bytes": 1124813, "n_tokens": 392641, "n_chars": 1121360} \ No newline at end of file diff --git a/stats/compress_rate/gpt2_chinese.zh-Hans.json b/stats/compress_rate/gpt2_chinese.zh-Hans.json new file mode 100644 index 0000000000000000000000000000000000000000..fc66e25bab026e99de996b9d09c6dc78395fe4f0 --- /dev/null +++ b/stats/compress_rate/gpt2_chinese.zh-Hans.json @@ -0,0 +1 @@ +{"vocab_size": 21128, "n_bytes": 2633047, "n_tokens": 899506, "n_chars": 927311} \ No newline at end of file diff --git a/stats/compress_rate/gpt_35_turbo.en.json b/stats/compress_rate/gpt_35_turbo.en.json new file mode 100644 index 0000000000000000000000000000000000000000..98fec102df7513d5e8668e5a6b178e194457d849 --- /dev/null +++ b/stats/compress_rate/gpt_35_turbo.en.json @@ -0,0 +1 @@ +{"vocab_size": 100277, "n_bytes": 1124813, "n_tokens": 254985, "n_chars": 1121360} \ No newline at end of file diff --git a/stats/compress_rate/gpt_35_turbo.zh-Hans.json b/stats/compress_rate/gpt_35_turbo.zh-Hans.json new file mode 100644 index 0000000000000000000000000000000000000000..39ba160f0c4a91e82191668aa76e59a00cdfbca8 --- /dev/null +++ b/stats/compress_rate/gpt_35_turbo.zh-Hans.json @@ -0,0 +1 @@ +{"vocab_size": 100277, "n_bytes": 2633047, "n_tokens": 1084939, "n_chars": 927311} \ No newline at end of file diff --git a/stats/compress_rate/gpt_4.cc100-en.cc100-zh-Hans.json b/stats/compress_rate/gpt_4.cc100-en.cc100-zh-Hans.json new file mode 100644 index 0000000000000000000000000000000000000000..7ddc0e187f430c5bea667b3cef1e817d213ac785 --- /dev/null +++ b/stats/compress_rate/gpt_4.cc100-en.cc100-zh-Hans.json @@ -0,0 +1 @@ +{"vocab_size": 100277, "n_bytes": 3757860, "n_tokens": 1339924, "n_chars": 2048671} \ No newline at end of file diff --git a/stats/compress_rate/gpt_4.en.json b/stats/compress_rate/gpt_4.en.json new file mode 100644 index 0000000000000000000000000000000000000000..98fec102df7513d5e8668e5a6b178e194457d849 --- /dev/null +++ b/stats/compress_rate/gpt_4.en.json @@ -0,0 +1 @@ +{"vocab_size": 100277, "n_bytes": 1124813, "n_tokens": 254985, "n_chars": 1121360} \ No newline at end of file diff --git a/stats/compress_rate/gpt_4.zh-Hans.json b/stats/compress_rate/gpt_4.zh-Hans.json new file mode 100644 index 0000000000000000000000000000000000000000..39ba160f0c4a91e82191668aa76e59a00cdfbca8 --- /dev/null +++ b/stats/compress_rate/gpt_4.zh-Hans.json @@ -0,0 +1 @@ +{"vocab_size": 100277, "n_bytes": 2633047, "n_tokens": 1084939, "n_chars": 927311} \ No newline at end of file diff --git a/stats/compress_rate/gpt_nexo_20b.en.json b/stats/compress_rate/gpt_nexo_20b.en.json new file mode 100644 index 0000000000000000000000000000000000000000..72274c03e56724c2f659e94cd8a04440bfe5e61c --- /dev/null +++ b/stats/compress_rate/gpt_nexo_20b.en.json @@ -0,0 +1 @@ +{"vocab_size": 50254, "n_bytes": 1124813, "n_tokens": 259357, "n_chars": 1121360} \ No newline at end of file diff --git a/stats/compress_rate/gpt_nexo_20b.zh-Hans.json b/stats/compress_rate/gpt_nexo_20b.zh-Hans.json new file mode 100644 index 0000000000000000000000000000000000000000..75ad9c016d26cf7248d6b0f3aeee3ebf0091a25f --- /dev/null +++ b/stats/compress_rate/gpt_nexo_20b.zh-Hans.json @@ -0,0 +1 @@ +{"vocab_size": 50254, "n_bytes": 2633047, "n_tokens": 1220529, "n_chars": 927311} \ No newline at end of file diff --git a/stats/compress_rate/grok_1.en.json b/stats/compress_rate/grok_1.en.json new file mode 100644 index 0000000000000000000000000000000000000000..748838acef602dd61af7a0f12fb50b26a213ad82 --- /dev/null +++ b/stats/compress_rate/grok_1.en.json @@ -0,0 +1 @@ +{"vocab_size": 131072, "n_bytes": 1124813, "n_tokens": 258048, "n_chars": 1121360} \ No newline at end of file diff --git a/stats/compress_rate/grok_1.zh-Hans.json b/stats/compress_rate/grok_1.zh-Hans.json new file mode 100644 index 0000000000000000000000000000000000000000..22deb410b52d07736ca3b1a73809ecc5cc80ee6f --- /dev/null +++ b/stats/compress_rate/grok_1.zh-Hans.json @@ -0,0 +1 @@ +{"vocab_size": 131072, "n_bytes": 2633047, "n_tokens": 1414508, "n_chars": 927311} \ No newline at end of file diff --git a/stats/compress_rate/internlm2_chat_7b.en.json b/stats/compress_rate/internlm2_chat_7b.en.json new file mode 100644 index 0000000000000000000000000000000000000000..b1aa88714c25fbc18f90294e17aa99c9822747f7 --- /dev/null +++ b/stats/compress_rate/internlm2_chat_7b.en.json @@ -0,0 +1 @@ +{"vocab_size": 92544, "n_bytes": 1124813, "n_tokens": 271583, "n_chars": 1121360} \ No newline at end of file diff --git a/stats/compress_rate/internlm2_chat_7b.zh-Hans.json b/stats/compress_rate/internlm2_chat_7b.zh-Hans.json new file mode 100644 index 0000000000000000000000000000000000000000..65c786d41dd7a9c878052ea5903f1fc3079601e1 --- /dev/null +++ b/stats/compress_rate/internlm2_chat_7b.zh-Hans.json @@ -0,0 +1 @@ +{"vocab_size": 92544, "n_bytes": 2633047, "n_tokens": 579976, "n_chars": 927311} \ No newline at end of file diff --git a/stats/compress_rate/internlm2_math_7b.en.json b/stats/compress_rate/internlm2_math_7b.en.json new file mode 100644 index 0000000000000000000000000000000000000000..b1aa88714c25fbc18f90294e17aa99c9822747f7 --- /dev/null +++ b/stats/compress_rate/internlm2_math_7b.en.json @@ -0,0 +1 @@ +{"vocab_size": 92544, "n_bytes": 1124813, "n_tokens": 271583, "n_chars": 1121360} \ No newline at end of file diff --git a/stats/compress_rate/internlm2_math_7b.zh-Hans.json b/stats/compress_rate/internlm2_math_7b.zh-Hans.json new file mode 100644 index 0000000000000000000000000000000000000000..65c786d41dd7a9c878052ea5903f1fc3079601e1 --- /dev/null +++ b/stats/compress_rate/internlm2_math_7b.zh-Hans.json @@ -0,0 +1 @@ +{"vocab_size": 92544, "n_bytes": 2633047, "n_tokens": 579976, "n_chars": 927311} \ No newline at end of file diff --git a/stats/compress_rate/internlm_chat_7b.en.json b/stats/compress_rate/internlm_chat_7b.en.json new file mode 100644 index 0000000000000000000000000000000000000000..aca5fa514321c6536120807b5ecbbbd4559f885b --- /dev/null +++ b/stats/compress_rate/internlm_chat_7b.en.json @@ -0,0 +1 @@ +{"vocab_size": 103168, "n_bytes": 1124813, "n_tokens": 271293, "n_chars": 1121360} \ No newline at end of file diff --git a/stats/compress_rate/internlm_chat_7b.zh-Hans.json b/stats/compress_rate/internlm_chat_7b.zh-Hans.json new file mode 100644 index 0000000000000000000000000000000000000000..e8cfd043e331261a6a4d955ed88e7374747ddae1 --- /dev/null +++ b/stats/compress_rate/internlm_chat_7b.zh-Hans.json @@ -0,0 +1 @@ +{"vocab_size": 103168, "n_bytes": 2633047, "n_tokens": 579109, "n_chars": 927311} \ No newline at end of file diff --git a/stats/compress_rate/internlm_xcomposer_7b.en.json b/stats/compress_rate/internlm_xcomposer_7b.en.json new file mode 100644 index 0000000000000000000000000000000000000000..aca5fa514321c6536120807b5ecbbbd4559f885b --- /dev/null +++ b/stats/compress_rate/internlm_xcomposer_7b.en.json @@ -0,0 +1 @@ +{"vocab_size": 103168, "n_bytes": 1124813, "n_tokens": 271293, "n_chars": 1121360} \ No newline at end of file diff --git a/stats/compress_rate/internlm_xcomposer_7b.zh-Hans.json b/stats/compress_rate/internlm_xcomposer_7b.zh-Hans.json new file mode 100644 index 0000000000000000000000000000000000000000..e8cfd043e331261a6a4d955ed88e7374747ddae1 --- /dev/null +++ b/stats/compress_rate/internlm_xcomposer_7b.zh-Hans.json @@ -0,0 +1 @@ +{"vocab_size": 103168, "n_bytes": 2633047, "n_tokens": 579109, "n_chars": 927311} \ No newline at end of file diff --git a/stats/compress_rate/jamba_v0_1.en.json b/stats/compress_rate/jamba_v0_1.en.json new file mode 100644 index 0000000000000000000000000000000000000000..8768b9997dc3d345acca8dac430faa10d3ad6dff --- /dev/null +++ b/stats/compress_rate/jamba_v0_1.en.json @@ -0,0 +1 @@ +{"vocab_size": 65536, "n_bytes": 1124813, "n_tokens": 274242, "n_chars": 1121360} \ No newline at end of file diff --git a/stats/compress_rate/jamba_v0_1.zh-Hans.json b/stats/compress_rate/jamba_v0_1.zh-Hans.json new file mode 100644 index 0000000000000000000000000000000000000000..ff6898c4560eb20ebb3b420358aa8fdd96f66782 --- /dev/null +++ b/stats/compress_rate/jamba_v0_1.zh-Hans.json @@ -0,0 +1 @@ +{"vocab_size": 65536, "n_bytes": 2633047, "n_tokens": 1067054, "n_chars": 927311} \ No newline at end of file diff --git a/stats/compress_rate/kplug.en.json b/stats/compress_rate/kplug.en.json new file mode 100644 index 0000000000000000000000000000000000000000..dd72802062c5d39ea3ffa961a62fbd6cb187c9e7 --- /dev/null +++ b/stats/compress_rate/kplug.en.json @@ -0,0 +1 @@ +{"vocab_size": 10261, "n_bytes": 1124813, "n_tokens": 393564, "n_chars": 1121360} \ No newline at end of file diff --git a/stats/compress_rate/kplug.zh-Hans.json b/stats/compress_rate/kplug.zh-Hans.json new file mode 100644 index 0000000000000000000000000000000000000000..1716c55d43f319dfcc08b2e48eef10e4882b426d --- /dev/null +++ b/stats/compress_rate/kplug.zh-Hans.json @@ -0,0 +1 @@ +{"vocab_size": 10261, "n_bytes": 2633047, "n_tokens": 902451, "n_chars": 927311} \ No newline at end of file diff --git a/stats/compress_rate/llama.en.json b/stats/compress_rate/llama.en.json new file mode 100644 index 0000000000000000000000000000000000000000..5d35bdd8f62e22e8f60aa2746b1336323526a3d1 --- /dev/null +++ b/stats/compress_rate/llama.en.json @@ -0,0 +1 @@ +{"vocab_size": 32000, "n_bytes": 1124813, "n_tokens": 294627, "n_chars": 1121360} \ No newline at end of file diff --git a/stats/compress_rate/llama.zh-Hans.json b/stats/compress_rate/llama.zh-Hans.json new file mode 100644 index 0000000000000000000000000000000000000000..10b4d902001cac4d7cca7faaee0389193b8a8127 --- /dev/null +++ b/stats/compress_rate/llama.zh-Hans.json @@ -0,0 +1 @@ +{"vocab_size": 32000, "n_bytes": 2633047, "n_tokens": 1330093, "n_chars": 927311} \ No newline at end of file diff --git a/stats/compress_rate/llama2.en.json b/stats/compress_rate/llama2.en.json new file mode 100644 index 0000000000000000000000000000000000000000..5d35bdd8f62e22e8f60aa2746b1336323526a3d1 --- /dev/null +++ b/stats/compress_rate/llama2.en.json @@ -0,0 +1 @@ +{"vocab_size": 32000, "n_bytes": 1124813, "n_tokens": 294627, "n_chars": 1121360} \ No newline at end of file diff --git a/stats/compress_rate/llama2.zh-Hans.json b/stats/compress_rate/llama2.zh-Hans.json new file mode 100644 index 0000000000000000000000000000000000000000..10b4d902001cac4d7cca7faaee0389193b8a8127 --- /dev/null +++ b/stats/compress_rate/llama2.zh-Hans.json @@ -0,0 +1 @@ +{"vocab_size": 32000, "n_bytes": 2633047, "n_tokens": 1330093, "n_chars": 927311} \ No newline at end of file diff --git a/stats/compress_rate/llama3.en.json b/stats/compress_rate/llama3.en.json new file mode 100644 index 0000000000000000000000000000000000000000..11ea75f8519ee91cf9cb421e499b8daf3249dba3 --- /dev/null +++ b/stats/compress_rate/llama3.en.json @@ -0,0 +1 @@ +{"vocab_size": 128000, "n_bytes": 1124813, "n_tokens": 254944, "n_chars": 1121360} \ No newline at end of file diff --git a/stats/compress_rate/llama3.zh-Hans.json b/stats/compress_rate/llama3.zh-Hans.json new file mode 100644 index 0000000000000000000000000000000000000000..ecce8e4c8d360f66099a5e434981437aaabc5e6d --- /dev/null +++ b/stats/compress_rate/llama3.zh-Hans.json @@ -0,0 +1 @@ +{"vocab_size": 128000, "n_bytes": 2633047, "n_tokens": 747405, "n_chars": 927311} \ No newline at end of file diff --git a/stats/compress_rate/mistral_7b.en.json b/stats/compress_rate/mistral_7b.en.json new file mode 100644 index 0000000000000000000000000000000000000000..cd94afd2f30c9cf60767a4087a2542f1db3604b2 --- /dev/null +++ b/stats/compress_rate/mistral_7b.en.json @@ -0,0 +1 @@ +{"vocab_size": 32000, "n_bytes": 1124813, "n_tokens": 285801, "n_chars": 1121360} \ No newline at end of file diff --git a/stats/compress_rate/mistral_7b.zh-Hans.json b/stats/compress_rate/mistral_7b.zh-Hans.json new file mode 100644 index 0000000000000000000000000000000000000000..996afb808de899a48b4c7b78dca154e3b39e51f8 --- /dev/null +++ b/stats/compress_rate/mistral_7b.zh-Hans.json @@ -0,0 +1 @@ +{"vocab_size": 32000, "n_bytes": 2633047, "n_tokens": 1041023, "n_chars": 927311} \ No newline at end of file diff --git a/stats/compress_rate/mixtral_8_7b.en.json b/stats/compress_rate/mixtral_8_7b.en.json new file mode 100644 index 0000000000000000000000000000000000000000..cd94afd2f30c9cf60767a4087a2542f1db3604b2 --- /dev/null +++ b/stats/compress_rate/mixtral_8_7b.en.json @@ -0,0 +1 @@ +{"vocab_size": 32000, "n_bytes": 1124813, "n_tokens": 285801, "n_chars": 1121360} \ No newline at end of file diff --git a/stats/compress_rate/mixtral_8_7b.zh-Hans.json b/stats/compress_rate/mixtral_8_7b.zh-Hans.json new file mode 100644 index 0000000000000000000000000000000000000000..996afb808de899a48b4c7b78dca154e3b39e51f8 --- /dev/null +++ b/stats/compress_rate/mixtral_8_7b.zh-Hans.json @@ -0,0 +1 @@ +{"vocab_size": 32000, "n_bytes": 2633047, "n_tokens": 1041023, "n_chars": 927311} \ No newline at end of file diff --git a/stats/compress_rate/mobilebert_uncased.en.json b/stats/compress_rate/mobilebert_uncased.en.json new file mode 100644 index 0000000000000000000000000000000000000000..b77bdaf83aadc81b0b0ad3ee9e373955bd3555f7 --- /dev/null +++ b/stats/compress_rate/mobilebert_uncased.en.json @@ -0,0 +1 @@ +{"vocab_size": 30522, "n_bytes": 1124813, "n_tokens": 280575, "n_chars": 1121360} \ No newline at end of file diff --git a/stats/compress_rate/mobilebert_uncased.zh-Hans.json b/stats/compress_rate/mobilebert_uncased.zh-Hans.json new file mode 100644 index 0000000000000000000000000000000000000000..872b373a274914990dfc077b60c07e14779fccbf --- /dev/null +++ b/stats/compress_rate/mobilebert_uncased.zh-Hans.json @@ -0,0 +1 @@ +{"vocab_size": 30522, "n_bytes": 2633047, "n_tokens": 898554, "n_chars": 927311} \ No newline at end of file diff --git a/stats/compress_rate/moss.en.json b/stats/compress_rate/moss.en.json new file mode 100644 index 0000000000000000000000000000000000000000..6368bd06f914d9b725f0f45202f5d1c5f9fb5c49 --- /dev/null +++ b/stats/compress_rate/moss.en.json @@ -0,0 +1 @@ +{"vocab_size": 106029, "n_bytes": 1124813, "n_tokens": 257070, "n_chars": 1121360} \ No newline at end of file diff --git a/stats/compress_rate/moss.zh-Hans.json b/stats/compress_rate/moss.zh-Hans.json new file mode 100644 index 0000000000000000000000000000000000000000..f19a0fa735b8a4c2c318fe784cf84a6d8cc29fba --- /dev/null +++ b/stats/compress_rate/moss.zh-Hans.json @@ -0,0 +1 @@ +{"vocab_size": 106029, "n_bytes": 2633047, "n_tokens": 557455, "n_chars": 927311} \ No newline at end of file diff --git a/stats/compress_rate/mt5_large.en.json b/stats/compress_rate/mt5_large.en.json new file mode 100644 index 0000000000000000000000000000000000000000..b0a79af7b29730761f24588b314d0d330a5a95f7 --- /dev/null +++ b/stats/compress_rate/mt5_large.en.json @@ -0,0 +1 @@ +{"vocab_size": 250100, "n_bytes": 1124813, "n_tokens": 317881, "n_chars": 1121360} \ No newline at end of file diff --git a/stats/compress_rate/mt5_large.zh-Hans.json b/stats/compress_rate/mt5_large.zh-Hans.json new file mode 100644 index 0000000000000000000000000000000000000000..86a0cf561d02c2e8e0065d494a398b7d8932260e --- /dev/null +++ b/stats/compress_rate/mt5_large.zh-Hans.json @@ -0,0 +1 @@ +{"vocab_size": 250100, "n_bytes": 2633047, "n_tokens": 631182, "n_chars": 927311} \ No newline at end of file diff --git a/stats/compress_rate/olmo_7b.en.json b/stats/compress_rate/olmo_7b.en.json new file mode 100644 index 0000000000000000000000000000000000000000..e89e94468d7f6104d063658690ea668b65841d4c --- /dev/null +++ b/stats/compress_rate/olmo_7b.en.json @@ -0,0 +1 @@ +{"vocab_size": 50280, "n_bytes": 1124813, "n_tokens": 259357, "n_chars": 1121360} \ No newline at end of file diff --git a/stats/compress_rate/olmo_7b.zh-Hans.json b/stats/compress_rate/olmo_7b.zh-Hans.json new file mode 100644 index 0000000000000000000000000000000000000000..626d069807f4d1aa6798d2ff63c85399b83800e5 --- /dev/null +++ b/stats/compress_rate/olmo_7b.zh-Hans.json @@ -0,0 +1 @@ +{"vocab_size": 50280, "n_bytes": 2633047, "n_tokens": 1220529, "n_chars": 927311} \ No newline at end of file diff --git a/stats/compress_rate/orion_14b_chat.en.json b/stats/compress_rate/orion_14b_chat.en.json new file mode 100644 index 0000000000000000000000000000000000000000..ae8e90f25379c3008fe7b7f85bfaf4b416ee3891 --- /dev/null +++ b/stats/compress_rate/orion_14b_chat.en.json @@ -0,0 +1 @@ +{"vocab_size": 84608, "n_bytes": 1124813, "n_tokens": 265948, "n_chars": 1121360} \ No newline at end of file diff --git a/stats/compress_rate/orion_14b_chat.zh-Hans.json b/stats/compress_rate/orion_14b_chat.zh-Hans.json new file mode 100644 index 0000000000000000000000000000000000000000..4007c5d91af1be0c02afdc24736dd136cf2e0781 --- /dev/null +++ b/stats/compress_rate/orion_14b_chat.zh-Hans.json @@ -0,0 +1 @@ +{"vocab_size": 84608, "n_bytes": 2633047, "n_tokens": 529926, "n_chars": 927311} \ No newline at end of file diff --git a/stats/compress_rate/phi_1.en.json b/stats/compress_rate/phi_1.en.json new file mode 100644 index 0000000000000000000000000000000000000000..9350ee6d1c041e2fa6825baf711a3c3a3b9c6290 --- /dev/null +++ b/stats/compress_rate/phi_1.en.json @@ -0,0 +1 @@ +{"vocab_size": 50257, "n_bytes": 1124813, "n_tokens": 258409, "n_chars": 1121360} \ No newline at end of file diff --git a/stats/compress_rate/phi_1.zh-Hans.json b/stats/compress_rate/phi_1.zh-Hans.json new file mode 100644 index 0000000000000000000000000000000000000000..3a8fdf75f1fe9a7047bb545fbe0857625b8f7899 --- /dev/null +++ b/stats/compress_rate/phi_1.zh-Hans.json @@ -0,0 +1 @@ +{"vocab_size": 50257, "n_bytes": 2633047, "n_tokens": 1876809, "n_chars": 927311} \ No newline at end of file diff --git a/stats/compress_rate/phi_2.en.json b/stats/compress_rate/phi_2.en.json new file mode 100644 index 0000000000000000000000000000000000000000..9350ee6d1c041e2fa6825baf711a3c3a3b9c6290 --- /dev/null +++ b/stats/compress_rate/phi_2.en.json @@ -0,0 +1 @@ +{"vocab_size": 50257, "n_bytes": 1124813, "n_tokens": 258409, "n_chars": 1121360} \ No newline at end of file diff --git a/stats/compress_rate/phi_2.zh-Hans.json b/stats/compress_rate/phi_2.zh-Hans.json new file mode 100644 index 0000000000000000000000000000000000000000..3a8fdf75f1fe9a7047bb545fbe0857625b8f7899 --- /dev/null +++ b/stats/compress_rate/phi_2.zh-Hans.json @@ -0,0 +1 @@ +{"vocab_size": 50257, "n_bytes": 2633047, "n_tokens": 1876809, "n_chars": 927311} \ No newline at end of file diff --git a/stats/compress_rate/pko_t5_large.en.json b/stats/compress_rate/pko_t5_large.en.json new file mode 100644 index 0000000000000000000000000000000000000000..801619ff3ad8aca9d5a19076af5824125e596d25 --- /dev/null +++ b/stats/compress_rate/pko_t5_large.en.json @@ -0,0 +1 @@ +{"vocab_size": 50258, "n_bytes": 1124813, "n_tokens": 658985, "n_chars": 1121360} \ No newline at end of file diff --git a/stats/compress_rate/pko_t5_large.zh-Hans.json b/stats/compress_rate/pko_t5_large.zh-Hans.json new file mode 100644 index 0000000000000000000000000000000000000000..c46a78d6ad6214090f1c0bb712d7caf58188bafc --- /dev/null +++ b/stats/compress_rate/pko_t5_large.zh-Hans.json @@ -0,0 +1 @@ +{"vocab_size": 50258, "n_bytes": 2633047, "n_tokens": 2533519, "n_chars": 927311} \ No newline at end of file diff --git a/stats/compress_rate/prompt_clue.en.json b/stats/compress_rate/prompt_clue.en.json new file mode 100644 index 0000000000000000000000000000000000000000..dce7d8b1e30246491ffac1df4f27231548b1012d --- /dev/null +++ b/stats/compress_rate/prompt_clue.en.json @@ -0,0 +1 @@ +{"vocab_size": 32128, "n_bytes": 1124813, "n_tokens": 536033, "n_chars": 1121360} \ No newline at end of file diff --git a/stats/compress_rate/prompt_clue.zh-Hans.json b/stats/compress_rate/prompt_clue.zh-Hans.json new file mode 100644 index 0000000000000000000000000000000000000000..29d8ad785f9ae43c138de6da4f1e5bc28fc55c28 --- /dev/null +++ b/stats/compress_rate/prompt_clue.zh-Hans.json @@ -0,0 +1 @@ +{"vocab_size": 32128, "n_bytes": 2633047, "n_tokens": 564905, "n_chars": 927311} \ No newline at end of file diff --git a/stats/compress_rate/qwen1_5_14b_chat.en.json b/stats/compress_rate/qwen1_5_14b_chat.en.json new file mode 100644 index 0000000000000000000000000000000000000000..e36f560f9dec3ca31f85ddaf9025757a1aa34fd4 --- /dev/null +++ b/stats/compress_rate/qwen1_5_14b_chat.en.json @@ -0,0 +1 @@ +{"vocab_size": 151643, "n_bytes": 1124813, "n_tokens": 257983, "n_chars": 1121360} \ No newline at end of file diff --git a/stats/compress_rate/qwen1_5_14b_chat.zh-Hans.json b/stats/compress_rate/qwen1_5_14b_chat.zh-Hans.json new file mode 100644 index 0000000000000000000000000000000000000000..8fe2bec2f773a5d27847b218160b1c446d46013c --- /dev/null +++ b/stats/compress_rate/qwen1_5_14b_chat.zh-Hans.json @@ -0,0 +1 @@ +{"vocab_size": 151643, "n_bytes": 2633047, "n_tokens": 589211, "n_chars": 927311} \ No newline at end of file diff --git a/stats/compress_rate/qwen_1_8b_chat.en.json b/stats/compress_rate/qwen_1_8b_chat.en.json new file mode 100644 index 0000000000000000000000000000000000000000..d61addecb4504613433fb727584dab4080fec030 --- /dev/null +++ b/stats/compress_rate/qwen_1_8b_chat.en.json @@ -0,0 +1 @@ +{"vocab_size": 151851, "n_bytes": 1124813, "n_tokens": 257983, "n_chars": 1121360} \ No newline at end of file diff --git a/stats/compress_rate/qwen_1_8b_chat.zh-Hans.json b/stats/compress_rate/qwen_1_8b_chat.zh-Hans.json new file mode 100644 index 0000000000000000000000000000000000000000..196ca69c84e55e2bbd5483b1743baeace78fa3d3 --- /dev/null +++ b/stats/compress_rate/qwen_1_8b_chat.zh-Hans.json @@ -0,0 +1 @@ +{"vocab_size": 151851, "n_bytes": 2633047, "n_tokens": 589211, "n_chars": 927311} \ No newline at end of file diff --git a/stats/compress_rate/qwen_72b_chat.en.json b/stats/compress_rate/qwen_72b_chat.en.json new file mode 100644 index 0000000000000000000000000000000000000000..d61addecb4504613433fb727584dab4080fec030 --- /dev/null +++ b/stats/compress_rate/qwen_72b_chat.en.json @@ -0,0 +1 @@ +{"vocab_size": 151851, "n_bytes": 1124813, "n_tokens": 257983, "n_chars": 1121360} \ No newline at end of file diff --git a/stats/compress_rate/qwen_72b_chat.zh-Hans.json b/stats/compress_rate/qwen_72b_chat.zh-Hans.json new file mode 100644 index 0000000000000000000000000000000000000000..196ca69c84e55e2bbd5483b1743baeace78fa3d3 --- /dev/null +++ b/stats/compress_rate/qwen_72b_chat.zh-Hans.json @@ -0,0 +1 @@ +{"vocab_size": 151851, "n_bytes": 2633047, "n_tokens": 589211, "n_chars": 927311} \ No newline at end of file diff --git a/stats/compress_rate/qwen_7b_chat.en.json b/stats/compress_rate/qwen_7b_chat.en.json new file mode 100644 index 0000000000000000000000000000000000000000..d61addecb4504613433fb727584dab4080fec030 --- /dev/null +++ b/stats/compress_rate/qwen_7b_chat.en.json @@ -0,0 +1 @@ +{"vocab_size": 151851, "n_bytes": 1124813, "n_tokens": 257983, "n_chars": 1121360} \ No newline at end of file diff --git a/stats/compress_rate/qwen_7b_chat.zh-Hans.json b/stats/compress_rate/qwen_7b_chat.zh-Hans.json new file mode 100644 index 0000000000000000000000000000000000000000..196ca69c84e55e2bbd5483b1743baeace78fa3d3 --- /dev/null +++ b/stats/compress_rate/qwen_7b_chat.zh-Hans.json @@ -0,0 +1 @@ +{"vocab_size": 151851, "n_bytes": 2633047, "n_tokens": 589211, "n_chars": 927311} \ No newline at end of file diff --git a/stats/compress_rate/roberta_chinese_clue.en.json b/stats/compress_rate/roberta_chinese_clue.en.json new file mode 100644 index 0000000000000000000000000000000000000000..425cd445233c6dccacbaa3aeb41bcd1ca5e5a81a --- /dev/null +++ b/stats/compress_rate/roberta_chinese_clue.en.json @@ -0,0 +1 @@ +{"vocab_size": 8021, "n_bytes": 1124813, "n_tokens": 583058, "n_chars": 1121360} \ No newline at end of file diff --git a/stats/compress_rate/roberta_chinese_clue.zh-Hans.json b/stats/compress_rate/roberta_chinese_clue.zh-Hans.json new file mode 100644 index 0000000000000000000000000000000000000000..d6788ff1d8e4e04ea3069648357a7af3470f8af1 --- /dev/null +++ b/stats/compress_rate/roberta_chinese_clue.zh-Hans.json @@ -0,0 +1 @@ +{"vocab_size": 8021, "n_bytes": 2633047, "n_tokens": 907144, "n_chars": 927311} \ No newline at end of file diff --git a/stats/compress_rate/skywork_13b_base.en.json b/stats/compress_rate/skywork_13b_base.en.json new file mode 100644 index 0000000000000000000000000000000000000000..4dcc95ff3a3cdb3d1ea0208d42580837d56233f6 --- /dev/null +++ b/stats/compress_rate/skywork_13b_base.en.json @@ -0,0 +1 @@ +{"vocab_size": 65519, "n_bytes": 1124813, "n_tokens": 294617, "n_chars": 1121360} \ No newline at end of file diff --git a/stats/compress_rate/skywork_13b_base.zh-Hans.json b/stats/compress_rate/skywork_13b_base.zh-Hans.json new file mode 100644 index 0000000000000000000000000000000000000000..7bace43e8988eaf0dda1f5bcedf0f31600c876b3 --- /dev/null +++ b/stats/compress_rate/skywork_13b_base.zh-Hans.json @@ -0,0 +1 @@ +{"vocab_size": 65519, "n_bytes": 2633047, "n_tokens": 663923, "n_chars": 927311} \ No newline at end of file diff --git a/stats/compress_rate/skywork_13b_math.en.json b/stats/compress_rate/skywork_13b_math.en.json new file mode 100644 index 0000000000000000000000000000000000000000..4dcc95ff3a3cdb3d1ea0208d42580837d56233f6 --- /dev/null +++ b/stats/compress_rate/skywork_13b_math.en.json @@ -0,0 +1 @@ +{"vocab_size": 65519, "n_bytes": 1124813, "n_tokens": 294617, "n_chars": 1121360} \ No newline at end of file diff --git a/stats/compress_rate/skywork_13b_math.zh-Hans.json b/stats/compress_rate/skywork_13b_math.zh-Hans.json new file mode 100644 index 0000000000000000000000000000000000000000..7bace43e8988eaf0dda1f5bcedf0f31600c876b3 --- /dev/null +++ b/stats/compress_rate/skywork_13b_math.zh-Hans.json @@ -0,0 +1 @@ +{"vocab_size": 65519, "n_bytes": 2633047, "n_tokens": 663923, "n_chars": 927311} \ No newline at end of file diff --git a/stats/compress_rate/solar_10_7b.en.json b/stats/compress_rate/solar_10_7b.en.json new file mode 100644 index 0000000000000000000000000000000000000000..cd94afd2f30c9cf60767a4087a2542f1db3604b2 --- /dev/null +++ b/stats/compress_rate/solar_10_7b.en.json @@ -0,0 +1 @@ +{"vocab_size": 32000, "n_bytes": 1124813, "n_tokens": 285801, "n_chars": 1121360} \ No newline at end of file diff --git a/stats/compress_rate/solar_10_7b.zh-Hans.json b/stats/compress_rate/solar_10_7b.zh-Hans.json new file mode 100644 index 0000000000000000000000000000000000000000..996afb808de899a48b4c7b78dca154e3b39e51f8 --- /dev/null +++ b/stats/compress_rate/solar_10_7b.zh-Hans.json @@ -0,0 +1 @@ +{"vocab_size": 32000, "n_bytes": 2633047, "n_tokens": 1041023, "n_chars": 927311} \ No newline at end of file diff --git a/stats/compress_rate/starchat_alpha.en.json b/stats/compress_rate/starchat_alpha.en.json new file mode 100644 index 0000000000000000000000000000000000000000..63d174be7a5367dcbd8edc2660055edb1654ff81 --- /dev/null +++ b/stats/compress_rate/starchat_alpha.en.json @@ -0,0 +1 @@ +{"vocab_size": 49152, "n_bytes": 1124813, "n_tokens": 288965, "n_chars": 1121360} \ No newline at end of file diff --git a/stats/compress_rate/starchat_alpha.zh-Hans.json b/stats/compress_rate/starchat_alpha.zh-Hans.json new file mode 100644 index 0000000000000000000000000000000000000000..f990d7f54bb4d3815aaac97429134f711937970c --- /dev/null +++ b/stats/compress_rate/starchat_alpha.zh-Hans.json @@ -0,0 +1 @@ +{"vocab_size": 49152, "n_bytes": 2633047, "n_tokens": 882018, "n_chars": 927311} \ No newline at end of file diff --git a/stats/compress_rate/switch_c_2048.en.json b/stats/compress_rate/switch_c_2048.en.json new file mode 100644 index 0000000000000000000000000000000000000000..a7e21bb731a976080d974313aa72dc8c40837d66 --- /dev/null +++ b/stats/compress_rate/switch_c_2048.en.json @@ -0,0 +1 @@ +{"vocab_size": 32100, "n_bytes": 1124813, "n_tokens": 290104, "n_chars": 1121360} \ No newline at end of file diff --git a/stats/compress_rate/switch_c_2048.zh-Hans.json b/stats/compress_rate/switch_c_2048.zh-Hans.json new file mode 100644 index 0000000000000000000000000000000000000000..a0191aa4838161362ce63ad0ad07b4dc0db9f85a --- /dev/null +++ b/stats/compress_rate/switch_c_2048.zh-Hans.json @@ -0,0 +1 @@ +{"vocab_size": 32100, "n_bytes": 2633047, "n_tokens": 173519, "n_chars": 927311} \ No newline at end of file diff --git a/stats/compress_rate/t5_base.en.json b/stats/compress_rate/t5_base.en.json new file mode 100644 index 0000000000000000000000000000000000000000..a7e21bb731a976080d974313aa72dc8c40837d66 --- /dev/null +++ b/stats/compress_rate/t5_base.en.json @@ -0,0 +1 @@ +{"vocab_size": 32100, "n_bytes": 1124813, "n_tokens": 290104, "n_chars": 1121360} \ No newline at end of file diff --git a/stats/compress_rate/t5_base.zh-Hans.json b/stats/compress_rate/t5_base.zh-Hans.json new file mode 100644 index 0000000000000000000000000000000000000000..a0191aa4838161362ce63ad0ad07b4dc0db9f85a --- /dev/null +++ b/stats/compress_rate/t5_base.zh-Hans.json @@ -0,0 +1 @@ +{"vocab_size": 32100, "n_bytes": 2633047, "n_tokens": 173519, "n_chars": 927311} \ No newline at end of file diff --git a/stats/compress_rate/t5_large.en.json b/stats/compress_rate/t5_large.en.json new file mode 100644 index 0000000000000000000000000000000000000000..a7e21bb731a976080d974313aa72dc8c40837d66 --- /dev/null +++ b/stats/compress_rate/t5_large.en.json @@ -0,0 +1 @@ +{"vocab_size": 32100, "n_bytes": 1124813, "n_tokens": 290104, "n_chars": 1121360} \ No newline at end of file diff --git a/stats/compress_rate/t5_large.zh-Hans.json b/stats/compress_rate/t5_large.zh-Hans.json new file mode 100644 index 0000000000000000000000000000000000000000..a0191aa4838161362ce63ad0ad07b4dc0db9f85a --- /dev/null +++ b/stats/compress_rate/t5_large.zh-Hans.json @@ -0,0 +1 @@ +{"vocab_size": 32100, "n_bytes": 2633047, "n_tokens": 173519, "n_chars": 927311} \ No newline at end of file diff --git a/stats/compress_rate/t5_small.en.json b/stats/compress_rate/t5_small.en.json new file mode 100644 index 0000000000000000000000000000000000000000..a7e21bb731a976080d974313aa72dc8c40837d66 --- /dev/null +++ b/stats/compress_rate/t5_small.en.json @@ -0,0 +1 @@ +{"vocab_size": 32100, "n_bytes": 1124813, "n_tokens": 290104, "n_chars": 1121360} \ No newline at end of file diff --git a/stats/compress_rate/t5_small.zh-Hans.json b/stats/compress_rate/t5_small.zh-Hans.json new file mode 100644 index 0000000000000000000000000000000000000000..a0191aa4838161362ce63ad0ad07b4dc0db9f85a --- /dev/null +++ b/stats/compress_rate/t5_small.zh-Hans.json @@ -0,0 +1 @@ +{"vocab_size": 32100, "n_bytes": 2633047, "n_tokens": 173519, "n_chars": 927311} \ No newline at end of file diff --git a/stats/compress_rate/text_davinci_003.en.json b/stats/compress_rate/text_davinci_003.en.json new file mode 100644 index 0000000000000000000000000000000000000000..27ad8b617e80415a1c05a199574f4b4bbdeb7231 --- /dev/null +++ b/stats/compress_rate/text_davinci_003.en.json @@ -0,0 +1 @@ +{"vocab_size": 50281, "n_bytes": 1124813, "n_tokens": 258403, "n_chars": 1121360} \ No newline at end of file diff --git a/stats/compress_rate/text_davinci_003.zh-Hans.json b/stats/compress_rate/text_davinci_003.zh-Hans.json new file mode 100644 index 0000000000000000000000000000000000000000..4ae85715f9abab2fa1b044de2ea87ffbcb031199 --- /dev/null +++ b/stats/compress_rate/text_davinci_003.zh-Hans.json @@ -0,0 +1 @@ +{"vocab_size": 50281, "n_bytes": 2633047, "n_tokens": 1876809, "n_chars": 927311} \ No newline at end of file diff --git a/stats/compress_rate/tigerbot_13b_chat_v2.en.json b/stats/compress_rate/tigerbot_13b_chat_v2.en.json new file mode 100644 index 0000000000000000000000000000000000000000..01920c3c903a125d70be1cbe4377b1ed859e43ee --- /dev/null +++ b/stats/compress_rate/tigerbot_13b_chat_v2.en.json @@ -0,0 +1 @@ +{"vocab_size": 60512, "n_bytes": 1124813, "n_tokens": 285652, "n_chars": 1121360} \ No newline at end of file diff --git a/stats/compress_rate/tigerbot_13b_chat_v2.zh-Hans.json b/stats/compress_rate/tigerbot_13b_chat_v2.zh-Hans.json new file mode 100644 index 0000000000000000000000000000000000000000..b036e97b3a4585c74581a5aeb410a34341e125c3 --- /dev/null +++ b/stats/compress_rate/tigerbot_13b_chat_v2.zh-Hans.json @@ -0,0 +1 @@ +{"vocab_size": 60512, "n_bytes": 2633047, "n_tokens": 577385, "n_chars": 927311} \ No newline at end of file diff --git a/stats/compress_rate/tigerbot_70b_chat_v4_4k.en.json b/stats/compress_rate/tigerbot_70b_chat_v4_4k.en.json new file mode 100644 index 0000000000000000000000000000000000000000..f40accb66ceb9bdac1c4c168cb7cb7e7025905b4 --- /dev/null +++ b/stats/compress_rate/tigerbot_70b_chat_v4_4k.en.json @@ -0,0 +1 @@ +{"vocab_size": 65107, "n_bytes": 1124813, "n_tokens": 286946, "n_chars": 1121360} \ No newline at end of file diff --git a/stats/compress_rate/tigerbot_70b_chat_v4_4k.zh-Hans.json b/stats/compress_rate/tigerbot_70b_chat_v4_4k.zh-Hans.json new file mode 100644 index 0000000000000000000000000000000000000000..46385a7bdfd8aeeb48dc5f002e5af2300b2c14c2 --- /dev/null +++ b/stats/compress_rate/tigerbot_70b_chat_v4_4k.zh-Hans.json @@ -0,0 +1 @@ +{"vocab_size": 65107, "n_bytes": 2633047, "n_tokens": 577211, "n_chars": 927311} \ No newline at end of file diff --git a/stats/compress_rate/wizardcoder_15b_v1.en.json b/stats/compress_rate/wizardcoder_15b_v1.en.json new file mode 100644 index 0000000000000000000000000000000000000000..63d174be7a5367dcbd8edc2660055edb1654ff81 --- /dev/null +++ b/stats/compress_rate/wizardcoder_15b_v1.en.json @@ -0,0 +1 @@ +{"vocab_size": 49152, "n_bytes": 1124813, "n_tokens": 288965, "n_chars": 1121360} \ No newline at end of file diff --git a/stats/compress_rate/wizardcoder_15b_v1.zh-Hans.json b/stats/compress_rate/wizardcoder_15b_v1.zh-Hans.json new file mode 100644 index 0000000000000000000000000000000000000000..f990d7f54bb4d3815aaac97429134f711937970c --- /dev/null +++ b/stats/compress_rate/wizardcoder_15b_v1.zh-Hans.json @@ -0,0 +1 @@ +{"vocab_size": 49152, "n_bytes": 2633047, "n_tokens": 882018, "n_chars": 927311} \ No newline at end of file diff --git a/stats/compress_rate/wizardcoder_python_7b_v1.en.json b/stats/compress_rate/wizardcoder_python_7b_v1.en.json new file mode 100644 index 0000000000000000000000000000000000000000..5d35bdd8f62e22e8f60aa2746b1336323526a3d1 --- /dev/null +++ b/stats/compress_rate/wizardcoder_python_7b_v1.en.json @@ -0,0 +1 @@ +{"vocab_size": 32000, "n_bytes": 1124813, "n_tokens": 294627, "n_chars": 1121360} \ No newline at end of file diff --git a/stats/compress_rate/wizardcoder_python_7b_v1.zh-Hans.json b/stats/compress_rate/wizardcoder_python_7b_v1.zh-Hans.json new file mode 100644 index 0000000000000000000000000000000000000000..10b4d902001cac4d7cca7faaee0389193b8a8127 --- /dev/null +++ b/stats/compress_rate/wizardcoder_python_7b_v1.zh-Hans.json @@ -0,0 +1 @@ +{"vocab_size": 32000, "n_bytes": 2633047, "n_tokens": 1330093, "n_chars": 927311} \ No newline at end of file diff --git a/stats/compress_rate/wizardlm_7b_v1.en.json b/stats/compress_rate/wizardlm_7b_v1.en.json new file mode 100644 index 0000000000000000000000000000000000000000..5d35bdd8f62e22e8f60aa2746b1336323526a3d1 --- /dev/null +++ b/stats/compress_rate/wizardlm_7b_v1.en.json @@ -0,0 +1 @@ +{"vocab_size": 32000, "n_bytes": 1124813, "n_tokens": 294627, "n_chars": 1121360} \ No newline at end of file diff --git a/stats/compress_rate/wizardlm_7b_v1.zh-Hans.json b/stats/compress_rate/wizardlm_7b_v1.zh-Hans.json new file mode 100644 index 0000000000000000000000000000000000000000..10b4d902001cac4d7cca7faaee0389193b8a8127 --- /dev/null +++ b/stats/compress_rate/wizardlm_7b_v1.zh-Hans.json @@ -0,0 +1 @@ +{"vocab_size": 32000, "n_bytes": 2633047, "n_tokens": 1330093, "n_chars": 927311} \ No newline at end of file diff --git a/stats/compress_rate/wizardmath_70b_v1.en.json b/stats/compress_rate/wizardmath_70b_v1.en.json new file mode 100644 index 0000000000000000000000000000000000000000..5d35bdd8f62e22e8f60aa2746b1336323526a3d1 --- /dev/null +++ b/stats/compress_rate/wizardmath_70b_v1.en.json @@ -0,0 +1 @@ +{"vocab_size": 32000, "n_bytes": 1124813, "n_tokens": 294627, "n_chars": 1121360} \ No newline at end of file diff --git a/stats/compress_rate/wizardmath_70b_v1.zh-Hans.json b/stats/compress_rate/wizardmath_70b_v1.zh-Hans.json new file mode 100644 index 0000000000000000000000000000000000000000..10b4d902001cac4d7cca7faaee0389193b8a8127 --- /dev/null +++ b/stats/compress_rate/wizardmath_70b_v1.zh-Hans.json @@ -0,0 +1 @@ +{"vocab_size": 32000, "n_bytes": 2633047, "n_tokens": 1330093, "n_chars": 927311} \ No newline at end of file diff --git a/stats/compress_rate/xlm_roberta.en.json b/stats/compress_rate/xlm_roberta.en.json new file mode 100644 index 0000000000000000000000000000000000000000..bf31c386f0a8e8849fc284fe1974b3359254639d --- /dev/null +++ b/stats/compress_rate/xlm_roberta.en.json @@ -0,0 +1 @@ +{"vocab_size": 250002, "n_bytes": 1124813, "n_tokens": 300026, "n_chars": 1121360} \ No newline at end of file diff --git a/stats/compress_rate/xlm_roberta.zh-Hans.json b/stats/compress_rate/xlm_roberta.zh-Hans.json new file mode 100644 index 0000000000000000000000000000000000000000..611ed9b02a8d0de2469da2028caea79e4a721ad6 --- /dev/null +++ b/stats/compress_rate/xlm_roberta.zh-Hans.json @@ -0,0 +1 @@ +{"vocab_size": 250002, "n_bytes": 2633047, "n_tokens": 619844, "n_chars": 927311} \ No newline at end of file diff --git a/stats/compress_rate/yi_34b.en.json b/stats/compress_rate/yi_34b.en.json new file mode 100644 index 0000000000000000000000000000000000000000..967c2c297e424c9f0fb14519cc61b57085f47b69 --- /dev/null +++ b/stats/compress_rate/yi_34b.en.json @@ -0,0 +1 @@ +{"vocab_size": 64000, "n_bytes": 1124813, "n_tokens": 270400, "n_chars": 1121360} \ No newline at end of file diff --git a/stats/compress_rate/yi_34b.zh-Hans.json b/stats/compress_rate/yi_34b.zh-Hans.json new file mode 100644 index 0000000000000000000000000000000000000000..4563d5fcb9738acc8c1083867085db63fd69744d --- /dev/null +++ b/stats/compress_rate/yi_34b.zh-Hans.json @@ -0,0 +1 @@ +{"vocab_size": 64000, "n_bytes": 2633047, "n_tokens": 588729, "n_chars": 927311} \ No newline at end of file diff --git a/stats/compress_rate/yi_6b.en.json b/stats/compress_rate/yi_6b.en.json new file mode 100644 index 0000000000000000000000000000000000000000..967c2c297e424c9f0fb14519cc61b57085f47b69 --- /dev/null +++ b/stats/compress_rate/yi_6b.en.json @@ -0,0 +1 @@ +{"vocab_size": 64000, "n_bytes": 1124813, "n_tokens": 270400, "n_chars": 1121360} \ No newline at end of file diff --git a/stats/compress_rate/yi_6b.zh-Hans.json b/stats/compress_rate/yi_6b.zh-Hans.json new file mode 100644 index 0000000000000000000000000000000000000000..4563d5fcb9738acc8c1083867085db63fd69744d --- /dev/null +++ b/stats/compress_rate/yi_6b.zh-Hans.json @@ -0,0 +1 @@ +{"vocab_size": 64000, "n_bytes": 2633047, "n_tokens": 588729, "n_chars": 927311} \ No newline at end of file diff --git a/stats/compress_rate/yi_vl34b.en.json b/stats/compress_rate/yi_vl34b.en.json new file mode 100644 index 0000000000000000000000000000000000000000..f4c34afe5f212a9ef2b41070f3b9b5766d0a00f3 --- /dev/null +++ b/stats/compress_rate/yi_vl34b.en.json @@ -0,0 +1 @@ +{"vocab_size": 64000, "n_bytes": 1124813, "n_tokens": 269738, "n_chars": 1121360} \ No newline at end of file diff --git a/stats/compress_rate/yi_vl34b.zh-Hans.json b/stats/compress_rate/yi_vl34b.zh-Hans.json new file mode 100644 index 0000000000000000000000000000000000000000..44fc216c3d33290504201b0f672ffc32e9601c58 --- /dev/null +++ b/stats/compress_rate/yi_vl34b.zh-Hans.json @@ -0,0 +1 @@ +{"vocab_size": 64000, "n_bytes": 2633047, "n_tokens": 596166, "n_chars": 927311} \ No newline at end of file diff --git a/stats/compress_rate/zephyr_7b_beta.en.json b/stats/compress_rate/zephyr_7b_beta.en.json new file mode 100644 index 0000000000000000000000000000000000000000..cd94afd2f30c9cf60767a4087a2542f1db3604b2 --- /dev/null +++ b/stats/compress_rate/zephyr_7b_beta.en.json @@ -0,0 +1 @@ +{"vocab_size": 32000, "n_bytes": 1124813, "n_tokens": 285801, "n_chars": 1121360} \ No newline at end of file diff --git a/stats/compress_rate/zephyr_7b_beta.zh-Hans.json b/stats/compress_rate/zephyr_7b_beta.zh-Hans.json new file mode 100644 index 0000000000000000000000000000000000000000..996afb808de899a48b4c7b78dca154e3b39e51f8 --- /dev/null +++ b/stats/compress_rate/zephyr_7b_beta.zh-Hans.json @@ -0,0 +1 @@ +{"vocab_size": 32000, "n_bytes": 2633047, "n_tokens": 1041023, "n_chars": 927311} \ No newline at end of file diff --git a/util.py b/util.py index 72261b0a5c1a662d87d68439542879173bdf102b..6120f63621ef6a6531800cbcb349b9d58bd78981 100644 --- a/util.py +++ b/util.py @@ -87,9 +87,8 @@ def basic_count(tokenizer_type): # return tokenizer.vocab_size, f'{stats["中文汉字数"]["中文单字"]}/{stats["中文汉字数"]["中文多字"]}' def get_compress_rate(tokenizer_type, all_corpus, unit): - corpus_name = all_corpus[0] tokenizer = load_tokener(tokenizer_type) - compress_rate_stats = tokenize_corpus(tokenizer, corpus_name) + compress_rate_stats = tokenize_corpus(tokenizer, all_corpus) compress_rate = unit_convertor(compress_rate_stats, unit) return compress_rate diff --git a/utils/compress_rate_util.py b/utils/compress_rate_util.py index 8d53753658212981c8f0b9b3f515cc0242cbf28e..701a33836cabba8b4a842347a5f91fad24258afd 100644 --- a/utils/compress_rate_util.py +++ b/utils/compress_rate_util.py @@ -5,38 +5,6 @@ 代码数据: 数字: -## 参考 -- https://github.com/baichuan-inc/Baichuan-7B 记录了不同分词器的压缩率 - - 指标:猜测是 n_tokens/n_chars (baichuan小,说明百川token少,压缩率高) - - Baichuan 0.73; llama 1.31; -- https://github.com/QwenLM/Qwen/blob/main/tech_memo.md 记录了不同分词器的压缩率 - - 以 XLM-RoBERTa为基准 (Unsupervised Cross-lingual Representation Learning at Scale ) , - - Qwen-7B 在很多语言上压缩率都较高压缩率 (high compression rate) - - 中文: llama7b 2.2; baichuan7b 1.1; chatglm2-6b 0.9; qwen7b 0.95 - - 英文: - - 指标:猜测是 n_tokens / n_tokens_xlmR -- https://github.com/hpcaitech/ColossalAI/blob/4b8312c08e8d05a5f41453d63c8671aab601ed1c/applications/Colossal-LLaMA-2/prepare_pretrain_dataset.py#L134 - - 有压缩率的计算方式 - - https://github.com/hpcaitech/ColossalAI/blob/main/applications/Colossal-LLaMA-2/README.md#tokenizer - - 记录了不同分词器的压缩率 - - 指标: -- https://github.com/AUGMXNT/shisa/blob/6a823d77a71acbd18ab8f68a6b02f4b87ec9dddd/eval/tokenizer-efficiency-ja.py#L24 - - 有压缩率的计算方式 = {n_chars} / {n_tokens} - - -- https://github.com/huggingface/transformers/blob/cec773345aeffce3c04e8891303a3f748de7141e/src/transformers/models/whisper/generation_whisper.py#L354 - - 这个可能不是 -- https://github.com/bojone/bytepiece/blob/main/README_en.md - - "bytes/token": the average number of bytes per token -- Getting the most out of your tokenizer for pre-training and domain adaptation 👍 - - 定义: - - NSL: 两个分词器的编码长度 比例,通常以 llama为基准 - - average number of bytes per token. {n_bytes} / {n_tokens} - - higher compression rate -- -- *** https://github.com/microsoft/LLMLingua/blob/main/llmlingua/prompt_compressor.py - - 定义:{Compressed Size}/{Raw Size}, 来自论文 Language modeling is compression. 数值<=1.0,用 % 来表示。也有>1的情况。 - - - - {Compressed Size} 指的是? - - 这里的压缩指的是 模型参数相关的。 """ import json @@ -46,6 +14,7 @@ import pandas as pd from datasets import load_dataset from utils.log_util import logger from vocab import load_tokener +from typing import List CURRENT_DIR = os.path.dirname(os.path.abspath(__file__)) @@ -75,16 +44,18 @@ def unit_convertor(stat, unit): value = n_tokens / n_chars elif unit == "g_bytes/b_tokens": value = n_bytes_in_gb / n_tokens_in_billion - elif unit == "t_bytes/t_tokens": # 重要: - value = n_bytes_in_tb / n_tokens_in_trillion elif unit == "b_tokens/g_bytes": value = n_tokens_in_billion / n_bytes_in_gb + elif unit == "t_bytes/t_tokens": # 重要: + value = n_bytes_in_tb / n_tokens_in_trillion + elif unit == "t_tokens/t_bytes": + value = n_tokens_in_trillion / n_bytes_in_tb else: raise "measure not support" return round(value, 2) -all_units = ["g_bytes/b_tokens", "t_bytes/t_tokens", "b_tokens/g_bytes"] +all_units = ["g_bytes/b_tokens", "b_tokens/g_bytes", "t_bytes/t_tokens", "t_tokens/t_bytes", "n_chars/n_tokens", ] def pprint(stats): @@ -106,25 +77,26 @@ def pprint(stats): cache = {} -def tokenize_corpus(tokenizer, lang, cache_dir="stats/compress_rate"): +def tokenize_corpus(tokenizer, corpuses, cache_dir="stats/compress_rate"): """ 这个要独立的cache,因为速度慢。 :param tokenizer: - :param lang: + :param corpuses: :param cache_dir: :return: """ - def _tokenize(tokenizer, dataset): + def _tokenize(tokenizer, datasets): n_tokens = 0 n_chars = 0 n_bytes = 0 - for item in dataset: - text = item["text"] - n_bytes += get_n_bytes_of_string(text) - n_chars += len(text) - encodings = tokenizer.encode(text) - n_tokens += len(encodings) + for dataset in datasets: + for item in dataset: + text = item["text"] + n_bytes += get_n_bytes_of_string(text) + n_chars += len(text) + encodings = tokenizer.encode(text) + n_tokens += len(encodings) stat = { "vocab_size": tokenizer.vocab_size, "n_bytes": n_bytes, @@ -134,8 +106,7 @@ def tokenize_corpus(tokenizer, lang, cache_dir="stats/compress_rate"): return stat tokenizer_name = tokenizer.alias - lang = lang.replace("cc100-", "") - cache_id = f"{tokenizer_name}.{lang}" + cache_id = f"{tokenizer_name}.{'.'.join(corpuses)}" # L1: in-memory cache if cache_id in cache: logger.info(f"loading {cache_id} from in-memory cache") @@ -152,8 +123,8 @@ def tokenize_corpus(tokenizer, lang, cache_dir="stats/compress_rate"): return stat # tokenize corpus - dataset = load_dataset("eson/cc100-samples", lang, split="train") - stat = _tokenize(tokenizer, dataset) + datasets = [load_dataset("eson/cc100-samples", corpus.replace("cc100-", ""), split="train") for corpus in corpuses] + stat = _tokenize(tokenizer, datasets) logger.info(f"saving {cache_id} to {cache_path}") json.dump(stat, open(cache_path, "w", encoding="utf-8")) logger.info(f"saving {cache_id} to in-memory cache") @@ -161,6 +132,13 @@ def tokenize_corpus(tokenizer, lang, cache_dir="stats/compress_rate"): return stat +def test(): + tokenizer_name = "gpt_4" + tokenizer = load_tokener(tokenizer_name) + stats = {tokenizer_name: tokenize_corpus(tokenizer, ["cc100-en", "cc100-zh-Hans"])} + pprint(stats) + + def main(): from vocab import all_tokenizers if len(sys.argv) == 3: @@ -175,10 +153,11 @@ def main(): print("###" * 10 + lang) for tokenizer_name in tokenizers: tokenizer = load_tokener(tokenizer_name) - stat = tokenize_corpus(tokenizer, lang) + stat = tokenize_corpus(tokenizer, [lang]) stats[tokenizer_name] = stat pprint(stats) if __name__ == "__main__": main() + # test()