File size: 6,489 Bytes
d27a756
 
c75633b
 
814ee6b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7d2062e
814ee6b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7d2062e
814ee6b
 
 
 
7d2062e
814ee6b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7d2062e
 
 
 
 
 
 
814ee6b
7d2062e
814ee6b
7d2062e
814ee6b
 
 
 
 
d27a756
814ee6b
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
"""

中文数据:clue superclue
英文数据:glue cnn_dailymail gigaword
代码数据:
数字:

## 参考
- https://github.com/baichuan-inc/Baichuan-7B  记录了不同分词器的压缩率
  - 指标:猜测是 n_tokens/n_chars  (baichuan小,说明百川token少,压缩率高)
  - Baichuan 0.73; llama 1.31;
- https://github.com/QwenLM/Qwen/blob/main/tech_memo.md  记录了不同分词器的压缩率
  - 以 XLM-RoBERTa为基准 (Unsupervised Cross-lingual Representation Learning at Scale ) ,
  - Qwen-7B 在很多语言上压缩率都较高压缩率 (high compression rate)
  - 中文: llama7b 2.2; baichuan7b 1.1; chatglm2-6b 0.9;  qwen7b 0.95
  - 英文:
  - 指标:猜测是 n_tokens / n_tokens_xlmR
- https://github.com/hpcaitech/ColossalAI/blob/4b8312c08e8d05a5f41453d63c8671aab601ed1c/applications/Colossal-LLaMA-2/prepare_pretrain_dataset.py#L134
  - 有压缩率的计算方式
  - https://github.com/hpcaitech/ColossalAI/blob/main/applications/Colossal-LLaMA-2/README.md#tokenizer
  - 记录了不同分词器的压缩率
  - 指标:
- https://github.com/AUGMXNT/shisa/blob/6a823d77a71acbd18ab8f68a6b02f4b87ec9dddd/eval/tokenizer-efficiency-ja.py#L24
  - 有压缩率的计算方式 = {n_chars} / {n_tokens}
  -
- https://github.com/huggingface/transformers/blob/cec773345aeffce3c04e8891303a3f748de7141e/src/transformers/models/whisper/generation_whisper.py#L354
  - 这个可能不是
- https://github.com/bojone/bytepiece/blob/main/README_en.md
  - "bytes/token": the average number of bytes per token
- Getting the most out of your tokenizer for pre-training and domain adaptation 👍
  - 定义:
    - NSL: 两个分词器的编码长度 比例,通常以 llama为基准
    - average number of bytes per token. {n_bytes} / {n_tokens}
  - higher compression rate  --
- *** https://github.com/microsoft/LLMLingua/blob/main/llmlingua/prompt_compressor.py
  - 定义:{Compressed Size}/{Raw Size}, 来自论文 Language modeling is compression. 数值<=1.0,用 % 来表示。也有>1的情况。
    -
    - {Compressed Size} 指的是?
  - 这里的压缩指的是 模型参数相关的。
"""

import json
import os
import sys
import pandas as pd
from datasets import load_dataset
from utils.log_util import logger
from vocab import load_tokener

CURRENT_DIR = os.path.dirname(os.path.abspath(__file__))


def get_n_bytes_of_string(string_text):
    n_bytes = len(string_text.encode("utf-8"))
    return n_bytes


def unit_convertor(stat, unit):
    n_tokens = stat["n_tokens"]
    n_chars = stat["n_chars"]
    n_bytes = stat["n_bytes"]

    n_tokens_in_billion = n_tokens / (1000 * 1000 * 1000)
    n_tokens_in_trillion = n_tokens / (1000 * 1000 * 1000 * 1000)
    n_bytes_in_mb = n_bytes / (1024 * 1024)
    n_bytes_in_gb = n_bytes_in_mb / 1024
    n_bytes_in_tb = n_bytes_in_gb / 1024
    # n_chars_in_billion = n_chars / (1000 * 1000 * 1000)

    if unit == "n_tokens/n_bytes":
        value = n_tokens / n_bytes
    elif unit == "n_chars/n_tokens":  # 重要:平均一个token包含多少个字符。
        value = n_chars / n_tokens
    elif unit == "n_tokens/n_chars":  # 一个中文汉字需要几个token?
        value = n_tokens / n_chars
    elif unit == "g_bytes/b_tokens":
        value = n_bytes_in_gb / n_tokens_in_billion
    elif unit == "t_bytes/t_tokens":  # 重要:
        value = n_bytes_in_tb / n_tokens_in_trillion
    elif unit == "b_tokens/g_bytes":
        value = n_tokens_in_billion / n_bytes_in_gb
    else:
        raise "measure not support"
    return round(value, 2)


all_units = ["g_bytes/b_tokens", "t_bytes/t_tokens", "b_tokens/g_bytes"]


def pprint(stats):
    table = []
    for tokenizer_name, stat in stats.items():
        columns = {"tokenizer": tokenizer_name, "vocab_size": stat["vocab_size"]}
        for unit in all_units:
            if unit not in stat:
                columns[unit] = unit_convertor(stat, unit)
            else:
                logger.error(f"unit {unit} not support")

        table.append(columns)
    df = pd.DataFrame(table)
    # print(df.to_markdown(index=False, tablefmt='fancy_grid'))
    logger.info(f"\n{df.to_markdown(index=False)}")


cache = {}


def tokenize_corpus(tokenizer, lang, cache_dir="stats/compress_rate"):
    """
    这个要独立的cache,因为速度慢。
    :param tokenizer:
    :param lang:
    :param cache_dir:
    :return:
    """

    def _tokenize(tokenizer, dataset):
        n_tokens = 0
        n_chars = 0
        n_bytes = 0
        for item in dataset:
            text = item["text"]
            n_bytes += get_n_bytes_of_string(text)
            n_chars += len(text)
            encodings = tokenizer.encode(text)
            n_tokens += len(encodings)
        stat = {
            "vocab_size": tokenizer.vocab_size,
            "n_bytes": n_bytes,
            "n_tokens": n_tokens,
            "n_chars": n_chars,
        }
        return stat

    tokenizer_name = tokenizer.alias
    lang = lang.replace("cc100-", "")
    cache_id = f"{tokenizer_name}.{lang}"
    # L1: in-memory cache
    if cache_id in cache:
        logger.info(f"loading {cache_id} from in-memory cache")
        return cache[cache_id]

    # L2: file cache
    cache_dir = os.path.join(CURRENT_DIR, f"../{cache_dir}")
    os.makedirs(cache_dir, exist_ok=True)
    cache_path = os.path.join(cache_dir, f"{cache_id}.json")
    if os.path.exists(cache_path):
        logger.info(f"loading {cache_id} from file cache")
        stat = json.load(open(cache_path, "r", encoding="utf-8"))
        cache[cache_id] = stat
        return stat

    # tokenize corpus
    dataset = load_dataset("eson/cc100-samples", lang, split="train")
    stat = _tokenize(tokenizer, dataset)
    logger.info(f"saving {cache_id} to {cache_path}")
    json.dump(stat, open(cache_path, "w", encoding="utf-8"))
    logger.info(f"saving {cache_id} to in-memory cache")
    cache[cache_id] = stat
    return stat


def main():
    from vocab import all_tokenizers
    if len(sys.argv) == 3:
        tokenizers = [sys.argv[1]]
        corpuses = [sys.argv[2]]
    else:
        tokenizers = all_tokenizers
        corpuses = ["en", "zh-Hans"]

    stats = {}
    for lang in corpuses:
        print("###" * 10 + lang)
        for tokenizer_name in tokenizers:
            tokenizer = load_tokener(tokenizer_name)
            stat = tokenize_corpus(tokenizer, lang)
            stats[tokenizer_name] = stat
        pprint(stats)


if __name__ == "__main__":
    main()