File size: 5,077 Bytes
d27a756
 
c75633b
 
814ee6b
 
 
 
 
 
 
7d2062e
814ee6b
 
 
 
988921c
814ee6b
 
 
367a536
 
 
 
814ee6b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
988921c
 
 
 
814ee6b
 
 
 
 
 
 
 
 
 
367a536
814ee6b
 
 
7d2062e
814ee6b
 
 
 
7d2062e
814ee6b
 
 
 
 
988921c
814ee6b
 
 
988921c
814ee6b
 
 
 
988921c
814ee6b
 
 
988921c
 
 
 
 
 
 
814ee6b
 
 
 
 
 
 
 
 
988921c
814ee6b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
988921c
 
814ee6b
 
 
 
 
 
 
988921c
 
 
 
 
 
 
814ee6b
 
7d2062e
 
 
 
 
367a536
7d2062e
814ee6b
7d2062e
814ee6b
7d2062e
814ee6b
988921c
814ee6b
 
 
d27a756
814ee6b
 
988921c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
"""

中文数据:clue superclue
英文数据:glue cnn_dailymail gigaword
代码数据:
数字:

"""

import json
import os
import sys
import pandas as pd
from datasets import load_dataset
from utils.log_util import logger
from vocab import load_tokener
from typing import List

CURRENT_DIR = os.path.dirname(os.path.abspath(__file__))

common_units = ["g_bytes/b_tokens", "b_tokens/g_bytes", "t_bytes/t_tokens", "t_tokens/t_bytes", "n_chars/n_tokens", ]
common_corpuses = ["cc100-en", "cc100-zh-Hans", "cc100-es"]
# code: https://huggingface.co/datasets/codeparrot/github-code-clean  python java c sql html
# math:

def get_n_bytes_of_string(string_text):
    n_bytes = len(string_text.encode("utf-8"))
    return n_bytes


def unit_convertor(stat, unit):
    n_tokens = stat["n_tokens"]
    n_chars = stat["n_chars"]
    n_bytes = stat["n_bytes"]

    n_tokens_in_billion = n_tokens / (1000 * 1000 * 1000)
    n_tokens_in_trillion = n_tokens / (1000 * 1000 * 1000 * 1000)
    n_bytes_in_mb = n_bytes / (1024 * 1024)
    n_bytes_in_gb = n_bytes_in_mb / 1024
    n_bytes_in_tb = n_bytes_in_gb / 1024
    # n_chars_in_billion = n_chars / (1000 * 1000 * 1000)

    if unit == "n_tokens/n_bytes":
        value = n_tokens / n_bytes
    elif unit == "n_chars/n_tokens":  # 重要:平均一个token包含多少个字符。
        value = n_chars / n_tokens
    elif unit == "n_tokens/n_chars":  # 一个中文汉字需要几个token?
        value = n_tokens / n_chars
    elif unit == "g_bytes/b_tokens":
        value = n_bytes_in_gb / n_tokens_in_billion
    elif unit == "b_tokens/g_bytes":
        value = n_tokens_in_billion / n_bytes_in_gb
    elif unit == "t_bytes/t_tokens":  # 重要:
        value = n_bytes_in_tb / n_tokens_in_trillion
    elif unit == "t_tokens/t_bytes":
        value = n_tokens_in_trillion / n_bytes_in_tb
    else:
        raise "measure not support"
    return round(value, 2)



def pprint(stats):
    table = []
    for tokenizer_name, stat in stats.items():
        columns = {"tokenizer": tokenizer_name, "vocab_size": stat["vocab_size"]}
        for unit in common_units:
            if unit not in stat:
                columns[unit] = unit_convertor(stat, unit)
            else:
                logger.error(f"unit {unit} not support")

        table.append(columns)
    df = pd.DataFrame(table)
    # print(df.to_markdown(index=False, tablefmt='fancy_grid'))
    logger.info(f"\n{df.to_markdown(index=False)}")


cache = {}


def tokenize_corpus(tokenizer, corpuses, cache_dir="stats/compress_rate"):
    """
    这个要独立的cache,因为速度慢。
    :param tokenizer:
    :param corpuses:
    :param cache_dir:
    :return:
    """

    def _tokenize(tokenizer, datasets):
        n_tokens = 0
        n_chars = 0
        n_bytes = 0
        for dataset in datasets:
            for item in dataset:
                text = item["text"]
                n_bytes += get_n_bytes_of_string(text)
                n_chars += len(text)
                encodings = tokenizer.encode(text)
                n_tokens += len(encodings)
        stat = {
            "vocab_size": tokenizer.vocab_size,
            "n_bytes": n_bytes,
            "n_tokens": n_tokens,
            "n_chars": n_chars,
        }
        return stat

    tokenizer_name = tokenizer.alias
    cache_id = f"{tokenizer_name}.{'.'.join(corpuses)}"
    # L1: in-memory cache
    if cache_id in cache:
        logger.info(f"loading {cache_id} from in-memory cache")
        return cache[cache_id]

    # L2: file cache
    cache_dir = os.path.join(CURRENT_DIR, f"../{cache_dir}")
    os.makedirs(cache_dir, exist_ok=True)
    cache_path = os.path.join(cache_dir, f"{cache_id}.json")
    if os.path.exists(cache_path):
        logger.info(f"loading {cache_id} from file cache")
        stat = json.load(open(cache_path, "r", encoding="utf-8"))
        cache[cache_id] = stat
        return stat

    # tokenize corpus
    datasets = [load_dataset("eson/cc100-samples", corpus.replace("cc100-", ""), split="train") for corpus in corpuses]
    stat = _tokenize(tokenizer, datasets)
    logger.info(f"saving {cache_id} to {cache_path}")
    json.dump(stat, open(cache_path, "w", encoding="utf-8"))
    logger.info(f"saving {cache_id} to in-memory cache")
    cache[cache_id] = stat
    return stat


def test():
    tokenizer_name = "gpt_4"
    tokenizer = load_tokener(tokenizer_name)
    stats = {tokenizer_name: tokenize_corpus(tokenizer, ["cc100-en", "cc100-zh-Hans"])}
    pprint(stats)


def main():
    from vocab import all_tokenizers
    if len(sys.argv) == 3:
        tokenizers = [sys.argv[1]]
        corpuses = [sys.argv[2]]
    else:
        tokenizers = all_tokenizers
        corpuses = common_corpuses

    stats = {}
    for lang in corpuses:
        print("###" * 10 + lang)
        for tokenizer_name in tokenizers:
            tokenizer = load_tokener(tokenizer_name)
            stat = tokenize_corpus(tokenizer, [lang])
            stats[tokenizer_name] = stat
        pprint(stats)


if __name__ == "__main__":
    main()
    # test()