Spaces:

eson
/

tokenizer-arena

Running

App Files Files Community

tokenizer-arena / utils /compress_rate_util.py

eson

update

7d2062e 2 months ago

raw history blame

No virus

6.49 kB

	"""

	中文数据：clue superclue
	英文数据：glue cnn_dailymail gigaword
	代码数据:
	数字：

	## 参考
	- https://github.com/baichuan-inc/Baichuan-7B 记录了不同分词器的压缩率
	- 指标：猜测是 n_tokens/n_chars (baichuan小，说明百川token少，压缩率高)
	- Baichuan 0.73; llama 1.31;
	- https://github.com/QwenLM/Qwen/blob/main/tech_memo.md 记录了不同分词器的压缩率
	- 以 XLM-RoBERTa为基准 (Unsupervised Cross-lingual Representation Learning at Scale ) ，
	- Qwen-7B 在很多语言上压缩率都较高压缩率 (high compression rate)
	- 中文： llama7b 2.2; baichuan7b 1.1; chatglm2-6b 0.9; qwen7b 0.95
	- 英文：
	- 指标：猜测是 n_tokens / n_tokens_xlmR
	- https://github.com/hpcaitech/ColossalAI/blob/4b8312c08e8d05a5f41453d63c8671aab601ed1c/applications/Colossal-LLaMA-2/prepare_pretrain_dataset.py#L134
	- 有压缩率的计算方式
	- https://github.com/hpcaitech/ColossalAI/blob/main/applications/Colossal-LLaMA-2/README.md#tokenizer
	- 记录了不同分词器的压缩率
	- 指标：
	- https://github.com/AUGMXNT/shisa/blob/6a823d77a71acbd18ab8f68a6b02f4b87ec9dddd/eval/tokenizer-efficiency-ja.py#L24
	- 有压缩率的计算方式 = {n_chars} / {n_tokens}
	-
	- https://github.com/huggingface/transformers/blob/cec773345aeffce3c04e8891303a3f748de7141e/src/transformers/models/whisper/generation_whisper.py#L354
	- 这个可能不是
	- https://github.com/bojone/bytepiece/blob/main/README_en.md
	- "bytes/token": the average number of bytes per token
	- Getting the most out of your tokenizer for pre-training and domain adaptation 👍
	- 定义：
	- NSL: 两个分词器的编码长度比例，通常以 llama为基准
	- average number of bytes per token. {n_bytes} / {n_tokens}
	- higher compression rate --
	- *** https://github.com/microsoft/LLMLingua/blob/main/llmlingua/prompt_compressor.py
	- 定义：{Compressed Size}/{Raw Size}, 来自论文 Language modeling is compression. 数值<=1.0，用 % 来表示。也有>1的情况。
	-
	- {Compressed Size} 指的是？
	- 这里的压缩指的是模型参数相关的。
	"""

	import json
	import os
	import sys
	import pandas as pd
	from datasets import load_dataset
	from utils.log_util import logger
	from vocab import load_tokener

	CURRENT_DIR = os.path.dirname(os.path.abspath(__file__))


	def get_n_bytes_of_string(string_text):
	n_bytes = len(string_text.encode("utf-8"))
	return n_bytes


	def unit_convertor(stat, unit):
	n_tokens = stat["n_tokens"]
	n_chars = stat["n_chars"]
	n_bytes = stat["n_bytes"]

	n_tokens_in_billion = n_tokens / (1000 * 1000 * 1000)
	n_tokens_in_trillion = n_tokens / (1000 * 1000 * 1000 * 1000)
	n_bytes_in_mb = n_bytes / (1024 * 1024)
	n_bytes_in_gb = n_bytes_in_mb / 1024
	n_bytes_in_tb = n_bytes_in_gb / 1024
	# n_chars_in_billion = n_chars / (1000 * 1000 * 1000)

	if unit == "n_tokens/n_bytes":
	value = n_tokens / n_bytes
	elif unit == "n_chars/n_tokens": # 重要：平均一个token包含多少个字符。
	value = n_chars / n_tokens
	elif unit == "n_tokens/n_chars": # 一个中文汉字需要几个token？
	value = n_tokens / n_chars
	elif unit == "g_bytes/b_tokens":
	value = n_bytes_in_gb / n_tokens_in_billion
	elif unit == "t_bytes/t_tokens": # 重要：
	value = n_bytes_in_tb / n_tokens_in_trillion
	elif unit == "b_tokens/g_bytes":
	value = n_tokens_in_billion / n_bytes_in_gb
	else:
	raise "measure not support"
	return round(value, 2)


	all_units = ["g_bytes/b_tokens", "t_bytes/t_tokens", "b_tokens/g_bytes"]


	def pprint(stats):
	table = []
	for tokenizer_name, stat in stats.items():
	columns = {"tokenizer": tokenizer_name, "vocab_size": stat["vocab_size"]}
	for unit in all_units:
	if unit not in stat:
	columns[unit] = unit_convertor(stat, unit)
	else:
	logger.error(f"unit {unit} not support")

	table.append(columns)
	df = pd.DataFrame(table)
	# print(df.to_markdown(index=False, tablefmt='fancy_grid'))
	logger.info(f"\n{df.to_markdown(index=False)}")


	cache = {}


	def tokenize_corpus(tokenizer, lang, cache_dir="stats/compress_rate"):
	"""
	这个要独立的cache，因为速度慢。
	:param tokenizer:
	:param lang:
	:param cache_dir:
	:return:
	"""

	def _tokenize(tokenizer, dataset):
	n_tokens = 0
	n_chars = 0
	n_bytes = 0
	for item in dataset:
	text = item["text"]
	n_bytes += get_n_bytes_of_string(text)
	n_chars += len(text)
	encodings = tokenizer.encode(text)
	n_tokens += len(encodings)
	stat = {
	"vocab_size": tokenizer.vocab_size,
	"n_bytes": n_bytes,
	"n_tokens": n_tokens,
	"n_chars": n_chars,
	}
	return stat

	tokenizer_name = tokenizer.alias
	lang = lang.replace("cc100-", "")
	cache_id = f"{tokenizer_name}.{lang}"
	# L1: in-memory cache
	if cache_id in cache:
	logger.info(f"loading {cache_id} from in-memory cache")
	return cache[cache_id]

	# L2: file cache
	cache_dir = os.path.join(CURRENT_DIR, f"../{cache_dir}")
	os.makedirs(cache_dir, exist_ok=True)
	cache_path = os.path.join(cache_dir, f"{cache_id}.json")
	if os.path.exists(cache_path):
	logger.info(f"loading {cache_id} from file cache")
	stat = json.load(open(cache_path, "r", encoding="utf-8"))
	cache[cache_id] = stat
	return stat

	# tokenize corpus
	dataset = load_dataset("eson/cc100-samples", lang, split="train")
	stat = _tokenize(tokenizer, dataset)
	logger.info(f"saving {cache_id} to {cache_path}")
	json.dump(stat, open(cache_path, "w", encoding="utf-8"))
	logger.info(f"saving {cache_id} to in-memory cache")
	cache[cache_id] = stat
	return stat


	def main():
	from vocab import all_tokenizers
	if len(sys.argv) == 3:
	tokenizers = [sys.argv[1]]
	corpuses = [sys.argv[2]]
	else:
	tokenizers = all_tokenizers
	corpuses = ["en", "zh-Hans"]

	stats = {}
	for lang in corpuses:
	print("###" * 10 + lang)
	for tokenizer_name in tokenizers:
	tokenizer = load_tokener(tokenizer_name)
	stat = tokenize_corpus(tokenizer, lang)
	stats[tokenizer_name] = stat
	pprint(stats)


	if __name__ == "__main__":
	main()