Spaces:

eson
/

tokenizer-arena

Running

tokenizer-arena / vocab /build_zh_vocab.py

update

751936e 10 months ago

No virus

877 Bytes

	import json
	from collections import defaultdict

	all_zh_words = defaultdict(int)
	for model_name in [
	"gpt2",
	"gpt2_chinese",
	"chinese_llama_lora_7b",
	"bert_chinese",
	"moss",
	"bloom",
	"bloomz_6b4_zh",
	"gpt_nexo_20b",
	"gpt_neox_chinese_v1",
	"glm_chinese",
	"chatglm"
	]:
	zh_word_set = set()
	for line in open(model_name + "_vocab.zh.jsonl", "r", encoding="utf-8"):
	item = json.loads(line)
	token = item["token"]
	if item["type"] in ["中文单字", "中文多字"]:
	zh_word_set.add(token.strip())

	for word in zh_word_set:
	all_zh_words[word] += 1

	sorted_keywords = sorted(all_zh_words.items(), key=lambda kv: kv[1], reverse=True)

	with open("vocab.freq.zh.txt", "w", encoding="utf-8") as f_out:
	for word, count in sorted_keywords:
	f_out.write("%s\t%d\n" % (word, count))