eson commited on
Commit
c75633b
1 Parent(s): 6bdf6c6

add more tokenizer

Browse files
requirements.txt CHANGED
@@ -3,4 +3,5 @@ sentencepiece
3
  tiktoken
4
  icetk
5
  torch
6
- zhon
 
 
3
  tiktoken
4
  icetk
5
  torch
6
+ zhon
7
+ nltk
utils/compress_rate_util.py CHANGED
@@ -1,9 +1,7 @@
1
  """
2
 
3
 
4
- 中文数据
5
- 英文数据:
6
-
7
-
8
 
9
  """
 
1
  """
2
 
3
 
4
+ 中文数据:clue superclue
5
+ 英文数据:glue cnn_dailymail gigaword
 
 
6
 
7
  """
utils/zh_util.py CHANGED
@@ -72,7 +72,7 @@ def iter_vocab(tokenizer, name="", from_cache=True):
72
  if from_cache and name in cache:
73
  return cache[name]
74
 
75
- f_out = open(name + "_vocab.zh.jsonl", "w", encoding="utf-8")
76
  zh_token_count = {"total": 0, "中文单字": 0, "中文多字": 0}
77
 
78
  # zh_token_count = {"total": 0, "包含1个中文单字": 0, "中文多字": 0}
@@ -91,7 +91,7 @@ def iter_vocab(tokenizer, name="", from_cache=True):
91
  if isinstance(token, bytes):
92
  token = token.decode("utf-8", errors="ignore")
93
 
94
- digit_count = get_digit_count(token)
95
  zh_count = get_zh_count(decode_str)
96
  space_count = get_space_count(decode_str)
97
 
@@ -99,7 +99,7 @@ def iter_vocab(tokenizer, name="", from_cache=True):
99
  {"id": token_id,
100
  "token": token,
101
  "token_decode": decode_str,
102
- "token_len": len(token),
103
  "zh_count": zh_count,
104
  "space_count": space_count,
105
  "digit_count": digit_count,
 
72
  if from_cache and name in cache:
73
  return cache[name]
74
 
75
+ f_out = open(name + "_vocab.jsonl", "w", encoding="utf-8")
76
  zh_token_count = {"total": 0, "中文单字": 0, "中文多字": 0}
77
 
78
  # zh_token_count = {"total": 0, "包含1个中文单字": 0, "中文多字": 0}
 
91
  if isinstance(token, bytes):
92
  token = token.decode("utf-8", errors="ignore")
93
 
94
+ digit_count = get_digit_count(decode_str)
95
  zh_count = get_zh_count(decode_str)
96
  space_count = get_space_count(decode_str)
97
 
 
99
  {"id": token_id,
100
  "token": token,
101
  "token_decode": decode_str,
102
+ "token_len": len(decode_str),
103
  "zh_count": zh_count,
104
  "space_count": space_count,
105
  "digit_count": digit_count,
vocab/__init__.py CHANGED
@@ -130,6 +130,10 @@ all_tokenizers = [
130
  "phi_1",
131
  "phi_2",
132
  "solar_10_7b",
 
 
 
 
133
  "wizardcoder_python_7b_v1",
134
  "wizardlm_7b_v1",
135
  "wizardmath_70b_v1",
 
130
  "phi_1",
131
  "phi_2",
132
  "solar_10_7b",
133
+ "mobilebert_uncased",
134
+ "mobilenet_v2",
135
+ "switch_c_2048",
136
+ "byt5_small",
137
  "wizardcoder_python_7b_v1",
138
  "wizardlm_7b_v1",
139
  "wizardmath_70b_v1",
vocab/byt5_small/__init__.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ from transformers import AutoTokenizer
2
+
3
+ tokenizer = AutoTokenizer.from_pretrained('google/byt5-small')
vocab/llama/demo.py CHANGED
@@ -30,4 +30,20 @@ tokens = [ 1, 29961, 25580, 29962, 3532, 14816, 29903, 6778, 13, 3492,
30
  text = tokenizer.decode(tokens)
31
  print(text)
32
  for token_id in tokens:
33
- print(json.dumps({"token_id": token_id, "decode_str": tokenizer.decode([token_id]), "token": tokenizer.convert_ids_to_tokens([token_id][0])}, ensure_ascii=False))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
30
  text = tokenizer.decode(tokens)
31
  print(text)
32
  for token_id in tokens:
33
+ print(json.dumps({"token_id": token_id, "decode_str": tokenizer.decode([token_id]), "token": tokenizer.convert_ids_to_tokens([token_id][0])}, ensure_ascii=False))
34
+
35
+
36
+
37
+ def byte_token():
38
+ """
39
+ 为什么 \n 是 "<0x0A>"
40
+
41
+ 8 11 145
42
+ :return:
43
+ """
44
+ for token_id in [8, 11, 145]:
45
+ token_str = tokenizer.decode([token_id])
46
+ print(token_str)
47
+
48
+
49
+ byte_token()
vocab/mobilebert_uncased/__init__.py ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ from transformers import AutoTokenizer
2
+ tokenizer = AutoTokenizer.from_pretrained("google/mobilebert-uncased", trust_remote_code=True)
vocab/mobilenet_v2/__init__.py ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ from transformers import AutoTokenizer
2
+ tokenizer = AutoTokenizer.from_pretrained("google/mobilenet_v2_1.0_224", trust_remote_code=True)
vocab/switch_c_2048/__init__.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+
2
+ from transformers import AutoTokenizer
3
+
4
+ tokenizer = AutoTokenizer.from_pretrained("google/switch-c-2048")