eson commited on
Commit
293bad6
1 Parent(s): aa0c637

add more tokenizer

Browse files
vocab/__init__.py CHANGED
@@ -96,8 +96,8 @@ all_tokenizers = [
96
  # "alpaca_7b",
97
  "baichuan",
98
  "baichuan2",
99
- "qwen",
100
  "internlm_chat_7b",
 
101
  "falcon_180b",
102
  # "goat",
103
 
@@ -109,9 +109,18 @@ all_tokenizers = [
109
  "skywork_13b_base",
110
  "skywork_13b_math",
111
  "mistral",
 
 
 
 
 
 
 
112
 
113
  ]
114
 
 
 
115
  class TokenizerType(Enum):
116
  """
117
  - https://huggingface.co/docs/transformers/tokenizer_summary
 
96
  # "alpaca_7b",
97
  "baichuan",
98
  "baichuan2",
 
99
  "internlm_chat_7b",
100
+ "falcon_7b",
101
  "falcon_180b",
102
  # "goat",
103
 
 
109
  "skywork_13b_base",
110
  "skywork_13b_math",
111
  "mistral",
112
+ "t5_small",
113
+ "t5_base",
114
+ "t5_large",
115
+ "flan_t5_base",
116
+ "fastchat_t5_3b",
117
+ "pko_t5_large",
118
+
119
 
120
  ]
121
 
122
+ all_tokenizers = sorted(all_tokenizers)
123
+
124
  class TokenizerType(Enum):
125
  """
126
  - https://huggingface.co/docs/transformers/tokenizer_summary
vocab/falcon_7b/__init__.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+
2
+ from transformers import AutoTokenizer
3
+
4
+ tokenizer = AutoTokenizer.from_pretrained("tiiuae/falcon-7b", trust_remote_code=True)
vocab/fastchat_t5_3b/__init__.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ from transformers import AutoTokenizer
2
+
3
+ tokenizer = AutoTokenizer.from_pretrained("lmsys/fastchat-t5-3b-v1.0", trust_remote_code=True)
vocab/flan_t5_base/__init__.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ from transformers import AutoTokenizer
2
+
3
+ tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-base", trust_remote_code=True)
vocab/pko_t5_large/__init__.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ from transformers import AutoTokenizer
2
+
3
+ tokenizer = AutoTokenizer.from_pretrained("paust/pko-t5-large", trust_remote_code=True)
vocab/t5/__init__.py DELETED
@@ -1,7 +0,0 @@
1
- """
2
-
3
-
4
- SentencePiece
5
- """
6
-
7
-
 
 
 
 
 
 
 
 
vocab/t5_base/__init__.py ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ """
2
+ https://huggingface.co/t5-base
3
+ """
4
+
5
+
6
+ from transformers import AutoTokenizer
7
+
8
+ tokenizer = AutoTokenizer.from_pretrained("t5-base", trust_remote_code=True)
vocab/t5_large/__init__.py ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ """
2
+ https://huggingface.co/t5-large
3
+ """
4
+
5
+
6
+ from transformers import AutoTokenizer
7
+
8
+ tokenizer = AutoTokenizer.from_pretrained("t5-large", trust_remote_code=True)
vocab/t5_small/__init__.py ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ """
2
+ https://huggingface.co/t5-large
3
+ """
4
+
5
+
6
+ from transformers import AutoTokenizer
7
+
8
+ tokenizer = AutoTokenizer.from_pretrained("t5-small", trust_remote_code=True)