eson commited on
Commit
2d550af
1 Parent(s): c7ed4a2
README.md CHANGED
@@ -15,7 +15,10 @@ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-
15
  ## ss
16
 
17
 
18
- ## ss
 
 
 
19
 
20
 
21
 
 
15
  ## ss
16
 
17
 
18
+ ## TODO
19
+
20
+
21
+ 'MossTokenizer' object has no attribute 'encoder'
22
 
23
 
24
 
vocab/moss/__init__.py CHANGED
@@ -2,10 +2,11 @@
2
  import os
3
  from transformers import AutoTokenizer, BloomTokenizerFast
4
 
5
- CURRENT_DIR = os.path.dirname(os.path.abspath(__file__))
6
- TOKENIZER_DIR = os.path.join(CURRENT_DIR, "moss-moon-003-sft")
 
7
 
8
- tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_DIR, trust_remote_code=True)
9
 
10
  # vocab_size = len(tokenizer.get_vocab())
11
  # vocab_size = tokenizer.vocab_size
 
2
  import os
3
  from transformers import AutoTokenizer, BloomTokenizerFast
4
 
5
+ # CURRENT_DIR = os.path.dirname(os.path.abspath(__file__))
6
+ # TOKENIZER_DIR = os.path.join(CURRENT_DIR, "moss-moon-003-sft")
7
+ # tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_DIR, trust_remote_code=True)
8
 
9
+ tokenizer = AutoTokenizer.from_pretrained("fnlp/moss-moon-003-sft", trust_remote_code=True)
10
 
11
  # vocab_size = len(tokenizer.get_vocab())
12
  # vocab_size = tokenizer.vocab_size
vocab/qwen_7b_chat/__init__.py CHANGED
@@ -9,12 +9,13 @@ https://github.com/QwenLM/Qwen/blob/main/tokenization_note_zh.md
9
 
10
  import os
11
  from transformers import AutoTokenizer
12
- CURRENT_DIR = os.path.dirname(os.path.abspath(__file__))
13
- TOKENIZER_DIR = os.path.join(CURRENT_DIR, "Qwen-7B-Chat")
14
 
15
  # 请注意:分词器默认行为已更改为默认关闭特殊token攻击防护。
16
- # tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen-VL-Chat", trust_remote_code=True)
17
- tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_DIR, trust_remote_code=True)
 
 
 
18
 
19
  tokenizer.comments = "在gpt4词典基础上,删除了100个多数字token,增加10000中文词token;并优化了special_token的分词"
20
 
 
9
 
10
  import os
11
  from transformers import AutoTokenizer
 
 
12
 
13
  # 请注意:分词器默认行为已更改为默认关闭特殊token攻击防护。
14
+ tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen-7B-Chat", trust_remote_code=True)
15
+
16
+ # CURRENT_DIR = os.path.dirname(os.path.abspath(__file__))
17
+ # TOKENIZER_DIR = os.path.join(CURRENT_DIR, "Qwen-7B-Chat")
18
+ # tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_DIR, trust_remote_code=True)
19
 
20
  tokenizer.comments = "在gpt4词典基础上,删除了100个多数字token,增加10000中文词token;并优化了special_token的分词"
21
 
vocab/skywork_13b_base/__init__.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+
2
+ from transformers import AutoTokenizer
3
+
4
+
5
+
6
+ tokenizer = AutoTokenizer.from_pretrained("Skywork/Skywork-13B-base", trust_remote_code=True)
vocab/skywork_13b_math/__init__.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+
2
+ from transformers import AutoTokenizer
3
+
4
+ tokenizer = AutoTokenizer.from_pretrained("Skywork/Skywork-13B-Math", trust_remote_code=True)