eson commited on
Commit
c766a08
1 Parent(s): adcfb97

fix fastchat_t5_3b

Browse files
tokenizer/tiktoken_patch.py CHANGED
@@ -7,11 +7,19 @@ def decode(self, tokens, errors="replace", skip_special_tokens=False):
7
  默认的decode,可能会报错,详见 decode_test.py
8
  skip_special_tokens 是为了兼容 hf_tokenizer
9
 
10
- errors=replace, ignore, strict 有什么区别?
 
 
 
 
 
 
 
11
  """
12
  try:
13
  decode_str = self._core_bpe.decode_bytes(tokens).decode("utf-8", errors=errors)
14
- except:
 
15
  decode_str = "null"
16
  return decode_str
17
 
 
7
  默认的decode,可能会报错,详见 decode_test.py
8
  skip_special_tokens 是为了兼容 hf_tokenizer
9
 
10
+ errors:
11
+ decoded bytes are not guaranteed to be valid UTF-8.
12
+ "strict" Raise UnicodeError
13
+ "ignore" Ignore and continue
14
+ "replace" Replace with replacement character
15
+ "backslashreplace" Replace with backslashed escape sequence
16
+ "xmlcharrefreplace" Replace with XML character reference
17
+ "namereplace" Replace with \N{...} (named unicode character)
18
  """
19
  try:
20
  decode_str = self._core_bpe.decode_bytes(tokens).decode("utf-8", errors=errors)
21
+ except Exception as e:
22
+ logger.error(f"{e} -> return 'null'")
23
  decode_str = "null"
24
  return decode_str
25
 
vocab/fastchat_t5_3b/__init__.py CHANGED
@@ -1,3 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  from transformers import AutoTokenizer
2
 
3
- tokenizer = AutoTokenizer.from_pretrained("lmsys/fastchat-t5-3b-v1.0", trust_remote_code=True)
 
1
+ """
2
+
3
+ ## 默认 use_fast=True 报错
4
+ File "E:\workspace\common\vocab-zoo\tokenizer-arena\utils\zh_util.py", line 190, in <module>
5
+ print(iter_vocab(tokenizer, name=name))
6
+ File "E:\workspace\common\vocab-zoo\tokenizer-arena\utils\zh_util.py", line 144, in iter_vocab
7
+ dist_length, mean_length = get_coding_length(tokenizer, zh_tokens, filter=lambda k: not is_zh_char(k))
8
+ File "E:\workspace\common\vocab-zoo\tokenizer-arena\utils\zh_util.py", line 34, in get_coding_length
9
+ tokens = tokenizer.encode(word)
10
+ File "C:\Users\xusong\Miniconda3\envs\py3.10-torch1.13-hf.latest\lib\site-packages\transformers\tokenization_utils_base.py", line 2600, in encode
11
+ encoded_inputs = self.encode_plus(
12
+ File "C:\Users\xusong\Miniconda3\envs\py3.10-torch1.13-hf.latest\lib\site-packages\transformers\tokenization_utils_base.py", line 3008, in encode_plus
13
+ return self._encode_plus(
14
+ File "C:\Users\xusong\Miniconda3\envs\py3.10-torch1.13-hf.latest\lib\site-packages\transformers\tokenization_utils_fast.py", line 576, in _encode_plus
15
+ batched_output = self._batch_encode_plus(
16
+ File "C:\Users\xusong\Miniconda3\envs\py3.10-torch1.13-hf.latest\lib\site-packages\transformers\tokenization_utils_fast.py", line 504, in _batch_encode_plus
17
+ encodings = self._tokenizer.encode_batch(
18
+ pyo3_runtime.PanicException: AddedVocabulary bad split
19
+ """
20
+
21
  from transformers import AutoTokenizer
22
 
23
+ tokenizer = AutoTokenizer.from_pretrained("lmsys/fastchat-t5-3b-v1.0", trust_remote_code=True, use_fast=False)