Spaces:

eson
/

tokenizer-arena

Running

App Files Files Community

eson commited on Mar 4

Commit

c766a08

•

1 Parent(s): adcfb97

fix fastchat_t5_3b

Browse files

Files changed (2) hide show

tokenizer/tiktoken_patch.py +10 -2
vocab/fastchat_t5_3b/__init__.py +21 -1

tokenizer/tiktoken_patch.py CHANGED Viewed

@@ -7,11 +7,19 @@ def decode(self, tokens, errors="replace", skip_special_tokens=False):
     默认的decode，可能会报错，详见 decode_test.py
     skip_special_tokens 是为了兼容 hf_tokenizer
-    errors=replace, ignore, strict 有什么区别？
     """
     try:
         decode_str = self._core_bpe.decode_bytes(tokens).decode("utf-8", errors=errors)
-    except:
         decode_str = "null"
     return decode_str

     默认的decode，可能会报错，详见 decode_test.py
     skip_special_tokens 是为了兼容 hf_tokenizer
+    errors:
+        decoded bytes are not guaranteed to be valid UTF-8.
+        "strict"	Raise UnicodeError
+        "ignore"	Ignore and continue
+        "replace"	Replace with replacement character
+        "backslashreplace"	Replace with backslashed escape sequence
+        "xmlcharrefreplace"	Replace with XML character reference
+        "namereplace"	Replace with \N{...} (named unicode character)
     """
     try:
         decode_str = self._core_bpe.decode_bytes(tokens).decode("utf-8", errors=errors)
+    except Exception as e:
+        logger.error(f"{e} -> return 'null'")
         decode_str = "null"
     return decode_str

vocab/fastchat_t5_3b/__init__.py CHANGED Viewed

@@ -1,3 +1,23 @@
 from transformers import AutoTokenizer
-tokenizer = AutoTokenizer.from_pretrained("lmsys/fastchat-t5-3b-v1.0", trust_remote_code=True)

+"""
+## 默认 use_fast=True 报错
+  File "E:\workspace\common\vocab-zoo\tokenizer-arena\utils\zh_util.py", line 190, in <module>
+    print(iter_vocab(tokenizer, name=name))
+  File "E:\workspace\common\vocab-zoo\tokenizer-arena\utils\zh_util.py", line 144, in iter_vocab
+    dist_length, mean_length = get_coding_length(tokenizer, zh_tokens, filter=lambda k: not is_zh_char(k))
+  File "E:\workspace\common\vocab-zoo\tokenizer-arena\utils\zh_util.py", line 34, in get_coding_length
+    tokens = tokenizer.encode(word)
+  File "C:\Users\xusong\Miniconda3\envs\py3.10-torch1.13-hf.latest\lib\site-packages\transformers\tokenization_utils_base.py", line 2600, in encode
+    encoded_inputs = self.encode_plus(
+  File "C:\Users\xusong\Miniconda3\envs\py3.10-torch1.13-hf.latest\lib\site-packages\transformers\tokenization_utils_base.py", line 3008, in encode_plus
+    return self._encode_plus(
+  File "C:\Users\xusong\Miniconda3\envs\py3.10-torch1.13-hf.latest\lib\site-packages\transformers\tokenization_utils_fast.py", line 576, in _encode_plus
+    batched_output = self._batch_encode_plus(
+  File "C:\Users\xusong\Miniconda3\envs\py3.10-torch1.13-hf.latest\lib\site-packages\transformers\tokenization_utils_fast.py", line 504, in _batch_encode_plus
+    encodings = self._tokenizer.encode_batch(
+pyo3_runtime.PanicException: AddedVocabulary bad split
+"""
 from transformers import AutoTokenizer
+tokenizer = AutoTokenizer.from_pretrained("lmsys/fastchat-t5-3b-v1.0", trust_remote_code=True, use_fast=False)