Upload tokenization_deepseek_fast.py with huggingface_hub
Browse files
tokenization_deepseek_fast.py
ADDED
|
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from typing import List, Optional, Union
|
| 2 |
+
|
| 3 |
+
|
| 4 |
+
from transformers.models.llama import LlamaTokenizerFast
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
class DeepseekTokenizerFast(LlamaTokenizerFast):
|
| 8 |
+
|
| 9 |
+
def convert_ids_to_tokens(
|
| 10 |
+
self, ids: Union[int, List[int]], skip_special_tokens: bool = False
|
| 11 |
+
) -> Union[str, List[str]]:
|
| 12 |
+
"""
|
| 13 |
+
Converts a single index or a sequence of indices in a token or a sequence of tokens, using the vocabulary and
|
| 14 |
+
added tokens.
|
| 15 |
+
|
| 16 |
+
Args:
|
| 17 |
+
ids (`int` or `List[int]`):
|
| 18 |
+
The token id (or token ids) to convert to tokens.
|
| 19 |
+
skip_special_tokens (`bool`, *optional*, defaults to `False`):
|
| 20 |
+
Whether or not to remove special tokens in the decoding.
|
| 21 |
+
|
| 22 |
+
Returns:
|
| 23 |
+
`str` or `List[str]`: The decoded token(s).
|
| 24 |
+
"""
|
| 25 |
+
if isinstance(ids, int):
|
| 26 |
+
return self._convert_id_to_token(ids)
|
| 27 |
+
tokens = []
|
| 28 |
+
for index in ids:
|
| 29 |
+
index = int(index)
|
| 30 |
+
if skip_special_tokens and index in self.all_special_ids:
|
| 31 |
+
continue
|
| 32 |
+
token = self._tokenizer.id_to_token(index)
|
| 33 |
+
tokens.append(token if token is not None else "")
|
| 34 |
+
return tokens
|
| 35 |
+
|
| 36 |
+
def _convert_id_to_token(self, index: int) -> Optional[str]:
|
| 37 |
+
token = self._tokenizer.id_to_token(int(index))
|
| 38 |
+
return token if token is not None else ""
|