DeepSeek-Coder-V2-Lite-Base / tokenization_deepseek_fast.py
msr2000's picture
Upload folder using huggingface_hub
5a3cf15 verified
raw
history blame
1.37 kB
from typing import List, Optional, Union
from transformers.models.llama import LlamaTokenizerFast
class DeepseekTokenizerFast(LlamaTokenizerFast):
def convert_ids_to_tokens(
self, ids: Union[int, List[int]], skip_special_tokens: bool = False
) -> Union[str, List[str]]:
"""
Converts a single index or a sequence of indices in a token or a sequence of tokens, using the vocabulary and
added tokens.
Args:
ids (`int` or `List[int]`):
The token id (or token ids) to convert to tokens.
skip_special_tokens (`bool`, *optional*, defaults to `False`):
Whether or not to remove special tokens in the decoding.
Returns:
`str` or `List[str]`: The decoded token(s).
"""
if isinstance(ids, int):
return self._convert_id_to_token(ids)
tokens = []
for index in ids:
index = int(index)
if skip_special_tokens and index in self.all_special_ids:
continue
token = self._tokenizer.id_to_token(index)
tokens.append(token if token is not None else "")
return tokens
def _convert_id_to_token(self, index: int) -> Optional[str]:
token = self._tokenizer.id_to_token(int(index))
return token if token is not None else ""