|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
"""Tokenization classes for IntermLM.""" |
|
from transformers.tokenization_utils import LlamaTokenizer |
|
|
|
|
|
class InternLMTokenizer(LlamaTokenizer): |
|
|
|
@property |
|
def no_prefix_space_tokens(self): |
|
if self._no_prefix_space_tokens is None: |
|
vocab = self.convert_ids_to_tokens(list(range(self.vocab_size))) |
|
self._no_prefix_space_tokens = {i for i, tok in enumerate(vocab) if not tok.startswith("▁")} |
|
return self._no_prefix_space_tokens |
|
|
|
def _maybe_add_prefix_space(self, tokens, decoded): |
|
if tokens and tokens[0] not in self.no_prefix_space_tokens: |
|
return " " + decoded |
|
else: |
|
return decoded |
|
|
|
def convert_tokens_to_string(self, tokens): |
|
"""Converts a sequence of tokens (string) in a single string.""" |
|
current_sub_tokens = [] |
|
out_string = "" |
|
prev_is_special = False |
|
for token in tokens: |
|
|
|
if token in self.all_special_tokens: |
|
if not prev_is_special: |
|
out_string += " " |
|
out_string += self.sp_model.decode(current_sub_tokens) + token |
|
prev_is_special = True |
|
current_sub_tokens = [] |
|
else: |
|
current_sub_tokens.append(token) |
|
prev_is_special = False |
|
out_string += self.sp_model.decode(current_sub_tokens) |
|
out_string = self.clean_up_tokenization(out_string) |
|
out_string = self._maybe_add_prefix_space(tokens=tokens, decoded=out_string) |
|
return out_string[1:] |
|
|
|
|
|
|