# coding=utf-8 # Copyright 2023 Shanghai Artificial Intelligence Laboratory and the # HuggingFace Inc. team. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """Tokenization classes for IntermLM.""" from transformers.tokenization_utils import LlamaTokenizer class InternLMTokenizer(LlamaTokenizer): @property def no_prefix_space_tokens(self): if self._no_prefix_space_tokens is None: vocab = self.convert_ids_to_tokens(list(range(self.vocab_size))) self._no_prefix_space_tokens = {i for i, tok in enumerate(vocab) if not tok.startswith("▁")} return self._no_prefix_space_tokens def _maybe_add_prefix_space(self, tokens, decoded): if tokens and tokens[0] not in self.no_prefix_space_tokens: return " " + decoded else: return decoded def convert_tokens_to_string(self, tokens): """Converts a sequence of tokens (string) in a single string.""" current_sub_tokens = [] out_string = "" prev_is_special = False for token in tokens: # make sure that special tokens are not decoded using sentencepiece model if token in self.all_special_tokens: if not prev_is_special: out_string += " " out_string += self.sp_model.decode(current_sub_tokens) + token prev_is_special = True current_sub_tokens = [] else: current_sub_tokens.append(token) prev_is_special = False out_string += self.sp_model.decode(current_sub_tokens) out_string = self.clean_up_tokenization(out_string) out_string = self._maybe_add_prefix_space(tokens=tokens, decoded=out_string) return out_string[1:]