internlm
/

internlm-7b

Text Generation

feature-extraction

Model card Files Files and versions Community

internlm-7b / tokenization_internlm.py

Matt

Re-add custom tokenizer

f2847d8 10 months ago

2.31 kB

	# coding=utf-8
	# Copyright 2023 Shanghai Artificial Intelligence Laboratory and the
	# HuggingFace Inc. team. All rights reserved.
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.

	"""Tokenization classes for IntermLM."""
	from transformers.tokenization_utils import LlamaTokenizer


	class InternLMTokenizer(LlamaTokenizer):

	@property
	def no_prefix_space_tokens(self):
	if self._no_prefix_space_tokens is None:
	vocab = self.convert_ids_to_tokens(list(range(self.vocab_size)))
	self._no_prefix_space_tokens = {i for i, tok in enumerate(vocab) if not tok.startswith("▁")}
	return self._no_prefix_space_tokens

	def _maybe_add_prefix_space(self, tokens, decoded):
	if tokens and tokens[0] not in self.no_prefix_space_tokens:
	return " " + decoded
	else:
	return decoded

	def convert_tokens_to_string(self, tokens):
	"""Converts a sequence of tokens (string) in a single string."""
	current_sub_tokens = []
	out_string = ""
	prev_is_special = False
	for token in tokens:
	# make sure that special tokens are not decoded using sentencepiece model
	if token in self.all_special_tokens:
	if not prev_is_special:
	out_string += " "
	out_string += self.sp_model.decode(current_sub_tokens) + token
	prev_is_special = True
	current_sub_tokens = []
	else:
	current_sub_tokens.append(token)
	prev_is_special = False
	out_string += self.sp_model.decode(current_sub_tokens)
	out_string = self.clean_up_tokenization(out_string)
	out_string = self._maybe_add_prefix_space(tokens=tokens, decoded=out_string)
	return out_string[1:]