# coding=utf-8 # Copyright 2022 The HuggingFace Inc. team. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ Testing suite for the PyTorch CpmBee tokenizer. """ import os import unittest from transformers.models.cpmbee.tokenization_cpmbee import VOCAB_FILES_NAMES, CpmBeeTokenizer from transformers.tokenization_utils import AddedToken from ...test_tokenization_common import TokenizerTesterMixin class CPMBeeTokenizationTest(TokenizerTesterMixin, unittest.TestCase): tokenizer_class = CpmBeeTokenizer test_rust_tokenizer = False def setUp(self): super().setUp() vocab_tokens = [ "", "", "", "", "", "", "", "", "", "我", "是", "C", "P", "M", "B", "e", "e", ] self.vocab_file = os.path.join(self.tmpdirname, VOCAB_FILES_NAMES["vocab_file"]) vocab_tokens = list(set(vocab_tokens)) with open(self.vocab_file, "w", encoding="utf-8") as vocab_writer: vocab_writer.write("".join([x + "\n" for x in vocab_tokens])) # override test_add_tokens_tokenizer because <...> is special token in CpmBeeTokenizer. def test_add_tokens_tokenizer(self): tokenizers = self.get_tokenizers(do_lower_case=False) for tokenizer in tokenizers: with self.subTest(f"{tokenizer.__class__.__name__}"): vocab_size = tokenizer.vocab_size all_size = len(tokenizer) self.assertNotEqual(vocab_size, 0) # We usually have added tokens from the start in tests because our vocab fixtures are # smaller than the original vocabs - let's not assert this # self.assertEqual(vocab_size, all_size) new_toks = ["aaaaa bbbbbb", "cccccccccdddddddd"] added_toks = tokenizer.add_tokens(new_toks) vocab_size_2 = tokenizer.vocab_size all_size_2 = len(tokenizer) self.assertNotEqual(vocab_size_2, 0) self.assertEqual(vocab_size, vocab_size_2) self.assertEqual(added_toks, len(new_toks)) self.assertEqual(all_size_2, all_size + len(new_toks)) tokens = tokenizer.encode("aaaaa bbbbbb low cccccccccdddddddd l", add_special_tokens=False) self.assertGreaterEqual(len(tokens), 4) self.assertGreater(tokens[0], tokenizer.vocab_size - 1) self.assertGreater(tokens[-2], tokenizer.vocab_size - 1) new_toks_2 = {"eos_token": ">>>>|||<||<<|<<", "pad_token": "<<<<<|||;;;||;"} added_toks_2 = tokenizer.add_special_tokens(new_toks_2) vocab_size_3 = tokenizer.vocab_size all_size_3 = len(tokenizer) self.assertNotEqual(vocab_size_3, 0) self.assertEqual(vocab_size, vocab_size_3) self.assertEqual(added_toks_2, len(new_toks_2)) self.assertEqual(all_size_3, all_size_2 + len(new_toks_2)) tokens = tokenizer.encode( ">>>>|||<||<<|<< aaaaabbbbbb low cccccccccdddddddd <<<<<|||;;;||; l", add_special_tokens=False ) self.assertGreaterEqual(len(tokens), 6) self.assertGreater(tokens[0], tokenizer.vocab_size - 1) self.assertGreater(tokens[0], tokens[1]) self.assertGreater(tokens[-2], tokenizer.vocab_size - 1) self.assertGreater(tokens[-2], tokens[-3]) self.assertEqual(tokens[0], tokenizer.eos_token_id) self.assertEqual(tokens[-2], tokenizer.pad_token_id) def test_added_tokens_do_lower_case(self): tokenizers = self.get_tokenizers(do_lower_case=True) for tokenizer in tokenizers: with self.subTest(f"{tokenizer.__class__.__name__}"): if not hasattr(tokenizer, "do_lower_case") or not tokenizer.do_lower_case: continue special_token = tokenizer.all_special_tokens[0] text = special_token + " aaaaa bbbbbb low cccccccccdddddddd l " + special_token text2 = special_token + " AAAAA BBBBBB low CCCCCCCCCDDDDDDDD l " + special_token toks_before_adding = tokenizer.tokenize(text) # toks before adding new_toks new_toks = ["aaaaa bbbbbb", "cccccccccdddddddd", "AAAAA BBBBBB", "CCCCCCCCCDDDDDDDD"] added = tokenizer.add_tokens([AddedToken(tok, lstrip=True, rstrip=True) for tok in new_toks]) toks_after_adding = tokenizer.tokenize(text) toks_after_adding2 = tokenizer.tokenize(text2) # Rust tokenizers dont't lowercase added tokens at the time calling `tokenizer.add_tokens`, # while python tokenizers do, so new_toks 0 and 2 would be treated as the same, so do new_toks 1 and 3. self.assertIn(added, [2, 4]) self.assertListEqual(toks_after_adding, toks_after_adding2) self.assertTrue( len(toks_before_adding) > len(toks_after_adding), # toks_before_adding should be longer ) # Check that none of the special tokens are lowercased sequence_with_special_tokens = "A " + " yEs ".join(tokenizer.all_special_tokens) + " B" # Convert the tokenized list to str as some special tokens are tokenized like normal tokens # which have a prefix spacee e.g. the mask token of Albert, and cannot match the original # special tokens exactly. tokenized_sequence = "".join(tokenizer.tokenize(sequence_with_special_tokens)) for special_token in tokenizer.all_special_tokens: self.assertTrue(special_token in tokenized_sequence) tokenizers = self.get_tokenizers(do_lower_case=True) for tokenizer in tokenizers: with self.subTest(f"{tokenizer.__class__.__name__}"): if hasattr(tokenizer, "do_lower_case") and tokenizer.do_lower_case: continue special_token = tokenizer.all_special_tokens[0] text = special_token + " aaaaa bbbbbb low cccccccccdddddddd l " + special_token text2 = special_token + " AAAAA BBBBBB low CCCCCCCCCDDDDDDDD l " + special_token toks_before_adding = tokenizer.tokenize(text) # toks before adding new_toks new_toks = ["aaaaa bbbbbb", "cccccccccdddddddd", "AAAAA BBBBBB", "CCCCCCCCCDDDDDDDD"] added = tokenizer.add_tokens([AddedToken(tok, lstrip=True, rstrip=True) for tok in new_toks]) self.assertIn(added, [2, 4]) toks_after_adding = tokenizer.tokenize(text) toks_after_adding2 = tokenizer.tokenize(text2) self.assertEqual(len(toks_after_adding), len(toks_after_adding2)) # Length should still be the same self.assertNotEqual( toks_after_adding[1], toks_after_adding2[1] ) # But at least the first non-special tokens should differ self.assertTrue( len(toks_before_adding) > len(toks_after_adding), # toks_before_adding should be longer ) def test_pre_tokenization(self): tokenizer = CpmBeeTokenizer.from_pretrained("openbmb/cpm-bee-10b") texts = {"input": "你好,", "": ""} tokens = tokenizer(texts) tokens = tokens["input_ids"][0] input_tokens = [6, 8, 7, 6, 65678, 7, 6, 10273, 246, 7, 6, 9, 7] self.assertListEqual(tokens, input_tokens) normalized_text = "input你好," reconstructed_text = tokenizer.decode(tokens) self.assertEqual(reconstructed_text, normalized_text)