from unittest import TestCase from tokenizers.pre_tokenizers import Whitespace from recognizers.utils import DifferenceSample class DifferenceSampleTestCase(TestCase): def setUp(self): self.text_a = "Chinese shares close higher Friday." self.text_b = "Les actions chinoises clĂ´turent en baisse mercredi." self.tokenizer = Whitespace() self.encoding_a = self.tokenizer.pre_tokenize_str(self.text_a) self.encoding_b = self.tokenizer.pre_tokenize_str(self.text_b) self.result = DifferenceSample( tokens_a=tuple([token[0] for token in self.encoding_a]), tokens_b=tuple([token[0] for token in self.encoding_b]), labels_a=tuple([0.1 for _ in range(len(self.encoding_a))]), labels_b=tuple([0.1 for _ in range(len(self.encoding_b))]), ) def test_add_whitespace(self): self.result.add_whitespace(self.encoding_a, self.encoding_b) self.assertEqual("".join(self.result.tokens_a), self.text_a) self.assertEqual("".join(self.result.tokens_b), self.text_b)