NetsPresso_QA / tests /test_tokenization.py
geonmin-kim's picture
Upload folder using huggingface_hub
d6585f5
raw history blame
No virus
31.5 kB
#
# Pyserini: Reproducible IR research with sparse and dense representations
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import unittest
from transformers import BertTokenizer, T5Tokenizer, AutoTokenizer
from pyserini.analysis import Analyzer, get_lucene_analyzer
class TestTokenization(unittest.TestCase):
def setUp(self):
pass
def test_bert_base_uncased_demo(self):
# https://huggingface.co/transformers/tokenizer_summary.html
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
tokens = tokenizer.tokenize('I have a new GPU!')
self.assertEqual(['i', 'have', 'a', 'new', 'gp', '##u', '!'], tokens)
def test_bert_base_uncased_en_book_examples(self):
# These are examples used in the ptr4tr book
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
tokens = tokenizer.tokenize('walking talking balking biking hiking rolling scrolling')
self.assertEqual(['walking', 'talking', 'bal', '##king', 'biking', 'hiking', 'rolling', 'scrolling'], tokens)
tokens = tokenizer.tokenize('biostatistics')
self.assertEqual(['bio', '##sta', '##tist', '##ics'], tokens)
tokens = tokenizer.tokenize('adversarial')
self.assertEqual(['ad', '##vers', '##aria', '##l'], tokens)
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
tokens = tokenizer.tokenize('walking talking balking biking hiking')
self.assertEqual(['walking', 'talking', 'b', '##alk', '##ing', 'bi', '##king', 'hiking'], tokens)
tokens = tokenizer.tokenize('rolling scrolling')
self.assertEqual(['rolling', 'scroll', '##ing'], tokens)
tokens = tokenizer.tokenize('biostatistics')
self.assertEqual(['bio', '##sta', '##tist', '##ics'], tokens)
tokens = tokenizer.tokenize('adversarial')
self.assertEqual(['ad', '##vers', '##aria', '##l'], tokens)
def test_xlm_roberta_base_en_book_examples(self):
# These are examples used in the ptr4tr book
tokenizer = AutoTokenizer.from_pretrained('xlm-roberta-base')
tokens = tokenizer.tokenize('walking talking balking biking hiking rolling scrolling')
self.assertEqual(['▁walking', '▁talking', '▁bal', 'king', '▁bi', 'king', '▁hi', 'king', '▁roll', 'ing', '▁scroll', 'ing'], tokens)
tokens = tokenizer.tokenize('rolling scrolling')
self.assertEqual(['▁roll', 'ing', '▁scroll', 'ing'], tokens)
tokens = tokenizer.tokenize('biostatistics')
self.assertEqual(['▁bio', 'stat', 'istic', 's'], tokens)
tokens = tokenizer.tokenize('adversarial')
self.assertEqual(['▁adversari', 'al'], tokens)
def test_bert_base_multilingual_en_book_examples(self):
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-uncased')
tokens = tokenizer.tokenize('walking talking balking biking hiking rolling scrolling')
self.assertEqual(['walking', 'talking', 'bal', '##king', 'bi', '##king', 'hi', '##king', 'rolling', 'sc', '##roll', '##ing'], tokens)
tokens = tokenizer.tokenize('rolling scrolling')
self.assertEqual(['rolling', 'sc', '##roll', '##ing'], tokens)
tokens = tokenizer.tokenize('biostatistics')
self.assertEqual(['bio', '##stat', '##istic', '##s'], tokens)
tokens = tokenizer.tokenize('adversarial')
self.assertEqual(['ad', '##versari', '##al'], tokens)
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
tokens = tokenizer.tokenize('walking talking balking biking hiking')
self.assertEqual(['walking', 'talking', 'bal', '##king', 'bi', '##king', 'hi', '##king'], tokens)
tokens = tokenizer.tokenize('rolling scrolling')
self.assertEqual(['rolling', 's', '##cro', '##lling'], tokens)
tokens = tokenizer.tokenize('biostatistics')
self.assertEqual(['bio', '##stati', '##stic', '##s'], tokens)
tokens = tokenizer.tokenize('adversarial')
self.assertEqual(['ad', '##versari', '##al'], tokens)
def test_lucene_analyzer_en_book_examples(self):
analyzer = Analyzer(get_lucene_analyzer())
tokens = analyzer.analyze('walking talking balking biking hiking rolling scrolling')
self.assertEqual(['walk', 'talk', 'balk', 'bike', 'hike', 'roll', 'scroll'], tokens)
tokens = analyzer.analyze('rolling scrolling')
self.assertEqual(['roll', 'scroll'], tokens)
tokens = analyzer.analyze('biostatistics')
self.assertEqual(['biostatist'], tokens)
tokens = analyzer.analyze('adversarial')
self.assertEqual(['adversari'], tokens)
def test_bert_base_multilingual_fr_book_examples(self):
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-uncased')
# walking talking biking hiking rolling scrolling
tokens = tokenizer.tokenize('marche parler vélo randonnée rouler défilement')
self.assertEqual(['marche', 'parler', 'velo', 'rand', '##onne', '##e', 'ro', '##uler', 'def', '##ile', '##ment'], tokens)
# rolling scrolling
tokens = tokenizer.tokenize('défilement roulant')
self.assertEqual(['def', '##ile', '##ment', 'ro', '##ulant'], tokens)
# biostatistics
tokens = tokenizer.tokenize('biostatistique')
self.assertEqual(['bio', '##stat', '##istique'], tokens)
# adversarial
tokens = tokenizer.tokenize('antagoniste')
self.assertEqual(['ant', '##ago', '##niste'], tokens)
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
# walking talking biking hiking rolling scrolling
tokens = tokenizer.tokenize('marche parler vélo randonnée rouler défilement')
self.assertEqual(['marche', 'parler', 'v', '##él', '##o', 'rand', '##onnée', 'ro', '##uler', 'dé', '##file', '##ment'], tokens)
# rolling scrolling
tokens = tokenizer.tokenize('défilement roulant')
self.assertEqual(['dé', '##file', '##ment', 'ro', '##ulant'], tokens)
# biostatistics
tokens = tokenizer.tokenize('biostatistique')
self.assertEqual(['bio', '##stati', '##stique'], tokens)
# adversarial
tokens = tokenizer.tokenize('antagoniste')
self.assertEqual(['ant', '##agon', '##iste'], tokens)
def test_lucene_analyzer_fr_book_examples(self):
analyzer = Analyzer(get_lucene_analyzer(language='fr'))
tokens = analyzer.analyze('marche parler vélo randonnée rouler défilement')
self.assertEqual(['march', 'parl', 'vélo', 'randon', 'roul', 'defil'], tokens)
tokens = analyzer.analyze('défilement roulant')
self.assertEqual(['defil', 'roulant'], tokens)
tokens = analyzer.analyze('biostatistique')
self.assertEqual(['biostatist'], tokens)
tokens = analyzer.analyze('antagoniste')
self.assertEqual(['antagonist'], tokens)
def test_bert_base_multilingual_zh_book_examples(self):
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-uncased')
# walking talking biking hiking rolling scrolling
tokens = tokenizer.tokenize('走路说话骑自行车远足滚动滚动')
self.assertEqual(['走', '路', '说', '话', '骑', '自', '行', '车', '远', '足', '滚', '动', '滚', '动'], tokens)
# rolling scrolling
tokens = tokenizer.tokenize('滚动滚动')
self.assertEqual(['滚', '动', '滚', '动'], tokens)
# biostatistics
tokens = tokenizer.tokenize('生物统计学')
self.assertEqual(['生', '物', '统', '计', '学'], tokens)
# adversarial
tokens = tokenizer.tokenize('对抗的')
self.assertEqual(['对', '抗', '的'], tokens)
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
# walking talking biking hiking rolling scrolling
tokens = tokenizer.tokenize('走路说话骑自行车远足滚动滚动')
self.assertEqual(['走', '路', '说', '话', '骑', '自', '行', '车', '远', '足', '滚', '动', '滚', '动'], tokens)
# rolling scrolling
tokens = tokenizer.tokenize('滚动滚动')
self.assertEqual(['滚', '动', '滚', '动'], tokens)
# biostatistics
tokens = tokenizer.tokenize('生物统计学')
self.assertEqual(['生', '物', '统', '计', '学'], tokens)
# adversarial
tokens = tokenizer.tokenize('对抗的')
self.assertEqual(['对', '抗', '的'], tokens)
def test_lucene_analyzer_zh_book_examples(self):
analyzer = Analyzer(get_lucene_analyzer(language='zh'))
tokens = analyzer.analyze('走路说话骑自行车远足滚动滚动')
self.assertEqual(['走路', '路说', '说话', '话骑', '骑自', '自行', '行车', '车远', '远足', '足滚', '滚动', '动滚', '滚动'], tokens)
tokens = analyzer.analyze('滚动滚动')
self.assertEqual(['滚动', '动滚', '滚动'], tokens)
tokens = analyzer.analyze('生物统计学')
self.assertEqual(['生物', '物统', '统计', '计学'], tokens)
tokens = analyzer.analyze('对抗的')
self.assertEqual(['对抗', '抗的'], tokens)
def test_bert_base_multilingual_ar_book_examples(self):
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-uncased')
# walking talking biking hiking rolling scrolling
tokens = tokenizer.tokenize('المشي الحديث ركوب الدراجات المشي لمسافات طويلة المتداول التمرير')
self.assertEqual(['ال', '##م', '##شي', 'الحديث', 'ر', '##كو', '##ب', 'ال', '##در', '##اج', '##ات', 'ال', '##م', '##شي', 'لم', '##سا', '##فات', 'طويلة', 'ال', '##مت', '##دا', '##ول', 'ال', '##تم', '##رير'], tokens)
# rolling scrolling
tokens = tokenizer.tokenize('المتداول التمرير')
self.assertEqual(['ال', '##مت', '##دا', '##ول', 'ال', '##تم', '##رير'], tokens)
# biostatistics
tokens = tokenizer.tokenize('الإحصاء الحيوي')
self.assertEqual(['الاحصاء', 'ال', '##حي', '##وي'], tokens)
# adversarial
tokens = tokenizer.tokenize('عدائي')
self.assertEqual(['ع', '##دا', '##يي'], tokens)
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
# walking talking biking hiking rolling scrolling
tokens = tokenizer.tokenize('المشي الحديث ركوب الدراجات المشي لمسافات طويلة المتداول التمرير')
self.assertEqual(['ال', '##م', '##شي', 'الحديث', 'ر', '##كو', '##ب', 'ال', '##در', '##اجات', 'ال', '##م', '##شي', 'لم', '##سا', '##فات', 'طويلة', 'ال', '##مت', '##دا', '##ول', 'ال', '##تم', '##رير'], tokens)
# rolling scrolling
tokens = tokenizer.tokenize('المتداول التمرير')
self.assertEqual(['ال', '##مت', '##دا', '##ول', 'ال', '##تم', '##رير'], tokens)
# biostatistics
tokens = tokenizer.tokenize('الإحصاء الحيوي')
self.assertEqual(['الإحصاء', 'ال', '##حي', '##وي'], tokens)
# adversarial
tokens = tokenizer.tokenize('عدائي')
self.assertEqual(['ع', '##دا', '##ئي'], tokens)
def test_bert_base_multilingual_hi_book_examples(self):
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-uncased')
# walking talking biking hiking rolling scrolling
tokens = tokenizer.tokenize('चलने की बात करते हुए बाइक चलाना लंबी पैदल यात्रा स्क्रॉल')
self.assertEqual(['चल', '##न', 'की', 'बात', 'करत', 'हए', 'ब', '##ा', '##इ', '##क', 'चल', '##ाना', 'ल', '##बी', 'पद', '##ल', 'यातरा', 'सक', '##र', '##ॉल'], tokens)
# rolling scrolling
tokens = tokenizer.tokenize('रोलिंग स्क्रॉल')
self.assertEqual(['र', '##ोल', '##िग', 'सक', '##र', '##ॉल'], tokens)
# biostatistics
tokens = tokenizer.tokenize('जैव सांख्यिकी')
self.assertEqual(['ज', '##व', 'स', '##ा', '##ख', '##यिक', '##ी'], tokens)
# adversarial
tokens = tokenizer.tokenize('विरोधात्मक')
self.assertEqual(['वि', '##रो', '##धा', '##तमक'], tokens)
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
# walking talking biking hiking rolling scrolling
tokens = tokenizer.tokenize('चलने की बात करते हुए बाइक चलाना लंबी पैदल यात्रा स्क्रॉल')
self.assertEqual(['च', '##लन', '##े', 'की', 'बात', 'करते', 'हुए', 'ब', '##ा', '##इ', '##क', 'च', '##ला', '##ना', 'ल', '##ं', '##बी', 'प', '##ै', '##दल', 'यात्रा', 'स', '##्क', '##्र', '##ॉल'], tokens)
# rolling scrolling
tokens = tokenizer.tokenize('रोलिंग स्क्रॉल')
self.assertEqual(['र', '##ोल', '##िंग', 'स', '##्क', '##्र', '##ॉल'], tokens)
# biostatistics
tokens = tokenizer.tokenize('जैव सांख्यिकी')
self.assertEqual(['ज', '##ै', '##व', 'स', '##ा', '##ं', '##ख', '##्य', '##िकी'], tokens)
# adversarial
tokens = tokenizer.tokenize('विरोधात्मक')
self.assertEqual(['वि', '##रो', '##धा', '##त्मक'], tokens)
def test_bert_base_multilingual_bn_book_examples(self):
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-uncased')
# walking talking biking hiking rolling scrolling
tokens = tokenizer.tokenize('হাঁটাচলা বাইকিং হাইকিং রোলিং স্ক্রোলিং')
self.assertEqual(['হ', '##াট', '##া', '##চ', '##লা', 'বা', '##ই', '##কি', '##ং', 'হ', '##াই', '##কি', '##ং', 'র', '##ো', '##লি', '##ং', 'স', '##কর', '##ো', '##লি', '##ং'], tokens)
# rolling scrolling
tokens = tokenizer.tokenize('ঘূর্ণায়মান স্ক্রোলিং')
self.assertEqual(['ঘর', '##ণা', '##য', '##মান', 'স', '##কর', '##ো', '##লি', '##ং'], tokens)
# biostatistics
tokens = tokenizer.tokenize('বায়োস্টাটিক্স')
self.assertEqual(['বা', '##যে', '##াস', '##টা', '##টি', '##ক', '##স'], tokens)
# adversarial
tokens = tokenizer.tokenize('প্রতিকূল')
self.assertEqual(['পরতি', '##ক', '##ল'], tokens)
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
# walking talking biking hiking rolling scrolling
tokens = tokenizer.tokenize('হাঁটাচলা বাইকিং হাইকিং রোলিং স্ক্রোলিং')
self.assertEqual(['হ', '##া', '##ঁ', '##টা', '##চ', '##লা', 'বা', '##ই', '##কি', '##ং', 'হ', '##াই', '##কি', '##ং', 'র', '##োল', '##িং', 'স', '##্ক', '##্র', '##োল', '##িং'], tokens)
# rolling scrolling
tokens = tokenizer.tokenize('ঘূর্ণায়মান স্ক্রোলিং')
self.assertEqual(['ঘ', '##ূর্ণ', '##ায়', '##মান', 'স', '##্ক', '##্র', '##োল', '##িং'], tokens)
# biostatistics
tokens = tokenizer.tokenize('বায়োস্টাটিক্স')
self.assertEqual(['বা', '##য়', '##ো', '##স্ট', '##াট', '##িক', '##্স'], tokens)
# adversarial
tokens = tokenizer.tokenize('প্রতিকূল')
self.assertEqual(['প্রতি', '##ক', '##ূ', '##ল'], tokens)
def test_bert_base_multilingual_am(self):
"""
amharic
"""
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-uncased')
tokens = tokenizer.tokenize('የሽፋኑ ርዕሰ ጉዳይ የሞቱ ሰዎች ይነሳሉ')
self.assertEqual(['[UNK]', '[UNK]', '[UNK]', '[UNK]', '[UNK]', '[UNK]'], tokens)
tokens = tokenizer.tokenize('የሽፋኑ')
self.assertEqual(['[UNK]'], tokens)
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
tokens = tokenizer.tokenize('የሽፋኑ ርዕሰ ጉዳይ የሞቱ ሰዎች ይነሳሉ')
self.assertEqual(['[UNK]', '[UNK]', '[UNK]', '[UNK]', '[UNK]', '[UNK]'], tokens)
tokens = tokenizer.tokenize('የሽፋኑ')
self.assertEqual(['[UNK]'], tokens)
def test_xlmr_base_multilingual_am(self):
"""
amharic
"""
tokenizer = AutoTokenizer.from_pretrained('xlm-roberta-base')
tokens = tokenizer.tokenize('የሽፋኑ ርዕሰ ጉዳይ የሞቱ ሰዎች ይነሳሉ')
self.assertEqual(['▁የ', 'ሽ', 'ፋ', 'ኑ', '▁ርዕሰ', '▁ጉዳይ', '▁የ', 'ሞቱ', '▁ሰዎች', '▁ይ', 'ነሳ', 'ሉ'], tokens)
tokens = tokenizer.tokenize('የሽፋኑ')
self.assertEqual(['▁የ', 'ሽ', 'ፋ', 'ኑ'], tokens)
def test_bert_base_multilingual_ha(self):
"""
hausa
"""
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-uncased')
tokens = tokenizer.tokenize('Ya san kungiyar, ya san komai game da kungiyar')
self.assertEqual(['ya', 'san', 'kung', '##iya', '##r', ',', 'ya', 'san', 'koma', '##i', 'game', 'da', 'kung', '##iya', '##r'], tokens)
tokens = tokenizer.tokenize('kungiyar')
self.assertEqual(['kung', '##iya', '##r'], tokens)
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
tokens = tokenizer.tokenize('Ya san kungiyar, ya san komai game da kungiyar')
self.assertEqual(['Ya', 'san', 'kung', '##iya', '##r', ',', 'ya', 'san', 'koma', '##i', 'game', 'da', 'kung', '##iya', '##r'], tokens)
tokens = tokenizer.tokenize('kungiyar')
self.assertEqual(['kung', '##iya', '##r'], tokens)
def test_xlmr_base_multilingual_ha(self):
"""
hausa
"""
tokenizer = AutoTokenizer.from_pretrained('xlm-roberta-base')
tokens = tokenizer.tokenize('Ya san kungiyar, ya san komai game da kungiyar')
self.assertEqual(['▁Ya', '▁san', '▁kungiyar', ',', '▁ya', '▁san', '▁koma', 'i', '▁game', '▁da', '▁kungiyar'], tokens)
tokens = tokenizer.tokenize('kungiyar')
self.assertEqual(['▁kungiyar'], tokens)
def test_bert_base_multilingual_ig(self):
"""
igbo
"""
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-uncased')
tokens = tokenizer.tokenize('Oke Ọñụ Adaa Dịka Lọọlọ Ezenneka gbàrà Ahọ Otu Narị')
self.assertEqual(['ok', '##e', 'onu', 'ada', '##a', 'dik', '##a', 'lo', '##olo', 'ezen', '##nek', '##a', 'gba', '##ra', 'ah', '##o', 'ot', '##u', 'nar', '##i'], tokens)
tokens = tokenizer.tokenize('Ezenneka')
self.assertEqual(['ezen', '##nek', '##a'], tokens)
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
tokens = tokenizer.tokenize('Oke Ọñụ Adaa Dịka Lọọlọ Ezenneka gbàrà Ahọ Otu Narị')
self.assertEqual(['Ok', '##e', 'Ọ', '##ñ', '##ụ', 'Ada', '##a', 'D', '##ị', '##ka', 'L', '##ọ', '##ọ', '##l', '##ọ', 'Ezen', '##nek', '##a', 'g', '##bà', '##rà', 'Ah', '##ọ', 'O', '##tu', 'Na', '##r', '##ị'], tokens)
tokens = tokenizer.tokenize('Ezenneka')
self.assertEqual(['Ezen', '##nek', '##a'], tokens)
def test_xlmr_base_multilingual_ig(self):
"""
igbo
"""
tokenizer = AutoTokenizer.from_pretrained('xlm-roberta-base')
tokens = tokenizer.tokenize('Oke Ọñụ Adaa Dịka Lọọlọ Ezenneka gbàrà Ahọ Otu Narị')
self.assertEqual(['▁O', 'ke', '▁', 'Ọ', 'ñ', 'ụ', '▁Ada', 'a', '▁D', 'ị', 'ka', '▁L', 'ọ', 'ọ', 'l', 'ọ', '▁Ezen', 'nek', 'a', '▁', 'gb', 'à', 'rà', '▁Ah', 'ọ', '▁O', 'tu', '▁Nar', 'ị'], tokens)
tokens = tokenizer.tokenize('Ezenneka')
self.assertEqual(['▁Ezen', 'nek', 'a'], tokens)
def test_bert_base_multilingual_om(self):
"""
Afaan Oromoo
"""
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-uncased')
tokens = tokenizer.tokenize('Ani obbolaa keessan, Abdii Baalee Oromiyaatii')
self.assertEqual(['ani', 'ob', '##bola', '##a', 'ke', '##essa', '##n', ',', 'abd', '##ii', 'ba', '##ale', '##e', 'oro', '##mi', '##ya', '##atii'], tokens)
tokens = tokenizer.tokenize('Oromiyaatii')
self.assertEqual(['oro', '##mi', '##ya', '##atii'], tokens)
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
tokens = tokenizer.tokenize('Ani obbolaa keessan, Abdii Baalee Oromiyaatii')
self.assertEqual(['Ani', 'ob', '##bola', '##a', 'ke', '##essa', '##n', ',', 'Abd', '##ii', 'Ba', '##ale', '##e', 'Oro', '##mi', '##ya', '##ati', '##i'], tokens)
tokens = tokenizer.tokenize('Oromiyaatii')
self.assertEqual(['Oro', '##mi', '##ya', '##ati', '##i'], tokens)
def test_xlmr_base_multilingual_om(self):
"""
Afaan Oromoo
"""
tokenizer = AutoTokenizer.from_pretrained('xlm-roberta-base')
tokens = tokenizer.tokenize('Ani obbolaa keessan, Abdii Baalee Oromiyaatii')
self.assertEqual(['▁Ani', '▁ob', 'bola', 'a', '▁keessa', 'n', ',', '▁Ab', 'dii', '▁Ba', 'ale', 'e', '▁Oromiyaa', 'tii'], tokens)
tokens = tokenizer.tokenize('Oromiyaatii')
self.assertEqual(['▁Oromiyaa', 'tii'], tokens)
def test_bert_base_multilingual_pcm(self):
"""
Nigerian Pidgin
"""
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-uncased')
tokens = tokenizer.tokenize('Crude oil dey kill pickin for Nigeria?')
self.assertEqual(['cru', '##de', 'oil', 'de', '##y', 'kill', 'pick', '##in', 'for', 'nigeria', '?'], tokens)
tokens = tokenizer.tokenize('wahala')
self.assertEqual(['wah', '##ala'], tokens)
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
tokens = tokenizer.tokenize('Crude oil dey kill pickin for Nigeria?')
self.assertEqual(['C', '##rude', 'oil', 'de', '##y', 'kill', 'pick', '##in', 'for', 'Nigeria', '?'], tokens)
tokens = tokenizer.tokenize('wahala')
self.assertEqual(['wa', '##hala'], tokens)
def test_xlmr_base_multilingual_pcm(self):
"""
Nigerian Pidgin
"""
tokenizer = AutoTokenizer.from_pretrained('xlm-roberta-base')
tokens = tokenizer.tokenize('Crude oil dey kill pickin for Nigeria?')
self.assertEqual(['▁Cru', 'de', '▁oil', '▁de', 'y', '▁kill', '▁pick', 'in', '▁for', '▁Nigeria', '?'], tokens)
tokens = tokenizer.tokenize('wahala')
self.assertEqual(['▁wa', 'hala'], tokens)
def test_bert_base_multilingual_so(self):
"""
Somali
"""
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-uncased')
tokens = tokenizer.tokenize('Rabbigu wuxuu amar ku bixiyey in la dumiyo qalcadaha Kancaan.')
self.assertEqual(['rabbi', '##gu', 'wu', '##xu', '##u', 'amar', 'ku', 'bi', '##xi', '##ye', '##y', 'in', 'la', 'dum', '##iy', '##o', 'qal', '##cada', '##ha', 'kan', '##ca', '##an', '.'], tokens)
tokens = tokenizer.tokenize('bixiyey')
self.assertEqual(['bi', '##xi', '##ye', '##y'], tokens)
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
tokens = tokenizer.tokenize('Rabbigu wuxuu amar ku bixiyey in la dumiyo qalcadaha Kancaan.')
self.assertEqual(['Rabbi', '##gu', 'w', '##ux', '##uu', 'amar', 'ku', 'bi', '##xi', '##ye', '##y', 'in', 'la', 'dum', '##iyo', 'q', '##al', '##cada', '##ha', 'Kan', '##ca', '##an', '.'], tokens)
tokens = tokenizer.tokenize('bixiyey')
self.assertEqual(['bi', '##xi', '##ye', '##y'], tokens)
def test_xlmr_base_multilingual_so(self):
"""
Somali
"""
tokenizer = AutoTokenizer.from_pretrained('xlm-roberta-base')
tokens = tokenizer.tokenize('Rabbigu wuxuu amar ku bixiyey in la dumiyo qalcadaha Kancaan.')
self.assertEqual(['▁Rabbi', 'gu', '▁wuxuu', '▁amar', '▁ku', '▁bixi', 'yey', '▁in', '▁la', '▁dum', 'iyo', '▁qal', 'cada', 'ha', '▁Kan', 'ca', 'an', '.'], tokens)
tokens = tokenizer.tokenize('bixiyey')
self.assertEqual(['▁bixi', 'yey'], tokens)
def test_bert_base_multilingual_sw(self):
"""
Swahili
"""
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-uncased')
tokens = tokenizer.tokenize('Huduma ya upasuaji mkubwa na mdogo')
self.assertEqual(['hu', '##dum', '##a', 'ya', 'up', '##asu', '##aji', 'mk', '##ubwa', 'na', 'md', '##ogo'], tokens)
tokens = tokenizer.tokenize('upasuaji')
self.assertEqual(['up', '##asu', '##aji'], tokens)
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
tokens = tokenizer.tokenize('Huduma ya upasuaji mkubwa na mdogo')
self.assertEqual(['Hu', '##dum', '##a', 'ya', 'up', '##asu', '##aji', 'mk', '##ub', '##wa', 'na', 'm', '##dogo'], tokens)
tokens = tokenizer.tokenize('upasuaji')
self.assertEqual(['up', '##asu', '##aji'], tokens)
def test_xlmr_base_multilingual_sw(self):
"""
Swahili
"""
tokenizer = AutoTokenizer.from_pretrained('xlm-roberta-base')
tokens = tokenizer.tokenize('Huduma ya upasuaji mkubwa na mdogo')
self.assertEqual(['▁Huduma', '▁ya', '▁up', 'asu', 'aji', '▁mkubwa', '▁na', '▁mdogo'], tokens)
tokens = tokenizer.tokenize('upasuaji')
self.assertEqual(['▁up', 'asu', 'aji'], tokens)
def test_bert_base_multilingual_ti(self):
"""
Tigrinya
"""
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-uncased')
tokens = tokenizer.tokenize('ስርዓተ ቀብሪ ኢንጂነር ስመኘው በቀለ ትማሊ ተፈፂሙ')
self.assertEqual(['[UNK]', '[UNK]', '[UNK]', '[UNK]', '[UNK]', '[UNK]', '[UNK]'], tokens)
tokens = tokenizer.tokenize('ኢንጂነር')
self.assertEqual(['[UNK]'], tokens)
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
tokens = tokenizer.tokenize('ስርዓተ ቀብሪ ኢንጂነር ስመኘው በቀለ ትማሊ ተፈፂሙ')
self.assertEqual(['[UNK]', '[UNK]', '[UNK]', '[UNK]', '[UNK]', '[UNK]', '[UNK]'], tokens)
tokens = tokenizer.tokenize('ኢንጂነር')
self.assertEqual(['[UNK]'], tokens)
def test_xlmr_base_multilingual_ti(self):
"""
Tigrinya
"""
tokenizer = AutoTokenizer.from_pretrained('xlm-roberta-base')
tokens = tokenizer.tokenize('ስርዓተ ቀብሪ ኢንጂነር ስመኘው በቀለ ትማሊ ተፈፂሙ')
self.assertEqual(['▁ስር', 'ዓ', 'ተ', '▁ቀ', 'ብሪ', '▁ኢን', 'ጂ', 'ነ', 'ር', '▁ስ', 'መ', 'ኘ', 'ው', '▁በቀለ', '▁ት', 'ማ', 'ሊ', '▁ተፈ', 'ፂ', 'ሙ'], tokens)
tokens = tokenizer.tokenize('ኢንጂነር')
self.assertEqual(['▁ኢን', 'ጂ', 'ነ', 'ር'], tokens)
def test_bert_base_multilingual_yo(self):
"""
Yoruba
"""
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-uncased')
tokens = tokenizer.tokenize('Orúkọ ọmọbinrin rẹ̀ àgbà ni Merabu, ti èyí àbúrò ni Mikali.')
self.assertEqual(['oru', '##ko', 'omo', '##bin', '##rin', 're', 'ag', '##ba', 'ni', 'mera', '##bu', ',', 'ti', 'e', '##yi', 'abu', '##ro', 'ni', 'mika', '##li', '.'], tokens)
tokens = tokenizer.tokenize('ọmọbinrin')
self.assertEqual(['omo', '##bin', '##rin'], tokens)
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
tokens = tokenizer.tokenize('Orúkọ ọmọbinrin rẹ̀ àgbà ni Merabu, ti èyí àbúrò ni Mikali.')
self.assertEqual(['Or', '##ú', '##k', '##ọ', 'ọ', '##m', '##ọ', '##bin', '##rin', 'r', '##ẹ̀', 'à', '##g', '##bà', 'ni', 'Mer', '##abu', ',', 'ti', 'è', '##y', '##í', 'à', '##b', '##úr', '##ò', 'ni', 'Mika', '##li', '.'], tokens)
tokens = tokenizer.tokenize('ọmọbinrin')
self.assertEqual(['ọ', '##m', '##ọ', '##bin', '##rin'], tokens)
def test_xlmr_base_multilingual_yo(self):
"""
Yoruba
"""
tokenizer = AutoTokenizer.from_pretrained('xlm-roberta-base')
tokens = tokenizer.tokenize('Orúkọ ọmọbinrin rẹ̀ àgbà ni Merabu, ti èyí àbúrò ni Mikali.')
self.assertEqual(['▁O', 'rú', 'k', 'ọ', '▁', 'ọ', 'm', 'ọ', 'bin', 'rin', '▁r', 'ẹ', '̀', '▁à', 'gb', 'à', '▁ni', '▁Mera', 'bu', ',', '▁ti', '▁è', 'y', 'í', '▁à', 'bú', 'rò', '▁ni', '▁Mi', 'kali', '.'], tokens)
tokens = tokenizer.tokenize('ọmọbinrin')
self.assertEqual(['▁', 'ọ', 'm', 'ọ', 'bin', 'rin'], tokens)
def test_doc2query(self):
tokenizer = T5Tokenizer.from_pretrained('castorini/doc2query-t5-base-msmarco')
tokens = tokenizer.tokenize('I have a new GPU!')
self.assertEqual(['▁I', '▁have', '▁', 'a', '▁new', '▁GPU', '!'], tokens)
tokenizer = T5Tokenizer.from_pretrained('castorini/doc2query-t5-base-msmarco')
tokens = tokenizer.tokenize('walking talking biking scrolling')
self.assertEqual(['▁walking', '▁talking', '▁biking', '▁scroll', 'ing'], tokens)
tokens = tokenizer.tokenize('biostatistics')
self.assertEqual(['▁bio', 'stat', 'istic', 's'], tokens)
tokens = tokenizer.tokenize('adversarial')
self.assertEqual(['▁adversar', 'i', 'al'], tokens)
def tearDown(self):
pass
if __name__ == '__main__':
unittest.main()