# coding=utf-8 # The MIT License (MIT) # Copyright (c) Microsoft Corporation # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # The above copyright notice and this permission notice shall be included in all # copies or substantial portions of the Software. # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. """Tokenization classes for MiniLM.""" from __future__ import absolute_import, division, print_function, unicode_literals import collections import logging import os import unicodedata from io import open from transformers.tokenization_bert import BertTokenizer, whitespace_tokenize logger = logging.getLogger(__name__) VOCAB_FILES_NAMES = {'vocab_file': 'vocab.txt'} PRETRAINED_VOCAB_FILES_MAP = { 'vocab_file': { 'minilm-l12-h384-uncased': "https://conversationhub.blob.core.windows.net/beit-share-public/ckpt/minilm-l12-h384-uncased-vocab.txt?sv=2021-10-04&st=2023-06-08T11%3A16%3A02Z&se=2033-06-09T11%3A16%3A00Z&sr=c&sp=r&sig=N4pfCVmSeq4L4tS8QbrFVsX6f6q844eft8xSuXdxU48%3D", } } PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { 'minilm-l12-h384-uncased': 512, } class MinilmTokenizer(BertTokenizer): r""" Constructs a MinilmTokenizer. :class:`~transformers.MinilmTokenizer` is identical to BertTokenizer and runs end-to-end tokenization: punctuation splitting + wordpiece Args: vocab_file: Path to a one-wordpiece-per-line vocabulary file do_lower_case: Whether to lower case the input. Only has an effect when do_wordpiece_only=False do_basic_tokenize: Whether to do basic tokenization before wordpiece. max_len: An artificial maximum length to truncate tokenized sequences to; Effective maximum length is always the minimum of this value (if specified) and the underlying BERT model's sequence length. never_split: List of tokens which will never be split during tokenization. Only has an effect when do_wordpiece_only=False """ vocab_files_names = VOCAB_FILES_NAMES pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES class WhitespaceTokenizer(object): def tokenize(self, text): return whitespace_tokenize(text)