Spaces:
Sleeping
Sleeping
# coding=utf-8 | |
# The MIT License (MIT) | |
# Copyright (c) Microsoft Corporation | |
# Permission is hereby granted, free of charge, to any person obtaining a copy | |
# of this software and associated documentation files (the "Software"), to deal | |
# in the Software without restriction, including without limitation the rights | |
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | |
# copies of the Software, and to permit persons to whom the Software is | |
# furnished to do so, subject to the following conditions: | |
# The above copyright notice and this permission notice shall be included in all | |
# copies or substantial portions of the Software. | |
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | |
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | |
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | |
# SOFTWARE. | |
"""Tokenization classes for MiniLM.""" | |
from __future__ import absolute_import, division, print_function, unicode_literals | |
import collections | |
import logging | |
import os | |
import unicodedata | |
from io import open | |
from transformers.tokenization_bert import BertTokenizer, whitespace_tokenize | |
logger = logging.getLogger(__name__) | |
VOCAB_FILES_NAMES = {'vocab_file': 'vocab.txt'} | |
PRETRAINED_VOCAB_FILES_MAP = { | |
'vocab_file': | |
{ | |
'minilm-l12-h384-uncased': "https://conversationhub.blob.core.windows.net/beit-share-public/ckpt/minilm-l12-h384-uncased-vocab.txt?sv=2021-10-04&st=2023-06-08T11%3A16%3A02Z&se=2033-06-09T11%3A16%3A00Z&sr=c&sp=r&sig=N4pfCVmSeq4L4tS8QbrFVsX6f6q844eft8xSuXdxU48%3D", | |
} | |
} | |
PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { | |
'minilm-l12-h384-uncased': 512, | |
} | |
class MinilmTokenizer(BertTokenizer): | |
r""" | |
Constructs a MinilmTokenizer. | |
:class:`~transformers.MinilmTokenizer` is identical to BertTokenizer and runs end-to-end tokenization: punctuation splitting + wordpiece | |
Args: | |
vocab_file: Path to a one-wordpiece-per-line vocabulary file | |
do_lower_case: Whether to lower case the input. Only has an effect when do_wordpiece_only=False | |
do_basic_tokenize: Whether to do basic tokenization before wordpiece. | |
max_len: An artificial maximum length to truncate tokenized sequences to; Effective maximum length is always the | |
minimum of this value (if specified) and the underlying BERT model's sequence length. | |
never_split: List of tokens which will never be split during tokenization. Only has an effect when | |
do_wordpiece_only=False | |
""" | |
vocab_files_names = VOCAB_FILES_NAMES | |
pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP | |
max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES | |
class WhitespaceTokenizer(object): | |
def tokenize(self, text): | |
return whitespace_tokenize(text) | |