Spaces:

Tzktz
/

Dit-document-layout-analysis

Sleeping

App Files Files Community

Dit-document-layout-analysis / unilm /layoutreader /s2s_ft /tokenization_minilm.py

Tzktz

Upload 7664 files

6fc683c verified about 1 year ago

raw

history blame

3.06 kB

	# coding=utf-8
	# The MIT License (MIT)

	# Copyright (c) Microsoft Corporation

	# Permission is hereby granted, free of charge, to any person obtaining a copy
	# of this software and associated documentation files (the "Software"), to deal
	# in the Software without restriction, including without limitation the rights
	# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
	# copies of the Software, and to permit persons to whom the Software is
	# furnished to do so, subject to the following conditions:

	# The above copyright notice and this permission notice shall be included in all
	# copies or substantial portions of the Software.

	# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
	# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
	# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
	# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
	# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
	# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
	# SOFTWARE.
	"""Tokenization classes for MiniLM."""

	from __future__ import absolute_import, division, print_function, unicode_literals

	import collections
	import logging
	import os
	import unicodedata
	from io import open

	from transformers.tokenization_bert import BertTokenizer, whitespace_tokenize

	logger = logging.getLogger(__name__)

	VOCAB_FILES_NAMES = {'vocab_file': 'vocab.txt'}

	PRETRAINED_VOCAB_FILES_MAP = {
	'vocab_file':
	{
	'minilm-l12-h384-uncased': "https://conversationhub.blob.core.windows.net/beit-share-public/ckpt/minilm-l12-h384-uncased-vocab.txt?sv=2021-10-04&st=2023-06-08T11%3A16%3A02Z&se=2033-06-09T11%3A16%3A00Z&sr=c&sp=r&sig=N4pfCVmSeq4L4tS8QbrFVsX6f6q844eft8xSuXdxU48%3D",
	}
	}

	PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
	'minilm-l12-h384-uncased': 512,
	}


	class MinilmTokenizer(BertTokenizer):
	r"""
	Constructs a MinilmTokenizer.
	:class:`~transformers.MinilmTokenizer` is identical to BertTokenizer and runs end-to-end tokenization: punctuation splitting + wordpiece
	Args:
	vocab_file: Path to a one-wordpiece-per-line vocabulary file
	do_lower_case: Whether to lower case the input. Only has an effect when do_wordpiece_only=False
	do_basic_tokenize: Whether to do basic tokenization before wordpiece.
	max_len: An artificial maximum length to truncate tokenized sequences to; Effective maximum length is always the
	minimum of this value (if specified) and the underlying BERT model's sequence length.
	never_split: List of tokens which will never be split during tokenization. Only has an effect when
	do_wordpiece_only=False
	"""

	vocab_files_names = VOCAB_FILES_NAMES
	pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
	max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES


	class WhitespaceTokenizer(object):
	def tokenize(self, text):
	return whitespace_tokenize(text)