Spaces:

evaluate-metric
/

bleu

Running

App Files Files Community

bleu / tokenizer_13a.py

lvwerra HF Staff

Update Space (evaluate main: 828c6327)

86af275 over 3 years ago

raw

history blame contribute delete

3.34 kB

	# Source: https://github.com/mjpost/sacrebleu/blob/master/sacrebleu/tokenizers/tokenizer_13a.py
	# Copyright 2020 SacreBLEU Authors.
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.

	import re
	from functools import lru_cache


	class BaseTokenizer:
	"""A base dummy tokenizer to derive from."""

	def signature(self):
	"""
	Returns a signature for the tokenizer.
	:return: signature string
	"""
	return "none"

	def __call__(self, line):
	"""
	Tokenizes an input line with the tokenizer.
	:param line: a segment to tokenize
	:return: the tokenized line
	"""
	return line


	class TokenizerRegexp(BaseTokenizer):
	def signature(self):
	return "re"

	def __init__(self):
	self._re = [
	# language-dependent part (assuming Western languages)
	(re.compile(r"([\{-\~\[-\` -\&\(-\+\:-\@\/])"), r" \1 "),
	# tokenize period and comma unless preceded by a digit
	(re.compile(r"([^0-9])([\.,])"), r"\1 \2 "),
	# tokenize period and comma unless followed by a digit
	(re.compile(r"([\.,])([^0-9])"), r" \1 \2"),
	# tokenize dash when preceded by a digit
	(re.compile(r"([0-9])(-)"), r"\1 \2 "),
	# one space only between words
	# NOTE: Doing this in Python (below) is faster
	# (re.compile(r'\s+'), r' '),
	]

	@lru_cache(maxsize=2**16)
	def __call__(self, line):
	"""Common post-processing tokenizer for `13a` and `zh` tokenizers.
	:param line: a segment to tokenize
	:return: the tokenized line
	"""
	for (_re, repl) in self._re:
	line = _re.sub(repl, line)

	# no leading or trailing spaces, single space within words
	# return ' '.join(line.split())
	# This line is changed with regards to the original tokenizer (seen above) to return individual words
	return line.split()


	class Tokenizer13a(BaseTokenizer):
	def signature(self):
	return "13a"

	def __init__(self):
	self._post_tokenizer = TokenizerRegexp()

	@lru_cache(maxsize=2**16)
	def __call__(self, line):
	"""Tokenizes an input line using a relatively minimal tokenization
	that is however equivalent to mteval-v13a, used by WMT.

	:param line: a segment to tokenize
	:return: the tokenized line
	"""

	# language-independent part:
	line = line.replace("<skipped>", "")
	line = line.replace("-\n", "")
	line = line.replace("\n", " ")

	if "&" in line:
	line = line.replace(""", '"')
	line = line.replace("&", "&")
	line = line.replace("<", "<")
	line = line.replace(">", ">")

	return self._post_tokenizer(f" {line} ")