Spaces:
Running
Running
| # Source: https://github.com/mjpost/sacrebleu/blob/master/sacrebleu/tokenizers/tokenizer_13a.py | |
| # Copyright 2020 SacreBLEU Authors. | |
| # | |
| # Licensed under the Apache License, Version 2.0 (the "License"); | |
| # you may not use this file except in compliance with the License. | |
| # You may obtain a copy of the License at | |
| # | |
| # http://www.apache.org/licenses/LICENSE-2.0 | |
| # | |
| # Unless required by applicable law or agreed to in writing, software | |
| # distributed under the License is distributed on an "AS IS" BASIS, | |
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| # See the License for the specific language governing permissions and | |
| # limitations under the License. | |
| import re | |
| from functools import lru_cache | |
| class BaseTokenizer: | |
| """A base dummy tokenizer to derive from.""" | |
| def signature(self): | |
| """ | |
| Returns a signature for the tokenizer. | |
| :return: signature string | |
| """ | |
| return "none" | |
| def __call__(self, line): | |
| """ | |
| Tokenizes an input line with the tokenizer. | |
| :param line: a segment to tokenize | |
| :return: the tokenized line | |
| """ | |
| return line | |
| class TokenizerRegexp(BaseTokenizer): | |
| def signature(self): | |
| return "re" | |
| def __init__(self): | |
| self._re = [ | |
| # language-dependent part (assuming Western languages) | |
| (re.compile(r"([\{-\~\[-\` -\&\(-\+\:-\@\/])"), r" \1 "), | |
| # tokenize period and comma unless preceded by a digit | |
| (re.compile(r"([^0-9])([\.,])"), r"\1 \2 "), | |
| # tokenize period and comma unless followed by a digit | |
| (re.compile(r"([\.,])([^0-9])"), r" \1 \2"), | |
| # tokenize dash when preceded by a digit | |
| (re.compile(r"([0-9])(-)"), r"\1 \2 "), | |
| # one space only between words | |
| # NOTE: Doing this in Python (below) is faster | |
| # (re.compile(r'\s+'), r' '), | |
| ] | |
| def __call__(self, line): | |
| """Common post-processing tokenizer for `13a` and `zh` tokenizers. | |
| :param line: a segment to tokenize | |
| :return: the tokenized line | |
| """ | |
| for (_re, repl) in self._re: | |
| line = _re.sub(repl, line) | |
| # no leading or trailing spaces, single space within words | |
| # return ' '.join(line.split()) | |
| # This line is changed with regards to the original tokenizer (seen above) to return individual words | |
| return line.split() | |
| class Tokenizer13a(BaseTokenizer): | |
| def signature(self): | |
| return "13a" | |
| def __init__(self): | |
| self._post_tokenizer = TokenizerRegexp() | |
| def __call__(self, line): | |
| """Tokenizes an input line using a relatively minimal tokenization | |
| that is however equivalent to mteval-v13a, used by WMT. | |
| :param line: a segment to tokenize | |
| :return: the tokenized line | |
| """ | |
| # language-independent part: | |
| line = line.replace("<skipped>", "") | |
| line = line.replace("-\n", "") | |
| line = line.replace("\n", " ") | |
| if "&" in line: | |
| line = line.replace(""", '"') | |
| line = line.replace("&", "&") | |
| line = line.replace("<", "<") | |
| line = line.replace(">", ">") | |
| return self._post_tokenizer(f" {line} ") | |