File size: 1,660 Bytes
6a62ffb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.

from dataclasses import dataclass, field

from fairseq.data.encoders import register_tokenizer
from fairseq.dataclass import FairseqDataclass


@dataclass
class MosesTokenizerConfig(FairseqDataclass):
    source_lang: str = field(default="en", metadata={"help": "source language"})
    target_lang: str = field(default="en", metadata={"help": "target language"})
    moses_no_dash_splits: bool = field(
        default=False, metadata={"help": "don't apply dash split rules"}
    )
    moses_no_escape: bool = field(
        default=False,
        metadata={"help": "don't perform HTML escaping on apostrophe, quotes, etc."},
    )


@register_tokenizer("moses", dataclass=MosesTokenizerConfig)
class MosesTokenizer(object):
    def __init__(self, cfg: MosesTokenizerConfig):
        self.cfg = cfg

        try:
            from sacremoses import MosesTokenizer, MosesDetokenizer

            self.tok = MosesTokenizer(cfg.source_lang)
            self.detok = MosesDetokenizer(cfg.target_lang)
        except ImportError:
            raise ImportError(
                "Please install Moses tokenizer with: pip install sacremoses"
            )

    def encode(self, x: str) -> str:
        return self.tok.tokenize(
            x,
            aggressive_dash_splits=(not self.cfg.moses_no_dash_splits),
            return_str=True,
            escape=(not self.cfg.moses_no_escape),
        )

    def decode(self, x: str) -> str:
        return self.detok.detokenize(x.split())