File size: 3,266 Bytes
ee21b96
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
#!/usr/bin/env python3
# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.

import argparse
import re
import sys


class OOVIndexError(IndexError):
    def __init__(self, pos, source_seq, target_seq):
        super(OOVIndexError, self).__init__(
            "A <unk-N> tag in the target sequence refers to a position that is "
            "outside the source sequence. Most likely there was a mismatch in "
            "provided source and target sequences. Otherwise this would mean that "
            "the pointing mechanism somehow attended to a position that is past "
            "the actual sequence end."
        )
        self.source_pos = pos
        self.source_seq = source_seq
        self.target_seq = target_seq


def replace_oovs(source_in, target_in, target_out):
    """Replaces <unk-N> tokens in the target text with the corresponding word in
    the source text.
    """

    oov_re = re.compile("^<unk-([0-9]+)>$")

    for source_seq, target_seq in zip(source_in, target_in):
        target_seq_out = []

        pos_to_word = source_seq.strip().split()
        for token in target_seq.strip().split():
            m = oov_re.match(token)
            if m:
                pos = int(m.group(1))
                if pos >= len(pos_to_word):
                    raise OOVIndexError(pos, source_seq, target_seq)
                token_out = pos_to_word[pos]
            else:
                token_out = token
            target_seq_out.append(token_out)
        target_out.write(" ".join(target_seq_out) + "\n")


def main():
    parser = argparse.ArgumentParser(
        description="Replaces <unk-N> tokens in target sequences with words from "
        "the corresponding position in the source sequence."
    )
    parser.add_argument(
        "--source", type=str, help="text file with source sequences", required=True
    )
    parser.add_argument(
        "--target", type=str, help="text file with target sequences", required=True
    )
    parser.add_argument(
        "--target-out",
        type=str,
        help="where to write target sequences without <unk-N> " "entries",
        required=True,
    )
    args = parser.parse_args()

    target_in = (
        open(args.target, "r", encoding="utf-8") if args.target is not None else None
    )
    target_out = (
        open(args.target_out, "w", encoding="utf-8")
        if args.target_out is not None
        else None
    )
    with open(args.source, "r", encoding="utf-8") as source_in, open(
        args.target, "r", encoding="utf-8"
    ) as target_in, open(args.target_out, "w", encoding="utf-8") as target_out:
        replace_oovs(source_in, target_in, target_out)


if __name__ == "__main__":
    try:
        main()
    except OOVIndexError as e:
        print(e, file=sys.stderr)
        print("Source sequence:", e.source_seq.strip(), file=sys.stderr)
        print("Target sequence:", e.target_seq.strip(), file=sys.stderr)
        print(
            "Source sequence length:",
            len(e.source_seq.strip().split()),
            file=sys.stderr,
        )
        print("The offending tag points to:", e.source_pos)
        sys.exit(2)