File size: 853 Bytes
e50fe35
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
import sys


def add_token(sent, tag_infos):
    """ add special tokens specified by tag_infos to each element in list

    tag_infos: list of tuples (tag_type,tag)

    each tag_info results in a token of the form: __{tag_type}__{tag}__

    """

    tokens = []
    for tag_type, tag in tag_infos:
        token = '__' + tag_type + '__' + tag + '__'
        tokens.append(token)

    return ' '.join(tokens) + ' ' + sent


if __name__ == '__main__':

    infname = sys.argv[1]
    outfname = sys.argv[2]
    src_lang = sys.argv[3]
    tgt_lang = sys.argv[4]

    with open(infname, 'r', encoding='utf-8') as infile, \
            open(outfname, 'w', encoding='utf-8') as outfile:
        for line in infile:
            outstr = add_token(
                line.strip(), [('src', src_lang), ('tgt', tgt_lang)])
            outfile.write(outstr + '\n')