File size: 4,024 Bytes
e50fe35
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
INDIC_NLP_LIB_HOME = "indic_nlp_library"
INDIC_NLP_RESOURCES = "indic_nlp_resources"
import sys

from indicnlp import transliterate

sys.path.append(r"{}".format(INDIC_NLP_LIB_HOME))
from indicnlp import common

common.set_resources_path(INDIC_NLP_RESOURCES)
from indicnlp import loader

loader.load()
from sacremoses import MosesPunctNormalizer
from sacremoses import MosesTokenizer
from sacremoses import MosesDetokenizer
from collections import defaultdict

import indicnlp
from indicnlp.tokenize import indic_tokenize
from indicnlp.tokenize import indic_detokenize
from indicnlp.normalize import indic_normalize
from indicnlp.transliterate import unicode_transliterate


def postprocess(
    infname, outfname, input_size, lang, common_lang="hi", transliterate=False
):
    """
    parse fairseq interactive output, convert script back to native Indic script (in case of Indic languages) and detokenize.

    infname: fairseq log file
    outfname: output file of translation (sentences not translated contain the dummy string 'DUMMY_OUTPUT'
    input_size: expected number of output sentences
    lang: language
    """

    consolidated_testoutput = []
    # with open(infname,'r',encoding='utf-8') as infile:
    # consolidated_testoutput= list(map(lambda x: x.strip(), filter(lambda x: x.startswith('H-'),infile) ))
    # consolidated_testoutput.sort(key=lambda x: int(x.split('\t')[0].split('-')[1]))
    # consolidated_testoutput=[ x.split('\t')[2] for x in consolidated_testoutput ]

    consolidated_testoutput = [(x, 0.0, "") for x in range(input_size)]
    temp_testoutput = []
    with open(infname, "r", encoding="utf-8") as infile:
        temp_testoutput = list(
            map(
                lambda x: x.strip().split("\t"),
                filter(lambda x: x.startswith("H-"), infile),
            )
        )
        temp_testoutput = list(
            map(lambda x: (int(x[0].split("-")[1]), float(x[1]), x[2]), temp_testoutput)
        )
        for sid, score, hyp in temp_testoutput:
            consolidated_testoutput[sid] = (sid, score, hyp)
        consolidated_testoutput = [x[2] for x in consolidated_testoutput]

    if lang == "en":
        en_detok = MosesDetokenizer(lang="en")
        with open(outfname, "w", encoding="utf-8") as outfile:
            for sent in consolidated_testoutput:
                outfile.write(en_detok.detokenize(sent.split(" ")) + "\n")
    else:
        xliterator = unicode_transliterate.UnicodeIndicTransliterator()
        with open(outfname, "w", encoding="utf-8") as outfile:
            for sent in consolidated_testoutput:
                if transliterate:
                    outstr = indic_detokenize.trivial_detokenize(
                        xliterator.transliterate(sent, common_lang, lang), lang
                    )
                else:
                    outstr = indic_detokenize.trivial_detokenize(sent, lang)
                outfile.write(outstr + "\n")


if __name__ == "__main__":
    #     # The path to the local git repo for Indic NLP library
    # INDIC_NLP_LIB_HOME="indic_nlp_library"
    # INDIC_NLP_RESOURCES = "indic_nlp_resources"
    # sys.path.append('{}'.format(INDIC_NLP_LIB_HOME))
    # common.set_resources_path(INDIC_NLP_RESOURCES)
    #     # The path to the local git repo for Indic NLP Resources
    #     INDIC_NLP_RESOURCES=""

    #     sys.path.append('{}'.format(INDIC_NLP_LIB_HOME))
    #     common.set_resources_path(INDIC_NLP_RESOURCES)

    # loader.load()

    infname = sys.argv[1]
    outfname = sys.argv[2]
    input_size = int(sys.argv[3])
    lang = sys.argv[4]
    if len(sys.argv) == 5:
        transliterate = False
    elif len(sys.argv) == 6:
        transliterate = sys.argv[5]
        if transliterate.lower() == "true":
            transliterate = True
        else:
            transliterate = False
    else:
        print(f"Invalid arguments: {sys.argv}")
        exit()

    postprocess(
        infname, outfname, input_size, lang, common_lang="hi", transliterate=transliterate
    )