File size: 7,336 Bytes
9fa4f9e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
#    MTUOC_truecaser
#    v. 07/06/2023
#    Copyright (C) 2021  Antoni Oliver
#
#    This program is free software: you can redistribute it and/or modify
#    it under the terms of the GNU General Public License as published by
#    the Free Software Foundation, either version 3 of the License, or
#    (at your option) any later version.
#
#    This program is distributed in the hope that it will be useful,
#    but WITHOUT ANY WARRANTY; without even the implied warranty of
#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#    GNU General Public License for more details.

#    You should have received a copy of the GNU General Public License
#    along with this program.  If not, see <https://www.gnu.org/licenses/>.

import sys
import codecs
import pickle
import argparse
import importlib

class Truecaser():
    def __init__(self, MTUOCPath=".", tokenizer=None, tc_model=None):
        
        self.initchars=["¡","¿","-","*","+","'",'"',"«","»","—","‘","’","“","”","„",]
        
        if tokenizer==None:
            self.tokenizer=None
        else:
            sys.path.append(MTUOCPath)
            if tokenizer.endswith(".py"):tokenizer=tokenizer.replace(".py","")
            self.module = importlib.import_module(tokenizer)
            self.tokenizer=self.module.Tokenizer()
        if tc_model:
            self.tc_model = pickle.load(open(tc_model, "rb" ) )
        else:
            self.tc_model={}
    def set_tc_model(self, tc_model):
        self.tc_model = pickle.load(open(tc_model, "rb" ) )
        
    def set_tokenizer(self, tokenizer):
        if tokenizer.endswith(".py"):tokenizer=tokenizer.replace(".py","")
        self.module = importlib.import_module(tokenizer)
        self.tokenizer=self.module.Tokenizer()
    
    def set_MTUOCPath(self, path):
        sys.path.append(path)
        
    def isinitsymbol(self, token):
        if len(token)==1 and token in self.initchars:
            return(True)
        else:
            return(False)
        
    def detect_type(self, segment):
        tipus="unknown"
        if self.tokenizer==None:
            tokens=segment.split(" ")
        else:
            tokens=self.tokenizer.tokenize(segment)
        ntok=0
        utokens=0
        ltokens=0
        leadingsplitter=False
        trailingsplitter=False
        leadingjoiner=False
        trailingsplitter=False
        for token in tokens:
            token=token.replace("▁","")
            token=token.replace("■","")
            if token.isalpha():ntok+=1
            if token.isalpha() and token==token.lower():
                ltokens+=1
            elif token.isalpha() and not token==token.lower():
                utokens+=1
        if ntok>=5 and utokens>=ntok/2 and not segment==segment.upper():
            tipus="titled"
        elif segment==segment.upper():
            tipus="uppercased"
        else:
            tipus="regular"
        return(tipus)
        
    def truecase(self, line, ucf=False, restoreCase=False):
        if self.tokenizer:
            tokens=self.tokenizer.tokenize_s(line).split(" ")
        else:
            tokens=line.split(" ")
        nsegment=[]
        for token in tokens:
            
            try:
                leadingsplitter=False
                trailingsplitter=False
                leadingjoiner=False
                trailingjoiner=False
                if token.startswith("▁"):leadingsplitter=True
                if token.endswith("▁"):trailingsplitter=True
                if token.startswith("■"):leadingjoiner=True
                if token.endswith("■"):trailingjoiner=True
                token=token.replace("▁","")
                token=token.replace("■","")
                try:
                    nlc=self.tc_model[token.lower()]["lc"]
                except:
                    nlc=0
                try:
                    nu1=self.tc_model[token.lower()]["u1"]
                except:
                    nu1=0
                try:
                    nuc=self.tc_model[token.lower()]["uc"]
                except:
                    nuc=0
                proceed=False
                if not token==token.lower(): proceed=True
                if restoreCase: proceed=True
                if proceed:
                    if nlc>0 and nlc>=nu1 and nlc>=nuc:
                        token=token.lower()
                    elif nu1>0 and nu1>nlc and nu1>nuc:
                        token=token.lower().capitalize()
                    elif nuc>0 and nuc>nlc and nuc>nu1:
                        token=token.upper()

                if leadingsplitter:token="▁"+token
                if trailingsplitter:token=token+"▁"
                if leadingjoiner:token="■"+token
                if trailingjoiner:token=token+"■"
                nsegment.append(token)
            except:
                print("ERROR",sys.exc_info())
                nsegment.append(token)
        if self.tokenizer:
            nsegment=self.tokenizer.detokenize_s(" ".join(nsegment))     
        else:
            nsegment=" ".join(nsegment)
        if ucf:
            if self.isinitsymbol(nsegment[0]):firstchar=1
            else: firstchar=0
            try:
                if firstchar==0:
                    nsegment=nsegment[firstchar].upper()+"".join(nsegment[1:])
                elif firstchar==1:
                    nsegment=nsegment[0]+nsegment[firstchar].upper()+"".join(nsegment[2:])
            except:
                pass
        return(nsegment)
        
    def detruecase_old(self,line,tokenizer):
        tokens=line.split(" ")
        new=[]
        yet=False
        if tokenizer:
            tokens=tokenizer.tokenize_j(line).split(" ")
        else:
            tokens=line.split(" ")
        for token in tokens:
            if not yet and token.isalpha():
                yet=True
                new.append(token[0].upper()+token[1:])
            else:
                new.append(token)
        line=" ".join(new)
        detrue=tokenizer.detokenize_j(line)
        return(line) 

    def detruecase(self,line):
        detruecased=line.capitalize()
        return(line) 
    

if __name__ == "__main__":
    parser = argparse.ArgumentParser(description='MTUOC program for truecasing.')
    parser.add_argument('-m','--model', action="store", dest="model", help='The truecasing model to use.',required=True)
    parser.add_argument('-t','--tokenizer', action="store", dest="tokenizer", help='The tokenizer to used',required=False)
    parser.add_argument('-u','--ucf', action="store_true", dest="ucf", help='Set if you want first word capitalized',required=False)
    parser.add_argument('-r','--restore', action="store_true", dest="restore", help='Set if you want to restore case (uppercase lower cased)',required=False)
    parser.add_argument('--mtuoc','--MTUOC', action="store", dest="MTUOC", help='The path to the MTUOC components',required=False)
    
    args = parser.parse_args()
    model=args.model
    ucf=args.ucf
    restore=args.restore
        

    if args.MTUOC:
        MTUOCPath=args.MTUOC
    else:
        MTUOCPath=""

    truecaser=Truecaser(MTUOCPath, args.tokenizer, args.model)
    for line in sys.stdin:
        line=line.strip()
        tcline=truecaser.truecase(line, ucf, restore)
        print(tcline)