# MTUOC_truecaser # v. 07/06/2023 # Copyright (C) 2021 Antoni Oliver # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # You should have received a copy of the GNU General Public License # along with this program. If not, see . import sys import codecs import pickle import argparse import importlib class Truecaser(): def __init__(self, MTUOCPath=".", tokenizer=None, tc_model=None): self.initchars=["¡","¿","-","*","+","'",'"',"«","»","—","‘","’","“","”","„",] if tokenizer==None: self.tokenizer=None else: sys.path.append(MTUOCPath) if tokenizer.endswith(".py"):tokenizer=tokenizer.replace(".py","") self.module = importlib.import_module(tokenizer) self.tokenizer=self.module.Tokenizer() if tc_model: self.tc_model = pickle.load(open(tc_model, "rb" ) ) else: self.tc_model={} def set_tc_model(self, tc_model): self.tc_model = pickle.load(open(tc_model, "rb" ) ) def set_tokenizer(self, tokenizer): if tokenizer.endswith(".py"):tokenizer=tokenizer.replace(".py","") self.module = importlib.import_module(tokenizer) self.tokenizer=self.module.Tokenizer() def set_MTUOCPath(self, path): sys.path.append(path) def isinitsymbol(self, token): if len(token)==1 and token in self.initchars: return(True) else: return(False) def detect_type(self, segment): tipus="unknown" if self.tokenizer==None: tokens=segment.split(" ") else: tokens=self.tokenizer.tokenize(segment) ntok=0 utokens=0 ltokens=0 leadingsplitter=False trailingsplitter=False leadingjoiner=False trailingsplitter=False for token in tokens: token=token.replace("▁","") token=token.replace("■","") if token.isalpha():ntok+=1 if token.isalpha() and token==token.lower(): ltokens+=1 elif token.isalpha() and not token==token.lower(): utokens+=1 if ntok>=5 and utokens>=ntok/2 and not segment==segment.upper(): tipus="titled" elif segment==segment.upper(): tipus="uppercased" else: tipus="regular" return(tipus) def truecase(self, line, ucf=False, restoreCase=False): if self.tokenizer: tokens=self.tokenizer.tokenize_s(line).split(" ") else: tokens=line.split(" ") nsegment=[] for token in tokens: try: leadingsplitter=False trailingsplitter=False leadingjoiner=False trailingjoiner=False if token.startswith("▁"):leadingsplitter=True if token.endswith("▁"):trailingsplitter=True if token.startswith("■"):leadingjoiner=True if token.endswith("■"):trailingjoiner=True token=token.replace("▁","") token=token.replace("■","") try: nlc=self.tc_model[token.lower()]["lc"] except: nlc=0 try: nu1=self.tc_model[token.lower()]["u1"] except: nu1=0 try: nuc=self.tc_model[token.lower()]["uc"] except: nuc=0 proceed=False if not token==token.lower(): proceed=True if restoreCase: proceed=True if proceed: if nlc>0 and nlc>=nu1 and nlc>=nuc: token=token.lower() elif nu1>0 and nu1>nlc and nu1>nuc: token=token.lower().capitalize() elif nuc>0 and nuc>nlc and nuc>nu1: token=token.upper() if leadingsplitter:token="▁"+token if trailingsplitter:token=token+"▁" if leadingjoiner:token="■"+token if trailingjoiner:token=token+"■" nsegment.append(token) except: print("ERROR",sys.exc_info()) nsegment.append(token) if self.tokenizer: nsegment=self.tokenizer.detokenize_s(" ".join(nsegment)) else: nsegment=" ".join(nsegment) if ucf: if self.isinitsymbol(nsegment[0]):firstchar=1 else: firstchar=0 try: if firstchar==0: nsegment=nsegment[firstchar].upper()+"".join(nsegment[1:]) elif firstchar==1: nsegment=nsegment[0]+nsegment[firstchar].upper()+"".join(nsegment[2:]) except: pass return(nsegment) def detruecase_old(self,line,tokenizer): tokens=line.split(" ") new=[] yet=False if tokenizer: tokens=tokenizer.tokenize_j(line).split(" ") else: tokens=line.split(" ") for token in tokens: if not yet and token.isalpha(): yet=True new.append(token[0].upper()+token[1:]) else: new.append(token) line=" ".join(new) detrue=tokenizer.detokenize_j(line) return(line) def detruecase(self,line): detruecased=line.capitalize() return(line) if __name__ == "__main__": parser = argparse.ArgumentParser(description='MTUOC program for truecasing.') parser.add_argument('-m','--model', action="store", dest="model", help='The truecasing model to use.',required=True) parser.add_argument('-t','--tokenizer', action="store", dest="tokenizer", help='The tokenizer to used',required=False) parser.add_argument('-u','--ucf', action="store_true", dest="ucf", help='Set if you want first word capitalized',required=False) parser.add_argument('-r','--restore', action="store_true", dest="restore", help='Set if you want to restore case (uppercase lower cased)',required=False) parser.add_argument('--mtuoc','--MTUOC', action="store", dest="MTUOC", help='The path to the MTUOC components',required=False) args = parser.parse_args() model=args.model ucf=args.ucf restore=args.restore if args.MTUOC: MTUOCPath=args.MTUOC else: MTUOCPath="" truecaser=Truecaser(MTUOCPath, args.tokenizer, args.model) for line in sys.stdin: line=line.strip() tcline=truecaser.truecase(line, ucf, restore) print(tcline)