import config import sys import re import string as stringmodule import srx_segmenter from MTUOC_misc import printLOG from MTUOC_misc import capitalizeMTUOC if config.MTUOCServer_MTengine=="GoogleTranslate": from MTUOC_GoogleTranslate import Google_translate if config.MTUOCServer_MTengine=="DeepL": from MTUOC_DeepL import DeepL_translate if config.MTUOCServer_MTengine=="Lucy": from MTUOC_Lucy import Lucy_translate from MTUOC_Marian import translate_segment_Marian from MTUOC_Moses import translate_segment_Moses from MTUOC_preprocess import preprocess_segment from MTUOC_preprocess import postprocess_segment from MTUOC_preprocess import tokenizationSL from MTUOC_preprocess import tokenizationTL from MTUOC_preprocess import detokenizationSL from MTUOC_preprocess import detokenizationTL def segmenta(cadena): segmenter = srx_segmenter.SrxSegmenter(config.rules[config.SRXlang],cadena) segments=segmenter.extract() resposta=[] return(segments) def is_first_letter_upper(segment): for character in segment: if character.isalpha() and character.isupper(): return(True) elif character.isalpha() and character.islower(): return(False) return(False) def upper_case_first_letter(segment): pos=0 for character in segment: if character.isalpha() and character.islower(): llista=list(segment) llista[pos]=llista[pos].upper() segment="".join(llista) return(segment) elif character.isalpha() and character.isupper(): return(segment) pos+=1 return(segment) ###URLs EMAILs def findEMAILs(string): email=re.findall('\S+@\S+', string) email2=[] for em in email: if em[-1] in stringmodule.punctuation: em=em[0:-1] email2.append(em) return email2 def findURLs(string): regex = r"(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'\".,<>?«»“”‘’]))" url = re.findall(regex,string) return [x[0] for x in url] def replace_EMAILs(string,code="@EMAIL@"): EMAILs=findEMAILs(string) for EMAIL in EMAILs: string=string.replace(EMAIL,code) return(string) def replace_URLs(string,code="@URL@"): URLs=findURLs(string) for URL in URLs: string=string.replace(URL,code) return(string) re_num = re.compile(r'[\d,\.]+') def replace_NUMs(segment,code="@NUM@"): trobatsEXPRNUM=re.finditer(re_num,segment) for trobat in trobatsEXPRNUM: if not trobat.group(0) in [".",","]: segment=segment.replace(trobat.group(0),code,1) return(segment) def splitnumbers(segment,joiner=""): joiner=joiner+" " xifres = re.findall(re_num,segment) equil={} for xifra in xifres: xifrastr=str(xifra) xifrasplit=xifra.split() xifra2=joiner.join(xifra) segment=segment.replace(xifra,xifra2) if xifra2.find(" ")>-1: equil[xifra2]=xifra return(segment,equil) def desplitnumbers(segment,equil): for xifra2 in equil: segment=segment.replace(xifra2,equil[xifra2]) return(segment) def restore_EMAILs(stringA,stringB,code="@EMAIL@"): EMAILs=findEMAILs(stringA) for email in EMAILs: stringB=stringB.replace(code,email,1) return(stringB) def restore_URLs(stringA,stringB,code="@URL@"): URLs=findURLs(stringA) for url in URLs: stringB=stringB.replace(code,url,1) return(stringB) def restore_NUMs(segmentSL,segmentTL,code="@NUM@"): trobatsEXPRNUM=re.finditer(re_num,segmentSL) position=0 for trobat in trobatsEXPRNUM: if not trobat.group(0) in [".",","]: segmentTL=segmentTL.replace(code,trobat.group(0),1) return(segmentTL) def translate_para(paragraph): if config.segment_input: (segments,separators)=segmenta(paragraph) translations=[] for segment in segments: translation=translate_segment(segment) if config.fix_xml: translation=config.tagrestorer.fix_xml_tags(translation) translations.append(translation) resultat=[] for i in range(0,len(separators)): resultat.append(separators[i]) try: resultat.append(translations[i]) except: pass translation="".join(resultat) else: translation=translate_segment(paragraph) return(translation) def restore_tags_translation_candidates(translation_candidates): hastags=config.tagrestorer.has_tags(translation_candidates["segmentTAGS"]) if hastags: (translation_candidates["segmentTAGS"],equil)=config.tagrestorer.replace_tags(translation_candidates["segmentOrig"]) printLOG(3,"replace_tags",translation_candidates["segmentTAGS"]) printLOG(3,"equil",equil) (translation_candidates["segmentTAGS"],tagInici,tagFinal)=config.tagrestorer.remove_start_end_tag(translation_candidates["segmentOrig"]) printLOG(3,"remove_start_end_tag",translation_candidates["segmentTAGS"]) printLOG(3,"TAG initial:",tagInici) printLOG(3,"TAG final:",tagFinal) translation_candidates["segmentNOTAGS"]=config.tagrestorer.remove_tags(translation_candidates["segmentTAGS"]) originaltags=config.tagrestorer.get_tags(translation_candidates["segmentTAGS"]) segmentNOTAGSTOK=tokenizationSL(translation_candidates["segmentNOTAGS"]) segmentTAGSTOK=tokenizationSL(translation_candidates["segmentPreTAGS"]) translation_candidates["translationTAGS"]=[None] * len(translation_candidates["translationNOTAGSPre"]) for i in range(0,len(translation_candidates["translationNOTAGSPre"])): try: if hastags and config.tag_restoration: try: alignment=translation_candidates["alignments"][i] translationNOTAGSTOK=tokenizationTL(translation_candidates["translationNOTAGSPre"][i]) translation_candidates["translationTAGS"][i]=config.tagrestorer.restore_tags(segmentNOTAGSTOK, segmentTAGSTOK, alignment, translationNOTAGSTOK) ''' if tagInici: translation_candidates["translationTAGS"][i]=tagInici+translation_candidates["translationTAGS"][i] if tagFinal: translation_candidates["translationTAGS"][i]=translation_candidates["translationTAGS"][i]+tagFinal printLOG(3,"SELECTED TRANSLATION SIMPLE TAGS",translation_candidates["translationTAGS"][i]) for t in equil: translation_candidates["translationTAGS"][i]=translation_candidates["translationTAGS"][i].replace(t,equil[t],1) ''' except: printLOG(3,"ERROR restoring tags:",sys.exc_info()) translation_candidates["translationTAGS"][i]=translationNOTAGSTOK else: translation_candidates["translationTAGS"]=translation_candidates["translationNOTAGSPre"] printLOG(3,"translationTAGS:",translation_candidates["translationTAGS"][i]) except: pass else: translation_candidates["segmentNOTAGS"]=translation_candidates["segmentTAGS"] translation_candidates["translationTAGS"]=translation_candidates["translationNOTAGSPre"] return(translation_candidates) def translate_segment(segment): printLOG(3,"translate_segment",segment) if config.MTUOCServer_MTengine=="GoogleTranslate": translation=Google_translate(segment) return(translation) elif config.MTUOCServer_MTengine=="DeepL": translation=DeepL_translate(segment) return(translation) elif config.MTUOCServer_MTengine=="Lucy": translation=Lucy_translate(segment) return(translation) segmentOrig=segment if not config.change_input_files[0]=="None": printLOG(3,"CHANGES INPUT:") printLOG(3,"ORIGINAL:",segmentOrig) for change in config.changes_input: tofind=change[0] tochange=change[1] regexp="\\b"+tofind+"\\b" trobat=re.findall(regexp,segment) if trobat: segment=re.sub(regexp, tochange, segment) printLOG(3,tofind,tochange) printLOG(3,"CHANGED:",segment) hastags=config.tagrestorer.has_tags(segment) originaltags=config.tagrestorer.get_tags(segment) printLOG(3,"hastags",hastags) printLOG(3,"originaltags",originaltags) #truecasing totruecase=False toupperfinal=False if not config.truecase==None and config.truecase=="all": totruecase=True segmentnotags=config.tagrestorer.remove_tags(segment) if not config.truecase==None and config.truecase in ["upper","all"] and segmentnotags.isupper() and not segment=="@URL@" and not segment=="@EMAIL@": totruecase=True toupperfinal=True if config.checkistranslatable: segmentNOTAGS=replace_URLs(segment,config.code_URLs) segmentNOTAGS=replace_EMAILs(segment,config.code_EMAILs) tokens=tokenizationSL(segmentNOTAGS) if not is_translatable(tokens): return(segment) if totruecase: segment=config.truecaser.truecase(segment) if config.tokenizerSL: segment=config.tokenizerSL.tokenize(segment) if hastags: segmentTAGS=segment (segmentTAGS,equil)=config.tagrestorer.replace_tags(segmentTAGS) printLOG(3,"segmentTAGS:",segmentTAGS) printLOG(3,"equil:",equil) printLOG(3,"segmentTAGS:",segmentTAGS) (segmentTAGS,tagInici,tagFinal)=config.tagrestorer.remove_start_end_tag(segmentTAGS) printLOG(3,"TAG initial:",tagInici) printLOG(3,"TAG final:",tagFinal) segmentNOTAGS=config.tagrestorer.remove_tags(segment) else: segmentTAGS=segment segmentNOTAGS=segment if len(segmentNOTAGS)