MTUOC-paracrawl6.0-spa-eng / MTUOC_translate.py
aoliverg's picture
Upload 39 files
9fa4f9e verified
raw
history blame contribute delete
No virus
18.2 kB
import config
import sys
import re
import string as stringmodule
import srx_segmenter
from MTUOC_misc import printLOG
from MTUOC_misc import capitalizeMTUOC
if config.MTUOCServer_MTengine=="GoogleTranslate":
from MTUOC_GoogleTranslate import Google_translate
if config.MTUOCServer_MTengine=="DeepL":
from MTUOC_DeepL import DeepL_translate
if config.MTUOCServer_MTengine=="Lucy":
from MTUOC_Lucy import Lucy_translate
from MTUOC_Marian import translate_segment_Marian
from MTUOC_Moses import translate_segment_Moses
from MTUOC_preprocess import preprocess_segment
from MTUOC_preprocess import postprocess_segment
from MTUOC_preprocess import tokenizationSL
from MTUOC_preprocess import tokenizationTL
from MTUOC_preprocess import detokenizationSL
from MTUOC_preprocess import detokenizationTL
def segmenta(cadena):
segmenter = srx_segmenter.SrxSegmenter(config.rules[config.SRXlang],cadena)
segments=segmenter.extract()
resposta=[]
return(segments)
def is_first_letter_upper(segment):
for character in segment:
if character.isalpha() and character.isupper():
return(True)
elif character.isalpha() and character.islower():
return(False)
return(False)
def upper_case_first_letter(segment):
pos=0
for character in segment:
if character.isalpha() and character.islower():
llista=list(segment)
llista[pos]=llista[pos].upper()
segment="".join(llista)
return(segment)
elif character.isalpha() and character.isupper():
return(segment)
pos+=1
return(segment)
###URLs EMAILs
def findEMAILs(string):
email=re.findall('\S+@\S+', string)
email2=[]
for em in email:
if em[-1] in stringmodule.punctuation: em=em[0:-1]
email2.append(em)
return email2
def findURLs(string):
regex = r"(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'\".,<>?«»“”‘’]))"
url = re.findall(regex,string)
return [x[0] for x in url]
def replace_EMAILs(string,code="@EMAIL@"):
EMAILs=findEMAILs(string)
for EMAIL in EMAILs:
string=string.replace(EMAIL,code)
return(string)
def replace_URLs(string,code="@URL@"):
URLs=findURLs(string)
for URL in URLs:
string=string.replace(URL,code)
return(string)
re_num = re.compile(r'[\d,\.]+')
def replace_NUMs(segment,code="@NUM@"):
trobatsEXPRNUM=re.finditer(re_num,segment)
for trobat in trobatsEXPRNUM:
if not trobat.group(0) in [".",","]:
segment=segment.replace(trobat.group(0),code,1)
return(segment)
def splitnumbers(segment,joiner=""):
joiner=joiner+" "
xifres = re.findall(re_num,segment)
equil={}
for xifra in xifres:
xifrastr=str(xifra)
xifrasplit=xifra.split()
xifra2=joiner.join(xifra)
segment=segment.replace(xifra,xifra2)
if xifra2.find(" ")>-1:
equil[xifra2]=xifra
return(segment,equil)
def desplitnumbers(segment,equil):
for xifra2 in equil:
segment=segment.replace(xifra2,equil[xifra2])
return(segment)
def restore_EMAILs(stringA,stringB,code="@EMAIL@"):
EMAILs=findEMAILs(stringA)
for email in EMAILs:
stringB=stringB.replace(code,email,1)
return(stringB)
def restore_URLs(stringA,stringB,code="@URL@"):
URLs=findURLs(stringA)
for url in URLs:
stringB=stringB.replace(code,url,1)
return(stringB)
def restore_NUMs(segmentSL,segmentTL,code="@NUM@"):
trobatsEXPRNUM=re.finditer(re_num,segmentSL)
position=0
for trobat in trobatsEXPRNUM:
if not trobat.group(0) in [".",","]:
segmentTL=segmentTL.replace(code,trobat.group(0),1)
return(segmentTL)
def translate_para(paragraph):
if config.segment_input:
(segments,separators)=segmenta(paragraph)
translations=[]
for segment in segments:
translation=translate_segment(segment)
if config.fix_xml:
translation=config.tagrestorer.fix_xml_tags(translation)
translations.append(translation)
resultat=[]
for i in range(0,len(separators)):
resultat.append(separators[i])
try:
resultat.append(translations[i])
except:
pass
translation="".join(resultat)
else:
translation=translate_segment(paragraph)
return(translation)
def restore_tags_translation_candidates(translation_candidates):
hastags=config.tagrestorer.has_tags(translation_candidates["segmentTAGS"])
if hastags:
(translation_candidates["segmentTAGS"],equil)=config.tagrestorer.replace_tags(translation_candidates["segmentOrig"])
printLOG(3,"replace_tags",translation_candidates["segmentTAGS"])
printLOG(3,"equil",equil)
(translation_candidates["segmentTAGS"],tagInici,tagFinal)=config.tagrestorer.remove_start_end_tag(translation_candidates["segmentOrig"])
printLOG(3,"remove_start_end_tag",translation_candidates["segmentTAGS"])
printLOG(3,"TAG initial:",tagInici)
printLOG(3,"TAG final:",tagFinal)
translation_candidates["segmentNOTAGS"]=config.tagrestorer.remove_tags(translation_candidates["segmentTAGS"])
originaltags=config.tagrestorer.get_tags(translation_candidates["segmentTAGS"])
segmentNOTAGSTOK=tokenizationSL(translation_candidates["segmentNOTAGS"])
segmentTAGSTOK=tokenizationSL(translation_candidates["segmentPreTAGS"])
translation_candidates["translationTAGS"]=[None] * len(translation_candidates["translationNOTAGSPre"])
for i in range(0,len(translation_candidates["translationNOTAGSPre"])):
try:
if hastags and config.tag_restoration:
try:
alignment=translation_candidates["alignments"][i]
translationNOTAGSTOK=tokenizationTL(translation_candidates["translationNOTAGSPre"][i])
translation_candidates["translationTAGS"][i]=config.tagrestorer.restore_tags(segmentNOTAGSTOK, segmentTAGSTOK, alignment, translationNOTAGSTOK)
'''
if tagInici:
translation_candidates["translationTAGS"][i]=tagInici+translation_candidates["translationTAGS"][i]
if tagFinal:
translation_candidates["translationTAGS"][i]=translation_candidates["translationTAGS"][i]+tagFinal
printLOG(3,"SELECTED TRANSLATION SIMPLE TAGS",translation_candidates["translationTAGS"][i])
for t in equil:
translation_candidates["translationTAGS"][i]=translation_candidates["translationTAGS"][i].replace(t,equil[t],1)
'''
except:
printLOG(3,"ERROR restoring tags:",sys.exc_info())
translation_candidates["translationTAGS"][i]=translationNOTAGSTOK
else:
translation_candidates["translationTAGS"]=translation_candidates["translationNOTAGSPre"]
printLOG(3,"translationTAGS:",translation_candidates["translationTAGS"][i])
except:
pass
else:
translation_candidates["segmentNOTAGS"]=translation_candidates["segmentTAGS"]
translation_candidates["translationTAGS"]=translation_candidates["translationNOTAGSPre"]
return(translation_candidates)
def translate_segment(segment):
printLOG(3,"translate_segment",segment)
if config.MTUOCServer_MTengine=="GoogleTranslate":
translation=Google_translate(segment)
return(translation)
elif config.MTUOCServer_MTengine=="DeepL":
translation=DeepL_translate(segment)
return(translation)
elif config.MTUOCServer_MTengine=="Lucy":
translation=Lucy_translate(segment)
return(translation)
segmentOrig=segment
if not config.change_input_files[0]=="None":
printLOG(3,"CHANGES INPUT:")
printLOG(3,"ORIGINAL:",segmentOrig)
for change in config.changes_input:
tofind=change[0]
tochange=change[1]
regexp="\\b"+tofind+"\\b"
trobat=re.findall(regexp,segment)
if trobat:
segment=re.sub(regexp, tochange, segment)
printLOG(3,tofind,tochange)
printLOG(3,"CHANGED:",segment)
hastags=config.tagrestorer.has_tags(segment)
originaltags=config.tagrestorer.get_tags(segment)
printLOG(3,"hastags",hastags)
printLOG(3,"originaltags",originaltags)
#truecasing
totruecase=False
toupperfinal=False
if not config.truecase==None and config.truecase=="all": totruecase=True
segmentnotags=config.tagrestorer.remove_tags(segment)
if not config.truecase==None and config.truecase in ["upper","all"] and segmentnotags.isupper() and not segment=="@URL@" and not segment=="@EMAIL@":
totruecase=True
toupperfinal=True
if config.checkistranslatable:
segmentNOTAGS=replace_URLs(segment,config.code_URLs)
segmentNOTAGS=replace_EMAILs(segment,config.code_EMAILs)
tokens=tokenizationSL(segmentNOTAGS)
if not is_translatable(tokens):
return(segment)
if totruecase:
segment=config.truecaser.truecase(segment)
if config.tokenizerSL:
segment=config.tokenizerSL.tokenize(segment)
if hastags:
segmentTAGS=segment
(segmentTAGS,equil)=config.tagrestorer.replace_tags(segmentTAGS)
printLOG(3,"segmentTAGS:",segmentTAGS)
printLOG(3,"equil:",equil)
printLOG(3,"segmentTAGS:",segmentTAGS)
(segmentTAGS,tagInici,tagFinal)=config.tagrestorer.remove_start_end_tag(segmentTAGS)
printLOG(3,"TAG initial:",tagInici)
printLOG(3,"TAG final:",tagFinal)
segmentNOTAGS=config.tagrestorer.remove_tags(segment)
else:
segmentTAGS=segment
segmentNOTAGS=segment
if len(segmentNOTAGS)<config.min_chars_segment:
return(segment)
if config.MTUOCServer_EMAILs:
segmentTAGS=replace_EMAILs(segmentTAGS)
segmentNOTAGS=replace_EMAILs(segmentNOTAGS)
printLOG(3,"Replace EMAILs:",segmentTAGS)
if config.MTUOCServer_URLs:
segmentTAGS=replace_URLs(segmentTAGS)
segmentNOTAGS=replace_URLs(segmentNOTAGS)
printLOG(3,"Replace URLs:",segmentTAGS)
if config.pre_replace_NUMs:
segmentTAGS=replace_NUMs(segmentTAGS,code=config.code_NUMs)
segmentNOTAGS=replace_NUMs(segmentNOTAGS,code=config.code_NUMs)
printLOG(3,"Replace NUMs:",segmentTAGS)
if config.pre_split_NUMs:
(segmentTAGS,equilSplitNum)=splitnumbers(segmentTAGS)
(segmentNOTAGS,equilSplitNum2)=splitnumbers(segmentNOTAGS)
printLOG(3,"Split NUMs:",segmentTAGS)
#leading and trailing spaces
leading_spaces=len(segment)-len(segment.lstrip())
trailing_spaces=len(segment)-len(segment.rstrip())-1
segmentPre=preprocess_segment(segmentNOTAGS)
segmentPreTAGS=preprocess_segment(segmentTAGS)
if config.MTUOCServer_MTengine=="Marian":
translation_candidates=translate_segment_Marian(segmentPre)
elif config.MTUOCServer_MTengine=="Moses":
translation_candidates=translate_segment_Moses(segmentPre)
translation_candidates["segment"]=segment
translation_candidates["segmentOrig"]=segmentOrig
translation_candidates["segmentTAGS"]=segmentTAGS
translation_candidates["segmentPre"]=segmentPre
translation_candidates["segmentPreTAGS"]=segmentPreTAGS
translation_candidates["segmentNOTAGS"]=segmentNOTAGS
print("*****1",translation_candidates)
if hastags:
translation_candidates=restore_tags_translation_candidates(translation_candidates)
#(translation_candidates["segmentTAGS"],equil)=config.tagrestorer.replace_tags(translation_candidates["segmentOrig"])
else:
translation_candidates["translationTAGS"]=translation_candidates["translationNOTAGSPre"]
print("*****2",translation_candidates)
for i in range(0,len(translation_candidates["translationNOTAGSPre"])):
translation_candidates["translationTAGS"][i]=postprocess_segment(translation_candidates["translationTAGS"][i])
if hastags:
if tagInici:
translation_candidates["translationTAGS"][i]=tagInici+translation_candidates["translationTAGS"][i]
if tagFinal:
translation_candidates["translationTAGS"][i]=translation_candidates["translationTAGS"][i]+tagFinal
for t in equil:
translation_candidates["translationTAGS"][i]=translation_candidates["translationTAGS"][i].replace(t,equil[t],1)
#if not config.truecaser==None and is_first_letter_upper(segmentOrig):
# translation_candidates["translationTAGS"][i]=upper_case_first_letter(translation_candidates["translationTAGS"][i])
print("*****3",translation_candidates)
if totruecase:
translation_candidates["translationTAGS"][i]=capitalizeMTUOC(translation_candidates["translationTAGS"][i])
if toupperfinal:
translation_candidates["translationTAGS"][i]=translation_candidates["translationTAGS"][i].upper()
###LOWERCASE UPPERCASED TAGS
hastagstranslation=config.tagrestorer.has_tags(translation_candidates["translationTAGS"][i])
if hastagstranslation:
translationtags=config.tagrestorer.get_tags(translation_candidates["translationTAGS"][i])
for tt in translationtags:
if not tt in originaltags and tt.lower() in originaltags:
translation_candidates["translationTAGS"][i]=translation_candidates["translationTAGS"][i].replace(tt,tt.lower())
if config.MTUOCServer_EMAILs:
translation_candidates["translationTAGS"][i]=restore_EMAILs(segmentOrig,translation_candidates["translationTAGS"][i],code=config.code_EMAILs)
if config.MTUOCServer_URLs:
translation_candidates["translationTAGS"][i]=restore_URLs(segmentOrig,translation_candidates["translationTAGS"][i],code=config.code_URLs)
'''
#config.pre_replace_NUMs
if config.pre_replace_NUMs:
translation_candidates["translationTAGS"][i]=restore_NUMs(segmentOrig,translation_candidates["translationTAGS"][i],code=config.code_NUMs)
#config.pre_split_NUMs
if config.pre_split_NUMs:
translation_candidates["translationTAGS"][i]=desplitnumbers(translation_candidates["translationTAGS"][i],equilSplitNum)
#detruecase
if totruecase:
translation_candidates["translationTAGS"][i]=translation_candidates["translationTAGS"][i][0].upper()+translation_candidates["translationTAGS"][i][1:]
#detokenize
'''
if config.tokenizerSL and not config.tokenizerTL==None:
translation_candidates["translationTAGS"][i]=config.tokenizerTL.detokenize(translation_candidates["translationTAGS"][i])
translation_candidates["translationTAGS"][i]=config.tagrestorer.repairSpacesTags(translation_candidates["segmentOrig"],translation_candidates["translationTAGS"][i])
printLOG(3,"SELECTED TRANSLATION REAL TAGS",translation_candidates["translationTAGS"][i])
best_translation=select_best_candidate(translation_candidates,config.translation_selection_strategy)
translation=best_translation
if not config.change_output_files[0]=="None":
printLOG(3,"CHANGES OUTPUT:")
printLOG(3,"ORIGINAL:",translation)
for change in config.changes_output:
tofind=change[0]
tochange=change[1]
regexp="\\b"+tofind+"\\b"
trobat=re.findall(regexp,translation)
if trobat:
translation=re.sub(regexp, tochange, translation)
printLOG(3,tofind,tochange)
printLOG(3,"CHANGED:",translation)
if not config.change_translation_files[0]=="None":
printLOG(3,"CHANGES TRANSLATION:")
printLOG(3,"ORIGINAL SOURCE:",segmentOrig)
printLOG(3,"ORIGINAL TARGET:",translation)
for change in config.changes_translation:
tofindSOURCE=change[0]
tofindTARGET=change[1]
tochange=change[2]
regexpSOURCE="\\b"+tofindSOURCE+"\\b"
regexpTARGET="\\b"+tofindTARGET+"\\b"
trobatSOURCE=re.findall(regexpSOURCE,segmentOrig)
trobatTARGET=re.findall(regexpTARGET,translation)
if trobatSOURCE and trobatTARGET:
translation=re.sub(regexpTARGET, tochange, translation)
printLOG(3,tofindTARGET,tochange)
printLOG(3,"CHANGED TARGET:",translation)
return(translation)
def is_translatable_old(tokens):
tokens=tokens.split(" ")
translatable=False
for token in tokens:
if token.isalpha():
translatable=True
break
return(translatable)
def is_translatable(tokens):
translatable=False
for token in tokens.split():
transtoken=True
for character in token:
if str(character) in ["0","1","2","3","4","5","6","7","8","9"]:
transtoken=False
break
if transtoken:
translatable=True
return(translatable)
def select_best_candidate(translation_candidates,strategy):
'''To implement several strategies to select the best candidate. Now it r,eturns the first one.'''
if strategy=="First":
best_translation=translation_candidates["translationTAGS"][0]
return(best_translation)