# # Copyright (c) 2013-present, Anoop Kunchukuttan # All rights reserved. # # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. # #Program for detokenizing Indian language input # # @author Anoop Kunchukuttan # """ De-tokenizer for Indian languages. """ import string, re, sys from indicnlp.common import IndicNlpException ## detokenizer patterns left_attach=r'!%)\]},.:;>?\u0964\u0965' pat_la=re.compile(r'[ ](['+left_attach+r'])') right_attach=r'#$(\[{<@' pat_ra=re.compile(r'(['+right_attach+r'])[ ]') lr_attach=r'-/\\' pat_lra=re.compile(r'[ ](['+lr_attach+r'])[ ]') #donknow=u'&*+=^_|~' ## date, numbers, section/article numbering ## TODO: handle indic numbers pat_num_seq=re.compile(r'([0-9]+ [,.:/] )+[0-9]+') ### e-mail address #pat_num=re.compile(ur'[a-zA-Z]+[ ]? def trivial_detokenize_indic(text): """detokenize string for Indian language scripts using Brahmi-derived scripts A trivial detokenizer which: - decides whether punctuation attaches to left/right or both - handles number sequences - handles quotes smartly (deciding left or right attachment) Args: text (str): tokenized text to process Returns: str: detokenized string """ s=text ### some normalizations #numbers and dates new_s='' prev=0 for m in pat_num_seq.finditer(s): start=m.start() end=m.end() if start>prev: new_s=new_s+s[prev:start] new_s=new_s+s[start:end].replace(' ','') prev=end new_s=new_s+s[prev:] s=new_s ### consective single quotes or backslashes become double quotes #s=s.replace("' '", "''") #s=s.replace("` `", '``') s=pat_lra.sub('\\1',s) s=pat_la.sub('\\1',s) s=pat_ra.sub('\\1',s) # assumes well formedness of quotes and alternates between right and left attach alt_attach='\'"`' for punc in alt_attach: cnt=0 out_str=[] for c in s: if c == punc: if cnt%2==0: out_str.append('@RA') else: out_str.append('@LA') cnt+=1 else: out_str.append(c) s=''.join(out_str).replace('@RA ',punc).replace(' @LA',punc ).replace('@RA',punc).replace('@LA',punc) return s def trivial_detokenize(text,lang='hi'): """detokenize string for languages of the Indian subcontinent A trivial detokenizer which: - decides whether punctuation attaches to left/right or both - handles number sequences - handles quotes smartly (deciding left or right attachment) Args: text (str): tokenized text to process Returns: str: detokenized string Raises: IndicNlpException: If language is not supported """ if lang=='ur': raise IndicNlpException('No detokenizer available for Urdu') else: return trivial_detokenize_indic(text) # if __name__ == '__main__': # if len(sys.argv)<4: # print("Usage: python indic_detokenize.py ") # sys.exit(1) # with open(sys.argv[1],'r', encoding='utf-8') as ifile: # with open(sys.argv[2],'w', encoding='utf-8') as ofile: # for line in ifile: # detokenized_line=trivial_detokenize(line,sys.argv[3]) # ofile.write(detokenized_line)