harveen
Adding code
9bbf386
raw history blame
No virus
3.53 kB
#
# Copyright (c) 2013-present, Anoop Kunchukuttan
# All rights reserved.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
#
#Program for detokenizing Indian language input
#
# @author Anoop Kunchukuttan
#
"""
De-tokenizer for Indian languages.
"""
import string, re, sys
from indicnlp.common import IndicNlpException
## detokenizer patterns
left_attach=r'!%)\]},.:;>?\u0964\u0965'
pat_la=re.compile(r'[ ](['+left_attach+r'])')
right_attach=r'#$(\[{<@'
pat_ra=re.compile(r'(['+right_attach+r'])[ ]')
lr_attach=r'-/\\'
pat_lra=re.compile(r'[ ](['+lr_attach+r'])[ ]')
#donknow=u'&*+=^_|~'
## date, numbers, section/article numbering
## TODO: handle indic numbers
pat_num_seq=re.compile(r'([0-9]+ [,.:/] )+[0-9]+')
### e-mail address
#pat_num=re.compile(ur'[a-zA-Z]+[ ]?
def trivial_detokenize_indic(text):
"""detokenize string for Indian language scripts using Brahmi-derived scripts
A trivial detokenizer which:
- decides whether punctuation attaches to left/right or both
- handles number sequences
- handles quotes smartly (deciding left or right attachment)
Args:
text (str): tokenized text to process
Returns:
str: detokenized string
"""
s=text
### some normalizations
#numbers and dates
new_s=''
prev=0
for m in pat_num_seq.finditer(s):
start=m.start()
end=m.end()
if start>prev:
new_s=new_s+s[prev:start]
new_s=new_s+s[start:end].replace(' ','')
prev=end
new_s=new_s+s[prev:]
s=new_s
### consective single quotes or backslashes become double quotes
#s=s.replace("' '", "''")
#s=s.replace("` `", '``')
s=pat_lra.sub('\\1',s)
s=pat_la.sub('\\1',s)
s=pat_ra.sub('\\1',s)
# assumes well formedness of quotes and alternates between right and left attach
alt_attach='\'"`'
for punc in alt_attach:
cnt=0
out_str=[]
for c in s:
if c == punc:
if cnt%2==0:
out_str.append('@RA')
else:
out_str.append('@LA')
cnt+=1
else:
out_str.append(c)
s=''.join(out_str).replace('@RA ',punc).replace(' @LA',punc
).replace('@RA',punc).replace('@LA',punc)
return s
def trivial_detokenize(text,lang='hi'):
"""detokenize string for languages of the Indian subcontinent
A trivial detokenizer which:
- decides whether punctuation attaches to left/right or both
- handles number sequences
- handles quotes smartly (deciding left or right attachment)
Args:
text (str): tokenized text to process
Returns:
str: detokenized string
Raises:
IndicNlpException: If language is not supported
"""
if lang=='ur':
raise IndicNlpException('No detokenizer available for Urdu')
else:
return trivial_detokenize_indic(text)
# if __name__ == '__main__':
# if len(sys.argv)<4:
# print("Usage: python indic_detokenize.py <infile> <outfile> <language>")
# sys.exit(1)
# with open(sys.argv[1],'r', encoding='utf-8') as ifile:
# with open(sys.argv[2],'w', encoding='utf-8') as ofile:
# for line in ifile:
# detokenized_line=trivial_detokenize(line,sys.argv[3])
# ofile.write(detokenized_line)