micahg's picture
Initial file upload
609216a
raw
history blame
1.21 kB
# -*- coding: utf-8 -*-
import pkg_resources
import unicodecsv as csv
class PuncNorm(object):
def __init__(self):
"""Constructs a punctuation normalization object"""
self.puncnorm = self._load_punc_norm_map()
def _load_punc_norm_map(self):
"""Load the map table for normalizing 'down' punctuation."""
path = pkg_resources.resource_filename(__name__, 'data/puncnorm.csv')
with open(path, 'rb') as f:
reader = csv.reader(f, encoding='utf-8', delimiter=str(','), quotechar=str('"'))
next(reader)
return {punc: norm for (punc, norm) in reader}
def norm(self, text):
"""Apply punctuation normalization to a string of text
Args:
text (unicode): text to normalize_punc
Returns:
unicode: text with normalized punctuation
"""
new_text = []
for c in text:
if c in self.puncnorm:
new_text.append(self.puncnorm[c])
else:
new_text.append(c)
return ''.join(new_text)
def __iter__(self):
return iter(self.puncnorm)
def __getitem__(self, key):
return self.puncnorm[key]