Spaces:
Sleeping
Sleeping
# -*- coding: utf-8 -*- | |
import pkg_resources | |
import unicodecsv as csv | |
class PuncNorm(object): | |
def __init__(self): | |
"""Constructs a punctuation normalization object""" | |
self.puncnorm = self._load_punc_norm_map() | |
def _load_punc_norm_map(self): | |
"""Load the map table for normalizing 'down' punctuation.""" | |
path = pkg_resources.resource_filename(__name__, 'data/puncnorm.csv') | |
with open(path, 'rb') as f: | |
reader = csv.reader(f, encoding='utf-8', delimiter=str(','), quotechar=str('"')) | |
next(reader) | |
return {punc: norm for (punc, norm) in reader} | |
def norm(self, text): | |
"""Apply punctuation normalization to a string of text | |
Args: | |
text (unicode): text to normalize_punc | |
Returns: | |
unicode: text with normalized punctuation | |
""" | |
new_text = [] | |
for c in text: | |
if c in self.puncnorm: | |
new_text.append(self.puncnorm[c]) | |
else: | |
new_text.append(c) | |
return ''.join(new_text) | |
def __iter__(self): | |
return iter(self.puncnorm) | |
def __getitem__(self, key): | |
return self.puncnorm[key] | |