|
|
|
|
|
|
|
__author__ = "Daulet N." |
|
__email__ = "daulet.nurmanbetov@gmail.com" |
|
|
|
def prepare_unpunct_text(text): |
|
""" |
|
Given a text, normalizes it to subsequently restore punctuation |
|
""" |
|
formatted_txt = text.replace('\n', '').strip() |
|
formatted_txt = formatted_txt.lower() |
|
formatted_txt_lst = formatted_txt.split(" ") |
|
punct_strp_txt = [strip_punct(i) for i in formatted_txt_lst] |
|
normalized_txt = " ".join([i for i in punct_strp_txt if i]) |
|
return normalized_txt |
|
|
|
def strip_punct(wrd): |
|
""" |
|
Given a word, strips non aphanumeric characters that precede and follow it |
|
""" |
|
if not wrd: |
|
return wrd |
|
|
|
while not wrd[-1:].isalnum(): |
|
if not wrd: |
|
break |
|
wrd = wrd[:-1] |
|
|
|
while not wrd[:1].isalnum(): |
|
if not wrd: |
|
break |
|
wrd = wrd[1:] |
|
return wrd |
|
|