File size: 890 Bytes
837fdb6 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 |
# -*- coding: utf-8 -*-
# 💾⚙️🔮
__author__ = "Daulet N."
__email__ = "daulet.nurmanbetov@gmail.com"
def prepare_unpunct_text(text):
"""
Given a text, normalizes it to subsequently restore punctuation
"""
formatted_txt = text.replace('\n', '').strip()
formatted_txt = formatted_txt.lower()
formatted_txt_lst = formatted_txt.split(" ")
punct_strp_txt = [strip_punct(i) for i in formatted_txt_lst]
normalized_txt = " ".join([i for i in punct_strp_txt if i])
return normalized_txt
def strip_punct(wrd):
"""
Given a word, strips non aphanumeric characters that precede and follow it
"""
if not wrd:
return wrd
while not wrd[-1:].isalnum():
if not wrd:
break
wrd = wrd[:-1]
while not wrd[:1].isalnum():
if not wrd:
break
wrd = wrd[1:]
return wrd
|