tubifier / myrpunct /utils.py
wldmr's picture
app file
837fdb6
raw
history blame
890 Bytes
# -*- coding: utf-8 -*-
# ๐Ÿ’พโš™๏ธ๐Ÿ”ฎ
__author__ = "Daulet N."
__email__ = "daulet.nurmanbetov@gmail.com"
def prepare_unpunct_text(text):
"""
Given a text, normalizes it to subsequently restore punctuation
"""
formatted_txt = text.replace('\n', '').strip()
formatted_txt = formatted_txt.lower()
formatted_txt_lst = formatted_txt.split(" ")
punct_strp_txt = [strip_punct(i) for i in formatted_txt_lst]
normalized_txt = " ".join([i for i in punct_strp_txt if i])
return normalized_txt
def strip_punct(wrd):
"""
Given a word, strips non aphanumeric characters that precede and follow it
"""
if not wrd:
return wrd
while not wrd[-1:].isalnum():
if not wrd:
break
wrd = wrd[:-1]
while not wrd[:1].isalnum():
if not wrd:
break
wrd = wrd[1:]
return wrd