File size: 890 Bytes
837fdb6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
# -*- coding: utf-8 -*-
# 💾⚙️🔮

__author__ = "Daulet N."
__email__ = "daulet.nurmanbetov@gmail.com"

def prepare_unpunct_text(text):
    """
    Given a text, normalizes it to subsequently restore punctuation
    """
    formatted_txt = text.replace('\n', '').strip()
    formatted_txt = formatted_txt.lower()
    formatted_txt_lst = formatted_txt.split(" ")
    punct_strp_txt = [strip_punct(i) for i in formatted_txt_lst]
    normalized_txt = " ".join([i for i in punct_strp_txt if i])
    return normalized_txt

def strip_punct(wrd):
    """
    Given a word, strips non aphanumeric characters that precede and follow it
    """
    if not wrd:
        return wrd
    
    while not wrd[-1:].isalnum():
        if not wrd:
            break
        wrd = wrd[:-1]
    
    while not wrd[:1].isalnum():
        if not wrd:
            break
        wrd = wrd[1:]
    return wrd