File size: 297 Bytes
0379fdb
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
import re


def clean_text(text):
    split_punct = re.escape(r'()')
    return ' '.join(re.findall(rf"[^\s{split_punct}]+|[{split_punct}]", text))
    # Ensure parentheses are probably separated by spaCy tokenizer for CNN/DailyMail dataset.
    return text.replace("(", "( ").replace(")", ") ")