Spaces:
Sleeping
Sleeping
import re | |
import random | |
import string | |
import nltk | |
nltk.download('punkt') | |
def replacement1(review, regex_list): | |
replaced_dict = {} | |
for regex in regex_list: | |
matches = re.findall(regex, review, re.IGNORECASE) | |
for match in matches: | |
random_string = ''.join(random.choices(string.ascii_letters + string.digits, k=10)) | |
review = review.replace(match, random_string) | |
replaced_dict[random_string] = match | |
return review, replaced_dict | |
def replacement2(sentences, replaced_dict): | |
for i in range(len(sentences)): | |
for randomized, original in replaced_dict.items(): | |
sentences[i] = sentences[i].replace(randomized, original) | |
return sentences | |
def parse_sentences(review): | |
regex_list = [r'et al.', r'"(.*?)"', r"'(.*?)'", r'e.g.', r'Sec.', r'Sec \d+(\.\d+)?\.', r'w.r.t.', r'e.q', r'fig.'] | |
review, replaced_dict = replacement1(review, regex_list) | |
sentences = nltk.sent_tokenize(review) | |
sentences = replacement2(sentences, replaced_dict) | |
return sentences |