File size: 1,051 Bytes
8cb156b
 
 
 
 
26d80b9
8cb156b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
import re
import random
import string
import nltk

nltk.download('punkt')

def replacement1(review, regex_list):
    replaced_dict = {}
    for regex in regex_list:
        matches = re.findall(regex, review, re.IGNORECASE)
        for match in matches:
            random_string = ''.join(random.choices(string.ascii_letters + string.digits, k=10))
            review = review.replace(match, random_string)
            replaced_dict[random_string] = match
    return review, replaced_dict

def replacement2(sentences, replaced_dict):
    for i in range(len(sentences)):
        for randomized, original in replaced_dict.items():
            sentences[i] = sentences[i].replace(randomized, original)
    return sentences

def parse_sentences(review):
    regex_list = [r'et al.', r'"(.*?)"', r"'(.*?)'", r'e.g.', r'Sec.', r'Sec \d+(\.\d+)?\.', r'w.r.t.', r'e.q', r'fig.']
    review, replaced_dict = replacement1(review, regex_list)
    sentences = nltk.sent_tokenize(review)
    sentences = replacement2(sentences, replaced_dict)
    return sentences