File size: 1,467 Bytes
7baf701
 
 
 
 
 
 
 
 
 
3b4471f
 
7baf701
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d814758
7baf701
 
d814758
 
3b4471f
d814758
 
 
81f410f
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
import re
from nltk.corpus import stopwords

def find_common_subsequences(sentence, str_list):
    stop_words = set(stopwords.words('english'))
    sentence = sentence.lower()

    str_list = [s.lower() for s in str_list]

    def is_present(subseq, str_list):
        subseq_regex = re.compile(r'\b' + re.escape(subseq) + r'\b')
        return all(subseq_regex.search(s) for s in str_list)

    def remove_stop_words_and_special_chars(sentence):
        sentence = re.sub(r'[^\w\s]', '', sentence)
        words = sentence.split()
        filtered_words = [word for word in words if word.lower() not in stop_words]
        return " ".join(filtered_words)

    sentence = remove_stop_words_and_special_chars(sentence)
    str_list = [remove_stop_words_and_special_chars(s) for s in str_list]

    words = sentence.split()
    common_grams = []
    added_phrases = set()

    for n in range(5, 0, -1):
        for i in range(len(words) - n + 1):
            subseq = " ".join(words[i:i+n])
            if is_present(subseq, str_list) and not any(subseq in phrase for phrase in added_phrases):
                common_grams.append((i, subseq))
                added_phrases.add(subseq)

    # Sort by the first appearance in the original sentence
    common_grams.sort(key=lambda x: x[0])

    # Assign indices based on the sorted order
    indexed_common_grams = [(index + 1, subseq) for index, (_, subseq) in enumerate(common_grams)]

    return indexed_common_grams