|
import re |
|
from nltk.corpus import stopwords |
|
|
|
def find_common_subsequences(sentence, str_list): |
|
stop_words = set(stopwords.words('english')) |
|
sentence = sentence.lower() |
|
|
|
str_list = [s.lower() for s in str_list] |
|
|
|
def is_present(subseq, str_list): |
|
subseq_regex = re.compile(r'\b' + re.escape(subseq) + r'\b') |
|
return all(subseq_regex.search(s) for s in str_list) |
|
|
|
def remove_stop_words_and_special_chars(sentence): |
|
sentence = re.sub(r'[^\w\s]', '', sentence) |
|
words = sentence.split() |
|
filtered_words = [word for word in words if word.lower() not in stop_words] |
|
return " ".join(filtered_words) |
|
|
|
sentence = remove_stop_words_and_special_chars(sentence) |
|
str_list = [remove_stop_words_and_special_chars(s) for s in str_list] |
|
|
|
words = sentence.split() |
|
common_grams = [] |
|
added_phrases = set() |
|
|
|
for n in range(5, 0, -1): |
|
for i in range(len(words) - n + 1): |
|
subseq = " ".join(words[i:i+n]) |
|
if is_present(subseq, str_list) and not any(subseq in phrase for phrase in added_phrases): |
|
common_grams.append((i, subseq)) |
|
added_phrases.add(subseq) |
|
|
|
|
|
common_grams.sort(key=lambda x: x[0]) |
|
|
|
|
|
indexed_common_grams = [(index + 1, subseq) for index, (_, subseq) in enumerate(common_grams)] |
|
|
|
return indexed_common_grams |
|
|
|
|