Spaces:
Sleeping
Sleeping
| import nltk | |
| from nltk.corpus import stopwords | |
| from nltk.util import ngrams | |
| from collections import Counter | |
| import re | |
| class NgramProcessor: | |
| def __init__(self): | |
| try: | |
| nltk.data.find('corpora/stopwords') | |
| except LookupError: | |
| nltk.download('stopwords') | |
| self.stop_words = set(stopwords.words('english')) | |
| def remove_stopwords(self, text): | |
| """ | |
| Remove stopwords using NLTK's stopword list | |
| Args: | |
| text (str): Input text | |
| Returns: | |
| str: Cleaned text with stopwords removed | |
| """ | |
| words = re.findall(r'\w+', text.lower()) | |
| filtered_words = [word for word in words if word not in self.stop_words] | |
| return ' '.join(filtered_words) | |
| def is_exact_match(self, ngram, sentences): | |
| """ | |
| Check if the given n-gram has an exact match in all sentences | |
| Args: | |
| ngram (str): The n-gram to search for | |
| sentences (list): List of sentences to search in | |
| Returns: | |
| bool: True if n-gram has exact match in all sentences, False otherwise | |
| """ | |
| return all(ngram in sentence for sentence in sentences) | |
| def is_substring_of_any(self, ngram, common_ngrams): | |
| """ | |
| Check if the given n-gram is an exact substring of any previously found common n-grams | |
| Args: | |
| ngram (str): The n-gram to check | |
| common_ngrams (list): List of previously found common n-grams | |
| Returns: | |
| bool: True if ngram is a substring of any common_ngrams, False otherwise | |
| """ | |
| return any(ngram in other_ngram for other_ngram in common_ngrams if ngram != other_ngram) | |
| def find_filtered_ngrams(self, sentences): | |
| """ | |
| Find all n-grams that have exact matches across all sentences, | |
| excluding those that are part of larger common n-grams | |
| Args: | |
| sentences (list): List of sentences to analyze | |
| Returns: | |
| list: List of tuples where each tuple contains the n-gram and its indices in each sentence | |
| """ | |
| original_sentences = sentences[:] | |
| sentences = [self.remove_stopwords(sentence) for sentence in sentences] | |
| ngram_lengths = [4, 3, 2, 1] # Quadgram, trigram, bigram, unigram | |
| common_ngrams = [] | |
| for n in ngram_lengths: | |
| ngrams_list = [list(ngrams(sentence.split(), n)) for sentence in sentences] | |
| ngrams_counter = Counter(ngrams_list[0]) | |
| for ngram in ngrams_counter: | |
| ngram_str = ' '.join(ngram) | |
| if self.is_exact_match(ngram_str, sentences) and not self.is_substring_of_any(ngram_str, [ng[0] for ng in common_ngrams]): | |
| indices = [] | |
| for original_sentence in original_sentences: | |
| words = original_sentence.split() | |
| ngram_indices = [ | |
| (i, i + n - 1) for i in range(len(words) - n + 1) | |
| if ' '.join(words[i:i + n]).lower() == ngram_str | |
| ] | |
| indices.append(ngram_indices) | |
| common_ngrams.append((ngram_str, indices)) | |
| return common_ngrams | |
| def find_relative_order(self, sentence, common_ngrams): | |
| """ | |
| Find the relative order of the common n-grams in the sentence | |
| Args: | |
| sentence (str): Sentence in which to find the relative order | |
| common_ngrams (list): List of common n-grams | |
| Returns: | |
| list: List of tuples with the relative position and the n-gram | |
| """ | |
| relative_order = [] | |
| for ngram, _ in common_ngrams: | |
| index = sentence.find(ngram) | |
| if index != -1: | |
| relative_order.append((index, ngram)) | |
| return sorted(relative_order) | |
| # Example usage | |
| if __name__ == "__main__": | |
| sentences = [ | |
| "The quick brown fox jumps over the lazy dog.", | |
| "A quick brown dog outpaces a lazy fox.", | |
| "Quick brown animals leap over lazy obstacles." | |
| ] | |
| processor = NgramProcessor() | |
| common_ngrams = processor.find_filtered_ngrams(sentences) | |
| print("Common n-grams and their indices:") | |
| for ngram, indices in common_ngrams: | |
| print(f"{ngram}: {indices}") | |
| for sentence in sentences: | |
| relative_order = processor.find_relative_order(sentence, common_ngrams) | |
| print(f"Relative order in sentence '{sentence}':", relative_order) | |
| # import nltk | |
| # from nltk.corpus import stopwords | |
| # from nltk.util import ngrams | |
| # from collections import Counter | |
| # import re | |
| # class NgramProcessor: | |
| # def __init__(self): | |
| # try: | |
| # nltk.data.find('corpora/stopwords') | |
| # except LookupError: | |
| # nltk.download('stopwords') | |
| # self.stop_words = set(stopwords.words('english')) | |
| # def remove_stopwords(self, text): | |
| # """ | |
| # Remove stopwords using NLTK's stopword list | |
| # Args: | |
| # text (str): Input text | |
| # Returns: | |
| # str: Cleaned text with stopwords removed | |
| # """ | |
| # words = re.findall(r'\w+', text.lower()) | |
| # filtered_words = [word for word in words if word not in self.stop_words] | |
| # return ' '.join(filtered_words) | |
| # def is_exact_match(self, ngram, sentences): | |
| # """ | |
| # Check if the given n-gram has an exact match in all sentences | |
| # Args: | |
| # ngram (str): The n-gram to search for | |
| # sentences (list): List of sentences to search in | |
| # Returns: | |
| # bool: True if n-gram has exact match in all sentences, False otherwise | |
| # """ | |
| # return all(ngram in sentence for sentence in sentences) | |
| # def is_substring_of_any(self, ngram, common_ngrams): | |
| # """ | |
| # Check if the given n-gram is an exact substring of any previously found common n-grams | |
| # Args: | |
| # ngram (str): The n-gram to check | |
| # common_ngrams (list): List of previously found common n-grams | |
| # Returns: | |
| # bool: True if ngram is a substring of any common_ngrams, False otherwise | |
| # """ | |
| # return any(ngram in other_ngram for other_ngram in common_ngrams if ngram != other_ngram) | |
| # def find_filtered_ngrams(self, sentences): | |
| # """ | |
| # Find all n-grams that have exact matches across all sentences, | |
| # excluding those that are part of larger common n-grams | |
| # Args: | |
| # sentences (list): List of sentences to analyze | |
| # Returns: | |
| # list: List of all common n-grams in order of their appearance in the first sentence | |
| # """ | |
| # sentences = [self.remove_stopwords(sentence) for sentence in sentences] | |
| # ngram_lengths = [4, 3, 2, 1] # Quadgram, trigram, bigram, unigram | |
| # common_ngrams = [] | |
| # for n in ngram_lengths: | |
| # ngrams_list = [list(ngrams(sentence.split(), n)) for sentence in sentences] | |
| # ngrams_counter = Counter(ngrams_list[0]) | |
| # for ngram in ngrams_counter: | |
| # ngram_str = ' '.join(ngram) | |
| # if self.is_exact_match(ngram_str, sentences) and not self.is_substring_of_any(ngram_str, common_ngrams): | |
| # common_ngrams.append(ngram_str) | |
| # return common_ngrams | |
| # def find_relative_order(self, sentence, common_ngrams): | |
| # """ | |
| # Find the relative order of the common n-grams in the sentence | |
| # Args: | |
| # sentence (str): Sentence in which to find the relative order | |
| # common_ngrams (list): List of common n-grams | |
| # Returns: | |
| # list: List of tuples with the relative position and the n-gram | |
| # """ | |
| # relative_order = [] | |
| # for ngram in common_ngrams: | |
| # index = sentence.find(ngram) | |
| # if index != -1: | |
| # relative_order.append((index, ngram)) | |
| # return sorted(relative_order) | |
| # # Example usage | |
| # if __name__ == "__main__": | |
| # sentences = [ | |
| # "The quick brown fox jumps over the lazy dog.", | |
| # "A quick brown dog outpaces a lazy fox.", | |
| # "Quick brown animals leap over lazy obstacles." | |
| # ] | |
| # processor = NgramProcessor() | |
| # common_ngrams = processor.find_filtered_ngrams(sentences) | |
| # print("Common n-grams:", common_ngrams) | |
| # for sentence in sentences: | |
| # relative_order = processor.find_relative_order(sentence, common_ngrams) | |
| # print(f"Relative order in sentence '{sentence}':", relative_order) | |