Spaces:

text-peccavi
/

TEXT_PECCAVI

Sleeping

App Files Files Community

TEXT_PECCAVI / utils /old /non_melting_points_v1.py

text-peccavi

Upload 54 files

c8b9192 verified 7 months ago

raw

history blame contribute delete

8.62 kB

	import nltk
	from nltk.corpus import stopwords
	from nltk.util import ngrams
	from collections import Counter
	import re

	class NgramProcessor:
	def __init__(self):
	try:
	nltk.data.find('corpora/stopwords')
	except LookupError:
	nltk.download('stopwords')

	self.stop_words = set(stopwords.words('english'))

	def remove_stopwords(self, text):
	"""
	Remove stopwords using NLTK's stopword list

	Args:
	text (str): Input text

	Returns:
	str: Cleaned text with stopwords removed
	"""
	words = re.findall(r'\w+', text.lower())
	filtered_words = [word for word in words if word not in self.stop_words]
	return ' '.join(filtered_words)

	def is_exact_match(self, ngram, sentences):
	"""
	Check if the given n-gram has an exact match in all sentences

	Args:
	ngram (str): The n-gram to search for
	sentences (list): List of sentences to search in

	Returns:
	bool: True if n-gram has exact match in all sentences, False otherwise
	"""
	return all(ngram in sentence for sentence in sentences)

	def is_substring_of_any(self, ngram, common_ngrams):
	"""
	Check if the given n-gram is an exact substring of any previously found common n-grams

	Args:
	ngram (str): The n-gram to check
	common_ngrams (list): List of previously found common n-grams

	Returns:
	bool: True if ngram is a substring of any common_ngrams, False otherwise
	"""
	return any(ngram in other_ngram for other_ngram in common_ngrams if ngram != other_ngram)

	def find_filtered_ngrams(self, sentences):
	"""
	Find all n-grams that have exact matches across all sentences,
	excluding those that are part of larger common n-grams

	Args:
	sentences (list): List of sentences to analyze

	Returns:
	list: List of tuples where each tuple contains the n-gram and its indices in each sentence
	"""
	original_sentences = sentences[:]
	sentences = [self.remove_stopwords(sentence) for sentence in sentences]
	ngram_lengths = [4, 3, 2, 1] # Quadgram, trigram, bigram, unigram
	common_ngrams = []

	for n in ngram_lengths:
	ngrams_list = [list(ngrams(sentence.split(), n)) for sentence in sentences]
	ngrams_counter = Counter(ngrams_list[0])

	for ngram in ngrams_counter:
	ngram_str = ' '.join(ngram)
	if self.is_exact_match(ngram_str, sentences) and not self.is_substring_of_any(ngram_str, [ng[0] for ng in common_ngrams]):
	indices = []
	for original_sentence in original_sentences:
	words = original_sentence.split()
	ngram_indices = [
	(i, i + n - 1) for i in range(len(words) - n + 1)
	if ' '.join(words[i:i + n]).lower() == ngram_str
	]
	indices.append(ngram_indices)
	common_ngrams.append((ngram_str, indices))

	return common_ngrams

	def find_relative_order(self, sentence, common_ngrams):
	"""
	Find the relative order of the common n-grams in the sentence

	Args:
	sentence (str): Sentence in which to find the relative order
	common_ngrams (list): List of common n-grams

	Returns:
	list: List of tuples with the relative position and the n-gram
	"""
	relative_order = []
	for ngram, _ in common_ngrams:
	index = sentence.find(ngram)
	if index != -1:
	relative_order.append((index, ngram))

	return sorted(relative_order)

	# Example usage
	if __name__ == "__main__":
	sentences = [
	"The quick brown fox jumps over the lazy dog.",
	"A quick brown dog outpaces a lazy fox.",
	"Quick brown animals leap over lazy obstacles."
	]

	processor = NgramProcessor()
	common_ngrams = processor.find_filtered_ngrams(sentences)
	print("Common n-grams and their indices:")
	for ngram, indices in common_ngrams:
	print(f"{ngram}: {indices}")

	for sentence in sentences:
	relative_order = processor.find_relative_order(sentence, common_ngrams)
	print(f"Relative order in sentence '{sentence}':", relative_order)



	# import nltk
	# from nltk.corpus import stopwords
	# from nltk.util import ngrams
	# from collections import Counter
	# import re

	# class NgramProcessor:
	# def __init__(self):
	# try:
	# nltk.data.find('corpora/stopwords')
	# except LookupError:
	# nltk.download('stopwords')

	# self.stop_words = set(stopwords.words('english'))

	# def remove_stopwords(self, text):
	# """
	# Remove stopwords using NLTK's stopword list

	# Args:
	# text (str): Input text

	# Returns:
	# str: Cleaned text with stopwords removed
	# """
	# words = re.findall(r'\w+', text.lower())
	# filtered_words = [word for word in words if word not in self.stop_words]
	# return ' '.join(filtered_words)

	# def is_exact_match(self, ngram, sentences):
	# """
	# Check if the given n-gram has an exact match in all sentences

	# Args:
	# ngram (str): The n-gram to search for
	# sentences (list): List of sentences to search in

	# Returns:
	# bool: True if n-gram has exact match in all sentences, False otherwise
	# """
	# return all(ngram in sentence for sentence in sentences)

	# def is_substring_of_any(self, ngram, common_ngrams):
	# """
	# Check if the given n-gram is an exact substring of any previously found common n-grams

	# Args:
	# ngram (str): The n-gram to check
	# common_ngrams (list): List of previously found common n-grams

	# Returns:
	# bool: True if ngram is a substring of any common_ngrams, False otherwise
	# """
	# return any(ngram in other_ngram for other_ngram in common_ngrams if ngram != other_ngram)

	# def find_filtered_ngrams(self, sentences):
	# """
	# Find all n-grams that have exact matches across all sentences,
	# excluding those that are part of larger common n-grams

	# Args:
	# sentences (list): List of sentences to analyze

	# Returns:
	# list: List of all common n-grams in order of their appearance in the first sentence
	# """
	# sentences = [self.remove_stopwords(sentence) for sentence in sentences]
	# ngram_lengths = [4, 3, 2, 1] # Quadgram, trigram, bigram, unigram
	# common_ngrams = []

	# for n in ngram_lengths:
	# ngrams_list = [list(ngrams(sentence.split(), n)) for sentence in sentences]
	# ngrams_counter = Counter(ngrams_list[0])

	# for ngram in ngrams_counter:
	# ngram_str = ' '.join(ngram)
	# if self.is_exact_match(ngram_str, sentences) and not self.is_substring_of_any(ngram_str, common_ngrams):
	# common_ngrams.append(ngram_str)

	# return common_ngrams

	# def find_relative_order(self, sentence, common_ngrams):
	# """
	# Find the relative order of the common n-grams in the sentence

	# Args:
	# sentence (str): Sentence in which to find the relative order
	# common_ngrams (list): List of common n-grams

	# Returns:
	# list: List of tuples with the relative position and the n-gram
	# """
	# relative_order = []
	# for ngram in common_ngrams:
	# index = sentence.find(ngram)
	# if index != -1:
	# relative_order.append((index, ngram))

	# return sorted(relative_order)

	# # Example usage
	# if __name__ == "__main__":
	# sentences = [
	# "The quick brown fox jumps over the lazy dog.",
	# "A quick brown dog outpaces a lazy fox.",
	# "Quick brown animals leap over lazy obstacles."
	# ]

	# processor = NgramProcessor()
	# common_ngrams = processor.find_filtered_ngrams(sentences)
	# print("Common n-grams:", common_ngrams)

	# for sentence in sentences:
	# relative_order = processor.find_relative_order(sentence, common_ngrams)
	# print(f"Relative order in sentence '{sentence}':", relative_order)