Spaces:
Sleeping
Sleeping
from sklearn.feature_extraction.text import CountVectorizer | |
from Pinpoint_Internal.Logger import * | |
c_vec = CountVectorizer(ngram_range=(1, 5)) | |
class n_gram_aggregator(): | |
""" | |
This class is used to retrieve the most common NGrams for a given dataset corpus. | |
""" | |
def _get_average_ngram_count(self, n_grams_dict): | |
""" | |
takes a dict of Ngrams and identifies the average weighting | |
:param n_grams_dict: | |
:return: | |
""" | |
all_count = [] | |
for n_gram in n_grams_dict: | |
ng_count = n_grams_dict[n_gram] | |
all_count.append(ng_count) | |
average_count = sum(all_count) / len(all_count) | |
# print(all_count) | |
return average_count | |
def _get_all_ngrams(self, data): | |
""" | |
Returns all ngrams (tri, bi, and uni) for a given piece of text | |
:param data: | |
:return: | |
""" | |
if type(data) is not list: | |
data = [data] | |
# input to fit_transform() should be an iterable with strings | |
ngrams = c_vec.fit_transform(data) | |
# needs to happen after fit_transform() | |
vocab = c_vec.vocabulary_ | |
count_values = ngrams.toarray().sum(axis=0) | |
# output n-grams | |
uni_grams = {} | |
bi_grams = {} | |
tri_grams = {} | |
for ng_count, ng_text in sorted([(count_values[i], k) for k, i in vocab.items()], reverse=True): | |
sentence_length = len(ng_text.split(" ")) | |
if sentence_length == 3: | |
tri_grams[ng_text] = ng_count | |
elif sentence_length == 2: | |
bi_grams[ng_text] = ng_count | |
elif sentence_length == 1: | |
uni_grams[ng_text] = ng_count | |
return uni_grams, bi_grams, tri_grams | |
def _get_popular_ngrams(self, ngrams_dict): | |
""" | |
Returns ngrams for a given piece of text that are the most popular (i.e. their weighting is | |
above the average ngram wighting) | |
:param ngrams_dict: | |
:return: | |
""" | |
average_count = self._get_average_ngram_count(ngrams_dict) | |
popular_ngrams = {} | |
for n_gram in ngrams_dict: | |
ng_count = ngrams_dict[n_gram] | |
if ng_count >= average_count: | |
popular_ngrams[n_gram] = ng_count | |
return popular_ngrams | |
def get_ngrams(self, data=None, file_name_to_read=None): | |
""" | |
Wrapper function for returning uni, bi, and tri grams that are the most popular (above the average weighting in | |
a given piece of text). | |
:param data: | |
:param file_name_to_read: | |
:return: | |
""" | |
logger().print_message("Getting Ngrams") | |
if data is None and file_name_to_read is None: | |
raise Exception("No data supplied to retrieve n_grams") | |
if data is None and file_name_to_read is not None: | |
with open(file_name_to_read, 'r') as file_to_read: | |
data = file_to_read.read() | |
uni_grams, bi_grams, tri_grams = self._get_all_ngrams(data) | |
popular_uni_grams = list(self._get_popular_ngrams(uni_grams).keys()) | |
popular_bi_grams = list(self._get_popular_ngrams(bi_grams).keys()) | |
popular_tri_grams = list(self._get_popular_ngrams(tri_grams).keys()) | |
return popular_uni_grams, popular_bi_grams, popular_tri_grams | |