Spaces:

User1342
/

Pinpoint-Web

Sleeping

Pinpoint-Web / Pinpoint_Internal /Aggregator_NGram.py

James Stevenson

added lib

246df79 over 2 years ago

3.34 kB

	from sklearn.feature_extraction.text import CountVectorizer

	from Pinpoint_Internal.Logger import *

	c_vec = CountVectorizer(ngram_range=(1, 5))


	class n_gram_aggregator():
	"""
	This class is used to retrieve the most common NGrams for a given dataset corpus.
	"""

	def _get_average_ngram_count(self, n_grams_dict):
	"""
	takes a dict of Ngrams and identifies the average weighting
	:param n_grams_dict:
	:return:
	"""
	all_count = []
	for n_gram in n_grams_dict:
	ng_count = n_grams_dict[n_gram]
	all_count.append(ng_count)

	average_count = sum(all_count) / len(all_count)
	# print(all_count)
	return average_count

	def _get_all_ngrams(self, data):
	"""
	Returns all ngrams (tri, bi, and uni) for a given piece of text
	:param data:
	:return:
	"""

	if type(data) is not list:
	data = [data]

	# input to fit_transform() should be an iterable with strings
	ngrams = c_vec.fit_transform(data)

	# needs to happen after fit_transform()
	vocab = c_vec.vocabulary_

	count_values = ngrams.toarray().sum(axis=0)

	# output n-grams
	uni_grams = {}
	bi_grams = {}
	tri_grams = {}

	for ng_count, ng_text in sorted([(count_values[i], k) for k, i in vocab.items()], reverse=True):
	sentence_length = len(ng_text.split(" "))

	if sentence_length == 3:
	tri_grams[ng_text] = ng_count
	elif sentence_length == 2:
	bi_grams[ng_text] = ng_count
	elif sentence_length == 1:
	uni_grams[ng_text] = ng_count

	return uni_grams, bi_grams, tri_grams

	def _get_popular_ngrams(self, ngrams_dict):
	"""
	Returns ngrams for a given piece of text that are the most popular (i.e. their weighting is
	above the average ngram wighting)
	:param ngrams_dict:
	:return:
	"""
	average_count = self._get_average_ngram_count(ngrams_dict)

	popular_ngrams = {}
	for n_gram in ngrams_dict:
	ng_count = ngrams_dict[n_gram]

	if ng_count >= average_count:
	popular_ngrams[n_gram] = ng_count
	return popular_ngrams

	def get_ngrams(self, data=None, file_name_to_read=None):
	"""
	Wrapper function for returning uni, bi, and tri grams that are the most popular (above the average weighting in
	a given piece of text).
	:param data:
	:param file_name_to_read:
	:return:
	"""
	logger().print_message("Getting Ngrams")

	if data is None and file_name_to_read is None:
	raise Exception("No data supplied to retrieve n_grams")

	if data is None and file_name_to_read is not None:
	with open(file_name_to_read, 'r') as file_to_read:
	data = file_to_read.read()

	uni_grams, bi_grams, tri_grams = self._get_all_ngrams(data)

	popular_uni_grams = list(self._get_popular_ngrams(uni_grams).keys())
	popular_bi_grams = list(self._get_popular_ngrams(bi_grams).keys())
	popular_tri_grams = list(self._get_popular_ngrams(tri_grams).keys())

	return popular_uni_grams, popular_bi_grams, popular_tri_grams