Spaces:

alex42t
/

EssayChecker

Runtime error

App Files Files Community

EssayChecker / features.py

alex42t

Update features.py

5ffcbc5 over 2 years ago

raw

history blame contribute delete

9.54 kB

	import numpy as np
	import pandas as pd
	import language_tool_python
	import readability
	import enchant
	from enchant.checker import SpellChecker
	from collections import OrderedDict
	import nltk
	nltk.download('punkt')
	from nltk.tokenize import word_tokenize


	class FeatureGenerator:
	def __init__(self):
	self.spell_checker = SpellChecker('en_US')
	self.enchant_dict = enchant.Dict("en_US")

	common_words_1k_filename = './1-1000.txt'
	with open(common_words_1k_filename) as f:
	self.common_words_1k = set(x.strip() for x in f.readlines())

	common_words_filename_10k = './google-10000-english-no-swears.txt'
	with open(common_words_filename_10k) as f:
	self.common_words_10k = set(x.strip() for x in f.readlines())

	# make sure common_words are a subset as well
	self.common_words_10k.update(self.common_words_1k)

	profanity_filename = './profanity.txt'
	with open(profanity_filename) as f:
	self.profanity_set = set(x.strip() for x in f.readlines())

	words_freq_filename = "./count_1w.txt"
	self.words_freq = pd.read_csv(words_freq_filename,
	names=['word', 'freq'],
	sep='\t',
	header=None,
	dtype={'word': str, 'freq': int},
	keep_default_na=False,
	na_values=[''])

	self.words_freq = self.words_freq.set_index('word')

	self.language_tool = language_tool_python.LanguageTool('en-US')


	def text_preprocess(self, text: str):
	'''
	Transform text to be processed by readability

	:param text: input text

	:return: str preprocessed text
	'''
	text = text.strip()

	# new paragraph starts with \n\n
	# readability also requires each sentence to end with \n
	paragraphs = [p.strip()\
	.replace('. ', '.\n')\
	.replace('? ', '?\n')\
	.replace('! ', '!\n') for p in text.split('\n\n')]

	return "\n\n".join(paragraphs)


	def misspelled_count(self, text: str):
	'''
	Get count out misspelled words by enchant SpellChecker

	:param text: input text

	:return: count of misspelled words in text
	'''
	self.spell_checker.set_text(text)
	return len(list(self.spell_checker))

	def flatten_readability(self, r: OrderedDict):
	'''
	Flatten readability output by adding prefixes

	:param r: OrderedDict of readability output

	:return: dict
	'''
	out = {}
	for k, group in r.items():
	prefix = {'readability grades': '',
	'sentence info': '',
	'word usage': 'wu_',
	'sentence beginnings': 'sb_'}[k]
	for var_name, value in group.items():
	out[prefix + var_name] = value
	return out

	def get_noncommon_words_count(self, text, common_words_dict):
	# first, tokenize the text
	# second, iterate over tokens and see whether
	# a. it is a word
	# b. not in common_words
	# c. correctly spelled
	# d. does not have underscore '_'

	def is_noncommon_word(w):
	return len(w) > 2 and\
	w not in common_words_dict and\
	'_' not in w and\
	self.enchant_dict.check(w)

	return sum(is_noncommon_word(x) for x in word_tokenize(text.lower()))


	def get_noncommon_words_count_1k(self, text):
	return self.get_noncommon_words_count(text, self.common_words_1k)

	def get_noncommon_words_count_10k(self, text):
	return self.get_noncommon_words_count(text, self.common_words_10k)



	def get_profanity_count(self, text):
	return sum(x in self.profanity_set for x in word_tokenize(text.lower()))

	def get_uncommon_words_counts(self, text):
	word_freq_thresholds = np.array([1e8, 1e7, 1e6, 1e5], dtype=int)
	counts = np.zeros(len(word_freq_thresholds), dtype=int)
	for w in word_tokenize(text):
	if len(w) < 3 or '_' in w or not self.enchant_dict.check(w) or w not in self.words_freq.index:
	continue
	w_freq = self.words_freq.loc[w].values[0]
	counts += (word_freq_thresholds > w_freq)
	return counts



	# I have commented out categories giving 0 columns in train dataset
	# SEMANTICS is mostly 0, we drop it as well
	LT_categories = ['CASING',
	#'COLLOQUIALISMS',
	'COMPOUNDING',
	'CONFUSED_WORDS',
	#'FALSE_FRIENDS',
	#'GENDER_NEUTRALITY',
	'GRAMMAR',
	'MISC',
	'PUNCTUATION',
	'REDUNDANCY',
	#'REGIONALISMS',
	#'REPETITIONS',
	#'REPETITIONS_STYLE',
	#'SEMANTICS',
	'STYLE',
	'TYPOGRAPHY',
	'TYPOS',
	'TOTAL',
	]
	def get_LT_features(self, text):
	'''
	Generates LanguateTool features: each category count and a total number
	'''
	matches = self.language_tool.check(text)
	cat_counts = [sum(m.category == cat for m in matches) for cat in self.LT_categories[:-1]]
	return cat_counts + [len(matches)]


	def generate_features(self, df: pd.DataFrame):
	'''
	Generate features from a dataframe with `full_text` column containing english text

	:param df: input dataframe

	:return: pd.DataFrame with features and possibly updated `full_text` column
	'''

	res_df = df.copy()
	#res_df['full_text'] = res_df['full_text'].apply(self.text_preprocess)



	features_df = res_df[['full_text']].apply(lambda row:
	self.flatten_readability(
	readability.getmeasures(
	self.text_preprocess(row[0]), lang='en')),
	axis='columns',
	result_type='expand')

	features_df['text_len'] = res_df['full_text'].apply(lambda x: len(x))

	features_df['misspelled'] = res_df['full_text'].apply(self.misspelled_count)
	features_df['noncommon_words_1k'] = res_df['full_text'].apply(self.get_noncommon_words_count_1k)
	features_df['noncommon_words_10k'] = res_df['full_text'].apply(self.get_noncommon_words_count_10k)
	features_df['profanity_count'] = res_df['full_text'].apply(self.get_profanity_count)
	features_df[['uwc1e8', 'uwc1e7', 'uwc1e6', 'uwc1e5']] =\
	res_df[['full_text']].apply(lambda x: self.get_uncommon_words_counts(x[0]),
	axis='columns',
	result_type='expand')

	# Generate ratio features
	words_ratio_features = ['wordtypes',
	'long_words',
	'complex_words',
	'complex_words_dc',
	'wu_tobeverb',
	'wu_auxverb',
	'wu_conjunction',
	'wu_pronoun',
	'wu_preposition',
	'wu_nominalization',
	'misspelled',
	'noncommon_words_1k',
	'noncommon_words_10k',
	'uwc1e8',
	'uwc1e7',
	'uwc1e6',
	'uwc1e5',
	]
	features_df[[x + "_ratio" for x in words_ratio_features]] = features_df[words_ratio_features]\
	.div(features_df['words'], axis=0)
	sentences_ratio_features = ['sb_pronoun',
	'sb_interrogative',
	'sb_article',
	'sb_subordination',
	'sb_conjunction',
	'sb_preposition',
	]
	features_df[[x + "_ratio" for x in sentences_ratio_features]] = features_df[sentences_ratio_features]\
	.div(features_df['sentences'], axis=0)


	features_df[['LT_' + x for x in self.LT_categories]] = res_df[['full_text']]\
	.apply(lambda x: self.get_LT_features(x[0]),
	axis=1,
	result_type='expand')

	features_df[['LT_' + x + '_ratio' for x in self.LT_categories]] = features_df[['LT_' + x for x in self.LT_categories]]\
	.div(features_df['words'], axis=0)

	features_df = features_df.sort_index(axis=1)
	return pd.concat([res_df, features_df], axis='columns')