Spaces:
Runtime error
Runtime error
import numpy as np | |
import pandas as pd | |
import language_tool_python | |
import readability | |
import enchant | |
from enchant.checker import SpellChecker | |
from collections import OrderedDict | |
import nltk | |
nltk.download('punkt') | |
from nltk.tokenize import word_tokenize | |
class FeatureGenerator: | |
def __init__(self): | |
self.spell_checker = SpellChecker('en_US') | |
self.enchant_dict = enchant.Dict("en_US") | |
common_words_1k_filename = './1-1000.txt' | |
with open(common_words_1k_filename) as f: | |
self.common_words_1k = set(x.strip() for x in f.readlines()) | |
common_words_filename_10k = './google-10000-english-no-swears.txt' | |
with open(common_words_filename_10k) as f: | |
self.common_words_10k = set(x.strip() for x in f.readlines()) | |
# make sure common_words are a subset as well | |
self.common_words_10k.update(self.common_words_1k) | |
profanity_filename = './profanity.txt' | |
with open(profanity_filename) as f: | |
self.profanity_set = set(x.strip() for x in f.readlines()) | |
words_freq_filename = "./count_1w.txt" | |
self.words_freq = pd.read_csv(words_freq_filename, | |
names=['word', 'freq'], | |
sep='\t', | |
header=None, | |
dtype={'word': str, 'freq': int}, | |
keep_default_na=False, | |
na_values=['']) | |
self.words_freq = self.words_freq.set_index('word') | |
self.language_tool = language_tool_python.LanguageTool('en-US') | |
def text_preprocess(self, text: str): | |
''' | |
Transform text to be processed by readability | |
:param text: input text | |
:return: str preprocessed text | |
''' | |
text = text.strip() | |
# new paragraph starts with \n\n | |
# readability also requires each sentence to end with \n | |
paragraphs = [p.strip()\ | |
.replace('. ', '.\n')\ | |
.replace('? ', '?\n')\ | |
.replace('! ', '!\n') for p in text.split('\n\n')] | |
return "\n\n".join(paragraphs) | |
def misspelled_count(self, text: str): | |
''' | |
Get count out misspelled words by enchant SpellChecker | |
:param text: input text | |
:return: count of misspelled words in text | |
''' | |
self.spell_checker.set_text(text) | |
return len(list(self.spell_checker)) | |
def flatten_readability(self, r: OrderedDict): | |
''' | |
Flatten readability output by adding prefixes | |
:param r: OrderedDict of readability output | |
:return: dict | |
''' | |
out = {} | |
for k, group in r.items(): | |
prefix = {'readability grades': '', | |
'sentence info': '', | |
'word usage': 'wu_', | |
'sentence beginnings': 'sb_'}[k] | |
for var_name, value in group.items(): | |
out[prefix + var_name] = value | |
return out | |
def get_noncommon_words_count(self, text, common_words_dict): | |
# first, tokenize the text | |
# second, iterate over tokens and see whether | |
# a. it is a word | |
# b. not in common_words | |
# c. correctly spelled | |
# d. does not have underscore '_' | |
def is_noncommon_word(w): | |
return len(w) > 2 and\ | |
w not in common_words_dict and\ | |
'_' not in w and\ | |
self.enchant_dict.check(w) | |
return sum(is_noncommon_word(x) for x in word_tokenize(text.lower())) | |
def get_noncommon_words_count_1k(self, text): | |
return self.get_noncommon_words_count(text, self.common_words_1k) | |
def get_noncommon_words_count_10k(self, text): | |
return self.get_noncommon_words_count(text, self.common_words_10k) | |
def get_profanity_count(self, text): | |
return sum(x in self.profanity_set for x in word_tokenize(text.lower())) | |
def get_uncommon_words_counts(self, text): | |
word_freq_thresholds = np.array([1e8, 1e7, 1e6, 1e5], dtype=int) | |
counts = np.zeros(len(word_freq_thresholds), dtype=int) | |
for w in word_tokenize(text): | |
if len(w) < 3 or '_' in w or not self.enchant_dict.check(w) or w not in self.words_freq.index: | |
continue | |
w_freq = self.words_freq.loc[w].values[0] | |
counts += (word_freq_thresholds > w_freq) | |
return counts | |
# I have commented out categories giving 0 columns in train dataset | |
# SEMANTICS is mostly 0, we drop it as well | |
LT_categories = ['CASING', | |
#'COLLOQUIALISMS', | |
'COMPOUNDING', | |
'CONFUSED_WORDS', | |
#'FALSE_FRIENDS', | |
#'GENDER_NEUTRALITY', | |
'GRAMMAR', | |
'MISC', | |
'PUNCTUATION', | |
'REDUNDANCY', | |
#'REGIONALISMS', | |
#'REPETITIONS', | |
#'REPETITIONS_STYLE', | |
#'SEMANTICS', | |
'STYLE', | |
'TYPOGRAPHY', | |
'TYPOS', | |
'TOTAL', | |
] | |
def get_LT_features(self, text): | |
''' | |
Generates LanguateTool features: each category count and a total number | |
''' | |
matches = self.language_tool.check(text) | |
cat_counts = [sum(m.category == cat for m in matches) for cat in self.LT_categories[:-1]] | |
return cat_counts + [len(matches)] | |
def generate_features(self, df: pd.DataFrame): | |
''' | |
Generate features from a dataframe with `full_text` column containing english text | |
:param df: input dataframe | |
:return: pd.DataFrame with features and possibly updated `full_text` column | |
''' | |
res_df = df.copy() | |
#res_df['full_text'] = res_df['full_text'].apply(self.text_preprocess) | |
features_df = res_df[['full_text']].apply(lambda row: | |
self.flatten_readability( | |
readability.getmeasures( | |
self.text_preprocess(row[0]), lang='en')), | |
axis='columns', | |
result_type='expand') | |
features_df['text_len'] = res_df['full_text'].apply(lambda x: len(x)) | |
features_df['misspelled'] = res_df['full_text'].apply(self.misspelled_count) | |
features_df['noncommon_words_1k'] = res_df['full_text'].apply(self.get_noncommon_words_count_1k) | |
features_df['noncommon_words_10k'] = res_df['full_text'].apply(self.get_noncommon_words_count_10k) | |
features_df['profanity_count'] = res_df['full_text'].apply(self.get_profanity_count) | |
features_df[['uwc1e8', 'uwc1e7', 'uwc1e6', 'uwc1e5']] =\ | |
res_df[['full_text']].apply(lambda x: self.get_uncommon_words_counts(x[0]), | |
axis='columns', | |
result_type='expand') | |
# Generate ratio features | |
words_ratio_features = ['wordtypes', | |
'long_words', | |
'complex_words', | |
'complex_words_dc', | |
'wu_tobeverb', | |
'wu_auxverb', | |
'wu_conjunction', | |
'wu_pronoun', | |
'wu_preposition', | |
'wu_nominalization', | |
'misspelled', | |
'noncommon_words_1k', | |
'noncommon_words_10k', | |
'uwc1e8', | |
'uwc1e7', | |
'uwc1e6', | |
'uwc1e5', | |
] | |
features_df[[x + "_ratio" for x in words_ratio_features]] = features_df[words_ratio_features]\ | |
.div(features_df['words'], axis=0) | |
sentences_ratio_features = ['sb_pronoun', | |
'sb_interrogative', | |
'sb_article', | |
'sb_subordination', | |
'sb_conjunction', | |
'sb_preposition', | |
] | |
features_df[[x + "_ratio" for x in sentences_ratio_features]] = features_df[sentences_ratio_features]\ | |
.div(features_df['sentences'], axis=0) | |
features_df[['LT_' + x for x in self.LT_categories]] = res_df[['full_text']]\ | |
.apply(lambda x: self.get_LT_features(x[0]), | |
axis=1, | |
result_type='expand') | |
features_df[['LT_' + x + '_ratio' for x in self.LT_categories]] = features_df[['LT_' + x for x in self.LT_categories]]\ | |
.div(features_df['words'], axis=0) | |
features_df = features_df.sort_index(axis=1) | |
return pd.concat([res_df, features_df], axis='columns') | |