import numpy as np import pandas as pd import language_tool_python import readability import enchant from enchant.checker import SpellChecker from collections import OrderedDict import nltk nltk.download('punkt') from nltk.tokenize import word_tokenize class FeatureGenerator: def __init__(self): self.spell_checker = SpellChecker('en_US') self.enchant_dict = enchant.Dict("en_US") common_words_1k_filename = './1-1000.txt' with open(common_words_1k_filename) as f: self.common_words_1k = set(x.strip() for x in f.readlines()) common_words_filename_10k = './google-10000-english-no-swears.txt' with open(common_words_filename_10k) as f: self.common_words_10k = set(x.strip() for x in f.readlines()) # make sure common_words are a subset as well self.common_words_10k.update(self.common_words_1k) profanity_filename = './profanity.txt' with open(profanity_filename) as f: self.profanity_set = set(x.strip() for x in f.readlines()) words_freq_filename = "./count_1w.txt" self.words_freq = pd.read_csv(words_freq_filename, names=['word', 'freq'], sep='\t', header=None, dtype={'word': str, 'freq': int}, keep_default_na=False, na_values=['']) self.words_freq = self.words_freq.set_index('word') self.language_tool = language_tool_python.LanguageTool('en-US') def text_preprocess(self, text: str): ''' Transform text to be processed by readability :param text: input text :return: str preprocessed text ''' text = text.strip() # new paragraph starts with \n\n # readability also requires each sentence to end with \n paragraphs = [p.strip()\ .replace('. ', '.\n')\ .replace('? ', '?\n')\ .replace('! ', '!\n') for p in text.split('\n\n')] return "\n\n".join(paragraphs) def misspelled_count(self, text: str): ''' Get count out misspelled words by enchant SpellChecker :param text: input text :return: count of misspelled words in text ''' self.spell_checker.set_text(text) return len(list(self.spell_checker)) def flatten_readability(self, r: OrderedDict): ''' Flatten readability output by adding prefixes :param r: OrderedDict of readability output :return: dict ''' out = {} for k, group in r.items(): prefix = {'readability grades': '', 'sentence info': '', 'word usage': 'wu_', 'sentence beginnings': 'sb_'}[k] for var_name, value in group.items(): out[prefix + var_name] = value return out def get_noncommon_words_count(self, text, common_words_dict): # first, tokenize the text # second, iterate over tokens and see whether # a. it is a word # b. not in common_words # c. correctly spelled # d. does not have underscore '_' def is_noncommon_word(w): return len(w) > 2 and\ w not in common_words_dict and\ '_' not in w and\ self.enchant_dict.check(w) return sum(is_noncommon_word(x) for x in word_tokenize(text.lower())) def get_noncommon_words_count_1k(self, text): return self.get_noncommon_words_count(text, self.common_words_1k) def get_noncommon_words_count_10k(self, text): return self.get_noncommon_words_count(text, self.common_words_10k) def get_profanity_count(self, text): return sum(x in self.profanity_set for x in word_tokenize(text.lower())) def get_uncommon_words_counts(self, text): word_freq_thresholds = np.array([1e8, 1e7, 1e6, 1e5], dtype=int) counts = np.zeros(len(word_freq_thresholds), dtype=int) for w in word_tokenize(text): if len(w) < 3 or '_' in w or not self.enchant_dict.check(w) or w not in self.words_freq.index: continue w_freq = self.words_freq.loc[w].values[0] counts += (word_freq_thresholds > w_freq) return counts # I have commented out categories giving 0 columns in train dataset # SEMANTICS is mostly 0, we drop it as well LT_categories = ['CASING', #'COLLOQUIALISMS', 'COMPOUNDING', 'CONFUSED_WORDS', #'FALSE_FRIENDS', #'GENDER_NEUTRALITY', 'GRAMMAR', 'MISC', 'PUNCTUATION', 'REDUNDANCY', #'REGIONALISMS', #'REPETITIONS', #'REPETITIONS_STYLE', #'SEMANTICS', 'STYLE', 'TYPOGRAPHY', 'TYPOS', 'TOTAL', ] def get_LT_features(self, text): ''' Generates LanguateTool features: each category count and a total number ''' matches = self.language_tool.check(text) cat_counts = [sum(m.category == cat for m in matches) for cat in self.LT_categories[:-1]] return cat_counts + [len(matches)] def generate_features(self, df: pd.DataFrame): ''' Generate features from a dataframe with `full_text` column containing english text :param df: input dataframe :return: pd.DataFrame with features and possibly updated `full_text` column ''' res_df = df.copy() #res_df['full_text'] = res_df['full_text'].apply(self.text_preprocess) features_df = res_df[['full_text']].apply(lambda row: self.flatten_readability( readability.getmeasures( self.text_preprocess(row[0]), lang='en')), axis='columns', result_type='expand') features_df['text_len'] = res_df['full_text'].apply(lambda x: len(x)) features_df['misspelled'] = res_df['full_text'].apply(self.misspelled_count) features_df['noncommon_words_1k'] = res_df['full_text'].apply(self.get_noncommon_words_count_1k) features_df['noncommon_words_10k'] = res_df['full_text'].apply(self.get_noncommon_words_count_10k) features_df['profanity_count'] = res_df['full_text'].apply(self.get_profanity_count) features_df[['uwc1e8', 'uwc1e7', 'uwc1e6', 'uwc1e5']] =\ res_df[['full_text']].apply(lambda x: self.get_uncommon_words_counts(x[0]), axis='columns', result_type='expand') # Generate ratio features words_ratio_features = ['wordtypes', 'long_words', 'complex_words', 'complex_words_dc', 'wu_tobeverb', 'wu_auxverb', 'wu_conjunction', 'wu_pronoun', 'wu_preposition', 'wu_nominalization', 'misspelled', 'noncommon_words_1k', 'noncommon_words_10k', 'uwc1e8', 'uwc1e7', 'uwc1e6', 'uwc1e5', ] features_df[[x + "_ratio" for x in words_ratio_features]] = features_df[words_ratio_features]\ .div(features_df['words'], axis=0) sentences_ratio_features = ['sb_pronoun', 'sb_interrogative', 'sb_article', 'sb_subordination', 'sb_conjunction', 'sb_preposition', ] features_df[[x + "_ratio" for x in sentences_ratio_features]] = features_df[sentences_ratio_features]\ .div(features_df['sentences'], axis=0) features_df[['LT_' + x for x in self.LT_categories]] = res_df[['full_text']]\ .apply(lambda x: self.get_LT_features(x[0]), axis=1, result_type='expand') features_df[['LT_' + x + '_ratio' for x in self.LT_categories]] = features_df[['LT_' + x for x in self.LT_categories]]\ .div(features_df['words'], axis=0) features_df = features_df.sort_index(axis=1) return pd.concat([res_df, features_df], axis='columns')