import argparse import os import sys import pickle import time from datetime import datetime import pytz import re import string import pandas as pd from bs4 import BeautifulSoup import nltk import liwc import empath import sklearn from empath import Empath from nltk.stem import WordNetLemmatizer from sklearn.feature_extraction.text import TfidfVectorizer from nltk.sentiment.vader import SentimentIntensityAnalyzer as SIA import catboost as cb from typing import Dict from sklearn.model_selection import GridSearchCV # nltk.download("wordnet") # nltk.download("omw-1.4") # nltk.download("vader_lexicon") sys.path.append(os.getcwd()) SEED = 9103 # path = "../data/" # df_train_00 = pd.read_csv(path + 'dreaddit-train.csv') # df_test_00 = pd.read_csv(path + 'dreaddit-test.csv') DF_TRAIN = None DF_TEST = None COL_NAMES = [ "subreddit", "text", "label", "social_timestamp", "social_karma", "social_num_comments", ] negative_words = [ "no", "not", "none", "neither", "never", "nobody", "nothing", "nowhere", "doesn't", "isn't", "wasn't", "shouldn't", "won't", "can't", "couldn't", "don't", "haven't", "hasn't", "hadn't", "aren't", "weren't", "wouldn't", "daren't", "needn't", "didn't", "without", "against", "negative", "deny", "reject", "refuse", "decline", "unhappy", "sad", "miserable", "hopeless", "worthless", "useless", "futile", "disagree", "oppose", "contrary", "contradict", "disapprove", "dissatisfied", "objection", "unsatisfactory", "unpleasant", "regret", "resent", "lament", "mourn", "grieve", "bemoan", "despise", "loathe", "detract", "abhor", "dread", "fear", "worry", "anxiety", "sorrow", "gloom", "melancholy", "dismay", "disheartened", "despair", "dislike", "aversion", "antipathy", "hate", "disdain", "mustn't", "mustn", "mightn", "mightn't", "ain", "aren", "couldn", "didn", "doesn", "hadn", "hasn", "haven", "shan", "shan't", "shouldn", "wasn", "weren", "won", "wouldn", "isn", "doesnt", "doesn't" "does" "do", ] def get_args(): parser = argparse.ArgumentParser(description="") parser.add_argument( "-tr", "--train-data", type=str, default="dreaddit-train.csv", required=False, help="Path to training data", ) parser.add_argument( "-ts", "--test-data", type=str, default="dreaddit-test.csv", required=False, help="Path to test data", ) parser.add_argument( "-p", "--path", type=str, default="../data/", required=False, help="Path to test data", ) return parser.parse_args() def read_dfs(path="", f_train_df="", f_test_df=""): # DF_TRAIN = pd.read_csv(path + f_train_df) # DF_TEST = pd.read_csv(path + f_test_df) return (pd.read_csv(path + f_train_df), pd.read_csv(path + f_test_df)) def prepro_timestamps(df=DF_TRAIN, timezone=pytz.timezone("America/New_York")): df_train_01 = df df_train_01["n_w_day"] = df_train_01["social_timestamp"].apply( lambda x: datetime.fromtimestamp(x, tz=timezone).weekday() ) df_train_01["n_month"] = df_train_01["social_timestamp"].apply( lambda x: datetime.fromtimestamp(x, tz=timezone).month ) df_train_01["n_day"] = df_train_01["social_timestamp"].apply( lambda x: datetime.fromtimestamp(x, tz=timezone).day ) df_train_01["n_hour"] = df_train_01["social_timestamp"].apply( lambda x: int(datetime.fromtimestamp(x, tz=timezone).strftime("%H")) ) def c_hour(row): if 0 <= row["n_hour"] <= 6: return 0 elif 6 < row["n_hour"] <= 12: return 1 elif 12 < row["n_hour"] <= 18: return 2 return 3 df_train_01["c_hour"] = df_train_01.apply(c_hour, axis=1) print("p_timestamps", df_train_01.shape) return df_train_01 def prepro_cleaning(df, negative_words=negative_words): df_train_01 = df eng_stopwords = nltk.corpus.stopwords.words("english") # We want to leave the negative words in the text eng_stopwords_no_negative = [w for w in eng_stopwords if not w in negative_words] # Removing stopwords def remove_eng_stopwords(text): token_text = nltk.word_tokenize(text) remove_stop = [ word for word in token_text if word not in eng_stopwords_no_negative ] join_text = " ".join(remove_stop) return join_text # Removing punctuation def word_pt(text): text = text.lower() text = re.sub("\[.*?\]", "", text) text = re.sub("\\W", " ", text) text = re.sub("https?://\S+|www\.\S+", "", text) text = re.sub("<.*?>+", "", text) text = re.sub("[%s]" % re.escape(string.punctuation), "", text) text = re.sub("\n", "", text) text = re.sub("\w*\d\w*", "", text) return text # Removing html encodes def strip_html(text): soup = BeautifulSoup(text, "html.parser") return soup.get_text() # Removing the square brackets def remove_between_square_brackets(text): return re.sub("\[[^]]*\]", "", text) # Removing URL's def remove_urls_brackets(text): return re.sub(r"http\S+", "", text) # Removing the noisy text def denoise_text(text): text = remove_eng_stopwords(text) text = word_pt(text) text = strip_html(text) text = remove_between_square_brackets(text) text = remove_urls_brackets(text) return text df_train_01["denoise_text"] = df_train_01.text.apply(lambda x: denoise_text(x)) print("p_denoise", df_train_01.shape) return df_train_01 def prepro_lema(df): df_train_01 = df lemm = WordNetLemmatizer() def word_lemmatizer(text): token_text = nltk.word_tokenize(text) remove_stop = [lemm.lemmatize(w) for w in token_text] join_text = " ".join(remove_stop) return join_text if join_text else None df_train_01["lem_dn_text"] = df_train_01.denoise_text.apply( lambda x: word_lemmatizer(x) ) print("p_lema", df_train_01.shape) return df_train_01 def meta_liwc(df, parse): df_train_01 = df # LIWC Features Extraction corpus = [] words = [] for index, row in df_train_01.iterrows(): review = re.sub("[^a-zA-Z0-9]", " ", row["lem_dn_text"]) # review = review.lower() review = review.split() review = [category for token in review for category in parse(token)] statements = " ".join(review) corpus.append(statements) words.append(review) vectorizer = TfidfVectorizer(max_features=6000) X_fit = vectorizer.fit(corpus) X_transformed = X_fit.transform(corpus) features = vectorizer.get_feature_names_out() df_train_02 = pd.DataFrame(X_transformed.toarray(), columns=features).add_prefix( "liwc_" ) print("meta_liwc", df_train_02.shape) return df_train_02 def meta_empath(df): df_train_01 = df lexicon = Empath() # Empath def get_empath_categories(df, normalize=False): cats = [] for index, row in df.iterrows(): dict_lexicon = lexicon.analyze(row["lem_dn_text"], normalize=normalize) try: temp = dict_lexicon.values() except AttributeError: dict_lexicon = lexicon.analyze( "hola que mas parse o parce?", normalize=normalize ) cats.append(dict_lexicon) return pd.DataFrame(cats) df_empath_00 = get_empath_categories(df_train_01, normalize=True).add_prefix( "empath_" ) print("meta_empath", df_empath_00.shape) return df_empath_00 def meta_vader(df): df_train_01 = df sid = SIA() # series_vader = df_train_01["lem_dn_text"].apply( # lambda x: sid.polarity_scores( # " ".join(re.findall(r"\w+", x.lower())) # ).values() # ) # # print((series_vader[3])) # df_vader_03 = pd.DataFrame( # columns=["neg", "neu", "pos", "compound"], data=series_vader[:] # ) # data = {"data": series_vader, "column_names": ["neg", "neu", "pos", "compound"]} # df_vader_03 = pd.DataFrame.from_dict(data, orient="index") # print(df_vader_03.shape) df_vader_03 = pd.DataFrame.from_records( df_train_01["lem_dn_text"].apply( lambda x: sid.polarity_scores(" ".join(re.findall(r"\w+", x.lower()))) ) ).add_prefix("vader_") print("meta_vader", df_vader_03.shape) return df_vader_03 def meta_ohe(df): df_train_01 = df subreddit_dummies = pd.get_dummies(df_train_01.subreddit).add_prefix("subreddit_") print("meta_subreddit", subreddit_dummies.shape) return subreddit_dummies def preprocess_dfs( df_train=DF_TRAIN, df_test=DF_TEST, timezone=pytz.timezone("America/New_York"), col_names=COL_NAMES, ): # df_train = pd.concat([df_train, df_test]) print("init", df_train.shape) # preprocessing df_train_01 = df_train[COL_NAMES] df_train_01 = df_train_01[df_train_01["text"].notna()] df_train_01 = df_train_01.dropna() df_train_02 = prepro_timestamps(df_train_01) df_train_03 = prepro_cleaning(df_train_02) df_train_04 = prepro_lema(df_train_03) # metadata generation # liwc parse, category_names = liwc.load_token_parser(path + "LIWC2007_English080730.dic") df_liwc_05 = meta_liwc(df_train_04, parse) # empath df_empath_06 = meta_empath(df_train_04) # vader df_vader_07 = meta_vader(df_train_04) # ohe df_ohe_08 = meta_ohe(df_train_04) # return result = pd.concat(frames) df_temp = df_train_02.drop( columns=["subreddit", "social_timestamp", "denoise_text", "text", "lem_dn_text"] ) df_train_09 = pd.concat( [df_temp, df_liwc_05, df_empath_06, df_vader_07, df_ohe_08], axis=1, join="inner", ) print("final", df_train_09.shape) # print(df_train_09.head(6)) df_train_10 = df_train_09.drop(columns=["label"]) return df_train_10, df_train_09["label"] def fit_model(df_train, df_label): # Best Parameters for CatBoostClassifier def best_params_CBClassifier( df_train, df_label, param_grid: Dict[str, list] = {}, scoring: str = "accuracy", cv: int = 5, ) -> dict: # cat_features = None # All numeric # [0, 1] model = cb.CatBoostClassifier( learning_rate=1, task_type="GPU", devices="0:1", verbose=False, random_seed=9103, ) # Instantiate GridSearchCV gscv = GridSearchCV( estimator=model, param_grid=param_grid, scoring=scoring, cv=cv ) time_0 = time.time() # fit the model gscv.fit(df_train, df_label) # returns the best score print(gscv.best_score_) # returns the best parameters best_parameters = gscv.best_params_ print(best_parameters) print(f"Tuning time: {round(time.time() - time_0, 3)} sec") return best_parameters param_grid = { "max_depth": [3, 6, 9], "n_estimators": [100, 300, 900], "bootstrap_type": ["Bayesian", "Bernoulli", "MVS", "Poisson", "No"], } param_grid = { "max_depth": [3, 6, 9], "n_estimators": [300, 350, 400, 450], "bootstrap_type": ["Bayesian", "MVS", "Poisson"], } param_grid = { "max_depth": [5], "n_estimators": [300], "bootstrap_type": ["Bayesian"], } # param_grid = {"max_depth": [3], "n_estimators": [300], "bootstrap_type": ["MVS"]} scorings = ["f1", "accuracy", "precision", "recall"] best_parameters = best_params_CBClassifier( df_train, df_label, param_grid, scoring="f1" ) print(best_parameters) model = cb.CatBoostClassifier( learning_rate=1, task_type="GPU", devices="0:1", verbose=False, max_depth=best_parameters.get("max_depth"), n_estimators=best_parameters.get("n_estimators"), bootstrap_type=best_parameters.get("bootstrap_type"), random_seed=9103, ) time_0 = time.time() model.fit(df_train, df_label) print(f"In training df: {model.score(df_train, df_label)}") print(f"Training time: {round(time.time() - time_0, 3)} sec") return model, best_parameters def save_best_model(path, model, best_parameters): file = open( f"{path}best_CBClassifier_{best_parameters.get('max_depth')}_{best_parameters.get('n_estimators')}_{best_parameters.get('bootstrap_type')}.model", "wb", ) pickle.dump(model, file) file.close() if "__main__" == __name__: args = get_args() f_train_df = args.train_data f_test_df = args.test_data path = args.path DF_TRAIN, DF_TEST = read_dfs(path, f_train_df, f_test_df) df_prepro, df_label = preprocess_dfs(DF_TRAIN, DF_TEST) model, best_parameters = fit_model(df_prepro, df_label) save_best_model(path, model, best_parameters) print("alle's gut parce!")