import argparse
import os
import sys
import pickle

import time
from datetime import datetime
import pytz

import re
import string
import pandas as pd
from bs4 import BeautifulSoup

import nltk
import liwc
import empath
import sklearn

from empath import Empath
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.sentiment.vader import SentimentIntensityAnalyzer as SIA

import catboost as cb
from typing import Dict
from sklearn.model_selection import GridSearchCV

# nltk.download("wordnet")
# nltk.download("omw-1.4")
# nltk.download("vader_lexicon")

sys.path.append(os.getcwd())

SEED = 9103
# path = "../data/"
# df_train_00 = pd.read_csv(path + 'dreaddit-train.csv')
# df_test_00 = pd.read_csv(path + 'dreaddit-test.csv')

DF_TRAIN = None
DF_TEST = None
COL_NAMES = [
    "subreddit",
    "text",
    "label",
    "social_timestamp",
    "social_karma",
    "social_num_comments",
]

negative_words = [
    "no",
    "not",
    "none",
    "neither",
    "never",
    "nobody",
    "nothing",
    "nowhere",
    "doesn't",
    "isn't",
    "wasn't",
    "shouldn't",
    "won't",
    "can't",
    "couldn't",
    "don't",
    "haven't",
    "hasn't",
    "hadn't",
    "aren't",
    "weren't",
    "wouldn't",
    "daren't",
    "needn't",
    "didn't",
    "without",
    "against",
    "negative",
    "deny",
    "reject",
    "refuse",
    "decline",
    "unhappy",
    "sad",
    "miserable",
    "hopeless",
    "worthless",
    "useless",
    "futile",
    "disagree",
    "oppose",
    "contrary",
    "contradict",
    "disapprove",
    "dissatisfied",
    "objection",
    "unsatisfactory",
    "unpleasant",
    "regret",
    "resent",
    "lament",
    "mourn",
    "grieve",
    "bemoan",
    "despise",
    "loathe",
    "detract",
    "abhor",
    "dread",
    "fear",
    "worry",
    "anxiety",
    "sorrow",
    "gloom",
    "melancholy",
    "dismay",
    "disheartened",
    "despair",
    "dislike",
    "aversion",
    "antipathy",
    "hate",
    "disdain",
    "mustn't",
    "mustn",
    "mightn",
    "mightn't",
    "ain",
    "aren",
    "couldn",
    "didn",
    "doesn",
    "hadn",
    "hasn",
    "haven",
    "shan",
    "shan't",
    "shouldn",
    "wasn",
    "weren",
    "won",
    "wouldn",
    "isn",
    "doesnt",
    "doesn't" "does" "do",
]


def get_args():
    parser = argparse.ArgumentParser(description="")
    parser.add_argument(
        "-tr",
        "--train-data",
        type=str,
        default="dreaddit-train.csv",
        required=False,
        help="Path to training data",
    )
    parser.add_argument(
        "-ts",
        "--test-data",
        type=str,
        default="dreaddit-test.csv",
        required=False,
        help="Path to test data",
    )

    parser.add_argument(
        "-p",
        "--path",
        type=str,
        default="../data/",
        required=False,
        help="Path to test data",
    )

    return parser.parse_args()


def read_dfs(path="", f_train_df="", f_test_df=""):
    # DF_TRAIN = pd.read_csv(path + f_train_df)
    # DF_TEST = pd.read_csv(path + f_test_df)
    return (pd.read_csv(path + f_train_df), pd.read_csv(path + f_test_df))


def prepro_timestamps(df=DF_TRAIN, timezone=pytz.timezone("America/New_York")):
    df_train_01 = df
    df_train_01["n_w_day"] = df_train_01["social_timestamp"].apply(
        lambda x: datetime.fromtimestamp(x, tz=timezone).weekday()
    )
    df_train_01["n_month"] = df_train_01["social_timestamp"].apply(
        lambda x: datetime.fromtimestamp(x, tz=timezone).month
    )
    df_train_01["n_day"] = df_train_01["social_timestamp"].apply(
        lambda x: datetime.fromtimestamp(x, tz=timezone).day
    )
    df_train_01["n_hour"] = df_train_01["social_timestamp"].apply(
        lambda x: int(datetime.fromtimestamp(x, tz=timezone).strftime("%H"))
    )

    def c_hour(row):
        if 0 <= row["n_hour"] <= 6:
            return 0
        elif 6 < row["n_hour"] <= 12:
            return 1
        elif 12 < row["n_hour"] <= 18:
            return 2
        return 3

    df_train_01["c_hour"] = df_train_01.apply(c_hour, axis=1)

    print("p_timestamps", df_train_01.shape)
    return df_train_01


def prepro_cleaning(df, negative_words=negative_words):
    df_train_01 = df
    eng_stopwords = nltk.corpus.stopwords.words("english")
    # We want to leave the negative words in the text
    eng_stopwords_no_negative = [w for w in eng_stopwords if not w in negative_words]

    # Removing stopwords
    def remove_eng_stopwords(text):
        token_text = nltk.word_tokenize(text)
        remove_stop = [
            word for word in token_text if word not in eng_stopwords_no_negative
        ]
        join_text = " ".join(remove_stop)
        return join_text

    # Removing punctuation
    def word_pt(text):
        text = text.lower()
        text = re.sub("\[.*?\]", "", text)
        text = re.sub("\\W", " ", text)
        text = re.sub("https?://\S+|www\.\S+", "", text)
        text = re.sub("<.*?>+", "", text)
        text = re.sub("[%s]" % re.escape(string.punctuation), "", text)
        text = re.sub("\n", "", text)
        text = re.sub("\w*\d\w*", "", text)
        return text

    # Removing html encodes
    def strip_html(text):
        soup = BeautifulSoup(text, "html.parser")
        return soup.get_text()

    # Removing the square brackets
    def remove_between_square_brackets(text):
        return re.sub("\[[^]]*\]", "", text)

    # Removing URL's
    def remove_urls_brackets(text):
        return re.sub(r"http\S+", "", text)

    # Removing the noisy text
    def denoise_text(text):
        text = remove_eng_stopwords(text)
        text = word_pt(text)
        text = strip_html(text)
        text = remove_between_square_brackets(text)
        text = remove_urls_brackets(text)
        return text

    df_train_01["denoise_text"] = df_train_01.text.apply(lambda x: denoise_text(x))
    print("p_denoise", df_train_01.shape)
    return df_train_01


def prepro_lema(df):
    df_train_01 = df
    lemm = WordNetLemmatizer()

    def word_lemmatizer(text):
        token_text = nltk.word_tokenize(text)
        remove_stop = [lemm.lemmatize(w) for w in token_text]
        join_text = " ".join(remove_stop)
        return join_text if join_text else None

    df_train_01["lem_dn_text"] = df_train_01.denoise_text.apply(
        lambda x: word_lemmatizer(x)
    )
    print("p_lema", df_train_01.shape)
    return df_train_01


def meta_liwc(df, parse):
    df_train_01 = df
    # LIWC Features Extraction
    corpus = []
    words = []

    for index, row in df_train_01.iterrows():
        review = re.sub("[^a-zA-Z0-9]", " ", row["lem_dn_text"])
        # review = review.lower()
        review = review.split()
        review = [category for token in review for category in parse(token)]
        statements = " ".join(review)
        corpus.append(statements)
        words.append(review)

    vectorizer = TfidfVectorizer(max_features=6000)
    X_fit = vectorizer.fit(corpus)
    X_transformed = X_fit.transform(corpus)

    features = vectorizer.get_feature_names_out()
    df_train_02 = pd.DataFrame(X_transformed.toarray(), columns=features).add_prefix(
        "liwc_"
    )
    print("meta_liwc", df_train_02.shape)
    return df_train_02


def meta_empath(df):
    df_train_01 = df
    lexicon = Empath()

    # Empath
    def get_empath_categories(df, normalize=False):
        cats = []
        for index, row in df.iterrows():
            dict_lexicon = lexicon.analyze(row["lem_dn_text"], normalize=normalize)
            try:
                temp = dict_lexicon.values()
            except AttributeError:
                dict_lexicon = lexicon.analyze(
                    "hola que mas parse o parce?", normalize=normalize
                )
            cats.append(dict_lexicon)
        return pd.DataFrame(cats)

    df_empath_00 = get_empath_categories(df_train_01, normalize=True).add_prefix(
        "empath_"
    )
    print("meta_empath", df_empath_00.shape)
    return df_empath_00


def meta_vader(df):
    df_train_01 = df
    sid = SIA()
    # series_vader = df_train_01["lem_dn_text"].apply(
    #     lambda x: sid.polarity_scores(
    #         " ".join(re.findall(r"\w+", x.lower()))
    #     ).values()
    # )
    # # print((series_vader[3]))
    # df_vader_03 = pd.DataFrame(
    #     columns=["neg", "neu", "pos", "compound"], data=series_vader[:]
    # )
    # data = {"data": series_vader, "column_names": ["neg", "neu", "pos", "compound"]}
    # df_vader_03 = pd.DataFrame.from_dict(data, orient="index")
    # print(df_vader_03.shape)
    df_vader_03 = pd.DataFrame.from_records(
        df_train_01["lem_dn_text"].apply(
            lambda x: sid.polarity_scores(" ".join(re.findall(r"\w+", x.lower())))
        )
    ).add_prefix("vader_")
    print("meta_vader", df_vader_03.shape)
    return df_vader_03


def meta_ohe(df):
    df_train_01 = df
    subreddit_dummies = pd.get_dummies(df_train_01.subreddit).add_prefix("subreddit_")
    print("meta_subreddit", subreddit_dummies.shape)
    return subreddit_dummies


def preprocess_dfs(
    df_train=DF_TRAIN,
    df_test=DF_TEST,
    timezone=pytz.timezone("America/New_York"),
    col_names=COL_NAMES,
):
    # df_train = pd.concat([df_train, df_test])
    print("init", df_train.shape)
    # preprocessing
    df_train_01 = df_train[COL_NAMES]
    df_train_01 = df_train_01[df_train_01["text"].notna()]
    df_train_01 = df_train_01.dropna()
    df_train_02 = prepro_timestamps(df_train_01)
    df_train_03 = prepro_cleaning(df_train_02)
    df_train_04 = prepro_lema(df_train_03)
    # metadata generation
    # liwc
    parse, category_names = liwc.load_token_parser(path + "LIWC2007_English080730.dic")
    df_liwc_05 = meta_liwc(df_train_04, parse)
    # empath
    df_empath_06 = meta_empath(df_train_04)
    # vader
    df_vader_07 = meta_vader(df_train_04)
    # ohe
    df_ohe_08 = meta_ohe(df_train_04)
    # return result = pd.concat(frames)
    df_temp = df_train_02.drop(
        columns=["subreddit", "social_timestamp", "denoise_text", "text", "lem_dn_text"]
    )
    df_train_09 = pd.concat(
        [df_temp, df_liwc_05, df_empath_06, df_vader_07, df_ohe_08],
        axis=1,
        join="inner",
    )
    print("final", df_train_09.shape)
    # print(df_train_09.head(6))
    df_train_10 = df_train_09.drop(columns=["label"])
    return df_train_10, df_train_09["label"]


def fit_model(df_train, df_label):
    # Best Parameters for CatBoostClassifier
    def best_params_CBClassifier(
        df_train,
        df_label,
        param_grid: Dict[str, list] = {},
        scoring: str = "accuracy",
        cv: int = 5,
    ) -> dict:
        # cat_features = None # All numeric # [0, 1]
        model = cb.CatBoostClassifier(
            learning_rate=1,
            task_type="GPU",
            devices="0:1",
            verbose=False,
            random_seed=9103,
        )
        # Instantiate GridSearchCV
        gscv = GridSearchCV(
            estimator=model, param_grid=param_grid, scoring=scoring, cv=cv
        )
        time_0 = time.time()
        # fit the model
        gscv.fit(df_train, df_label)
        # returns the best score
        print(gscv.best_score_)
        # returns the best parameters
        best_parameters = gscv.best_params_
        print(best_parameters)
        print(f"Tuning time: {round(time.time() - time_0, 3)} sec")
        return best_parameters

    param_grid = {
        "max_depth": [3, 6, 9],
        "n_estimators": [100, 300, 900],
        "bootstrap_type": ["Bayesian", "Bernoulli", "MVS", "Poisson", "No"],
    }

    param_grid = {
        "max_depth": [3, 6, 9],
        "n_estimators": [300, 350, 400, 450],
        "bootstrap_type": ["Bayesian", "MVS", "Poisson"],
    }

    param_grid = {
        "max_depth": [5],
        "n_estimators": [300],
        "bootstrap_type": ["Bayesian"],
    }

    # param_grid = {"max_depth": [3], "n_estimators": [300], "bootstrap_type": ["MVS"]}
    scorings = ["f1", "accuracy", "precision", "recall"]
    best_parameters = best_params_CBClassifier(
        df_train, df_label, param_grid, scoring="f1"
    )
    print(best_parameters)

    model = cb.CatBoostClassifier(
        learning_rate=1,
        task_type="GPU",
        devices="0:1",
        verbose=False,
        max_depth=best_parameters.get("max_depth"),
        n_estimators=best_parameters.get("n_estimators"),
        bootstrap_type=best_parameters.get("bootstrap_type"),
        random_seed=9103,
    )
    time_0 = time.time()
    model.fit(df_train, df_label)
    print(f"In training df: {model.score(df_train, df_label)}")
    print(f"Training time: {round(time.time() - time_0, 3)} sec")

    return model, best_parameters


def save_best_model(path, model, best_parameters):
    file = open(
        f"{path}best_CBClassifier_{best_parameters.get('max_depth')}_{best_parameters.get('n_estimators')}_{best_parameters.get('bootstrap_type')}.model",
        "wb",
    )
    pickle.dump(model, file)
    file.close()


if "__main__" == __name__:
    args = get_args()

    f_train_df = args.train_data
    f_test_df = args.test_data
    path = args.path

    DF_TRAIN, DF_TEST = read_dfs(path, f_train_df, f_test_df)
    df_prepro, df_label = preprocess_dfs(DF_TRAIN, DF_TEST)
    model, best_parameters = fit_model(df_prepro, df_label)
    save_best_model(path, model, best_parameters)

    print("alle's gut parce!")