Spaces:
Runtime error
Runtime error
import numpy as np | |
import matplotlib.pyplot as plt | |
import xgboost as xgb | |
import pickle | |
import os | |
import pandas as pd | |
# EDA | |
def check_dates(df, end_date): | |
""" | |
Checks that the dataframe is in correct order | |
""" | |
months_31 = {"01", "03", "05", "07", "08", "10", "12"} | |
months_30 = {"04", "06", "09", "11"} | |
months_28 = {"02"} | |
for idx, row in df.iterrows(): | |
if(row["month_str"] == end_date): | |
continue | |
if(row["month_str"][5:7] in months_31): | |
if (row["interval"] != np.timedelta64(31, "D")): | |
return False | |
if(row["month_str"][5:7] in months_30): | |
if (row["interval"] != np.timedelta64(30, "D")): | |
return False | |
if(row["month_str"][5:7] in months_28): | |
if (row["interval"] != np.timedelta64(28, "D") and int(row["month_str"][:4]) % 4 != 0): | |
return False | |
# Leap Year | |
if (row["interval"] != np.timedelta64(29, "D") and int(row["month_str"][:4]) % 4 == 0): | |
return False | |
return True | |
# EDA | |
def plot_cohort(df, cohort_first_month, product): | |
""" | |
Plots the specified cohort given product and the date of the cohort | |
""" | |
df_ = get_sequence(df, cohort_first_month, product) | |
x = np.array([x for x in range(df_.shape[0])]) | |
fig = plt.figure() | |
ax = fig.gca() | |
plt.plot(x, df_["percentage"]) | |
plt.grid() | |
ax.set_xlim([0, df_.shape[0]]) | |
ax.set_ylim([0, 1]) | |
ax.set_xlabel("Months") | |
ax.set_ylabel("Percentage") | |
ttle = f"{cohort_first_month} | {product}" | |
ax.set_title(ttle) | |
plt.show() | |
########################################################################################## | |
# train | |
def plot_feature_importance(model, feature_names): | |
""" | |
Plots the importance of the features of a XGB model | |
""" | |
importances = model.feature_importances_ | |
indices = np.argsort(importances)[::-1] | |
names = [feature_names[i] for i in indices] | |
plt.figure(figsize=(10, 6)) | |
plt.title("Feature Importance") | |
plt.bar(range(len(importances)), importances[indices]) | |
plt.xticks(range(len(importances)), names, rotation=90) | |
plt.show() | |
########################################################################################## | |
# evaluate | |
def get_sequence(df, cohort_first_month, product): | |
""" | |
Gets the dataframe of a sequence given the product and the date of the cohort | |
""" | |
df_ = df[df["cohort_first_month"] == cohort_first_month] | |
df_ = df_[df_["cohort_first_product"] == product] | |
return df_ | |
# evaluate | |
def plot_true_and_predicted(y_true, y_pred, cohort, product): | |
""" | |
Plots the true and predicted time-series given a cohort and a product | |
Every step is of the predicted is given the true t-1 datapoint. Its does not | |
create an entire sequence from predictions. | |
""" | |
x = np.array([x for x in range(y_true.shape[0])]) | |
fig = plt.figure() | |
ax = fig.gca() | |
plt.plot(x, y_true, label="Y True") | |
plt.plot(x, y_pred, label="Y Pred") | |
plt.grid() | |
ax.set_xlim([0, y_true.shape[0]]) | |
ax.set_ylim([0, 1]) | |
ax.set_xlabel("Months") | |
ax.set_ylabel("Percentage") | |
ttle = f"{cohort} | {product}" | |
ax.set_title(ttle) | |
ax.legend() | |
plt.show() | |
# evaluate | |
def get_product_one_hot_encode(product): | |
""" | |
Gets a one hot encoded dataframe of the possible products for a row | |
""" | |
products = {"1m":0,"3m":0,"4m":0} | |
columns = ["product_1m", "product_3m", "product_4m"] | |
products[product] = 1 | |
df = pd.DataFrame([products]) | |
df = df.rename(columns = {"1m": columns[0], | |
"3m": columns[1], | |
"4m": columns[2]}) | |
# print(df) | |
return df | |
# evaluate | |
def get_month_one_hot_encode(month): | |
""" | |
Gets a one hot encoded dataframe of the months | |
""" | |
months = [0 for x in range(12)] | |
columns = [f"month_{x}" for x in range(1,13)] | |
months[month-1] = 1 | |
df = pd.DataFrame([months], columns=columns) | |
# print(df) | |
return df | |
# evaluate | |
def generate_new_data(df, date, cohort, product, model, columns_to_drop, n_points): | |
""" | |
This function generates data for a cohort of a product, from a specified date. | |
It will use the predicion model, to generate the n consequent time steps of a cohort. | |
The datapoints will be generated given the previously generated datapoints, in an iterative | |
fashion | |
""" | |
df_ = df[df["cohort_first_month"] == cohort] | |
df_ = df_[df_["cohort_first_product"] == product] | |
df_ = df_[df_["month"] == date] | |
current_month = int(date[5:7]) | |
current_msa = df_["months_since_acquisition"].values[0] | |
df_ = df_.drop(columns=columns_to_drop) | |
columns = df_.columns | |
product_ohe = get_product_one_hot_encode(product) | |
datapoint = df_.copy() | |
counter = 0 | |
while(counter < n_points): | |
prediction = model.predict(datapoint) | |
# print(prediction) | |
current_month = (current_month%12)+1 | |
month_ohe = get_month_one_hot_encode(current_month) | |
current_msa += 1 | |
new_row = pd.DataFrame([current_msa], columns=[columns[0]]) | |
new_row[columns[1]] = prediction[0] | |
new_row = new_row.join(product_ohe) | |
new_row = new_row.join(month_ohe) | |
df_ = pd.concat([df_,new_row], ignore_index=True) | |
datapoint = new_row.copy() | |
counter +=1 | |
return df_ | |
# evaluate | |
def plot_example_from_case(historical, predicted, x_lim, product): | |
""" | |
With the generated data, it plots the historical true data, and in a dotted line | |
the data that was predicted by the model for the subsequent datapoints. | |
""" | |
x_historical = np.array([x for x in range(historical.shape[0])]) | |
x_predicted= np.array([x + historical.shape[0]-1 for x in range(predicted.shape[0])]) | |
y_historical = historical["percentage"] | |
y_predicted = predicted["percentage"] | |
cohort_date = historical.iloc[0]["cohort_first_month"].strftime('%Y-%m-%d') | |
fig = plt.figure() | |
ax = fig.gca() | |
plt.plot(x_historical, y_historical, label="historical", color="blue", linestyle="-") | |
plt.plot(x_predicted, y_predicted, label="predicted", color="blue", linestyle="--") | |
plt.grid() | |
ax.set_xlim([0, x_lim]) | |
ax.set_ylim([0, 1]) | |
ax.set_xlabel("Months") | |
ax.set_ylabel("Percentage") | |
ttle = f" Cohort {cohort_date} | Product {product}" | |
ax.set_title(ttle) | |
ax.legend() | |
plt.show() | |
return fig |