Spaces:

jsra2
/

cohort_predictor

Runtime error

App Files Files Community

Santiago Roman commited on Jun 7, 2023

Commit

3059673

•

1 Parent(s): 7276223

gradio app

Browse files

Files changed (3) hide show

app.py +63 -0
app_funcs.py +193 -0
requirements.txt +7 -0

app.py ADDED Viewed

	@@ -0,0 +1,63 @@

+import gradio as gr
+import numpy as np
+from PIL import Image
+import requests
+import xgboost
+import pandas as pd
+from app_funcs import *
+import hopsworks
+import joblib
+project = hopsworks.login()
+fs = project.get_feature_store()
+X_columns_to_drop = ["percentage_future", "cohort_first_month", "month",
+                     "cohort_first_product"]
+feature_view = fs.get_feature_view(name="cohorts_fv", version=1)
+mr = project.get_model_registry()
+model = mr.get_model("cohort_model", version=1)
+model_dir = model.download()
+model = joblib.load(model_dir + "/xgb_.pkl")
+# Get feature view in a dataframe
+data, labels = feature_view.get_training_data(training_dataset_version=1)
+data["percentage_future"] = labels
+# Convert it to pandas datetime
+data["cohort_first_month"] = pd.to_datetime(data["cohort_first_month"])
+data["month"] = pd.to_datetime(data["month"])
+# Sort and assert
+data = data.sort_values(by=["cohort_first_product", "cohort_first_month", "month"])
+def cohort_predict(start_date, cohort_start, product):
+    input_list = []
+    input_list.append(start_date, cohort_start, product)
+    new_seq = generate_new_data(data, cohort_start, start_date, product, model, X_columns_to_drop, 12)
+    hist_seq = get_sequence(data, cohort_start, product)
+    fig = plot_example_from_case(hist_seq, new_seq, 25, product)
+    return fig
+demo = gr.Interface(
+    fn=cohort_predict,
+    title="Cohort Active Percentage Prediction",
+    description="Predicts active user percentage in a future month for a cohort that started in specific date with specific product",
+    allow_flagging="never",
+    inputs=[
+        gr.Textbox(default='2022-04-01', label="Cohort Start Date"),
+        gr.Textbox(default='2022-04-01', label="Prediction Start Date"),
+        gr.Textbox(default="3m", label="Product (1m, 3m, 4m)"),
+        ],
+    outputs=gr.Image(type="pil"))
+demo.launch()

app_funcs.py ADDED Viewed

	@@ -0,0 +1,193 @@

+import numpy as np
+import matplotlib.pyplot as plt
+import xgboost as xgb
+import pickle
+import os
+import pandas as pd
+# EDA
+def check_dates(df, end_date):
+    """
+    Checks that the dataframe is in correct order
+    """
+    months_31 = {"01", "03", "05", "07", "08", "10", "12"}
+    months_30 = {"04", "06", "09", "11"}
+    months_28 = {"02"}
+    for idx, row in df.iterrows():
+        if(row["month_str"] == end_date):
+            continue
+        if(row["month_str"][5:7] in months_31):
+            if (row["interval"] != np.timedelta64(31, "D")):
+                return False
+        if(row["month_str"][5:7] in months_30):
+            if (row["interval"] != np.timedelta64(30, "D")):
+                return False
+        if(row["month_str"][5:7] in months_28):
+            if (row["interval"] != np.timedelta64(28, "D") and int(row["month_str"][:4]) % 4 != 0):
+                return False
+            # Leap Year
+            if (row["interval"] != np.timedelta64(29, "D") and int(row["month_str"][:4]) % 4 == 0):
+                return False
+    return True
+# EDA
+def plot_cohort(df, cohort_first_month, product):
+    """
+    Plots the specified cohort given product and the date of the cohort
+    """
+    df_ = get_sequence(df, cohort_first_month, product)
+    x = np.array([x for x in range(df_.shape[0])])
+    fig = plt.figure()
+    ax = fig.gca()
+    plt.plot(x, df_["percentage"])
+    plt.grid()
+    ax.set_xlim([0, df_.shape[0]])
+    ax.set_ylim([0, 1])
+    ax.set_xlabel("Months")
+    ax.set_ylabel("Percentage")
+    ttle = f"{cohort_first_month} | {product}"
+    ax.set_title(ttle)
+    plt.show()
+##########################################################################################
+# train
+def plot_feature_importance(model, feature_names):
+    """
+    Plots the importance of the features of a XGB model
+    """
+    importances = model.feature_importances_
+    indices = np.argsort(importances)[::-1]
+    names = [feature_names[i] for i in indices]
+    plt.figure(figsize=(10, 6))
+    plt.title("Feature Importance")
+    plt.bar(range(len(importances)), importances[indices])
+    plt.xticks(range(len(importances)), names, rotation=90)
+    plt.show()
+##########################################################################################
+# evaluate
+def get_sequence(df, cohort_first_month, product):
+    """
+    Gets the dataframe of a sequence given the product and the date of the cohort
+    """
+    df_ = df[df["cohort_first_month"] == cohort_first_month]
+    df_ = df_[df_["cohort_first_product"] == product]
+    return df_
+# evaluate
+def plot_true_and_predicted(y_true, y_pred, cohort, product):
+    """
+    Plots the true and predicted time-series given a cohort and a product
+    Every step is of the predicted is given the true t-1 datapoint. Its does not
+    create an entire sequence from predictions.
+    """
+    x = np.array([x for x in range(y_true.shape[0])])
+    fig = plt.figure()
+    ax = fig.gca()
+    plt.plot(x, y_true, label="Y True")
+    plt.plot(x, y_pred, label="Y Pred")
+    plt.grid()
+    ax.set_xlim([0, y_true.shape[0]])
+    ax.set_ylim([0, 1])
+    ax.set_xlabel("Months")
+    ax.set_ylabel("Percentage")
+    ttle = f"{cohort} | {product}"
+    ax.set_title(ttle)
+    ax.legend()
+    plt.show()
+# evaluate
+def get_product_one_hot_encode(product):
+    """
+    Gets a one hot encoded dataframe of the possible products for a row
+    """
+    products = {"1m":0,"3m":0,"4m":0}
+    columns = ["product_1m", "product_3m", "product_4m"]
+    products[product] = 1
+    df = pd.DataFrame([products])
+    df = df.rename(columns = {"1m": columns[0],
+                              "3m": columns[1],
+                              "4m": columns[2]})
+    # print(df)
+    return df
+# evaluate
+def get_month_one_hot_encode(month):
+    """
+    Gets a one hot encoded dataframe of the months
+    """
+    months = [0 for x in range(12)]
+    columns = [f"month_{x}" for x in range(1,13)]
+    months[month-1] = 1
+    df = pd.DataFrame([months], columns=columns)
+    # print(df)
+    return df
+# evaluate
+def generate_new_data(df, date, cohort, product, model, columns_to_drop, n_points):
+    """
+    This function generates data for a cohort of a product, from a specified date.
+    It will use the predicion model, to generate the n consequent time steps of a cohort.
+    The datapoints will be generated given the previously generated datapoints, in an iterative
+    fashion
+    """
+    df_ = df[df["cohort_first_month"] == cohort]
+    df_ = df_[df_["cohort_first_product"] == product]
+    df_ = df_[df_["month"] == date]
+    current_month = int(date[5:7])
+    current_msa = df_["months_since_acquisition"].values[0]
+    df_ = df_.drop(columns=columns_to_drop)
+    columns = df_.columns
+    product_ohe = get_product_one_hot_encode(product)
+    datapoint = df_.copy()
+    counter = 0
+    while(counter < n_points):
+        prediction = model.predict(datapoint)
+        # print(prediction)
+        current_month = (current_month%12)+1
+        month_ohe = get_month_one_hot_encode(current_month)
+        current_msa += 1
+        new_row = pd.DataFrame([current_msa], columns=[columns[0]])
+        new_row[columns[1]] = prediction[0]
+        new_row = new_row.join(product_ohe)
+        new_row = new_row.join(month_ohe)
+        df_ = pd.concat([df_,new_row], ignore_index=True)
+        datapoint = new_row.copy()
+        counter +=1
+    return df_
+# evaluate
+def plot_example_from_case(historical, predicted, x_lim, product):
+    """
+    With the generated data, it plots the historical true data, and in a dotted line
+    the data that was predicted by the model for the subsequent datapoints.
+    """
+    x_historical = np.array([x for x in range(historical.shape[0])])
+    x_predicted= np.array([x + historical.shape[0]-1 for x in range(predicted.shape[0])])
+    y_historical = historical["percentage"]
+    y_predicted = predicted["percentage"]
+    cohort_date = historical.iloc[0]["cohort_first_month"].strftime('%Y-%m-%d')
+    fig = plt.figure()
+    ax = fig.gca()
+    plt.plot(x_historical, y_historical, label="historical", color="blue", linestyle="-")
+    plt.plot(x_predicted, y_predicted, label="predicted", color="blue", linestyle="--")
+    plt.grid()
+    ax.set_xlim([0, x_lim])
+    ax.set_ylim([0, 1])
+    ax.set_xlabel("Months")
+    ax.set_ylabel("Percentage")
+    ttle = f" Cohort {cohort_date} | Product {product}"
+    ax.set_title(ttle)
+    ax.legend()
+    plt.show()
+    return fig

requirements.txt ADDED Viewed

	@@ -0,0 +1,7 @@

+hopsworks
+joblib
+scikit-learn
+numpy
+xgboost
+pandas
+matplotlib