Santiago Roman commited on
Commit
3059673
1 Parent(s): 7276223

gradio app

Browse files
Files changed (3) hide show
  1. app.py +63 -0
  2. app_funcs.py +193 -0
  3. requirements.txt +7 -0
app.py ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import numpy as np
3
+ from PIL import Image
4
+ import requests
5
+ import xgboost
6
+ import pandas as pd
7
+
8
+ from app_funcs import *
9
+
10
+ import hopsworks
11
+ import joblib
12
+
13
+ project = hopsworks.login()
14
+ fs = project.get_feature_store()
15
+
16
+ X_columns_to_drop = ["percentage_future", "cohort_first_month", "month",
17
+ "cohort_first_product"]
18
+
19
+ feature_view = fs.get_feature_view(name="cohorts_fv", version=1)
20
+
21
+ mr = project.get_model_registry()
22
+ model = mr.get_model("cohort_model", version=1)
23
+ model_dir = model.download()
24
+ model = joblib.load(model_dir + "/xgb_.pkl")
25
+
26
+ # Get feature view in a dataframe
27
+ data, labels = feature_view.get_training_data(training_dataset_version=1)
28
+ data["percentage_future"] = labels
29
+
30
+ # Convert it to pandas datetime
31
+ data["cohort_first_month"] = pd.to_datetime(data["cohort_first_month"])
32
+ data["month"] = pd.to_datetime(data["month"])
33
+
34
+ # Sort and assert
35
+ data = data.sort_values(by=["cohort_first_product", "cohort_first_month", "month"])
36
+
37
+
38
+ def cohort_predict(start_date, cohort_start, product):
39
+
40
+ input_list = []
41
+ input_list.append(start_date, cohort_start, product)
42
+
43
+ new_seq = generate_new_data(data, cohort_start, start_date, product, model, X_columns_to_drop, 12)
44
+ hist_seq = get_sequence(data, cohort_start, product)
45
+
46
+ fig = plot_example_from_case(hist_seq, new_seq, 25, product)
47
+
48
+
49
+ return fig
50
+
51
+ demo = gr.Interface(
52
+ fn=cohort_predict,
53
+ title="Cohort Active Percentage Prediction",
54
+ description="Predicts active user percentage in a future month for a cohort that started in specific date with specific product",
55
+ allow_flagging="never",
56
+ inputs=[
57
+ gr.Textbox(default='2022-04-01', label="Cohort Start Date"),
58
+ gr.Textbox(default='2022-04-01', label="Prediction Start Date"),
59
+ gr.Textbox(default="3m", label="Product (1m, 3m, 4m)"),
60
+ ],
61
+ outputs=gr.Image(type="pil"))
62
+
63
+ demo.launch()
app_funcs.py ADDED
@@ -0,0 +1,193 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import matplotlib.pyplot as plt
3
+ import xgboost as xgb
4
+ import pickle
5
+ import os
6
+ import pandas as pd
7
+
8
+ # EDA
9
+ def check_dates(df, end_date):
10
+ """
11
+ Checks that the dataframe is in correct order
12
+ """
13
+ months_31 = {"01", "03", "05", "07", "08", "10", "12"}
14
+ months_30 = {"04", "06", "09", "11"}
15
+ months_28 = {"02"}
16
+ for idx, row in df.iterrows():
17
+ if(row["month_str"] == end_date):
18
+ continue
19
+ if(row["month_str"][5:7] in months_31):
20
+ if (row["interval"] != np.timedelta64(31, "D")):
21
+ return False
22
+ if(row["month_str"][5:7] in months_30):
23
+ if (row["interval"] != np.timedelta64(30, "D")):
24
+ return False
25
+ if(row["month_str"][5:7] in months_28):
26
+ if (row["interval"] != np.timedelta64(28, "D") and int(row["month_str"][:4]) % 4 != 0):
27
+ return False
28
+ # Leap Year
29
+ if (row["interval"] != np.timedelta64(29, "D") and int(row["month_str"][:4]) % 4 == 0):
30
+ return False
31
+
32
+ return True
33
+
34
+
35
+ # EDA
36
+ def plot_cohort(df, cohort_first_month, product):
37
+ """
38
+ Plots the specified cohort given product and the date of the cohort
39
+ """
40
+ df_ = get_sequence(df, cohort_first_month, product)
41
+ x = np.array([x for x in range(df_.shape[0])])
42
+ fig = plt.figure()
43
+ ax = fig.gca()
44
+ plt.plot(x, df_["percentage"])
45
+ plt.grid()
46
+ ax.set_xlim([0, df_.shape[0]])
47
+ ax.set_ylim([0, 1])
48
+ ax.set_xlabel("Months")
49
+ ax.set_ylabel("Percentage")
50
+ ttle = f"{cohort_first_month} | {product}"
51
+ ax.set_title(ttle)
52
+ plt.show()
53
+
54
+ ##########################################################################################
55
+
56
+ # train
57
+ def plot_feature_importance(model, feature_names):
58
+ """
59
+ Plots the importance of the features of a XGB model
60
+ """
61
+ importances = model.feature_importances_
62
+ indices = np.argsort(importances)[::-1]
63
+ names = [feature_names[i] for i in indices]
64
+ plt.figure(figsize=(10, 6))
65
+ plt.title("Feature Importance")
66
+ plt.bar(range(len(importances)), importances[indices])
67
+ plt.xticks(range(len(importances)), names, rotation=90)
68
+ plt.show()
69
+
70
+ ##########################################################################################
71
+
72
+ # evaluate
73
+ def get_sequence(df, cohort_first_month, product):
74
+ """
75
+ Gets the dataframe of a sequence given the product and the date of the cohort
76
+ """
77
+ df_ = df[df["cohort_first_month"] == cohort_first_month]
78
+ df_ = df_[df_["cohort_first_product"] == product]
79
+ return df_
80
+
81
+
82
+
83
+ # evaluate
84
+ def plot_true_and_predicted(y_true, y_pred, cohort, product):
85
+ """
86
+ Plots the true and predicted time-series given a cohort and a product
87
+ Every step is of the predicted is given the true t-1 datapoint. Its does not
88
+ create an entire sequence from predictions.
89
+ """
90
+ x = np.array([x for x in range(y_true.shape[0])])
91
+ fig = plt.figure()
92
+ ax = fig.gca()
93
+ plt.plot(x, y_true, label="Y True")
94
+ plt.plot(x, y_pred, label="Y Pred")
95
+ plt.grid()
96
+ ax.set_xlim([0, y_true.shape[0]])
97
+ ax.set_ylim([0, 1])
98
+ ax.set_xlabel("Months")
99
+ ax.set_ylabel("Percentage")
100
+ ttle = f"{cohort} | {product}"
101
+ ax.set_title(ttle)
102
+ ax.legend()
103
+ plt.show()
104
+
105
+
106
+ # evaluate
107
+ def get_product_one_hot_encode(product):
108
+ """
109
+ Gets a one hot encoded dataframe of the possible products for a row
110
+ """
111
+ products = {"1m":0,"3m":0,"4m":0}
112
+ columns = ["product_1m", "product_3m", "product_4m"]
113
+ products[product] = 1
114
+ df = pd.DataFrame([products])
115
+ df = df.rename(columns = {"1m": columns[0],
116
+ "3m": columns[1],
117
+ "4m": columns[2]})
118
+ # print(df)
119
+ return df
120
+
121
+ # evaluate
122
+ def get_month_one_hot_encode(month):
123
+ """
124
+ Gets a one hot encoded dataframe of the months
125
+ """
126
+ months = [0 for x in range(12)]
127
+ columns = [f"month_{x}" for x in range(1,13)]
128
+ months[month-1] = 1
129
+ df = pd.DataFrame([months], columns=columns)
130
+ # print(df)
131
+ return df
132
+
133
+ # evaluate
134
+ def generate_new_data(df, date, cohort, product, model, columns_to_drop, n_points):
135
+ """
136
+ This function generates data for a cohort of a product, from a specified date.
137
+ It will use the predicion model, to generate the n consequent time steps of a cohort.
138
+ The datapoints will be generated given the previously generated datapoints, in an iterative
139
+ fashion
140
+ """
141
+ df_ = df[df["cohort_first_month"] == cohort]
142
+ df_ = df_[df_["cohort_first_product"] == product]
143
+ df_ = df_[df_["month"] == date]
144
+ current_month = int(date[5:7])
145
+ current_msa = df_["months_since_acquisition"].values[0]
146
+ df_ = df_.drop(columns=columns_to_drop)
147
+ columns = df_.columns
148
+ product_ohe = get_product_one_hot_encode(product)
149
+ datapoint = df_.copy()
150
+ counter = 0
151
+ while(counter < n_points):
152
+ prediction = model.predict(datapoint)
153
+ # print(prediction)
154
+ current_month = (current_month%12)+1
155
+ month_ohe = get_month_one_hot_encode(current_month)
156
+ current_msa += 1
157
+ new_row = pd.DataFrame([current_msa], columns=[columns[0]])
158
+ new_row[columns[1]] = prediction[0]
159
+ new_row = new_row.join(product_ohe)
160
+ new_row = new_row.join(month_ohe)
161
+ df_ = pd.concat([df_,new_row], ignore_index=True)
162
+
163
+ datapoint = new_row.copy()
164
+ counter +=1
165
+
166
+ return df_
167
+
168
+ # evaluate
169
+ def plot_example_from_case(historical, predicted, x_lim, product):
170
+ """
171
+ With the generated data, it plots the historical true data, and in a dotted line
172
+ the data that was predicted by the model for the subsequent datapoints.
173
+ """
174
+ x_historical = np.array([x for x in range(historical.shape[0])])
175
+ x_predicted= np.array([x + historical.shape[0]-1 for x in range(predicted.shape[0])])
176
+ y_historical = historical["percentage"]
177
+ y_predicted = predicted["percentage"]
178
+ cohort_date = historical.iloc[0]["cohort_first_month"].strftime('%Y-%m-%d')
179
+ fig = plt.figure()
180
+ ax = fig.gca()
181
+ plt.plot(x_historical, y_historical, label="historical", color="blue", linestyle="-")
182
+ plt.plot(x_predicted, y_predicted, label="predicted", color="blue", linestyle="--")
183
+ plt.grid()
184
+ ax.set_xlim([0, x_lim])
185
+ ax.set_ylim([0, 1])
186
+ ax.set_xlabel("Months")
187
+ ax.set_ylabel("Percentage")
188
+ ttle = f" Cohort {cohort_date} | Product {product}"
189
+ ax.set_title(ttle)
190
+ ax.legend()
191
+ plt.show()
192
+
193
+ return fig
requirements.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ hopsworks
2
+ joblib
3
+ scikit-learn
4
+ numpy
5
+ xgboost
6
+ pandas
7
+ matplotlib