Spaces:

jsra2
/

cohort_predictor

Runtime error

cohort_predictor / app_funcs.py

Santiago Roman

gradio app

3059673 about 1 year ago

6.46 kB

	import numpy as np
	import matplotlib.pyplot as plt
	import xgboost as xgb
	import pickle
	import os
	import pandas as pd

	# EDA
	def check_dates(df, end_date):
	"""
	Checks that the dataframe is in correct order
	"""
	months_31 = {"01", "03", "05", "07", "08", "10", "12"}
	months_30 = {"04", "06", "09", "11"}
	months_28 = {"02"}
	for idx, row in df.iterrows():
	if(row["month_str"] == end_date):
	continue
	if(row["month_str"][5:7] in months_31):
	if (row["interval"] != np.timedelta64(31, "D")):
	return False
	if(row["month_str"][5:7] in months_30):
	if (row["interval"] != np.timedelta64(30, "D")):
	return False
	if(row["month_str"][5:7] in months_28):
	if (row["interval"] != np.timedelta64(28, "D") and int(row["month_str"][:4]) % 4 != 0):
	return False
	# Leap Year
	if (row["interval"] != np.timedelta64(29, "D") and int(row["month_str"][:4]) % 4 == 0):
	return False

	return True


	# EDA
	def plot_cohort(df, cohort_first_month, product):
	"""
	Plots the specified cohort given product and the date of the cohort
	"""
	df_ = get_sequence(df, cohort_first_month, product)
	x = np.array([x for x in range(df_.shape[0])])
	fig = plt.figure()
	ax = fig.gca()
	plt.plot(x, df_["percentage"])
	plt.grid()
	ax.set_xlim([0, df_.shape[0]])
	ax.set_ylim([0, 1])
	ax.set_xlabel("Months")
	ax.set_ylabel("Percentage")
	ttle = f"{cohort_first_month} \| {product}"
	ax.set_title(ttle)
	plt.show()

	##########################################################################################

	# train
	def plot_feature_importance(model, feature_names):
	"""
	Plots the importance of the features of a XGB model
	"""
	importances = model.feature_importances_
	indices = np.argsort(importances)[::-1]
	names = [feature_names[i] for i in indices]
	plt.figure(figsize=(10, 6))
	plt.title("Feature Importance")
	plt.bar(range(len(importances)), importances[indices])
	plt.xticks(range(len(importances)), names, rotation=90)
	plt.show()

	##########################################################################################

	# evaluate
	def get_sequence(df, cohort_first_month, product):
	"""
	Gets the dataframe of a sequence given the product and the date of the cohort
	"""
	df_ = df[df["cohort_first_month"] == cohort_first_month]
	df_ = df_[df_["cohort_first_product"] == product]
	return df_



	# evaluate
	def plot_true_and_predicted(y_true, y_pred, cohort, product):
	"""
	Plots the true and predicted time-series given a cohort and a product
	Every step is of the predicted is given the true t-1 datapoint. Its does not
	create an entire sequence from predictions.
	"""
	x = np.array([x for x in range(y_true.shape[0])])
	fig = plt.figure()
	ax = fig.gca()
	plt.plot(x, y_true, label="Y True")
	plt.plot(x, y_pred, label="Y Pred")
	plt.grid()
	ax.set_xlim([0, y_true.shape[0]])
	ax.set_ylim([0, 1])
	ax.set_xlabel("Months")
	ax.set_ylabel("Percentage")
	ttle = f"{cohort} \| {product}"
	ax.set_title(ttle)
	ax.legend()
	plt.show()


	# evaluate
	def get_product_one_hot_encode(product):
	"""
	Gets a one hot encoded dataframe of the possible products for a row
	"""
	products = {"1m":0,"3m":0,"4m":0}
	columns = ["product_1m", "product_3m", "product_4m"]
	products[product] = 1
	df = pd.DataFrame([products])
	df = df.rename(columns = {"1m": columns[0],
	"3m": columns[1],
	"4m": columns[2]})
	# print(df)
	return df

	# evaluate
	def get_month_one_hot_encode(month):
	"""
	Gets a one hot encoded dataframe of the months
	"""
	months = [0 for x in range(12)]
	columns = [f"month_{x}" for x in range(1,13)]
	months[month-1] = 1
	df = pd.DataFrame([months], columns=columns)
	# print(df)
	return df

	# evaluate
	def generate_new_data(df, date, cohort, product, model, columns_to_drop, n_points):
	"""
	This function generates data for a cohort of a product, from a specified date.
	It will use the predicion model, to generate the n consequent time steps of a cohort.
	The datapoints will be generated given the previously generated datapoints, in an iterative
	fashion
	"""
	df_ = df[df["cohort_first_month"] == cohort]
	df_ = df_[df_["cohort_first_product"] == product]
	df_ = df_[df_["month"] == date]
	current_month = int(date[5:7])
	current_msa = df_["months_since_acquisition"].values[0]
	df_ = df_.drop(columns=columns_to_drop)
	columns = df_.columns
	product_ohe = get_product_one_hot_encode(product)
	datapoint = df_.copy()
	counter = 0
	while(counter < n_points):
	prediction = model.predict(datapoint)
	# print(prediction)
	current_month = (current_month%12)+1
	month_ohe = get_month_one_hot_encode(current_month)
	current_msa += 1
	new_row = pd.DataFrame([current_msa], columns=[columns[0]])
	new_row[columns[1]] = prediction[0]
	new_row = new_row.join(product_ohe)
	new_row = new_row.join(month_ohe)
	df_ = pd.concat([df_,new_row], ignore_index=True)

	datapoint = new_row.copy()
	counter +=1

	return df_

	# evaluate
	def plot_example_from_case(historical, predicted, x_lim, product):
	"""
	With the generated data, it plots the historical true data, and in a dotted line
	the data that was predicted by the model for the subsequent datapoints.
	"""
	x_historical = np.array([x for x in range(historical.shape[0])])
	x_predicted= np.array([x + historical.shape[0]-1 for x in range(predicted.shape[0])])
	y_historical = historical["percentage"]
	y_predicted = predicted["percentage"]
	cohort_date = historical.iloc[0]["cohort_first_month"].strftime('%Y-%m-%d')
	fig = plt.figure()
	ax = fig.gca()
	plt.plot(x_historical, y_historical, label="historical", color="blue", linestyle="-")
	plt.plot(x_predicted, y_predicted, label="predicted", color="blue", linestyle="--")
	plt.grid()
	ax.set_xlim([0, x_lim])
	ax.set_ylim([0, 1])
	ax.set_xlabel("Months")
	ax.set_ylabel("Percentage")
	ttle = f" Cohort {cohort_date} \| Product {product}"
	ax.set_title(ttle)
	ax.legend()
	plt.show()

	return fig