Spaces:
Running
Running
andrewicus
commited on
Commit
β’
a62d129
1
Parent(s):
a8ac252
Draft of all Pages
Browse files- Introduction.py +98 -0
- ifood-data.csv +0 -0
- pages/01 π Data Visualization.py +4 -1
- pages/02 π€ Model Prediction.py +47 -20
- requirements.txt +1 -0
Introduction.py
ADDED
@@ -0,0 +1,98 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import pandas as pd
|
3 |
+
from PIL import Image
|
4 |
+
|
5 |
+
url = "https://upload.wikimedia.org/wikipedia/commons/6/6a/DoorDash_Logo.svg"
|
6 |
+
st.image(url, output_format="PNG", width=300)
|
7 |
+
|
8 |
+
st.title("Doordash CRM Data Analysis")
|
9 |
+
|
10 |
+
st.markdown("##### Context")
|
11 |
+
st.markdown("Doordash is one of the leading food delivery apps in the United States, present in over seven thousand cities.")
|
12 |
+
st.markdown("Keeping a high customer engagement is key for growing and consolidating the companyβs position as the market leader.")
|
13 |
+
st.markdown("To expand their product offering, the company is currently looking to launch a physical button that customers can press to automatically place an order for their favorite meal. Doordash is looking to maximize marketing efforts for this new product.")
|
14 |
+
|
15 |
+
st.image('amazon-dash.png', caption="DoorDash's new Insta-Order Button", width = 300)
|
16 |
+
|
17 |
+
st.markdown("##### Objectives")
|
18 |
+
st.markdown("The objective of the team is to build a predictive model that will produce the highest profit for the next direct marketing campaign, scheduled for next month. The new campaign, sixth, aims at selling a new gadget to the Customer Database. The team is set on developing a model that predicts customer response to various marketing tactics, which will then be applied to the rest of the customer base.")
|
19 |
+
st.markdown("Hopefully the model will allow the company to cherry pick the customers that are most likely to purchase the offer while leaving out the non-respondents, making the sixth campaign highly profitable. Moreover, other than maximizing the profit of this new campaign while reducing expenses, the CMO is interested in understanding the characteristic features of those customers who are responsive to purchasing the gadget.")
|
20 |
+
|
21 |
+
st.markdown("### Key Goals:")
|
22 |
+
st.markdown("1. Propose and describe a customer segmentation based on customers behaviors.")
|
23 |
+
st.markdown("2. Create a predictive model which allows the company to maximize the profits while reducing expenses of the sixth marketing campaign.")
|
24 |
+
st.markdown("3. By examining which past campaigns were the most responsive, the team can implement the most successful strategies into the sixth campaign to increase customer retention.")
|
25 |
+
|
26 |
+
st.markdown("##### Data Source")
|
27 |
+
st.markdown("The data set contains socio-demographic and firmographic features from about 2.240 customers who were contacted. Additionally, it contains a flag for those customers who responded to the campaign by purchasing the product.")
|
28 |
+
|
29 |
+
df = pd.read_csv("ifood-data.csv")
|
30 |
+
|
31 |
+
num = st.number_input('No. of Rows', 5, 10)
|
32 |
+
|
33 |
+
head = st.radio('View from top (head) or bottom (tail)', ('Head', 'Tail'))
|
34 |
+
if head == 'Head':
|
35 |
+
st.dataframe(df.head(num))
|
36 |
+
else:
|
37 |
+
st.dataframe(df.tail(num))
|
38 |
+
|
39 |
+
st.text('(Rows,Columns)')
|
40 |
+
st.write(df.shape)
|
41 |
+
|
42 |
+
st.markdown("### Fields")
|
43 |
+
st.markdown("- AcceptedCmp1 - 1 if customer accepted the offer in the 1st campaign, 0 otherwise")
|
44 |
+
st.markdown("- AcceptedCmp2 - 1 if customer accepted the offer in the 2nd campaign, 0 otherwise")
|
45 |
+
st.markdown("- AcceptedCmp3 - 1 if customer accepted the offer in the 3rd campaign, 0 otherwise")
|
46 |
+
st.markdown("- AcceptedCmp4 - 1 if customer accepted the offer in the 4th campaign, 0 otherwise")
|
47 |
+
st.markdown("- AcceptedCmp5 - 1 if customer accepted the offer in the 5th campaign, 0 otherwise")
|
48 |
+
st.markdown("- Response (target) - 1 if customer accepted the offer in the last campaign, 0 otherwise")
|
49 |
+
st.markdown("- Complain - 1 if customer complained in the last 2 years")
|
50 |
+
st.markdown("- DtCustomer - date of customerβs enrolment with the company")
|
51 |
+
st.markdown("- Education - customerβs level of education")
|
52 |
+
st.markdown("- Marital - customerβs marital status")
|
53 |
+
st.markdown("- Kidhome - number of small children in customerβs household")
|
54 |
+
st.markdown("- Teenhome - number of teenagers in customerβs household")
|
55 |
+
st.markdown("- Income - customerβs yearly household income")
|
56 |
+
st.markdown("- MntFishProducts - amount spent on fish products in the last 2 years")
|
57 |
+
st.markdown("- MntMeatProducts - amount spent on meat products in the last 2 years")
|
58 |
+
st.markdown("- MntFruits - amount spent on fruits products in the last 2 years")
|
59 |
+
st.markdown("- MntSweetProducts - amount spent on sweet products in the last 2 years")
|
60 |
+
st.markdown("- MntWines - amount spent on wine products in the last 2 years")
|
61 |
+
st.markdown("- MntGoldProds - amount spent on gold products in the last 2 years")
|
62 |
+
st.markdown("- NumDealsPurchases - number of purchases made with discount")
|
63 |
+
st.markdown("- NumCatalogPurchases - number of purchases made using catalogue")
|
64 |
+
st.markdown("- NumStorePurchases - number of purchases made directly in stores")
|
65 |
+
st.markdown("- NumWebPurchases - number of purchases made through companyβs web site")
|
66 |
+
st.markdown("- NumWebVisitsMonth - number of visits to companyβs web site in the last month")
|
67 |
+
st.markdown("- Recency - number of days since the last purchase")
|
68 |
+
|
69 |
+
|
70 |
+
st.markdown("### Description of Data")
|
71 |
+
st.dataframe(df.describe())
|
72 |
+
|
73 |
+
st.markdown("### Missing Values")
|
74 |
+
st.markdown("Null or NaN values.")
|
75 |
+
|
76 |
+
dfnull = df.isnull().sum()/len(df)*100
|
77 |
+
totalmiss = dfnull.sum().round(2)
|
78 |
+
totalmiss = round(totalmiss/len(df.columns),2)
|
79 |
+
st.write("Percentage of total missing values: ",totalmiss)
|
80 |
+
st.write(dfnull)
|
81 |
+
if totalmiss <= 30:
|
82 |
+
st.success("We have less then 30 percent of missing values, which is good. This provides us with more accurate data as the null values will not significantly affect the outcomes of our conclusions. And no bias will steer towards misleading results. ")
|
83 |
+
else:
|
84 |
+
st.warning("Poor data quality due to greater than 30 percent of missing value.")
|
85 |
+
st.markdown(" > Theoretically, 25 to 30 percent is the maximum missing values are allowed, there's no hard and fast rule to decide this threshold. It can vary from problem to problem.")
|
86 |
+
|
87 |
+
st.markdown("### Completeness")
|
88 |
+
st.markdown(" The ratio of non-missing values to total records in dataset and how comprehensive the data is.")
|
89 |
+
|
90 |
+
st.write("Total data length:", len(df))
|
91 |
+
nonmissing = (df.notnull().sum().round(2))
|
92 |
+
completeness= round(sum(nonmissing)/df.size,2)
|
93 |
+
st.write("Completeness ratio:",completeness)
|
94 |
+
st.write(nonmissing)
|
95 |
+
if completeness >= 0.80:
|
96 |
+
st.success("We have completeness ratio greater than 0.85, which is good. It shows that the vast majority of the data is available for us to use and analyze. ")
|
97 |
+
else:
|
98 |
+
st.success("Poor data quality due to low completeness ratio( less than 0.85).")
|
ifood-data.csv
CHANGED
The diff for this file is too large to render.
See raw diff
|
|
pages/01 π Data Visualization.py
CHANGED
@@ -26,15 +26,17 @@ st.metric(value = df_unclean.shape[0] - df.shape[0], label = "Difference")
|
|
26 |
|
27 |
# Education Levels
|
28 |
|
|
|
29 |
st.bar_chart(df.groupby("Education").size(), color = "#FF3008")
|
30 |
|
|
|
31 |
st.bar_chart(df.groupby("Year_Birth").size(), color = "#FF3008")
|
32 |
|
|
|
33 |
accepted_cmp_dataset = df[["AcceptedCmp1","AcceptedCmp2","AcceptedCmp3","AcceptedCmp4","AcceptedCmp5"]]
|
34 |
counts = accepted_cmp_dataset.sum()
|
35 |
counts = counts.reset_index()
|
36 |
counts.columns = ['Campaign', 'Frequency']
|
37 |
-
|
38 |
st.bar_chart(counts.set_index('Campaign'), color = "#FF3008")
|
39 |
|
40 |
df['Education'] = df['Education'].astype('category').cat.codes
|
@@ -43,6 +45,7 @@ df['Marital_Status'] = df['Marital_Status'].astype('category').cat.codes
|
|
43 |
df = df.drop(["Dt_Customer"], axis = 1)
|
44 |
df = df.drop(["ID"], axis = 1)
|
45 |
|
|
|
46 |
heatmap = plt.figure(figsize=(18, 10))
|
47 |
sns.heatmap(df.corr().round(2), annot=True, cmap="Reds")
|
48 |
st.pyplot(heatmap)
|
|
|
26 |
|
27 |
# Education Levels
|
28 |
|
29 |
+
st.header("Education Distribution")
|
30 |
st.bar_chart(df.groupby("Education").size(), color = "#FF3008")
|
31 |
|
32 |
+
st.header("Birth Year Distribution")
|
33 |
st.bar_chart(df.groupby("Year_Birth").size(), color = "#FF3008")
|
34 |
|
35 |
+
st.header("Birth Year Distribution")
|
36 |
accepted_cmp_dataset = df[["AcceptedCmp1","AcceptedCmp2","AcceptedCmp3","AcceptedCmp4","AcceptedCmp5"]]
|
37 |
counts = accepted_cmp_dataset.sum()
|
38 |
counts = counts.reset_index()
|
39 |
counts.columns = ['Campaign', 'Frequency']
|
|
|
40 |
st.bar_chart(counts.set_index('Campaign'), color = "#FF3008")
|
41 |
|
42 |
df['Education'] = df['Education'].astype('category').cat.codes
|
|
|
45 |
df = df.drop(["Dt_Customer"], axis = 1)
|
46 |
df = df.drop(["ID"], axis = 1)
|
47 |
|
48 |
+
st.header("Heatmap")
|
49 |
heatmap = plt.figure(figsize=(18, 10))
|
50 |
sns.heatmap(df.corr().round(2), annot=True, cmap="Reds")
|
51 |
st.pyplot(heatmap)
|
pages/02 π€ Model Prediction.py
CHANGED
@@ -11,6 +11,7 @@ import matplotlib.pyplot as plt
|
|
11 |
from sklearn.linear_model import LogisticRegression
|
12 |
from sklearn.metrics import classification_report
|
13 |
from codecarbon import EmissionsTracker
|
|
|
14 |
|
15 |
url = "https://upload.wikimedia.org/wikipedia/commons/6/6a/DoorDash_Logo.svg"
|
16 |
st.image(url, output_format="PNG", width=300)
|
@@ -24,31 +25,57 @@ df['Education'] = df['Education'].astype('category').cat.codes
|
|
24 |
df['Marital_Status'] = df['Marital_Status'].astype('category').cat.codes
|
25 |
df = df.drop(["Dt_Customer"], axis = 1)
|
26 |
df = df.drop(["ID"], axis = 1)
|
|
|
|
|
27 |
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
|
32 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
33 |
|
34 |
-
|
35 |
-
|
|
|
|
|
|
|
|
|
|
|
36 |
|
37 |
-
|
38 |
-
|
39 |
-
log_results = logmodel.predict(X_test)
|
40 |
|
41 |
-
clf
|
42 |
-
clf = clf.fit(X_train,y_train)
|
43 |
-
tree_results = clf.predict(X_test)
|
44 |
|
45 |
-
|
46 |
-
|
47 |
-
|
|
|
|
|
|
|
|
|
|
|
48 |
|
49 |
-
|
50 |
-
|
51 |
|
52 |
-
|
53 |
-
|
54 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
11 |
from sklearn.linear_model import LogisticRegression
|
12 |
from sklearn.metrics import classification_report
|
13 |
from codecarbon import EmissionsTracker
|
14 |
+
import time
|
15 |
|
16 |
url = "https://upload.wikimedia.org/wikipedia/commons/6/6a/DoorDash_Logo.svg"
|
17 |
st.image(url, output_format="PNG", width=300)
|
|
|
25 |
df['Marital_Status'] = df['Marital_Status'].astype('category').cat.codes
|
26 |
df = df.drop(["Dt_Customer"], axis = 1)
|
27 |
df = df.drop(["ID"], axis = 1)
|
28 |
+
params = st.multiselect("Select Parameters", df.columns, default = ["Year_Birth"])
|
29 |
+
model = st.selectbox("Select Model", ["Logistic Regression", "K-Nearest Neighbors", "Decision Tree"])
|
30 |
|
31 |
+
if not params:
|
32 |
+
st.warning("Please select at least one parameter.")
|
33 |
+
else:
|
34 |
|
35 |
+
X = df.drop(labels = ['Response'], axis = 1)
|
36 |
+
X = df[params]
|
37 |
+
y = df["Response"]
|
38 |
+
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.3, random_state = 42)
|
39 |
+
model_start_time = time.time()
|
40 |
+
tracker = EmissionsTracker()
|
41 |
+
tracker.start()
|
42 |
+
if(model == "Logistic Regression"):
|
43 |
+
logmodel = LogisticRegression()
|
44 |
+
logmodel.fit(X_train,y_train)
|
45 |
+
model_accuracy = logmodel.predict(X_test)
|
46 |
+
elif(model == "K-Nearest Neighbors"):
|
47 |
|
48 |
+
knn = KNeighborsClassifier()
|
49 |
+
knn.fit(X_train, y_train)
|
50 |
+
model_accuracy = knn.predict(X_test)
|
51 |
+
else:
|
52 |
+
clf = DecisionTreeClassifier(max_depth=3)
|
53 |
+
clf = clf.fit(X_train,y_train)
|
54 |
+
model_accuracy = clf.predict(X_test)
|
55 |
|
56 |
+
import graphviz
|
57 |
+
from sklearn.tree import export_graphviz
|
|
|
58 |
|
59 |
+
# Assuming `clf` and `X` are defined somewhere in your code
|
|
|
|
|
60 |
|
61 |
+
# Your code for exporting the decision tree graph
|
62 |
+
feature_names = X.columns
|
63 |
+
feature_cols = X.columns
|
64 |
+
dot_data = export_graphviz(clf, out_file=None,
|
65 |
+
feature_names=feature_cols,
|
66 |
+
class_names=['0', '1'],
|
67 |
+
filled=True, rounded=True,
|
68 |
+
special_characters=True)
|
69 |
|
70 |
+
# Display the graph using streamlit_graphviz
|
71 |
+
st.graphviz_chart(dot_data)
|
72 |
|
73 |
+
model_end_time = time.time()
|
74 |
+
model_execution_time = model_end_time - model_start_time
|
75 |
+
|
76 |
+
|
77 |
+
emissions = tracker.stop()
|
78 |
+
print(f"Estimated emissions for training the model: {emissions:.4f} kg of CO2")
|
79 |
+
|
80 |
+
st.metric(label = "Accuracy", value = str(round(metrics.accuracy_score(y_test, model_accuracy)*100, 2)) + "%")
|
81 |
+
st.metric(label = "Execution time:", value = str(model_execution_time) + "s")
|
requirements.txt
CHANGED
@@ -7,3 +7,4 @@ tensorflow
|
|
7 |
matplotlib
|
8 |
streamlit
|
9 |
seaborn
|
|
|
|
7 |
matplotlib
|
8 |
streamlit
|
9 |
seaborn
|
10 |
+
graphviz
|