ziyaforbes commited on
Commit
dc607dd
β€’
1 Parent(s): 889f021

Upload 10 files

Browse files
Introduction.py ADDED
@@ -0,0 +1,98 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ from PIL import Image
4
+
5
+ url = "https://upload.wikimedia.org/wikipedia/commons/6/6a/DoorDash_Logo.svg"
6
+ st.image(url, output_format="PNG", width=300)
7
+
8
+ st.title("Doordash CRM Data Analysis")
9
+
10
+ st.markdown("##### Context")
11
+ st.markdown("Doordash is one of the leading food delivery apps in the United States, present in over seven thousand cities.")
12
+ st.markdown("Keeping a high customer engagement is key for growing and consolidating the company’s position as the market leader.")
13
+ st.markdown("To expand their product offering, the company is currently looking to launch a physical button that customers can press to automatically place an order for their favorite meal. Doordash is looking to maximize marketing efforts for this new product.")
14
+
15
+ st.image('amazon-dash.png', caption="DoorDash's new Insta-Order Button", width = 300)
16
+
17
+ st.markdown("##### Objectives")
18
+ st.markdown("The objective of the team is to build a predictive model that will produce the highest profit for the next direct marketing campaign, scheduled for next month. The new campaign, sixth, aims at selling a new gadget to the Customer Database. The team is set on developing a model that predicts customer response to various marketing tactics, which will then be applied to the rest of the customer base.")
19
+ st.markdown("Hopefully the model will allow the company to cherry pick the customers that are most likely to purchase the offer while leaving out the non-respondents, making the sixth campaign highly profitable. Moreover, other than maximizing the profit of this new campaign while reducing expenses, the CMO is interested in understanding the characteristic features of those customers who are responsive to purchasing the gadget.")
20
+
21
+ st.markdown("### Key Goals:")
22
+ st.markdown("1. Propose and describe a customer segmentation based on customers behaviors.")
23
+ st.markdown("2. Create a predictive model which allows the company to maximize the profits while reducing expenses of the sixth marketing campaign.")
24
+ st.markdown("3. By examining which past campaigns were the most responsive, the team can implement the most successful strategies into the sixth campaign to increase customer retention.")
25
+
26
+ st.markdown("##### Data Source")
27
+ st.markdown("The data set contains socio-demographic and firmographic features from about 2.240 customers who were contacted. Additionally, it contains a flag for those customers who responded to the campaign by purchasing the product.")
28
+
29
+ df = pd.read_csv("ifood-data.csv")
30
+
31
+ num = st.number_input('No. of Rows', 5, 10)
32
+
33
+ head = st.radio('View from top (head) or bottom (tail)', ('Head', 'Tail'))
34
+ if head == 'Head':
35
+ st.dataframe(df.head(num))
36
+ else:
37
+ st.dataframe(df.tail(num))
38
+
39
+ st.text('(Rows,Columns)')
40
+ st.write(df.shape)
41
+
42
+ st.markdown("### Fields")
43
+ st.markdown("- AcceptedCmp1 - 1 if customer accepted the offer in the 1st campaign, 0 otherwise")
44
+ st.markdown("- AcceptedCmp2 - 1 if customer accepted the offer in the 2nd campaign, 0 otherwise")
45
+ st.markdown("- AcceptedCmp3 - 1 if customer accepted the offer in the 3rd campaign, 0 otherwise")
46
+ st.markdown("- AcceptedCmp4 - 1 if customer accepted the offer in the 4th campaign, 0 otherwise")
47
+ st.markdown("- AcceptedCmp5 - 1 if customer accepted the offer in the 5th campaign, 0 otherwise")
48
+ st.markdown("- Response (target) - 1 if customer accepted the offer in the last campaign, 0 otherwise")
49
+ st.markdown("- Complain - 1 if customer complained in the last 2 years")
50
+ st.markdown("- DtCustomer - date of customer’s enrolment with the company")
51
+ st.markdown("- Education - customer’s level of education")
52
+ st.markdown("- Marital - customer’s marital status")
53
+ st.markdown("- Kidhome - number of small children in customer’s household")
54
+ st.markdown("- Teenhome - number of teenagers in customer’s household")
55
+ st.markdown("- Income - customer’s yearly household income")
56
+ st.markdown("- MntFishProducts - amount spent on fish products in the last 2 years")
57
+ st.markdown("- MntMeatProducts - amount spent on meat products in the last 2 years")
58
+ st.markdown("- MntFruits - amount spent on fruits products in the last 2 years")
59
+ st.markdown("- MntSweetProducts - amount spent on sweet products in the last 2 years")
60
+ st.markdown("- MntWines - amount spent on wine products in the last 2 years")
61
+ st.markdown("- MntGoldProds - amount spent on gold products in the last 2 years")
62
+ st.markdown("- NumDealsPurchases - number of purchases made with discount")
63
+ st.markdown("- NumCatalogPurchases - number of purchases made using catalogue")
64
+ st.markdown("- NumStorePurchases - number of purchases made directly in stores")
65
+ st.markdown("- NumWebPurchases - number of purchases made through company’s web site")
66
+ st.markdown("- NumWebVisitsMonth - number of visits to company’s web site in the last month")
67
+ st.markdown("- Recency - number of days since the last purchase")
68
+
69
+
70
+ st.markdown("### Description of Data")
71
+ st.dataframe(df.describe())
72
+
73
+ st.markdown("### Missing Values")
74
+ st.markdown("Null or NaN values.")
75
+
76
+ dfnull = df.isnull().sum()/len(df)*100
77
+ totalmiss = dfnull.sum().round(2)
78
+ totalmiss = round(totalmiss/len(df.columns),2)
79
+ st.write("Percentage of total missing values: ",totalmiss)
80
+ st.write(dfnull)
81
+ if totalmiss <= 30:
82
+ st.success("We have less then 30 percent of missing values, which is good. This provides us with more accurate data as the null values will not significantly affect the outcomes of our conclusions. And no bias will steer towards misleading results. ")
83
+ else:
84
+ st.warning("Poor data quality due to greater than 30 percent of missing value.")
85
+ st.markdown(" > Theoretically, 25 to 30 percent is the maximum missing values are allowed, there's no hard and fast rule to decide this threshold. It can vary from problem to problem.")
86
+
87
+ st.markdown("### Completeness")
88
+ st.markdown(" The ratio of non-missing values to total records in dataset and how comprehensive the data is.")
89
+
90
+ st.write("Total data length:", len(df))
91
+ nonmissing = (df.notnull().sum().round(2))
92
+ completeness= round(sum(nonmissing)/df.size,2)
93
+ st.write("Completeness ratio:",completeness)
94
+ st.write(nonmissing)
95
+ if completeness >= 0.80:
96
+ st.success("We have completeness ratio greater than 0.85, which is good. It shows that the vast majority of the data is available for us to use and analyze. ")
97
+ else:
98
+ st.success("Poor data quality due to low completeness ratio( less than 0.85).")
README.md CHANGED
@@ -1,12 +1 @@
1
- ---
2
- title: Doordash CRM Data Analysis
3
- emoji: 🏒
4
- colorFrom: green
5
- colorTo: purple
6
- sdk: streamlit
7
- sdk_version: 1.33.0
8
- app_file: app.py
9
- pinned: false
10
- ---
11
-
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
+ # doordash-data-analysis
 
 
 
 
 
 
 
 
 
 
 
amazon-dash.png ADDED
ifood-data.csv ADDED
The diff for this file is too large to render. See raw diff
 
pages/01 πŸ“Š Data Visualization.py ADDED
@@ -0,0 +1,79 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import streamlit as st
3
+ import numpy as np
4
+ import seaborn as sns
5
+ import matplotlib.pyplot as plt
6
+ from PIL import Image
7
+
8
+ url = "https://upload.wikimedia.org/wikipedia/commons/6/6a/DoorDash_Logo.svg"
9
+ st.image(url, output_format="PNG", width=300)
10
+
11
+ st.title("Data Visualization")
12
+
13
+ df_unclean = pd.read_csv("ifood-data.csv")
14
+
15
+ st.dataframe(df_unclean)
16
+
17
+ st.header("Cleaning the Data")
18
+ st.metric(value = df_unclean.shape[0], label = "Rows")
19
+ st.write('After dropping NA and cleaning up outliers')
20
+ df = df_unclean.dropna()
21
+ df = df[df["Year_Birth"] > 1940]
22
+
23
+ st.metric(value = df.shape[0], label = "Rows")
24
+
25
+ st.metric(value = df_unclean.shape[0] - df.shape[0], label = "Difference")
26
+
27
+ # Education Levels
28
+
29
+ st.bar_chart(df.groupby("Education").size(), color = "#FF3008")
30
+
31
+ st.bar_chart(df.groupby("Year_Birth").size(), color = "#FF3008")
32
+
33
+ accepted_cmp_dataset = df[["AcceptedCmp1","AcceptedCmp2","AcceptedCmp3","AcceptedCmp4","AcceptedCmp5"]]
34
+ counts = accepted_cmp_dataset.sum()
35
+ counts = counts.reset_index()
36
+ counts.columns = ['Campaign', 'Frequency']
37
+
38
+ st.bar_chart(counts.set_index('Campaign'), color = "#FF3008")
39
+
40
+ df['Education'] = df['Education'].astype('category').cat.codes
41
+ df['Marital_Status'] = df['Marital_Status'].astype('category').cat.codes
42
+
43
+ df = df.drop(["Dt_Customer"], axis = 1)
44
+ df = df.drop(["ID"], axis = 1)
45
+
46
+ heatmap = plt.figure(figsize=(18, 10))
47
+ sns.heatmap(df.corr().round(2), annot=True, cmap="Reds")
48
+ st.pyplot(heatmap)
49
+
50
+ # limit to just the most correlated vars
51
+
52
+ recency_response = plt.figure()
53
+ sns.boxplot(x=df['Response'], y=df['Recency'], color = "#ff3008")
54
+ plt.xlabel('Response')
55
+ plt.ylabel('Recency')
56
+ plt.title('Box Plot of Response vs Recency')
57
+ st.pyplot(recency_response)
58
+
59
+
60
+ response_income = plt.figure()
61
+ sns.barplot(x=df['Response'], y=df['Income'], color = "#ff3008")
62
+ plt.xlabel('Response')
63
+ plt.ylabel('Income')
64
+ plt.title('Bar Plot of Response vs Income')
65
+ st.pyplot(response_income)
66
+
67
+ response_teenhome = plt.figure()
68
+ sns.boxplot(x=df['Response'], y=df['Teenhome'], color = "#ff3008")
69
+ plt.xlabel('Response')
70
+ plt.ylabel('Teenhome')
71
+ plt.title('Box Plot of Response vs Teenhome')
72
+ st.pyplot(response_teenhome)
73
+
74
+ response_kidhome = plt.figure()
75
+ sns.boxplot(x=df['Response'], y=df['Kidhome'], color = "#ff3008")
76
+ plt.xlabel('Response')
77
+ plt.ylabel('Kidhome')
78
+ plt.title('Box Plot of Response vs Kidhome')
79
+ st.pyplot(response_kidhome)
pages/02 πŸ€– Model Prediction.py ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ from PIL import Image
4
+ import sklearn.metrics as sk_metrics
5
+ from sklearn.model_selection import train_test_split
6
+ from sklearn.neighbors import KNeighborsClassifier
7
+ from sklearn.tree import DecisionTreeClassifier # Import Decision Tree Classifier
8
+ from sklearn.model_selection import train_test_split # Import train_test_split function
9
+ from sklearn import metrics #Import scikit-learn metrics module for accuracy calculation
10
+ import matplotlib.pyplot as plt
11
+ from sklearn.linear_model import LogisticRegression
12
+ from sklearn.metrics import classification_report
13
+ from codecarbon import EmissionsTracker
14
+
15
+ url = "https://upload.wikimedia.org/wikipedia/commons/6/6a/DoorDash_Logo.svg"
16
+ st.image(url, output_format="PNG", width=300)
17
+
18
+ st.title("Model Prediction")
19
+
20
+ df_unclean = pd.read_csv("ifood-data.csv")
21
+ df = df_unclean.dropna()
22
+ df = df[df["Year_Birth"] > 1940]
23
+ df['Education'] = df['Education'].astype('category').cat.codes
24
+ df['Marital_Status'] = df['Marital_Status'].astype('category').cat.codes
25
+ df = df.drop(["Dt_Customer"], axis = 1)
26
+ df = df.drop(["ID"], axis = 1)
27
+
28
+ X = df.drop(labels = ['Response'], axis = 1)
29
+ y = df["Response"]
30
+ X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.3, random_state = 42)
31
+
32
+ st.multiselect("Select Parameters", df.columns)
33
+
34
+ tracker = EmissionsTracker()
35
+ tracker.start()
36
+
37
+ logmodel = LogisticRegression()
38
+ logmodel.fit(X_train,y_train)
39
+ log_results = logmodel.predict(X_test)
40
+
41
+ clf = DecisionTreeClassifier()
42
+ clf = clf.fit(X_train,y_train)
43
+ tree_results = clf.predict(X_test)
44
+
45
+ knn = KNeighborsClassifier()
46
+ knn.fit(X_train, y_train)
47
+ knn_results = knn.predict(X_test)
48
+
49
+ emissions = tracker.stop()
50
+ print(f"Estimated emissions for training the model: {emissions:.4f} kg of CO2")
51
+
52
+ st.metric(label = "Log Accuracy", value = round(metrics.accuracy_score(y_test, log_results)*100, 2))
53
+ st.metric(label = "Tree Accuracy", value = round(metrics.accuracy_score(y_test, tree_results)*100, 2))
54
+ st.metric(label = "kNN Accuracy", value = round(metrics.accuracy_score(y_test, knn_results)*100, 2))
pages/03 πŸ§‘β€πŸ’» Explainable AI.py ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ from PIL import Image
4
+
5
+ url = "https://upload.wikimedia.org/wikipedia/commons/6/6a/DoorDash_Logo.svg"
6
+ st.image(url, output_format="PNG", width=300)
7
+
8
+ st.title("Explainable AI")
9
+
pages/04 🦦 MLflow.py ADDED
@@ -0,0 +1,245 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import pandas as pd
3
+ import seaborn as sn
4
+ # Commented out IPython magic to ensure Python compatibility.
5
+ from sklearn.tree import DecisionTreeClassifier # Import Decision Tree Classifier
6
+ from sklearn.model_selection import train_test_split # Import train_test_split function
7
+ from sklearn import metrics #Import scikit-learn metrics module for accuracy calculation
8
+ import matplotlib.pyplot as plt
9
+ from sklearn.linear_model import LogisticRegression
10
+ from sklearn.metrics import classification_report
11
+ # %matplotlib inline
12
+
13
+ df = pd.read_csv("ifood-data.csv")
14
+
15
+ df.head()
16
+
17
+ df['Education'] = df['Education'].astype('category').cat.codes
18
+ df['Marital_Status'] = df['Marital_Status'].astype('category').cat.codes
19
+
20
+ df = df.drop(["Dt_Customer"], axis = 1)
21
+ df = df.drop(["ID"], axis = 1)
22
+
23
+ df = df.dropna()
24
+
25
+ df.head()
26
+
27
+ plt.figure(figsize=(16, 10))
28
+ sns.heatmap(df.corr(), annot=True)
29
+ plt.show()
30
+
31
+ X = df.drop(labels = ['Response'], axis = 1)
32
+ y = df["Response"]
33
+ X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.3, random_state = 42)
34
+
35
+ logmodel = LogisticRegression()
36
+
37
+ logmodel.fit(X_train,y_train)
38
+
39
+ prediction = logmodel.predict(X_test)
40
+
41
+ print(classification_report(y_test,prediction))
42
+
43
+ # Create Decision Tree classifer object
44
+ clf = DecisionTreeClassifier()
45
+
46
+ # Train Decision Tree Classifer
47
+ clf = clf.fit(X_train,y_train)
48
+
49
+ #Predict the response for test dataset
50
+ y_pred = clf.predict(X_test)
51
+
52
+ print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
53
+
54
+ feature_cols = X.columns
55
+ feature_cols
56
+
57
+ from sklearn.tree import export_graphviz
58
+ feature_names = X.columns
59
+ dot_data = export_graphviz(clf, out_file=None,
60
+
61
+ feature_names=feature_cols,
62
+
63
+ class_names=['0','1'],
64
+
65
+ filled=True, rounded=True,
66
+
67
+ special_characters=True)
68
+
69
+ graph = graphviz.Source(dot_data)
70
+ graph
71
+
72
+ # Create Decision Tree classifer object
73
+ clf = DecisionTreeClassifier(max_depth=3)
74
+
75
+ # Train Decision Tree Classifer
76
+ clf = clf.fit(X_train,y_train)
77
+
78
+ #Predict the response for test dataset
79
+ y_pred = clf.predict(X_test)
80
+
81
+ # Model Accuracy, how often is the classifier correct?
82
+ print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
83
+
84
+ import graphviz
85
+ from sklearn.tree import export_graphviz
86
+ feature_names = X.columns
87
+ dot_data = export_graphviz(clf, out_file=None,
88
+
89
+ feature_names=feature_cols,
90
+
91
+ class_names=['0','1'],
92
+
93
+ filled=True, rounded=True,
94
+
95
+ special_characters=True)
96
+
97
+ graph = graphviz.Source(dot_data)
98
+ graph
99
+
100
+ from shapash.explainer.smart_explainer import SmartExplainer
101
+
102
+ xpl = SmartExplainer(clf)
103
+
104
+ y_pred = pd.Series(y_pred)
105
+ X_test = X_test.reset_index(drop=True)
106
+ xpl.compile(x=X_test, y_pred=y_pred)
107
+
108
+ xpl.plot.features_importance()
109
+
110
+ from sklearn.neighbors import KNeighborsClassifier
111
+
112
+ knn = KNeighborsClassifier()
113
+
114
+ knn.fit(X_train, y_train)
115
+
116
+ results = knn.predict(X_test)
117
+
118
+ print("Accuracy:",metrics.accuracy_score(y_test, results))
119
+
120
+ # Import necessary libraries
121
+ import numpy as np # a Python library used for working with arrays
122
+ import pandas as pd # it allows us to analyze big data and make conclusions based on statistical theories
123
+
124
+ from pycaret.datasets import get_data # allows you to easily access and load built-in datasets for machine learning experimentation
125
+ from pycaret.classification import * # imports all the classification-related functions
126
+ from sklearn.model_selection import train_test_split # This function is commonly used to split a dataset into training and testing subsets.
127
+ import mlflow # MLflow is an open-source platform for managing the machine learning lifecycle
128
+ from sklearn import metrics as sk_metrics # imports the metrics module from the sklearn library and use various evaluation metrics and scoring functions provided by sci)kit
129
+
130
+ # Split data into training and testing sets
131
+ loan_train, loan_test = train_test_split(df, test_size=0.2, random_state=42)
132
+
133
+ # Initialize PyCaret setup with the training set
134
+ cls1 = setup(data = loan_train, target = 'Response')
135
+
136
+ # Compare all models and select top 3
137
+ top3 = compare_models(include=['lr', 'knn', 'dt'], n_select=3)
138
+
139
+ # Log each model into mlflow separately
140
+ for i, model in enumerate(top3, 1):
141
+ with mlflow.start_run(run_name = f"Model: {model}"):
142
+ model_name = "model_" + str(i)
143
+
144
+ # Log model
145
+ mlflow.sklearn.log_model(model, model_name)
146
+
147
+ # Log parameters
148
+ params = model.get_params()
149
+ for key, value in params.items():
150
+ mlflow.log_param(key, value)
151
+
152
+ # Predict on the testing set and log metrics
153
+ y_pred = predict_model(model, data=loan_test.drop('Response', axis=1))
154
+ y_test = loan_test['Response']
155
+
156
+ # Calculate metrics
157
+ accuracy = sk_metrics.accuracy_score(y_test, y_pred["prediction_label"])
158
+ precision = sk_metrics.precision_score(y_test, y_pred["prediction_label"], average='weighted')
159
+ recall = sk_metrics.recall_score(y_test, y_pred["prediction_label"], average='weighted')
160
+ f1 = sk_metrics.f1_score(y_test, y_pred["prediction_label"], average='weighted')
161
+
162
+ # Log metrics
163
+ mlflow.log_metric("Accuracy", accuracy)
164
+ mlflow.log_metric("Precision", precision)
165
+ mlflow.log_metric("Recall", recall)
166
+ mlflow.log_metric("F1 Score", f1)
167
+
168
+ mlflow.end_run()
169
+
170
+
171
+ # Split data into training and testing sets
172
+ loan_train, loan_test = train_test_split(df, test_size=0.2, random_state=42)
173
+
174
+ # Define the list of max_depth values to try
175
+ max_depth_values = [2, 4, 6, 8, 10]
176
+
177
+ # Loop over each max_depth value
178
+ for depth in max_depth_values:
179
+ with mlflow.start_run(run_name=f"Decision Tree (Max Depth: {depth})"):
180
+ # Initialize and train the decision tree model
181
+ model = DecisionTreeClassifier(max_depth=depth)
182
+ model.fit(loan_train.drop('Response', axis=1), loan_train['Response'])
183
+
184
+ # Log model parameters
185
+ mlflow.log_param("max_depth", depth)
186
+
187
+ # Predict on the testing set and log metrics
188
+ y_pred = model.predict(loan_test.drop('Response', axis=1))
189
+ y_test = loan_test['Response']
190
+
191
+ # Calculate metrics
192
+ accuracy = sk_metrics.accuracy_score(y_test, y_pred)
193
+ precision = sk_metrics.precision_score(y_test, y_pred, average='weighted')
194
+ recall = sk_metrics.recall_score(y_test, y_pred, average='weighted')
195
+ f1 = sk_metrics.f1_score(y_test, y_pred, average='weighted')
196
+
197
+ # Log metrics
198
+ mlflow.log_metric("Accuracy", accuracy)
199
+ mlflow.log_metric("Precision", precision)
200
+ mlflow.log_metric("Recall", recall)
201
+ mlflow.log_metric("F1 Score", f1)
202
+
203
+ # Log the trained model
204
+ mlflow.sklearn.log_model(model, "decision_tree_model")
205
+
206
+ mlflow.end_run()
207
+
208
+ from sklearn.neighbors import KNeighborsClassifier
209
+
210
+ # Split data into training and testing sets
211
+ loan_train, loan_test = train_test_split(df, test_size=0.2, random_state=42)
212
+
213
+ # Define the list of n_neighbors values to try
214
+ n_neighbors_values = [3, 5, 7, 9, 11]
215
+
216
+ # Loop over each n_neighbors value
217
+ for n_neighbors in n_neighbors_values:
218
+ with mlflow.start_run(run_name=f"KNN (n_neighbors: {n_neighbors})"):
219
+ # Initialize and train the KNN model
220
+ model = KNeighborsClassifier(n_neighbors=n_neighbors)
221
+ model.fit(loan_train.drop('Response', axis=1), loan_train['Response'])
222
+
223
+ # Log model parameters
224
+ mlflow.log_param("n_neighbors", n_neighbors)
225
+
226
+ # Predict on the testing set and log metrics
227
+ y_pred = model.predict(loan_test.drop('Response', axis=1))
228
+ y_test = loan_test['Response']
229
+
230
+ # Calculate metrics
231
+ accuracy = sk_metrics.accuracy_score(y_test, y_pred)
232
+ precision = sk_metrics.precision_score(y_test, y_pred, average='weighted')
233
+ recall = sk_metrics.recall_score(y_test, y_pred, average='weighted')
234
+ f1 = sk_metrics.f1_score(y_test, y_pred, average='weighted')
235
+
236
+ # Log metrics
237
+ mlflow.log_metric("Accuracy", accuracy)
238
+ mlflow.log_metric("Precision", precision)
239
+ mlflow.log_metric("Recall", recall)
240
+ mlflow.log_metric("F1 Score", f1)
241
+
242
+ # Log the trained model
243
+ mlflow.sklearn.log_model(model, "knn_model")
244
+
245
+ mlflow.end_run()
pages/05 πŸ§‘β€πŸ’» Insights.py ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ from PIL import Image
4
+
5
+ url = "https://upload.wikimedia.org/wikipedia/commons/6/6a/DoorDash_Logo.svg"
6
+ st.image(url, output_format="PNG", width=300)
7
+
8
+ df_unclean = pd.read_csv("ifood-data.csv")
9
+ df = df_unclean.dropna()
10
+ df = df[df["Year_Birth"] > 1940]
11
+
12
+
13
+ st.title("Insights")
14
+
15
+ st.header("Our Ideal Customer")
16
+
17
+ response_by_age = df.groupby('Year_Birth')['Response'].mean()
18
+
19
+ # Finding the age group with the highest proportion of Response equal to 1
20
+ ideal_age = response_by_age.idxmax()
21
+ highest_response_proportion = response_by_age.max()
22
+
23
+ st.metric(value = ideal_age, label = "Ideal Birth Year")
requirements.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ codecarbon
2
+ mlflow
3
+ numpy
4
+ pandas
5
+ scikit-learn
6
+ tensorflow
7
+ matplotlib
8
+ streamlit