Spaces:
Running
Running
Add files to app
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- .gitignore +166 -0
- README.md +13 -12
- data/classification/credit_score/credit_score_cm_train +0 -0
- data/classification/credit_score/credit_score_test_pp.pkl +3 -0
- data/classification/credit_score/credit_score_test_raw.pkl +3 -0
- data/classification/credit_score/credit_score_train_raw.pkl +3 -0
- data/clustering/clean_marketing.pkl +3 -0
- data/clustering/results/results_2_clusters.pkl +3 -0
- data/clustering/results/results_3_clusters.pkl +3 -0
- data/clustering/results/results_4_clusters.pkl +3 -0
- data/clustering/results/results_5_clusters.pkl +3 -0
- data/clustering/results/results_6_clusters.pkl +3 -0
- data/hotels/booking_df.csv +0 -0
- data/household/household_power_consumption_clean.pkl +3 -0
- data/movies/csr_data_tf.pkl +3 -0
- data/movies/movies_dict2.pkl +3 -0
- data/movies/vote_info.pkl +3 -0
- data/pinterest/image1.jpg +0 -0
- data/pinterest/image2.jpg +0 -0
- data/pinterest/image3.jpg +0 -0
- data/pinterest/image4.jpg +0 -0
- data/sa_data/reviews_raw.pkl +3 -0
- data/sa_data/reviews_results.pkl +3 -0
- images/AI.jpg +0 -0
- images/clustering.webp +0 -0
- images/credit_score.jpg +0 -0
- images/cs.webp +0 -0
- images/energy_consumption.jpg +0 -0
- images/france.jpeg +0 -0
- images/group.png +0 -0
- images/hec.png +0 -0
- images/hi-paris.png +0 -0
- images/models/credit_score/EDA_numeric_credit.png +0 -0
- images/object_detection.png +0 -0
- images/od_fashion.jpg +0 -0
- images/reviews.jpg +0 -0
- images/room.jpg +0 -0
- images/rs.png +0 -0
- images/sentiment_analysis.png +0 -0
- images/singapore.jpg +0 -0
- images/spain-banner.jpg +0 -0
- images/spain.WebP +0 -0
- images/supervised_learner.png +0 -0
- images/thailand.jpeg +0 -0
- images/ts_patterns.png +0 -0
- images/unsupervised_learner.webp +0 -0
- main_page.py +84 -0
- notebooks/Supervised-Unsupervised/credit_score.ipynb +0 -0
- notebooks/Supervised-Unsupervised/customer_churn.ipynb +0 -0
- notebooks/Supervised-Unsupervised/customer_segmentation.ipynb +632 -0
.gitignore
ADDED
@@ -0,0 +1,166 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Byte-compiled / optimized / DLL files
|
2 |
+
__pycache__/
|
3 |
+
*.py[cod]
|
4 |
+
*$py.class
|
5 |
+
|
6 |
+
# Streamlit secrets
|
7 |
+
.streamlit/
|
8 |
+
|
9 |
+
# C extensions
|
10 |
+
*.so
|
11 |
+
|
12 |
+
# Virtual Environment
|
13 |
+
venv-app-ai-ds/
|
14 |
+
|
15 |
+
# Distribution / packaging
|
16 |
+
.Python
|
17 |
+
build/
|
18 |
+
develop-eggs/
|
19 |
+
dist/
|
20 |
+
downloads/
|
21 |
+
eggs/
|
22 |
+
.eggs/
|
23 |
+
lib/
|
24 |
+
lib64/
|
25 |
+
parts/
|
26 |
+
sdist/
|
27 |
+
var/
|
28 |
+
wheels/
|
29 |
+
share/python-wheels/
|
30 |
+
*.egg-info/
|
31 |
+
.installed.cfg
|
32 |
+
*.egg
|
33 |
+
MANIFEST
|
34 |
+
|
35 |
+
# PyInstaller
|
36 |
+
# Usually these files are written by a python script from a template
|
37 |
+
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
38 |
+
*.manifest
|
39 |
+
*.spec
|
40 |
+
|
41 |
+
# Installer logs
|
42 |
+
pip-log.txt
|
43 |
+
pip-delete-this-directory.txt
|
44 |
+
|
45 |
+
# Unit test / coverage reports
|
46 |
+
htmlcov/
|
47 |
+
.tox/
|
48 |
+
.nox/
|
49 |
+
.coverage
|
50 |
+
.coverage.*
|
51 |
+
.cache
|
52 |
+
nosetests.xml
|
53 |
+
coverage.xml
|
54 |
+
*.cover
|
55 |
+
*.py,cover
|
56 |
+
.hypothesis/
|
57 |
+
.pytest_cache/
|
58 |
+
cover/
|
59 |
+
|
60 |
+
# Translations
|
61 |
+
*.mo
|
62 |
+
*.pot
|
63 |
+
|
64 |
+
# Django stuff:
|
65 |
+
*.log
|
66 |
+
local_settings.py
|
67 |
+
db.sqlite3
|
68 |
+
db.sqlite3-journal
|
69 |
+
|
70 |
+
# Flask stuff:
|
71 |
+
instance/
|
72 |
+
.webassets-cache
|
73 |
+
|
74 |
+
# Scrapy stuff:
|
75 |
+
.scrapy
|
76 |
+
|
77 |
+
# Sphinx documentation
|
78 |
+
docs/_build/
|
79 |
+
|
80 |
+
# PyBuilder
|
81 |
+
.pybuilder/
|
82 |
+
target/
|
83 |
+
|
84 |
+
# Jupyter Notebook
|
85 |
+
.ipynb_checkpoints
|
86 |
+
|
87 |
+
# IPython
|
88 |
+
profile_default/
|
89 |
+
ipython_config.py
|
90 |
+
|
91 |
+
# pyenv
|
92 |
+
# For a library or package, you might want to ignore these files since the code is
|
93 |
+
# intended to run in multiple environments; otherwise, check them in:
|
94 |
+
# .python-version
|
95 |
+
|
96 |
+
# pipenv
|
97 |
+
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
|
98 |
+
# However, in case of collaboration, if having platform-specific dependencies or dependencies
|
99 |
+
# having no cross-platform support, pipenv may install dependencies that don't work, or not
|
100 |
+
# install all needed dependencies.
|
101 |
+
#Pipfile.lock
|
102 |
+
|
103 |
+
# poetry
|
104 |
+
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
|
105 |
+
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
106 |
+
# commonly ignored for libraries.
|
107 |
+
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
|
108 |
+
#poetry.lock
|
109 |
+
|
110 |
+
# pdm
|
111 |
+
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
|
112 |
+
#pdm.lock
|
113 |
+
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
|
114 |
+
# in version control.
|
115 |
+
# https://pdm.fming.dev/#use-with-ide
|
116 |
+
.pdm.toml
|
117 |
+
|
118 |
+
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
|
119 |
+
__pypackages__/
|
120 |
+
|
121 |
+
# Celery stuff
|
122 |
+
celerybeat-schedule
|
123 |
+
celerybeat.pid
|
124 |
+
|
125 |
+
# SageMath parsed files
|
126 |
+
*.sage.py
|
127 |
+
|
128 |
+
# Environments
|
129 |
+
.env
|
130 |
+
.venv
|
131 |
+
env/
|
132 |
+
venv/
|
133 |
+
ENV/
|
134 |
+
env.bak/
|
135 |
+
venv.bak/
|
136 |
+
|
137 |
+
# Spyder project settings
|
138 |
+
.spyderproject
|
139 |
+
.spyproject
|
140 |
+
|
141 |
+
# Rope project settings
|
142 |
+
.ropeproject
|
143 |
+
|
144 |
+
# mkdocs documentation
|
145 |
+
/site
|
146 |
+
|
147 |
+
# mypy
|
148 |
+
.mypy_cache/
|
149 |
+
.dmypy.json
|
150 |
+
dmypy.json
|
151 |
+
|
152 |
+
# Pyre type checker
|
153 |
+
.pyre/
|
154 |
+
|
155 |
+
# pytype static type analyzer
|
156 |
+
.pytype/
|
157 |
+
|
158 |
+
# Cython debug symbols
|
159 |
+
cython_debug/
|
160 |
+
|
161 |
+
# PyCharm
|
162 |
+
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
|
163 |
+
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
|
164 |
+
# and can be added to the global gitignore or merged into this file. For a more nuclear
|
165 |
+
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
|
166 |
+
#.idea/
|
README.md
CHANGED
@@ -1,13 +1,14 @@
|
|
1 |
-
|
2 |
-
|
3 |
-
emoji: 🐨
|
4 |
-
colorFrom: indigo
|
5 |
-
colorTo: purple
|
6 |
-
sdk: streamlit
|
7 |
-
sdk_version: 1.31.1
|
8 |
-
app_file: app.py
|
9 |
-
pinned: false
|
10 |
-
license: mit
|
11 |
-
---
|
12 |
|
13 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# AI and Data Science examples 🧠
|
2 |
+
Space for the Streamlit "AI and Data Science examples" HEC Paris app.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
3 |
|
4 |
+
The app is structured in 5 pages:
|
5 |
+
- Supervised vs Unsupervised
|
6 |
+
- Time Series Analysis
|
7 |
+
- Sentiment Analysis
|
8 |
+
- Object detection
|
9 |
+
- Recommendation system
|
10 |
+
|
11 |
+
Each page contains one or more real-life use cases of AI.
|
12 |
+
Some of these use cases include electrical power consumption forecasting or customer segmentation
|
13 |
+
|
14 |
+
Other pages on image segmentation and topic modeling are currently being developped.
|
data/classification/credit_score/credit_score_cm_train
ADDED
Binary file (779 Bytes). View file
|
|
data/classification/credit_score/credit_score_test_pp.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:e0fbd76a90f5289377d21c2d39f377cb73e13de1c71f3818c7b0ea71f46a29ac
|
3 |
+
size 241322
|
data/classification/credit_score/credit_score_test_raw.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:c2659d847aeb751f63a49e15b6bdc501be32eaddfaba9b33ca86f279065559f2
|
3 |
+
size 103703
|
data/classification/credit_score/credit_score_train_raw.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:7d218682f6e67a5c9f81d227fd90362976185aa78958ab432d6561b6dfd960a4
|
3 |
+
size 725729
|
data/clustering/clean_marketing.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:7bde1e077f04583237c0029abf25f841d9100d9050375fbec00c328f14c5c1b2
|
3 |
+
size 284225
|
data/clustering/results/results_2_clusters.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:1cd983411f4d5fa3e5e82db4058183af7d22683d8ce52cc909a9a5edb03a153c
|
3 |
+
size 1155
|
data/clustering/results/results_3_clusters.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:4ddb93b8758544c327ec2f3336db7dadb3ce013cbe54e82693c4d5cb2d0d441b
|
3 |
+
size 1279
|
data/clustering/results/results_4_clusters.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:e6b483a60584781dc8667241b041c9c1962ea0bbd601dd98c3fbc19259e1be34
|
3 |
+
size 1403
|
data/clustering/results/results_5_clusters.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:853b059260cbdedf4ebb2f61f45c4a44d78195609c58797e5d4a47646b357ae2
|
3 |
+
size 1527
|
data/clustering/results/results_6_clusters.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:a6c9dd3f0fcdeb05a15076c1e196d7748b79db981153489b0ac31f73a3b69518
|
3 |
+
size 1651
|
data/hotels/booking_df.csv
ADDED
The diff for this file is too large to render.
See raw diff
|
|
data/household/household_power_consumption_clean.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:1c6207c16c7301dbd331706eac29c250a04da757a73a070d156c69c9f4e04d4c
|
3 |
+
size 81958
|
data/movies/csr_data_tf.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:1740a81957cb480a43c02c956a64d2fb3be9213888279641cdbfea8b5ea9c60a
|
3 |
+
size 1632893
|
data/movies/movies_dict2.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:fae593200377c6dbfa1b5bc00883234b5c5986fb8f21997431ef0fdd4a814ac8
|
3 |
+
size 1722011
|
data/movies/vote_info.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:0bcea5abd2b4f192b1db3c3dea541aeea36ddea662c22270fa186c3bcbf887cc
|
3 |
+
size 114227
|
data/pinterest/image1.jpg
ADDED
data/pinterest/image2.jpg
ADDED
data/pinterest/image3.jpg
ADDED
data/pinterest/image4.jpg
ADDED
data/sa_data/reviews_raw.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:11fa1be19b16bdd6e183991367670e735caf9b974dbbbd651b877786078aa557
|
3 |
+
size 19784
|
data/sa_data/reviews_results.pkl
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:a304e138d8444653e3288434ac2b0804079a6de1271b1de3d27755bba3b25d07
|
3 |
+
size 20455
|
images/AI.jpg
ADDED
images/clustering.webp
ADDED
images/credit_score.jpg
ADDED
images/cs.webp
ADDED
images/energy_consumption.jpg
ADDED
images/france.jpeg
ADDED
images/group.png
ADDED
images/hec.png
ADDED
images/hi-paris.png
ADDED
images/models/credit_score/EDA_numeric_credit.png
ADDED
images/object_detection.png
ADDED
images/od_fashion.jpg
ADDED
images/reviews.jpg
ADDED
images/room.jpg
ADDED
images/rs.png
ADDED
images/sentiment_analysis.png
ADDED
images/singapore.jpg
ADDED
images/spain-banner.jpg
ADDED
images/spain.WebP
ADDED
images/supervised_learner.png
ADDED
images/thailand.jpeg
ADDED
images/ts_patterns.png
ADDED
images/unsupervised_learner.webp
ADDED
main_page.py
ADDED
@@ -0,0 +1,84 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import streamlit as st
|
3 |
+
import pandas as pd
|
4 |
+
import numpy as np
|
5 |
+
|
6 |
+
from st_pages import Page, show_pages
|
7 |
+
from PIL import Image
|
8 |
+
#from utils import authenticate_drive
|
9 |
+
|
10 |
+
|
11 |
+
|
12 |
+
##################################################################################
|
13 |
+
# PAGE CONFIGURATION #
|
14 |
+
##################################################################################
|
15 |
+
|
16 |
+
st.set_page_config(layout="wide")
|
17 |
+
|
18 |
+
|
19 |
+
|
20 |
+
|
21 |
+
##################################################################################
|
22 |
+
# GOOGLE DRIVE CONNEXION #
|
23 |
+
##################################################################################
|
24 |
+
|
25 |
+
# if ["drive_oauth"] not in st.session_state:
|
26 |
+
# st.session_state["drive_oauth"] = authenticate_drive()
|
27 |
+
|
28 |
+
# drive_oauth = st.session_state["drive_oauth"]
|
29 |
+
|
30 |
+
|
31 |
+
|
32 |
+
|
33 |
+
##################################################################################
|
34 |
+
# TITLE #
|
35 |
+
##################################################################################
|
36 |
+
|
37 |
+
st.image("images/AI.jpg")
|
38 |
+
st.title("AI and Data Science Examples")
|
39 |
+
st.subheader("HEC Paris, 2023-2024")
|
40 |
+
st.markdown("Course provided by **Shirish C. SRIVASTAVA**")
|
41 |
+
|
42 |
+
st.markdown(" ")
|
43 |
+
st.info("""**About the app**: The AI and Data Science Examples app was created to introduce students to the field of Data Science by showcasing real-life applications of AI.
|
44 |
+
It includes use cases using traditional Machine Learning algorithms on structured data, as well as Deep Learning models run on unstructured data (text, images,...).""")
|
45 |
+
|
46 |
+
st.divider()
|
47 |
+
|
48 |
+
|
49 |
+
#Hi! PARIS collaboration mention
|
50 |
+
st.markdown(" ")
|
51 |
+
image_hiparis = Image.open('images/hi-paris.png')
|
52 |
+
st.image(image_hiparis, width=150)
|
53 |
+
url = "https://www.hi-paris.fr/"
|
54 |
+
st.markdown("**The app was made in collaboration with: [Hi! PARIS Engineering Team](%s)**" % url)
|
55 |
+
|
56 |
+
|
57 |
+
|
58 |
+
|
59 |
+
##################################################################################
|
60 |
+
# DASHBOARD/SIDEBAR #
|
61 |
+
##################################################################################
|
62 |
+
|
63 |
+
|
64 |
+
# AI use case pages
|
65 |
+
show_pages(
|
66 |
+
[
|
67 |
+
Page("main_page.py", "Home Page", "🏠"),
|
68 |
+
Page("pages/supervised_unsupervised_page.py", "Supervised vs Unsupervised", "🔍"),
|
69 |
+
Page("pages/timeseries_analysis.py", "Time Series Forecasting", "📈"),
|
70 |
+
Page("pages/sentiment_analysis.py", "Sentiment Analysis", "👍"),
|
71 |
+
#Page("pages/object_detection.py", "Object Detection", "📹"), #need to reduce RAM costs
|
72 |
+
Page("pages/recommendation_system.py", "Recommendation system", "🛒")
|
73 |
+
]
|
74 |
+
)
|
75 |
+
|
76 |
+
|
77 |
+
|
78 |
+
##################################################################################
|
79 |
+
# PAGE CONTENT #
|
80 |
+
##################################################################################
|
81 |
+
|
82 |
+
|
83 |
+
|
84 |
+
|
notebooks/Supervised-Unsupervised/credit_score.ipynb
ADDED
The diff for this file is too large to render.
See raw diff
|
|
notebooks/Supervised-Unsupervised/customer_churn.ipynb
ADDED
The diff for this file is too large to render.
See raw diff
|
|
notebooks/Supervised-Unsupervised/customer_segmentation.ipynb
ADDED
@@ -0,0 +1,632 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cells": [
|
3 |
+
{
|
4 |
+
"cell_type": "code",
|
5 |
+
"execution_count": 5,
|
6 |
+
"metadata": {},
|
7 |
+
"outputs": [],
|
8 |
+
"source": [
|
9 |
+
"import os\n",
|
10 |
+
"import pandas as pd\n",
|
11 |
+
"import numpy as np\n",
|
12 |
+
"import matplotlib.pyplot as plt \n",
|
13 |
+
"import seaborn as sns"
|
14 |
+
]
|
15 |
+
},
|
16 |
+
{
|
17 |
+
"cell_type": "markdown",
|
18 |
+
"metadata": {},
|
19 |
+
"source": [
|
20 |
+
"## Customer segmentation for targeted marketing campaign\n",
|
21 |
+
"\n",
|
22 |
+
"https://www.kaggle.com/datasets/imakash3011/customer-personality-analysis\n",
|
23 |
+
"\n",
|
24 |
+
"**People**\n",
|
25 |
+
"- ID: Customer's unique identifier\n",
|
26 |
+
"- Year_Birth: Customer's birth year\n",
|
27 |
+
"- Education: Customer's education level\n",
|
28 |
+
"- Marital_Status: Customer's marital status\n",
|
29 |
+
"- Income: Customer's yearly household income\n",
|
30 |
+
"- Kidhome: Number of children in customer's household\n",
|
31 |
+
"- Teenhome: Number of teenagers in customer's household\n",
|
32 |
+
"- Dt_Customer: Date of customer's enrollment with the company\n",
|
33 |
+
"- Recency: Number of days since customer's last purchase\n",
|
34 |
+
"- Complain: 1 if the customer complained in the last 2 years, 0 otherwise\n",
|
35 |
+
"\n",
|
36 |
+
"**Products**\n",
|
37 |
+
"- MntWines: Amount spent on wine in last 2 years\n",
|
38 |
+
"- MntFruits: Amount spent on fruits in last 2 years\n",
|
39 |
+
"- MntMeatProducts: Amount spent on meat in last 2 years\n",
|
40 |
+
"- MntFishProducts: Amount spent on fish in last 2 years\n",
|
41 |
+
"- MntSweetProducts: Amount spent on sweets in last 2 years\n",
|
42 |
+
"- MntGoldProds: Amount spent on gold in last 2 years\n",
|
43 |
+
"\n",
|
44 |
+
"**Promotion**\n",
|
45 |
+
"- NumDealsPurchases: Number of purchases made with a discount\n",
|
46 |
+
"- AcceptedCmp1: 1 if customer accepted the offer in the 1st campaign, 0 otherwise\n",
|
47 |
+
"- AcceptedCmp2: 1 if customer accepted the offer in the 2nd campaign, 0 otherwise\n",
|
48 |
+
"- AcceptedCmp3: 1 if customer accepted the offer in the 3rd campaign, 0 otherwise\n",
|
49 |
+
"- AcceptedCmp4: 1 if customer accepted the offer in the 4th campaign, 0 otherwise\n",
|
50 |
+
"- AcceptedCmp5: 1 if customer accepted the offer in the 5th campaign, 0 otherwise\n",
|
51 |
+
"- Response: 1 if customer accepted the offer in the last campaign, 0 otherwise\n",
|
52 |
+
"\n",
|
53 |
+
"**Place**\n",
|
54 |
+
"- NumWebPurchases: Number of purchases made through the company’s website\n",
|
55 |
+
"- NumCatalogPurchases: Number of purchases made using a catalogue\n",
|
56 |
+
"- NumStorePurchases: Number of purchases made directly in stores\n",
|
57 |
+
"- NumWebVisitsMonth: Number of visits to company’s website in the last month"
|
58 |
+
]
|
59 |
+
},
|
60 |
+
{
|
61 |
+
"cell_type": "markdown",
|
62 |
+
"metadata": {},
|
63 |
+
"source": [
|
64 |
+
"### Data Cleaning"
|
65 |
+
]
|
66 |
+
},
|
67 |
+
{
|
68 |
+
"cell_type": "code",
|
69 |
+
"execution_count": 1363,
|
70 |
+
"metadata": {},
|
71 |
+
"outputs": [],
|
72 |
+
"source": [
|
73 |
+
"# Load dataset\n",
|
74 |
+
"path_data_marketing = r\"C:\\Users\\LaurèneDAVID\\Documents\\Teaching\\Educational_apps\\app-hec-AI-DS\\data\\clustering\\marketing_campaign.csv\"\n",
|
75 |
+
"marketing_data = pd.read_csv(path_data_marketing, sep=\";\")"
|
76 |
+
]
|
77 |
+
},
|
78 |
+
{
|
79 |
+
"cell_type": "code",
|
80 |
+
"execution_count": 1364,
|
81 |
+
"metadata": {},
|
82 |
+
"outputs": [],
|
83 |
+
"source": [
|
84 |
+
"# Delete columns\n",
|
85 |
+
"marketing_data.drop(columns=['ID','MntGoldProds','Response','Complain','AcceptedCmp3', 'AcceptedCmp4', 'AcceptedCmp5', 'AcceptedCmp1','AcceptedCmp2',\n",
|
86 |
+
" 'Z_CostContact', 'Z_Revenue'], inplace=True)\n",
|
87 |
+
"\n",
|
88 |
+
"#marketing_data = marketing_data.loc[marketing_data[\"Marital_Status\"].isin([\"Single\",\"Married\",\"Divorced\"])]\n",
|
89 |
+
"marketing_data.drop(columns=[\"Marital_Status\"], inplace=True)\n",
|
90 |
+
"\n",
|
91 |
+
"# marketing_data = marketing_data.loc[marketing_data[\"Education\"].isin([\"2n Cycle\",\"Graduation\",\"Master\",\"PhD\"])]\n",
|
92 |
+
"marketing_data.drop(columns=[\"Education\"],inplace=True)\n",
|
93 |
+
"\n",
|
94 |
+
"marketing_data = marketing_data[marketing_data[\"Income\"]>5000]"
|
95 |
+
]
|
96 |
+
},
|
97 |
+
{
|
98 |
+
"cell_type": "code",
|
99 |
+
"execution_count": 1365,
|
100 |
+
"metadata": {},
|
101 |
+
"outputs": [],
|
102 |
+
"source": [
|
103 |
+
"# Change column names\n",
|
104 |
+
"new_columns = [col.replace(\"Mnt\",\"\").replace(\"Num\",\"\") for col in list(marketing_data.columns)]\n",
|
105 |
+
"new_columns = [col + \"Products\" if col in [\"Wines\",\"Fruits\"] else col for col in new_columns]\n",
|
106 |
+
"marketing_data.columns = new_columns"
|
107 |
+
]
|
108 |
+
},
|
109 |
+
{
|
110 |
+
"cell_type": "markdown",
|
111 |
+
"metadata": {},
|
112 |
+
"source": [
|
113 |
+
"### Data Preprocessing"
|
114 |
+
]
|
115 |
+
},
|
116 |
+
{
|
117 |
+
"cell_type": "code",
|
118 |
+
"execution_count": 1366,
|
119 |
+
"metadata": {},
|
120 |
+
"outputs": [],
|
121 |
+
"source": [
|
122 |
+
"# Proportion of a customer's income spent on wines, fruits, ...\n",
|
123 |
+
"products_col = [\"WinesProducts\",\"FruitsProducts\", \"MeatProducts\",\"FishProducts\",\"SweetProducts\"]\n",
|
124 |
+
"total_amount_spent = marketing_data[products_col].sum(axis=1)\n",
|
125 |
+
"\n",
|
126 |
+
"for col in products_col:\n",
|
127 |
+
" marketing_data[col] = (100*marketing_data[col] / total_amount_spent).round(1)"
|
128 |
+
]
|
129 |
+
},
|
130 |
+
{
|
131 |
+
"cell_type": "code",
|
132 |
+
"execution_count": 1367,
|
133 |
+
"metadata": {},
|
134 |
+
"outputs": [],
|
135 |
+
"source": [
|
136 |
+
"# Proportion of web, catalog and store purchases (based on total number of purchases)\n",
|
137 |
+
"purchases_col = [\"WebPurchases\", \"CatalogPurchases\", \"StorePurchases\"]\n",
|
138 |
+
"total_purchases = marketing_data[purchases_col].sum(axis=1)\n",
|
139 |
+
"\n",
|
140 |
+
"for col in purchases_col:\n",
|
141 |
+
" marketing_data[col] = (100*marketing_data[col] / total_purchases).round(1)"
|
142 |
+
]
|
143 |
+
},
|
144 |
+
{
|
145 |
+
"cell_type": "code",
|
146 |
+
"execution_count": 1368,
|
147 |
+
"metadata": {},
|
148 |
+
"outputs": [],
|
149 |
+
"source": [
|
150 |
+
"from datetime import datetime, date\n",
|
151 |
+
"\n",
|
152 |
+
"def get_number_days(input_date):\n",
|
153 |
+
" date1 = datetime.strptime(input_date, '%d/%m/%Y').date()\n",
|
154 |
+
" date2 = date(2022, 2, 13)\n",
|
155 |
+
" return (date2 - date1).days"
|
156 |
+
]
|
157 |
+
},
|
158 |
+
{
|
159 |
+
"cell_type": "code",
|
160 |
+
"execution_count": 1369,
|
161 |
+
"metadata": {},
|
162 |
+
"outputs": [],
|
163 |
+
"source": [
|
164 |
+
"# Compute a customer's age, based on year of birth\n",
|
165 |
+
"marketing_data.insert(0, \"Age\", marketing_data[\"Year_Birth\"].apply(lambda x: 2023-x))\n",
|
166 |
+
"\n",
|
167 |
+
"# Compute the number of days a customer has been subscribed \n",
|
168 |
+
"marketing_data.insert(1, \"Days_subscription\", marketing_data[\"Dt_Customer\"].apply(get_number_days))\n",
|
169 |
+
"\n",
|
170 |
+
"# Compute total number of kids (kids + teens)\n",
|
171 |
+
"marketing_data[\"Kids\"] = marketing_data[\"Kidhome\"] + marketing_data[\"Teenhome\"]\n",
|
172 |
+
"marketing_data.drop(columns=[\"Kidhome\",\"Teenhome\"], inplace=True)\n",
|
173 |
+
"\n",
|
174 |
+
"marketing_data.drop(columns=[\"Year_Birth\", \"Dt_Customer\"], inplace=True)\n",
|
175 |
+
"marketing_data.dropna(inplace=True)"
|
176 |
+
]
|
177 |
+
},
|
178 |
+
{
|
179 |
+
"cell_type": "code",
|
180 |
+
"execution_count": 1370,
|
181 |
+
"metadata": {},
|
182 |
+
"outputs": [],
|
183 |
+
"source": [
|
184 |
+
"path_cleandata = r\"C:\\Users\\LaurèneDAVID\\Documents\\Teaching\\Educational_apps\\app-hec-AI-DS\\data\\clustering\"\n",
|
185 |
+
"marketing_data.to_pickle(os.path.join(path_cleandata,\"clean_marketing.pkl\"))"
|
186 |
+
]
|
187 |
+
},
|
188 |
+
{
|
189 |
+
"cell_type": "code",
|
190 |
+
"execution_count": 1371,
|
191 |
+
"metadata": {},
|
192 |
+
"outputs": [
|
193 |
+
{
|
194 |
+
"data": {
|
195 |
+
"text/html": [
|
196 |
+
"<div>\n",
|
197 |
+
"<style scoped>\n",
|
198 |
+
" .dataframe tbody tr th:only-of-type {\n",
|
199 |
+
" vertical-align: middle;\n",
|
200 |
+
" }\n",
|
201 |
+
"\n",
|
202 |
+
" .dataframe tbody tr th {\n",
|
203 |
+
" vertical-align: top;\n",
|
204 |
+
" }\n",
|
205 |
+
"\n",
|
206 |
+
" .dataframe thead th {\n",
|
207 |
+
" text-align: right;\n",
|
208 |
+
" }\n",
|
209 |
+
"</style>\n",
|
210 |
+
"<table border=\"1\" class=\"dataframe\">\n",
|
211 |
+
" <thead>\n",
|
212 |
+
" <tr style=\"text-align: right;\">\n",
|
213 |
+
" <th></th>\n",
|
214 |
+
" <th>Age</th>\n",
|
215 |
+
" <th>Days_subscription</th>\n",
|
216 |
+
" <th>Income</th>\n",
|
217 |
+
" <th>Recency</th>\n",
|
218 |
+
" <th>WinesProducts</th>\n",
|
219 |
+
" <th>FruitsProducts</th>\n",
|
220 |
+
" <th>MeatProducts</th>\n",
|
221 |
+
" <th>FishProducts</th>\n",
|
222 |
+
" <th>SweetProducts</th>\n",
|
223 |
+
" <th>DealsPurchases</th>\n",
|
224 |
+
" <th>WebPurchases</th>\n",
|
225 |
+
" <th>CatalogPurchases</th>\n",
|
226 |
+
" <th>StorePurchases</th>\n",
|
227 |
+
" <th>WebVisitsMonth</th>\n",
|
228 |
+
" <th>Kids</th>\n",
|
229 |
+
" </tr>\n",
|
230 |
+
" </thead>\n",
|
231 |
+
" <tbody>\n",
|
232 |
+
" <tr>\n",
|
233 |
+
" <th>0</th>\n",
|
234 |
+
" <td>66</td>\n",
|
235 |
+
" <td>3449</td>\n",
|
236 |
+
" <td>58138.0</td>\n",
|
237 |
+
" <td>58</td>\n",
|
238 |
+
" <td>41.5</td>\n",
|
239 |
+
" <td>5.8</td>\n",
|
240 |
+
" <td>35.7</td>\n",
|
241 |
+
" <td>11.2</td>\n",
|
242 |
+
" <td>5.8</td>\n",
|
243 |
+
" <td>3</td>\n",
|
244 |
+
" <td>36.4</td>\n",
|
245 |
+
" <td>45.5</td>\n",
|
246 |
+
" <td>18.2</td>\n",
|
247 |
+
" <td>7</td>\n",
|
248 |
+
" <td>0</td>\n",
|
249 |
+
" </tr>\n",
|
250 |
+
" <tr>\n",
|
251 |
+
" <th>1</th>\n",
|
252 |
+
" <td>69</td>\n",
|
253 |
+
" <td>2899</td>\n",
|
254 |
+
" <td>46344.0</td>\n",
|
255 |
+
" <td>38</td>\n",
|
256 |
+
" <td>52.4</td>\n",
|
257 |
+
" <td>4.8</td>\n",
|
258 |
+
" <td>28.6</td>\n",
|
259 |
+
" <td>9.5</td>\n",
|
260 |
+
" <td>4.8</td>\n",
|
261 |
+
" <td>2</td>\n",
|
262 |
+
" <td>25.0</td>\n",
|
263 |
+
" <td>25.0</td>\n",
|
264 |
+
" <td>50.0</td>\n",
|
265 |
+
" <td>5</td>\n",
|
266 |
+
" <td>2</td>\n",
|
267 |
+
" </tr>\n",
|
268 |
+
" <tr>\n",
|
269 |
+
" <th>2</th>\n",
|
270 |
+
" <td>58</td>\n",
|
271 |
+
" <td>3098</td>\n",
|
272 |
+
" <td>71613.0</td>\n",
|
273 |
+
" <td>26</td>\n",
|
274 |
+
" <td>58.0</td>\n",
|
275 |
+
" <td>6.7</td>\n",
|
276 |
+
" <td>17.3</td>\n",
|
277 |
+
" <td>15.1</td>\n",
|
278 |
+
" <td>2.9</td>\n",
|
279 |
+
" <td>1</td>\n",
|
280 |
+
" <td>40.0</td>\n",
|
281 |
+
" <td>10.0</td>\n",
|
282 |
+
" <td>50.0</td>\n",
|
283 |
+
" <td>4</td>\n",
|
284 |
+
" <td>0</td>\n",
|
285 |
+
" </tr>\n",
|
286 |
+
" <tr>\n",
|
287 |
+
" <th>3</th>\n",
|
288 |
+
" <td>39</td>\n",
|
289 |
+
" <td>2925</td>\n",
|
290 |
+
" <td>26646.0</td>\n",
|
291 |
+
" <td>26</td>\n",
|
292 |
+
" <td>22.9</td>\n",
|
293 |
+
" <td>8.3</td>\n",
|
294 |
+
" <td>41.7</td>\n",
|
295 |
+
" <td>20.8</td>\n",
|
296 |
+
" <td>6.2</td>\n",
|
297 |
+
" <td>2</td>\n",
|
298 |
+
" <td>33.3</td>\n",
|
299 |
+
" <td>0.0</td>\n",
|
300 |
+
" <td>66.7</td>\n",
|
301 |
+
" <td>6</td>\n",
|
302 |
+
" <td>1</td>\n",
|
303 |
+
" </tr>\n",
|
304 |
+
" <tr>\n",
|
305 |
+
" <th>4</th>\n",
|
306 |
+
" <td>42</td>\n",
|
307 |
+
" <td>2947</td>\n",
|
308 |
+
" <td>58293.0</td>\n",
|
309 |
+
" <td>94</td>\n",
|
310 |
+
" <td>42.5</td>\n",
|
311 |
+
" <td>10.6</td>\n",
|
312 |
+
" <td>29.0</td>\n",
|
313 |
+
" <td>11.3</td>\n",
|
314 |
+
" <td>6.6</td>\n",
|
315 |
+
" <td>5</td>\n",
|
316 |
+
" <td>35.7</td>\n",
|
317 |
+
" <td>21.4</td>\n",
|
318 |
+
" <td>42.9</td>\n",
|
319 |
+
" <td>5</td>\n",
|
320 |
+
" <td>1</td>\n",
|
321 |
+
" </tr>\n",
|
322 |
+
" <tr>\n",
|
323 |
+
" <th>...</th>\n",
|
324 |
+
" <td>...</td>\n",
|
325 |
+
" <td>...</td>\n",
|
326 |
+
" <td>...</td>\n",
|
327 |
+
" <td>...</td>\n",
|
328 |
+
" <td>...</td>\n",
|
329 |
+
" <td>...</td>\n",
|
330 |
+
" <td>...</td>\n",
|
331 |
+
" <td>...</td>\n",
|
332 |
+
" <td>...</td>\n",
|
333 |
+
" <td>...</td>\n",
|
334 |
+
" <td>...</td>\n",
|
335 |
+
" <td>...</td>\n",
|
336 |
+
" <td>...</td>\n",
|
337 |
+
" <td>...</td>\n",
|
338 |
+
" <td>...</td>\n",
|
339 |
+
" </tr>\n",
|
340 |
+
" <tr>\n",
|
341 |
+
" <th>2235</th>\n",
|
342 |
+
" <td>56</td>\n",
|
343 |
+
" <td>3167</td>\n",
|
344 |
+
" <td>61223.0</td>\n",
|
345 |
+
" <td>46</td>\n",
|
346 |
+
" <td>64.8</td>\n",
|
347 |
+
" <td>3.9</td>\n",
|
348 |
+
" <td>16.6</td>\n",
|
349 |
+
" <td>3.8</td>\n",
|
350 |
+
" <td>10.8</td>\n",
|
351 |
+
" <td>2</td>\n",
|
352 |
+
" <td>56.2</td>\n",
|
353 |
+
" <td>18.8</td>\n",
|
354 |
+
" <td>25.0</td>\n",
|
355 |
+
" <td>5</td>\n",
|
356 |
+
" <td>1</td>\n",
|
357 |
+
" </tr>\n",
|
358 |
+
" <tr>\n",
|
359 |
+
" <th>2236</th>\n",
|
360 |
+
" <td>77</td>\n",
|
361 |
+
" <td>2805</td>\n",
|
362 |
+
" <td>64014.0</td>\n",
|
363 |
+
" <td>56</td>\n",
|
364 |
+
" <td>93.1</td>\n",
|
365 |
+
" <td>0.0</td>\n",
|
366 |
+
" <td>6.9</td>\n",
|
367 |
+
" <td>0.0</td>\n",
|
368 |
+
" <td>0.0</td>\n",
|
369 |
+
" <td>7</td>\n",
|
370 |
+
" <td>53.3</td>\n",
|
371 |
+
" <td>13.3</td>\n",
|
372 |
+
" <td>33.3</td>\n",
|
373 |
+
" <td>7</td>\n",
|
374 |
+
" <td>3</td>\n",
|
375 |
+
" </tr>\n",
|
376 |
+
" <tr>\n",
|
377 |
+
" <th>2237</th>\n",
|
378 |
+
" <td>42</td>\n",
|
379 |
+
" <td>2941</td>\n",
|
380 |
+
" <td>56981.0</td>\n",
|
381 |
+
" <td>91</td>\n",
|
382 |
+
" <td>74.6</td>\n",
|
383 |
+
" <td>3.9</td>\n",
|
384 |
+
" <td>17.8</td>\n",
|
385 |
+
" <td>2.6</td>\n",
|
386 |
+
" <td>1.0</td>\n",
|
387 |
+
" <td>1</td>\n",
|
388 |
+
" <td>11.1</td>\n",
|
389 |
+
" <td>16.7</td>\n",
|
390 |
+
" <td>72.2</td>\n",
|
391 |
+
" <td>6</td>\n",
|
392 |
+
" <td>0</td>\n",
|
393 |
+
" </tr>\n",
|
394 |
+
" <tr>\n",
|
395 |
+
" <th>2238</th>\n",
|
396 |
+
" <td>67</td>\n",
|
397 |
+
" <td>2942</td>\n",
|
398 |
+
" <td>69245.0</td>\n",
|
399 |
+
" <td>8</td>\n",
|
400 |
+
" <td>54.7</td>\n",
|
401 |
+
" <td>3.8</td>\n",
|
402 |
+
" <td>27.4</td>\n",
|
403 |
+
" <td>10.2</td>\n",
|
404 |
+
" <td>3.8</td>\n",
|
405 |
+
" <td>2</td>\n",
|
406 |
+
" <td>28.6</td>\n",
|
407 |
+
" <td>23.8</td>\n",
|
408 |
+
" <td>47.6</td>\n",
|
409 |
+
" <td>3</td>\n",
|
410 |
+
" <td>1</td>\n",
|
411 |
+
" </tr>\n",
|
412 |
+
" <tr>\n",
|
413 |
+
" <th>2239</th>\n",
|
414 |
+
" <td>69</td>\n",
|
415 |
+
" <td>3408</td>\n",
|
416 |
+
" <td>52869.0</td>\n",
|
417 |
+
" <td>40</td>\n",
|
418 |
+
" <td>55.6</td>\n",
|
419 |
+
" <td>2.0</td>\n",
|
420 |
+
" <td>40.4</td>\n",
|
421 |
+
" <td>1.3</td>\n",
|
422 |
+
" <td>0.7</td>\n",
|
423 |
+
" <td>3</td>\n",
|
424 |
+
" <td>37.5</td>\n",
|
425 |
+
" <td>12.5</td>\n",
|
426 |
+
" <td>50.0</td>\n",
|
427 |
+
" <td>7</td>\n",
|
428 |
+
" <td>2</td>\n",
|
429 |
+
" </tr>\n",
|
430 |
+
" </tbody>\n",
|
431 |
+
"</table>\n",
|
432 |
+
"<p>2208 rows × 15 columns</p>\n",
|
433 |
+
"</div>"
|
434 |
+
],
|
435 |
+
"text/plain": [
|
436 |
+
" Age Days_subscription Income Recency WinesProducts FruitsProducts \\\n",
|
437 |
+
"0 66 3449 58138.0 58 41.5 5.8 \n",
|
438 |
+
"1 69 2899 46344.0 38 52.4 4.8 \n",
|
439 |
+
"2 58 3098 71613.0 26 58.0 6.7 \n",
|
440 |
+
"3 39 2925 26646.0 26 22.9 8.3 \n",
|
441 |
+
"4 42 2947 58293.0 94 42.5 10.6 \n",
|
442 |
+
"... ... ... ... ... ... ... \n",
|
443 |
+
"2235 56 3167 61223.0 46 64.8 3.9 \n",
|
444 |
+
"2236 77 2805 64014.0 56 93.1 0.0 \n",
|
445 |
+
"2237 42 2941 56981.0 91 74.6 3.9 \n",
|
446 |
+
"2238 67 2942 69245.0 8 54.7 3.8 \n",
|
447 |
+
"2239 69 3408 52869.0 40 55.6 2.0 \n",
|
448 |
+
"\n",
|
449 |
+
" MeatProducts FishProducts SweetProducts DealsPurchases WebPurchases \\\n",
|
450 |
+
"0 35.7 11.2 5.8 3 36.4 \n",
|
451 |
+
"1 28.6 9.5 4.8 2 25.0 \n",
|
452 |
+
"2 17.3 15.1 2.9 1 40.0 \n",
|
453 |
+
"3 41.7 20.8 6.2 2 33.3 \n",
|
454 |
+
"4 29.0 11.3 6.6 5 35.7 \n",
|
455 |
+
"... ... ... ... ... ... \n",
|
456 |
+
"2235 16.6 3.8 10.8 2 56.2 \n",
|
457 |
+
"2236 6.9 0.0 0.0 7 53.3 \n",
|
458 |
+
"2237 17.8 2.6 1.0 1 11.1 \n",
|
459 |
+
"2238 27.4 10.2 3.8 2 28.6 \n",
|
460 |
+
"2239 40.4 1.3 0.7 3 37.5 \n",
|
461 |
+
"\n",
|
462 |
+
" CatalogPurchases StorePurchases WebVisitsMonth Kids \n",
|
463 |
+
"0 45.5 18.2 7 0 \n",
|
464 |
+
"1 25.0 50.0 5 2 \n",
|
465 |
+
"2 10.0 50.0 4 0 \n",
|
466 |
+
"3 0.0 66.7 6 1 \n",
|
467 |
+
"4 21.4 42.9 5 1 \n",
|
468 |
+
"... ... ... ... ... \n",
|
469 |
+
"2235 18.8 25.0 5 1 \n",
|
470 |
+
"2236 13.3 33.3 7 3 \n",
|
471 |
+
"2237 16.7 72.2 6 0 \n",
|
472 |
+
"2238 23.8 47.6 3 1 \n",
|
473 |
+
"2239 12.5 50.0 7 2 \n",
|
474 |
+
"\n",
|
475 |
+
"[2208 rows x 15 columns]"
|
476 |
+
]
|
477 |
+
},
|
478 |
+
"execution_count": 1371,
|
479 |
+
"metadata": {},
|
480 |
+
"output_type": "execute_result"
|
481 |
+
}
|
482 |
+
],
|
483 |
+
"source": [
|
484 |
+
"pd.read_pickle(os.path.join(path_cleandata,\"clean_marketing.pkl\"))"
|
485 |
+
]
|
486 |
+
},
|
487 |
+
{
|
488 |
+
"cell_type": "code",
|
489 |
+
"execution_count": 1372,
|
490 |
+
"metadata": {},
|
491 |
+
"outputs": [],
|
492 |
+
"source": [
|
493 |
+
"from sklearn.compose import ColumnTransformer\n",
|
494 |
+
"from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler\n",
|
495 |
+
"\n",
|
496 |
+
"num_columns = marketing_data.select_dtypes(include=[\"int64\", \"float64\"]).columns\n",
|
497 |
+
"\n",
|
498 |
+
"# Build data processing pipeline\n",
|
499 |
+
"ct = ColumnTransformer(\n",
|
500 |
+
" [(\"numerical\", RobustScaler(), num_columns)])\n",
|
501 |
+
"\n",
|
502 |
+
"X = ct.fit_transform(marketing_data)"
|
503 |
+
]
|
504 |
+
},
|
505 |
+
{
|
506 |
+
"cell_type": "code",
|
507 |
+
"execution_count": 1373,
|
508 |
+
"metadata": {},
|
509 |
+
"outputs": [],
|
510 |
+
"source": [
|
511 |
+
"columns_transform = [col.split(\"__\")[1] for col in ct.get_feature_names_out()]\n",
|
512 |
+
"df_clean = pd.DataFrame(X, columns=columns_transform)"
|
513 |
+
]
|
514 |
+
},
|
515 |
+
{
|
516 |
+
"cell_type": "markdown",
|
517 |
+
"metadata": {},
|
518 |
+
"source": [
|
519 |
+
"### Clustering"
|
520 |
+
]
|
521 |
+
},
|
522 |
+
{
|
523 |
+
"cell_type": "code",
|
524 |
+
"execution_count": 1374,
|
525 |
+
"metadata": {},
|
526 |
+
"outputs": [],
|
527 |
+
"source": [
|
528 |
+
"from sklearn.cluster import KMeans\n",
|
529 |
+
"from sklearn.metrics import silhouette_score\n",
|
530 |
+
"\n",
|
531 |
+
"def clustering_model(X, list_nb_clusters):\n",
|
532 |
+
" dict_labels = dict()\n",
|
533 |
+
" list_scores = []\n",
|
534 |
+
"\n",
|
535 |
+
" for n in list_nb_clusters:\n",
|
536 |
+
" kmeans = KMeans(n_clusters=n, n_init=10)\n",
|
537 |
+
" labels = kmeans.fit_predict(X)\n",
|
538 |
+
" score = silhouette_score(X, labels)\n",
|
539 |
+
" dict_labels[f\"{n} clusters\"] = labels\n",
|
540 |
+
" list_scores.append(score)\n",
|
541 |
+
"\n",
|
542 |
+
" return list_scores, dict_labels"
|
543 |
+
]
|
544 |
+
},
|
545 |
+
{
|
546 |
+
"cell_type": "code",
|
547 |
+
"execution_count": 1375,
|
548 |
+
"metadata": {},
|
549 |
+
"outputs": [],
|
550 |
+
"source": [
|
551 |
+
"list_nb_clusters = np.arange(2,7)\n",
|
552 |
+
"scores_kmeans, labels_kmeans = clustering_model(X, list_nb_clusters)"
|
553 |
+
]
|
554 |
+
},
|
555 |
+
{
|
556 |
+
"cell_type": "code",
|
557 |
+
"execution_count": 1376,
|
558 |
+
"metadata": {},
|
559 |
+
"outputs": [
|
560 |
+
{
|
561 |
+
"data": {
|
562 |
+
"image/png": "",
|
563 |
+
"text/plain": [
|
564 |
+
"<Figure size 640x480 with 1 Axes>"
|
565 |
+
]
|
566 |
+
},
|
567 |
+
"metadata": {},
|
568 |
+
"output_type": "display_data"
|
569 |
+
}
|
570 |
+
],
|
571 |
+
"source": [
|
572 |
+
"marketing_data_results = pd.DataFrame({\"nb_clusters\":[str(i) for i in np.arange(2,7)], \"scores\":scores_kmeans})\n",
|
573 |
+
"\n",
|
574 |
+
"sns.lineplot(data=marketing_data_results, x=\"nb_clusters\", y=\"scores\", marker=\"o\")\n",
|
575 |
+
"plt.xlabel(\"number of clusters\")\n",
|
576 |
+
"plt.ylabel(\"silhouette score\")\n",
|
577 |
+
"plt.title(\"Silhouette score of Kmeans\")\n",
|
578 |
+
"plt.show()"
|
579 |
+
]
|
580 |
+
},
|
581 |
+
{
|
582 |
+
"cell_type": "markdown",
|
583 |
+
"metadata": {},
|
584 |
+
"source": [
|
585 |
+
"### Save results"
|
586 |
+
]
|
587 |
+
},
|
588 |
+
{
|
589 |
+
"cell_type": "code",
|
590 |
+
"execution_count": 1377,
|
591 |
+
"metadata": {},
|
592 |
+
"outputs": [],
|
593 |
+
"source": [
|
594 |
+
"import os\n",
|
595 |
+
"path_results = r\"C:\\Users\\LaurèneDAVID\\Documents\\Teaching\\Educational_apps\\app-hec-AI-DS\\data\\clustering\\results\"\n",
|
596 |
+
"\n",
|
597 |
+
"for nb_clusters in list_nb_clusters:\n",
|
598 |
+
" labels_ = labels_kmeans[f\"{nb_clusters} clusters\"] # chosen labels\n",
|
599 |
+
" marketing_data_labels = marketing_data.copy()\n",
|
600 |
+
" marketing_data_labels[\"Group\"] = labels_\n",
|
601 |
+
" marketing_data_labels[\"Group\"] = marketing_data_labels[\"Group\"].astype(int)\n",
|
602 |
+
"\n",
|
603 |
+
" df_mean_results = marketing_data_labels.groupby(\"Group\")[num_columns].mean().reset_index()\n",
|
604 |
+
" df_mean_results = df_mean_results.round(1).melt(id_vars=[\"Group\"])\n",
|
605 |
+
" df_mean_results = pd.pivot_table(df_mean_results, values='value', index=['variable'], columns=[\"Group\"])\n",
|
606 |
+
"\n",
|
607 |
+
" df_mean_results.to_pickle(os.path.join(path_results,f\"results_{nb_clusters}_clusters.pkl\"))"
|
608 |
+
]
|
609 |
+
}
|
610 |
+
],
|
611 |
+
"metadata": {
|
612 |
+
"kernelspec": {
|
613 |
+
"display_name": "venv",
|
614 |
+
"language": "python",
|
615 |
+
"name": "python3"
|
616 |
+
},
|
617 |
+
"language_info": {
|
618 |
+
"codemirror_mode": {
|
619 |
+
"name": "ipython",
|
620 |
+
"version": 3
|
621 |
+
},
|
622 |
+
"file_extension": ".py",
|
623 |
+
"mimetype": "text/x-python",
|
624 |
+
"name": "python",
|
625 |
+
"nbconvert_exporter": "python",
|
626 |
+
"pygments_lexer": "ipython3",
|
627 |
+
"version": "3.9.0"
|
628 |
+
}
|
629 |
+
},
|
630 |
+
"nbformat": 4,
|
631 |
+
"nbformat_minor": 2
|
632 |
+
}
|