{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import numpy as np\n", "\n", "from datetime import date\n", "from datetime import timedelta" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
InvoiceNoStockCodeDescriptionQuantityInvoiceDateUnitPriceCustomerIDCountry
053636585123AWHITE HANGING HEART T-LIGHT HOLDER612/1/2010 8:262.5517850.0United Kingdom
153636571053WHITE METAL LANTERN612/1/2010 8:263.3917850.0United Kingdom
253636584406BCREAM CUPID HEARTS COAT HANGER812/1/2010 8:262.7517850.0United Kingdom
353636584029GKNITTED UNION FLAG HOT WATER BOTTLE612/1/2010 8:263.3917850.0United Kingdom
453636584029ERED WOOLLY HOTTIE WHITE HEART.612/1/2010 8:263.3917850.0United Kingdom
\n", "
" ], "text/plain": [ " InvoiceNo StockCode Description Quantity \n", "0 536365 85123A WHITE HANGING HEART T-LIGHT HOLDER 6 \\\n", "1 536365 71053 WHITE METAL LANTERN 6 \n", "2 536365 84406B CREAM CUPID HEARTS COAT HANGER 8 \n", "3 536365 84029G KNITTED UNION FLAG HOT WATER BOTTLE 6 \n", "4 536365 84029E RED WOOLLY HOTTIE WHITE HEART. 6 \n", "\n", " InvoiceDate UnitPrice CustomerID Country \n", "0 12/1/2010 8:26 2.55 17850.0 United Kingdom \n", "1 12/1/2010 8:26 3.39 17850.0 United Kingdom \n", "2 12/1/2010 8:26 2.75 17850.0 United Kingdom \n", "3 12/1/2010 8:26 3.39 17850.0 United Kingdom \n", "4 12/1/2010 8:26 3.39 17850.0 United Kingdom " ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# We load our data\n", "df = pd.read_csv('static/customer_segmentation.csv', encoding='latin1')\n", "df.head()" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "# We change our dates feature into a datetime type\n", "df['InvoiceDate'] = pd.to_datetime(df['InvoiceDate'])" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/var/folders/sm/_8w8tt415w56rr72t8421wj00000gn/T/ipykernel_4478/4209121508.py:3: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame.\n", "Try using .loc[row_indexer,col_indexer] = value instead\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " data_null['Quantity'] = - data_null['Quantity']\n", "/var/folders/sm/_8w8tt415w56rr72t8421wj00000gn/T/ipykernel_4478/4209121508.py:4: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame.\n", "Try using .loc[row_indexer,col_indexer] = value instead\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " data_null['TotalAmount'] = - data_null['TotalAmount']\n", "/var/folders/sm/_8w8tt415w56rr72t8421wj00000gn/T/ipykernel_4478/4209121508.py:8: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame.\n", "Try using .loc[row_indexer,col_indexer] = value instead\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " data_not_null['Quantity'] = - data_not_null['Quantity']\n", "/var/folders/sm/_8w8tt415w56rr72t8421wj00000gn/T/ipykernel_4478/4209121508.py:9: SettingWithCopyWarning: \n", "A value is trying to be set on a copy of a slice from a DataFrame.\n", "Try using .loc[row_indexer,col_indexer] = value instead\n", "\n", "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", " data_not_null['TotalAmount'] = - data_not_null['TotalAmount']\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
CustomerIDDescriptionQuantityUnitPriceInvoiceNoStockCodeInvoiceDateCountryTotalAmount
4717850.0HAND WARMER RED POLKA DOT61.85536372226322010-12-01 09:01:00United Kingdom11.10
4817850.0HAND WARMER UNION JACK61.85536372226332010-12-01 09:01:00United Kingdom11.10
4917850.0WHITE HANGING HEART T-LIGHT HOLDER62.5553637385123A2010-12-01 09:02:00United Kingdom15.30
5017850.0WHITE METAL LANTERN63.39536373710532010-12-01 09:02:00United Kingdom20.34
5117850.0CREAM CUPID HEARTS COAT HANGER82.7553637384406B2010-12-01 09:02:00United Kingdom22.00
\n", "
" ], "text/plain": [ " CustomerID Description Quantity UnitPrice \n", "47 17850.0 HAND WARMER RED POLKA DOT 6 1.85 \\\n", "48 17850.0 HAND WARMER UNION JACK 6 1.85 \n", "49 17850.0 WHITE HANGING HEART T-LIGHT HOLDER 6 2.55 \n", "50 17850.0 WHITE METAL LANTERN 6 3.39 \n", "51 17850.0 CREAM CUPID HEARTS COAT HANGER 8 2.75 \n", "\n", " InvoiceNo StockCode InvoiceDate Country TotalAmount \n", "47 536372 22632 2010-12-01 09:01:00 United Kingdom 11.10 \n", "48 536372 22633 2010-12-01 09:01:00 United Kingdom 11.10 \n", "49 536373 85123A 2010-12-01 09:02:00 United Kingdom 15.30 \n", "50 536373 71053 2010-12-01 09:02:00 United Kingdom 20.34 \n", "51 536373 84406B 2010-12-01 09:02:00 United Kingdom 22.00 " ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df['TotalAmount'] = df['Quantity'] * df['UnitPrice']\n", "data_null = df[df['TotalAmount'] < 0]\n", "data_null['Quantity'] = - data_null['Quantity']\n", "data_null['TotalAmount'] = - data_null['TotalAmount']\n", "data_null = data_null[['CustomerID', 'Description', 'Quantity', 'UnitPrice', 'TotalAmount']]\n", "\n", "data_not_null = df[df['TotalAmount'] >= 0]\n", "data_not_null['Quantity'] = - data_not_null['Quantity']\n", "data_not_null['TotalAmount'] = - data_not_null['TotalAmount']\n", "data_not_null = data_not_null[['CustomerID', 'Description', 'Quantity', 'UnitPrice', 'TotalAmount']]\n", "\n", "df_concat = pd.concat([data_null, data_not_null])\n", "df_concat = df_concat.drop_duplicates()\n", "df_concat = df_concat.drop('TotalAmount', axis=1)\n", "\n", "data_concat = pd.concat([df_concat, df])\n", "null_quantity = data_concat[data_concat['Quantity'] < 0]\n", "data_concat = data_concat.drop(null_quantity.index)\n", "data_concat = data_concat.drop_duplicates(subset=['CustomerID', 'Description', 'Quantity', 'UnitPrice', 'TotalAmount'])\n", "data_concat.head()" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "def data_subset(data, beginning='2010-12-01', end='2011-12-09'):\n", " \n", " beginning = pd.to_datetime(beginning)\n", " end = pd.to_datetime(end)\n", " \n", " # Subsetting\n", " data = data[(data['InvoiceDate'] >= beginning) & (data['InvoiceDate'] <= end)]\n", "\n", " return data" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "def recency(data):\n", " new_data = data.copy()\n", " last_day = new_data['InvoiceDate'].max()\n", "\n", " recency = []\n", " for value in new_data['InvoiceDate']:\n", " result = last_day - value\n", " recency.append(result.days)\n", " \n", " new_data['Recency'] = recency\n", "\n", " return new_data" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "def frequency(data):\n", " new_data = data.copy()\n", "\n", " frequencies_series = new_data.groupby('CustomerID')['InvoiceNo'].unique()\n", "\n", " nb_orders = []\n", " for value in frequencies_series.values:\n", " nb_orders.append(len(value))\n", "\n", " indexes = frequencies_series.index\n", "\n", " df_freq = pd.DataFrame(nb_orders, columns=['NbOrder'])\n", " df_freq['CustomerID'] = indexes\n", "\n", " df_merge = new_data.merge(df_freq, on='CustomerID')\n", "\n", " df_merge['Frequency'] = df_merge['NbOrder']\n", "\n", " return df_merge" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "def monetary(data):\n", " new_data = data.copy()\n", " new_data['Monetary'] = new_data['Quantity'] * new_data['UnitPrice']\n", "\n", " return new_data" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": [ "def rfm(data):\n", " new_data = data.copy()\n", "\n", " new_data = recency(new_data)\n", " new_data = frequency(new_data)\n", " new_data = monetary(new_data)\n", "\n", " df_rfm = new_data[['CustomerID', 'Recency', 'Frequency', 'Monetary']]\n", "\n", " monetary_sum = df_rfm.groupby('CustomerID')['Monetary'].sum()\n", " monetary_sum = pd.DataFrame(monetary_sum)\n", " monetary_sum = monetary_sum.reset_index()\n", " monetary_sum = monetary_sum[monetary_sum['Monetary'] > 5]\n", "\n", " df_rfm = df_rfm.drop('Monetary', axis=1)\n", " df_rfm = df_rfm.merge(monetary_sum, on='CustomerID')\n", " df_rfm = df_rfm.sort_values(['CustomerID', 'Recency'])\n", " df_rfm = df_rfm.drop_duplicates(subset='CustomerID')\n", " df_rfm = df_rfm.reset_index()\n", " df_rfm = df_rfm.drop(['index'], axis=1)\n", "\n", " return df_rfm" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
CustomerIDDescriptionQuantityUnitPriceInvoiceNoStockCodeInvoiceDateCountryTotalAmountRecencyNbOrderFrequencyMonetary
017850.0HAND WARMER RED POLKA DOT61.85536372226322010-12-01 09:01:00United Kingdom11.103739911.10
117850.0HAND WARMER UNION JACK61.85536372226332010-12-01 09:01:00United Kingdom11.103739911.10
217850.0WHITE HANGING HEART T-LIGHT HOLDER62.5553637385123A2010-12-01 09:02:00United Kingdom15.303739915.30
317850.0WHITE METAL LANTERN63.39536373710532010-12-01 09:02:00United Kingdom20.343739920.34
417850.0CREAM CUPID HEARTS COAT HANGER82.7553637384406B2010-12-01 09:02:00United Kingdom22.003739922.00
\n", "
" ], "text/plain": [ " CustomerID Description Quantity UnitPrice \n", "0 17850.0 HAND WARMER RED POLKA DOT 6 1.85 \\\n", "1 17850.0 HAND WARMER UNION JACK 6 1.85 \n", "2 17850.0 WHITE HANGING HEART T-LIGHT HOLDER 6 2.55 \n", "3 17850.0 WHITE METAL LANTERN 6 3.39 \n", "4 17850.0 CREAM CUPID HEARTS COAT HANGER 8 2.75 \n", "\n", " InvoiceNo StockCode InvoiceDate Country TotalAmount \n", "0 536372 22632 2010-12-01 09:01:00 United Kingdom 11.10 \\\n", "1 536372 22633 2010-12-01 09:01:00 United Kingdom 11.10 \n", "2 536373 85123A 2010-12-01 09:02:00 United Kingdom 15.30 \n", "3 536373 71053 2010-12-01 09:02:00 United Kingdom 20.34 \n", "4 536373 84406B 2010-12-01 09:02:00 United Kingdom 22.00 \n", "\n", " Recency NbOrder Frequency Monetary \n", "0 373 9 9 11.10 \n", "1 373 9 9 11.10 \n", "2 373 9 9 15.30 \n", "3 373 9 9 20.34 \n", "4 373 9 9 22.00 " ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# We apply our functions to get the new features we are interested in\n", "df_info = recency(data_concat)\n", "df_info = frequency(df_info)\n", "df_info = monetary(df_info)\n", "df_info.head()" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
CustomerIDDescriptionQuantityInvoiceNoInvoiceDateTotalAmountRecencyNbOrderFrequencyMonetary
017850.0HAND WARMER RED POLKA DOT65363722010-12-01 09:01:0011.103739911.10
117850.0HAND WARMER UNION JACK65363722010-12-01 09:01:0011.103739911.10
217850.0WHITE HANGING HEART T-LIGHT HOLDER65363732010-12-01 09:02:0015.303739915.30
317850.0WHITE METAL LANTERN65363732010-12-01 09:02:0020.343739920.34
417850.0CREAM CUPID HEARTS COAT HANGER85363732010-12-01 09:02:0022.003739922.00
\n", "
" ], "text/plain": [ " CustomerID Description Quantity InvoiceNo \n", "0 17850.0 HAND WARMER RED POLKA DOT 6 536372 \\\n", "1 17850.0 HAND WARMER UNION JACK 6 536372 \n", "2 17850.0 WHITE HANGING HEART T-LIGHT HOLDER 6 536373 \n", "3 17850.0 WHITE METAL LANTERN 6 536373 \n", "4 17850.0 CREAM CUPID HEARTS COAT HANGER 8 536373 \n", "\n", " InvoiceDate TotalAmount Recency NbOrder Frequency Monetary \n", "0 2010-12-01 09:01:00 11.10 373 9 9 11.10 \n", "1 2010-12-01 09:01:00 11.10 373 9 9 11.10 \n", "2 2010-12-01 09:02:00 15.30 373 9 9 15.30 \n", "3 2010-12-01 09:02:00 20.34 373 9 9 20.34 \n", "4 2010-12-01 09:02:00 22.00 373 9 9 22.00 " ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# We drop features we won't use\n", "df_info = df_info.drop(['StockCode', 'Country', 'UnitPrice'], axis=1)\n", "df_info.head()" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "We apply our rfm function to have the list of CustomerID values that have a \"Monetary\" feature which is positive." ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([12347., 12348., 12349., ..., 18282., 18283., 18287.])" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_customers = rfm(df)\n", "customers = df_customers['CustomerID'].unique()\n", "customers" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
CustomerIDDescriptionQuantityInvoiceNoInvoiceDateTotalAmountRecencyNbOrderFrequencyMonetary
017850.0HAND WARMER RED POLKA DOT65363722010-12-01 09:01:0011.103739911.10
117850.0HAND WARMER UNION JACK65363722010-12-01 09:01:0011.103739911.10
217850.0WHITE HANGING HEART T-LIGHT HOLDER65363732010-12-01 09:02:0015.303739915.30
317850.0WHITE METAL LANTERN65363732010-12-01 09:02:0020.343739920.34
417850.0CREAM CUPID HEARTS COAT HANGER85363732010-12-01 09:02:0022.003739922.00
\n", "
" ], "text/plain": [ " CustomerID Description Quantity InvoiceNo \n", "0 17850.0 HAND WARMER RED POLKA DOT 6 536372 \\\n", "1 17850.0 HAND WARMER UNION JACK 6 536372 \n", "2 17850.0 WHITE HANGING HEART T-LIGHT HOLDER 6 536373 \n", "3 17850.0 WHITE METAL LANTERN 6 536373 \n", "4 17850.0 CREAM CUPID HEARTS COAT HANGER 8 536373 \n", "\n", " InvoiceDate TotalAmount Recency NbOrder Frequency Monetary \n", "0 2010-12-01 09:01:00 11.10 373 9 9 11.10 \n", "1 2010-12-01 09:01:00 11.10 373 9 9 11.10 \n", "2 2010-12-01 09:02:00 15.30 373 9 9 15.30 \n", "3 2010-12-01 09:02:00 20.34 373 9 9 20.34 \n", "4 2010-12-01 09:02:00 22.00 373 9 9 22.00 " ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# We apply to get only the interesting IDs\n", "df_info = df_info[df_info['CustomerID'].isin(customers)]\n", "df_info.head()" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [], "source": [ "# We save as a CSV file\n", "df_info.to_csv('static/customer_info.csv')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "artefact", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.4" }, "orig_nbformat": 4 }, "nbformat": 4, "nbformat_minor": 2 }