Spaces:
Sleeping
Sleeping
Commit
·
c4b27ba
1
Parent(s):
9860d55
First Commit
Browse files- .gitattributes +1 -0
- README.md +5 -11
- app.py +64 -0
- create_object.py +160 -0
- data/online_retail.csv +3 -0
- requirements.txt +5 -0
.gitattributes
CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
+
data/online_retail.csv filter=lfs diff=lfs merge=lfs -text
|
README.md
CHANGED
@@ -1,12 +1,6 @@
|
|
1 |
-
|
2 |
-
title: Streamlit ID-POS
|
3 |
-
emoji: 🌍
|
4 |
-
colorFrom: blue
|
5 |
-
colorTo: yellow
|
6 |
-
sdk: streamlit
|
7 |
-
sdk_version: 1.35.0
|
8 |
-
app_file: app.py
|
9 |
-
pinned: false
|
10 |
-
---
|
11 |
|
12 |
-
|
|
|
|
|
|
|
|
1 |
+
# ID-POS分析システム
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2 |
|
3 |
+
## 利用データ
|
4 |
+
|
5 |
+
Online Retail Dataset
|
6 |
+
https://www.kaggle.com/datasets/ulrikthygepedersen/online-retail-dataset
|
app.py
ADDED
@@ -0,0 +1,64 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import pandas as pd
|
3 |
+
import datetime
|
4 |
+
|
5 |
+
import create_object as co
|
6 |
+
|
7 |
+
import duckdb
|
8 |
+
df = pd.read_csv("data/online_retail.csv")
|
9 |
+
|
10 |
+
# データ型整備:ID項目のSTRING化など
|
11 |
+
df = df.astype({'CustomerID': 'object'})
|
12 |
+
df = df[df["UnitPrice"] * df["Quantity"] > 0]
|
13 |
+
|
14 |
+
country_list = df["Country"].unique()
|
15 |
+
|
16 |
+
st.set_page_config(
|
17 |
+
page_title="購買データ分析App",
|
18 |
+
layout="wide",
|
19 |
+
)
|
20 |
+
|
21 |
+
def main():
|
22 |
+
st.title("購買データ分析App")
|
23 |
+
|
24 |
+
with st.sidebar.form(key="my_form"):
|
25 |
+
analysis_menu = st.selectbox("分析メニュー", co.analysis_menu_list)
|
26 |
+
|
27 |
+
country = st.multiselect("国を選択してください。", country_list)
|
28 |
+
if len(country) != 0:
|
29 |
+
country = "','".join(country)
|
30 |
+
country = f"Country in ('{country}')"
|
31 |
+
else:
|
32 |
+
country = "True"
|
33 |
+
|
34 |
+
st.write("2010年の日付を入れてください。")
|
35 |
+
start_date = st.date_input("開始日", datetime.date(2010, 1, 1))
|
36 |
+
end_date = st.date_input("終了日", datetime.date(2010, 12, 31))
|
37 |
+
|
38 |
+
submit_button = st.form_submit_button(label = "分析開始")
|
39 |
+
|
40 |
+
|
41 |
+
if submit_button:
|
42 |
+
# 処理を実行
|
43 |
+
sql = co.create_sql(analysis_menu, country, start_date, end_date)
|
44 |
+
df_output = duckdb.query(sql).to_df()
|
45 |
+
|
46 |
+
try:
|
47 |
+
fig = co.create_graph(analysis_menu, df_output)
|
48 |
+
st.plotly_chart(fig)
|
49 |
+
except:
|
50 |
+
print("グラフ無し")
|
51 |
+
|
52 |
+
st.table(df_output.head(100))
|
53 |
+
st.write("上位100行まで、全体を見たい場合はCSVでダウンロードしてください。")
|
54 |
+
|
55 |
+
st.download_button(
|
56 |
+
"Press to Download",
|
57 |
+
df_output.to_csv(index=False).encode('utf-8'),
|
58 |
+
"file.csv",
|
59 |
+
"text/csv",
|
60 |
+
key='download-csv'
|
61 |
+
)
|
62 |
+
|
63 |
+
if __name__ == '__main__':
|
64 |
+
main()
|
create_object.py
ADDED
@@ -0,0 +1,160 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import plotly.graph_objects as go
|
2 |
+
import plotly.express as px
|
3 |
+
|
4 |
+
# 分析メニュー
|
5 |
+
analysis_menu_list = ["ABC分析", "バスケット分析","時系列分析"]
|
6 |
+
|
7 |
+
# 分析メニューごとのSQL
|
8 |
+
def create_sql(analysis_menu, country, start_date, end_date):
|
9 |
+
if analysis_menu == "ABC分析":
|
10 |
+
sql = f"""
|
11 |
+
WITH
|
12 |
+
t_base AS(
|
13 |
+
-- 商品コードごとの売上(単価×個数)の合計値を算出
|
14 |
+
-- 期間を、2011年1~6月に絞る
|
15 |
+
SELECT
|
16 |
+
StockCode, Description,
|
17 |
+
SUM(UnitPrice * Quantity) AS SalesTotal
|
18 |
+
FROM df
|
19 |
+
WHERE CAST(InvoiceDate AS DATE) BETWEEN DATETIME '{start_date}' AND DATETIME '{end_date}'
|
20 |
+
AND {country}
|
21 |
+
GROUP BY StockCode, Description
|
22 |
+
),
|
23 |
+
t_standard AS(
|
24 |
+
-- 全体の売上のうち、70%を占める売上額・90%を占める売上額を算出
|
25 |
+
SELECT
|
26 |
+
SUM(SalesTotal) AS Sum_SalesTotal,
|
27 |
+
FROM t_base
|
28 |
+
),
|
29 |
+
t_cumulative AS(
|
30 |
+
-- 売上を降順(高い順)でソートし、先頭からの累計売上額を算出
|
31 |
+
SELECT
|
32 |
+
StockCode,
|
33 |
+
Description,
|
34 |
+
SalesTotal,
|
35 |
+
SUM(SalesTotal) OVER (ORDER BY SalesTotal DESC) AS SalesCumulative
|
36 |
+
FROM t_base
|
37 |
+
ORDER BY SalesTotal DESC
|
38 |
+
)
|
39 |
+
|
40 |
+
SELECT
|
41 |
+
StockCode,
|
42 |
+
Description,
|
43 |
+
SalesTotal,
|
44 |
+
SalesCumulative,
|
45 |
+
SalesCumulative / Sum_SalesTotal AS Percentage_SalesCumulative,
|
46 |
+
-- 累計売上額が売上合計の70%以下の場合はランクA、90%以下の場合はランクB、それ以降はランクCとしてランク付け
|
47 |
+
CASE
|
48 |
+
WHEN SalesCumulative / Sum_SalesTotal <= 0.7 THEN 'A'
|
49 |
+
WHEN SalesCumulative / Sum_SalesTotal <= 0.9 THEN 'B'
|
50 |
+
ELSE 'C'
|
51 |
+
END AS SalesRank
|
52 |
+
|
53 |
+
FROM t_cumulative
|
54 |
+
FULL OUTER JOIN t_standard
|
55 |
+
ON TRUE
|
56 |
+
ORDER BY SalesTotal desc
|
57 |
+
"""
|
58 |
+
|
59 |
+
elif analysis_menu == "バスケット分析":
|
60 |
+
sql = f"""
|
61 |
+
WITH
|
62 |
+
t_all AS(
|
63 |
+
-- 総来店者数
|
64 |
+
SELECT
|
65 |
+
COUNT(DISTINCT CustomerID) AS Num_of_All
|
66 |
+
FROM df
|
67 |
+
WHERE CAST(InvoiceDate AS DATE) BETWEEN DATETIME '{start_date}' AND DATETIME '{end_date}'
|
68 |
+
AND {country}
|
69 |
+
),
|
70 |
+
|
71 |
+
t_purchaser AS(
|
72 |
+
-- 商品ごとの購入者数
|
73 |
+
SELECT
|
74 |
+
CAST(StockCode AS STRING) AS ProductID, Description, COUNT(DISTINCT CustomerID) AS Num_of_Purchaser
|
75 |
+
FROM df
|
76 |
+
WHERE CAST(InvoiceDate AS DATE) BETWEEN DATETIME '{start_date}' AND DATETIME '{end_date}'
|
77 |
+
AND {country}
|
78 |
+
GROUP BY ProductID, Description
|
79 |
+
),
|
80 |
+
|
81 |
+
t_simultaneous_purchaser_pre AS(
|
82 |
+
-- 商品ごとの同時購入者
|
83 |
+
SELECT
|
84 |
+
DISTINCT CAST(StockCode AS STRING) AS ProductID, Description, CAST(InvoiceDate AS DATE) Purchase_date, CustomerID
|
85 |
+
FROM df
|
86 |
+
WHERE CAST(InvoiceDate AS DATE) BETWEEN DATETIME '{start_date}' AND DATETIME '{end_date}'
|
87 |
+
AND {country}
|
88 |
+
),
|
89 |
+
|
90 |
+
t_simultaneous_purchaser AS(
|
91 |
+
-- 商品ごとの同時購入者数
|
92 |
+
SELECT
|
93 |
+
t1.ProductID as ProductID_A, t1.Description AS Description_A, t2.ProductID as ProductID_B, t2.Description AS Description_B, COUNT(DISTINCT CustomerID) as Num_of_Simultaneous_Purchaser
|
94 |
+
FROM t_simultaneous_purchaser_pre as t1
|
95 |
+
INNER JOIN t_simultaneous_purchaser_pre as t2
|
96 |
+
USING(Purchase_date, CustomerID)
|
97 |
+
WHERE t1.ProductID != t2.ProductID
|
98 |
+
GROUP BY t1.ProductID, t1.Description, t2.ProductID, t2.Description
|
99 |
+
)
|
100 |
+
|
101 |
+
SELECT
|
102 |
+
ProductID_A, Description_A,
|
103 |
+
t_purchaser.Num_of_Purchaser AS Num_of_Purchaser_A,
|
104 |
+
t_purchaser.Num_of_Purchaser / Num_of_All AS PurchaseRate_A,
|
105 |
+
ProductID_B, Description_B,
|
106 |
+
t2.Num_of_Purchaser AS Num_of_Purchaser_B,
|
107 |
+
t2.Num_of_Purchaser / Num_of_All AS PurchaseRate_B,
|
108 |
+
Num_of_Simultaneous_Purchaser,
|
109 |
+
Num_of_Simultaneous_Purchaser / t_purchaser.Num_of_Purchaser AS CombinedSalesRate,
|
110 |
+
(Num_of_Simultaneous_Purchaser / t_purchaser.Num_of_Purchaser) / (t2.Num_of_Purchaser / Num_of_All) AS Lift
|
111 |
+
|
112 |
+
FROM t_purchaser
|
113 |
+
LEFT OUTER JOIN t_simultaneous_purchaser
|
114 |
+
ON t_purchaser.ProductID = t_simultaneous_purchaser.ProductID_A
|
115 |
+
AND t_purchaser.Description = t_simultaneous_purchaser.Description_A
|
116 |
+
|
117 |
+
LEFT OUTER JOIN t_purchaser as t2
|
118 |
+
ON t_simultaneous_purchaser.ProductID_B = t2.ProductID
|
119 |
+
AND t_simultaneous_purchaser.Description_B = t2.Description
|
120 |
+
|
121 |
+
FULL OUTER JOIN t_all
|
122 |
+
ON True
|
123 |
+
|
124 |
+
-- データが多くなりすぎるので、上位10商品同士の組み合わせに限定
|
125 |
+
WHERE
|
126 |
+
Description_A in (SELECT Description FROM t_purchaser ORDER BY Num_of_Purchaser DESC LIMIT 10)
|
127 |
+
AND
|
128 |
+
Description_B in (SELECT Description FROM t_purchaser ORDER BY Num_of_Purchaser DESC LIMIT 10)
|
129 |
+
"""
|
130 |
+
|
131 |
+
elif analysis_menu == "時系列分析":
|
132 |
+
sql = f"""
|
133 |
+
SELECT
|
134 |
+
CAST(InvoiceDate AS DATE) AS YearMonthDate,
|
135 |
+
COUNT(DISTINCT CustomerID) AS Num_of_Purchaser,
|
136 |
+
SUM(Quantity) AS Total_of_Amount,
|
137 |
+
SUM(UnitPrice * Quantity) AS SalesTotal
|
138 |
+
FROM df
|
139 |
+
WHERE CAST(InvoiceDate AS DATE) BETWEEN DATETIME '{start_date}' AND DATETIME '{end_date}'
|
140 |
+
AND {country}
|
141 |
+
GROUP BY YearMonthDate
|
142 |
+
ORDER BY YearMonthDate
|
143 |
+
"""
|
144 |
+
|
145 |
+
return sql
|
146 |
+
|
147 |
+
|
148 |
+
# 分析メニューごとのグラフ
|
149 |
+
def create_graph(analysis_menu, df):
|
150 |
+
if analysis_menu == "バスケット分析":
|
151 |
+
# ヒートマップ
|
152 |
+
df = df.sort_values(["Description_A","Description_B"], ascending=[True, False]).reset_index()
|
153 |
+
fig = go.Figure([go.Heatmap(z=df.CombinedSalesRate,
|
154 |
+
x=df.Description_A.values,
|
155 |
+
y=df.Description_B.values)])
|
156 |
+
|
157 |
+
elif analysis_menu == "時系列分析":
|
158 |
+
# 折れ線グラフ
|
159 |
+
fig = px.line(df, x='YearMonthDate', y='Total_of_Amount')
|
160 |
+
return fig
|
data/online_retail.csv
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:c820e928a9cb01d05738b0c36b5033ef661eccfb82f09f2e5ce8542da73b0b99
|
3 |
+
size 48581636
|
requirements.txt
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
streamlit==1.22.0
|
2 |
+
pandas==2.0.3
|
3 |
+
datetime==5.5
|
4 |
+
duckdb==0.10.0
|
5 |
+
plotly
|