pax-dare-lab commited on
Commit
c4b27ba
·
1 Parent(s): 9860d55

First Commit

Browse files
Files changed (6) hide show
  1. .gitattributes +1 -0
  2. README.md +5 -11
  3. app.py +64 -0
  4. create_object.py +160 -0
  5. data/online_retail.csv +3 -0
  6. requirements.txt +5 -0
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ data/online_retail.csv filter=lfs diff=lfs merge=lfs -text
README.md CHANGED
@@ -1,12 +1,6 @@
1
- ---
2
- title: Streamlit ID-POS
3
- emoji: 🌍
4
- colorFrom: blue
5
- colorTo: yellow
6
- sdk: streamlit
7
- sdk_version: 1.35.0
8
- app_file: app.py
9
- pinned: false
10
- ---
11
 
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
1
+ # ID-POS分析システム
 
 
 
 
 
 
 
 
 
2
 
3
+ ## 利用データ
4
+
5
+ Online Retail Dataset
6
+ https://www.kaggle.com/datasets/ulrikthygepedersen/online-retail-dataset
app.py ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ import datetime
4
+
5
+ import create_object as co
6
+
7
+ import duckdb
8
+ df = pd.read_csv("data/online_retail.csv")
9
+
10
+ # データ型整備:ID項目のSTRING化など
11
+ df = df.astype({'CustomerID': 'object'})
12
+ df = df[df["UnitPrice"] * df["Quantity"] > 0]
13
+
14
+ country_list = df["Country"].unique()
15
+
16
+ st.set_page_config(
17
+ page_title="購買データ分析App",
18
+ layout="wide",
19
+ )
20
+
21
+ def main():
22
+ st.title("購買データ分析App")
23
+
24
+ with st.sidebar.form(key="my_form"):
25
+ analysis_menu = st.selectbox("分析メニュー", co.analysis_menu_list)
26
+
27
+ country = st.multiselect("国を選択してください。", country_list)
28
+ if len(country) != 0:
29
+ country = "','".join(country)
30
+ country = f"Country in ('{country}')"
31
+ else:
32
+ country = "True"
33
+
34
+ st.write("2010年の日付を入れてください。")
35
+ start_date = st.date_input("開始日", datetime.date(2010, 1, 1))
36
+ end_date = st.date_input("終了日", datetime.date(2010, 12, 31))
37
+
38
+ submit_button = st.form_submit_button(label = "分析開始")
39
+
40
+
41
+ if submit_button:
42
+ # 処理を実行
43
+ sql = co.create_sql(analysis_menu, country, start_date, end_date)
44
+ df_output = duckdb.query(sql).to_df()
45
+
46
+ try:
47
+ fig = co.create_graph(analysis_menu, df_output)
48
+ st.plotly_chart(fig)
49
+ except:
50
+ print("グラフ無し")
51
+
52
+ st.table(df_output.head(100))
53
+ st.write("上位100行まで、全体を見たい場合はCSVでダウンロードしてください。")
54
+
55
+ st.download_button(
56
+ "Press to Download",
57
+ df_output.to_csv(index=False).encode('utf-8'),
58
+ "file.csv",
59
+ "text/csv",
60
+ key='download-csv'
61
+ )
62
+
63
+ if __name__ == '__main__':
64
+ main()
create_object.py ADDED
@@ -0,0 +1,160 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import plotly.graph_objects as go
2
+ import plotly.express as px
3
+
4
+ # 分析メニュー
5
+ analysis_menu_list = ["ABC分析", "バスケット分析","時系列分析"]
6
+
7
+ # 分析メニューごとのSQL
8
+ def create_sql(analysis_menu, country, start_date, end_date):
9
+ if analysis_menu == "ABC分析":
10
+ sql = f"""
11
+ WITH
12
+ t_base AS(
13
+ -- 商品コードごとの売上(単価×個数)の合計値を算出
14
+ -- 期間を、2011年1~6月に絞る
15
+ SELECT
16
+ StockCode, Description,
17
+ SUM(UnitPrice * Quantity) AS SalesTotal
18
+ FROM df
19
+ WHERE CAST(InvoiceDate AS DATE) BETWEEN DATETIME '{start_date}' AND DATETIME '{end_date}'
20
+ AND {country}
21
+ GROUP BY StockCode, Description
22
+ ),
23
+ t_standard AS(
24
+ -- 全体の売上のうち、70%を占める売上額・90%を占める売上額を算出
25
+ SELECT
26
+ SUM(SalesTotal) AS Sum_SalesTotal,
27
+ FROM t_base
28
+ ),
29
+ t_cumulative AS(
30
+ -- 売上を降順(高い順)でソートし、先頭からの累計売上額を算出
31
+ SELECT
32
+ StockCode,
33
+ Description,
34
+ SalesTotal,
35
+ SUM(SalesTotal) OVER (ORDER BY SalesTotal DESC) AS SalesCumulative
36
+ FROM t_base
37
+ ORDER BY SalesTotal DESC
38
+ )
39
+
40
+ SELECT
41
+ StockCode,
42
+ Description,
43
+ SalesTotal,
44
+ SalesCumulative,
45
+ SalesCumulative / Sum_SalesTotal AS Percentage_SalesCumulative,
46
+ -- 累計売上額が売上合計の70%以下の場合はランクA、90%以下の場合はランクB、それ以降はランクCとしてランク付け
47
+ CASE
48
+ WHEN SalesCumulative / Sum_SalesTotal <= 0.7 THEN 'A'
49
+ WHEN SalesCumulative / Sum_SalesTotal <= 0.9 THEN 'B'
50
+ ELSE 'C'
51
+ END AS SalesRank
52
+
53
+ FROM t_cumulative
54
+ FULL OUTER JOIN t_standard
55
+ ON TRUE
56
+ ORDER BY SalesTotal desc
57
+ """
58
+
59
+ elif analysis_menu == "バスケット分析":
60
+ sql = f"""
61
+ WITH
62
+ t_all AS(
63
+ -- 総来店者数
64
+ SELECT
65
+ COUNT(DISTINCT CustomerID) AS Num_of_All
66
+ FROM df
67
+ WHERE CAST(InvoiceDate AS DATE) BETWEEN DATETIME '{start_date}' AND DATETIME '{end_date}'
68
+ AND {country}
69
+ ),
70
+
71
+ t_purchaser AS(
72
+ -- 商品ごとの購入者数
73
+ SELECT
74
+ CAST(StockCode AS STRING) AS ProductID, Description, COUNT(DISTINCT CustomerID) AS Num_of_Purchaser
75
+ FROM df
76
+ WHERE CAST(InvoiceDate AS DATE) BETWEEN DATETIME '{start_date}' AND DATETIME '{end_date}'
77
+ AND {country}
78
+ GROUP BY ProductID, Description
79
+ ),
80
+
81
+ t_simultaneous_purchaser_pre AS(
82
+ -- 商品ごとの同時購入者
83
+ SELECT
84
+ DISTINCT CAST(StockCode AS STRING) AS ProductID, Description, CAST(InvoiceDate AS DATE) Purchase_date, CustomerID
85
+ FROM df
86
+ WHERE CAST(InvoiceDate AS DATE) BETWEEN DATETIME '{start_date}' AND DATETIME '{end_date}'
87
+ AND {country}
88
+ ),
89
+
90
+ t_simultaneous_purchaser AS(
91
+ -- 商品ごとの同時購入者数
92
+ SELECT
93
+ t1.ProductID as ProductID_A, t1.Description AS Description_A, t2.ProductID as ProductID_B, t2.Description AS Description_B, COUNT(DISTINCT CustomerID) as Num_of_Simultaneous_Purchaser
94
+ FROM t_simultaneous_purchaser_pre as t1
95
+ INNER JOIN t_simultaneous_purchaser_pre as t2
96
+ USING(Purchase_date, CustomerID)
97
+ WHERE t1.ProductID != t2.ProductID
98
+ GROUP BY t1.ProductID, t1.Description, t2.ProductID, t2.Description
99
+ )
100
+
101
+ SELECT
102
+ ProductID_A, Description_A,
103
+ t_purchaser.Num_of_Purchaser AS Num_of_Purchaser_A,
104
+ t_purchaser.Num_of_Purchaser / Num_of_All AS PurchaseRate_A,
105
+ ProductID_B, Description_B,
106
+ t2.Num_of_Purchaser AS Num_of_Purchaser_B,
107
+ t2.Num_of_Purchaser / Num_of_All AS PurchaseRate_B,
108
+ Num_of_Simultaneous_Purchaser,
109
+ Num_of_Simultaneous_Purchaser / t_purchaser.Num_of_Purchaser AS CombinedSalesRate,
110
+ (Num_of_Simultaneous_Purchaser / t_purchaser.Num_of_Purchaser) / (t2.Num_of_Purchaser / Num_of_All) AS Lift
111
+
112
+ FROM t_purchaser
113
+ LEFT OUTER JOIN t_simultaneous_purchaser
114
+ ON t_purchaser.ProductID = t_simultaneous_purchaser.ProductID_A
115
+ AND t_purchaser.Description = t_simultaneous_purchaser.Description_A
116
+
117
+ LEFT OUTER JOIN t_purchaser as t2
118
+ ON t_simultaneous_purchaser.ProductID_B = t2.ProductID
119
+ AND t_simultaneous_purchaser.Description_B = t2.Description
120
+
121
+ FULL OUTER JOIN t_all
122
+ ON True
123
+
124
+ -- データが多くなりすぎるので、上位10商品同士の組み合わせに限定
125
+ WHERE
126
+ Description_A in (SELECT Description FROM t_purchaser ORDER BY Num_of_Purchaser DESC LIMIT 10)
127
+ AND
128
+ Description_B in (SELECT Description FROM t_purchaser ORDER BY Num_of_Purchaser DESC LIMIT 10)
129
+ """
130
+
131
+ elif analysis_menu == "時系列分析":
132
+ sql = f"""
133
+ SELECT
134
+ CAST(InvoiceDate AS DATE) AS YearMonthDate,
135
+ COUNT(DISTINCT CustomerID) AS Num_of_Purchaser,
136
+ SUM(Quantity) AS Total_of_Amount,
137
+ SUM(UnitPrice * Quantity) AS SalesTotal
138
+ FROM df
139
+ WHERE CAST(InvoiceDate AS DATE) BETWEEN DATETIME '{start_date}' AND DATETIME '{end_date}'
140
+ AND {country}
141
+ GROUP BY YearMonthDate
142
+ ORDER BY YearMonthDate
143
+ """
144
+
145
+ return sql
146
+
147
+
148
+ # 分析メニューごとのグラフ
149
+ def create_graph(analysis_menu, df):
150
+ if analysis_menu == "バスケット分析":
151
+ # ヒートマップ
152
+ df = df.sort_values(["Description_A","Description_B"], ascending=[True, False]).reset_index()
153
+ fig = go.Figure([go.Heatmap(z=df.CombinedSalesRate,
154
+ x=df.Description_A.values,
155
+ y=df.Description_B.values)])
156
+
157
+ elif analysis_menu == "時系列分析":
158
+ # 折れ線グラフ
159
+ fig = px.line(df, x='YearMonthDate', y='Total_of_Amount')
160
+ return fig
data/online_retail.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c820e928a9cb01d05738b0c36b5033ef661eccfb82f09f2e5ce8542da73b0b99
3
+ size 48581636
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ streamlit==1.22.0
2
+ pandas==2.0.3
3
+ datetime==5.5
4
+ duckdb==0.10.0
5
+ plotly