skanderovitch commited on
Commit
d880747
·
verified ·
1 Parent(s): 161023b

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +162 -0
app.py ADDED
@@ -0,0 +1,162 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import streamlit.components.v1 as components
3
+ import pandas as pd
4
+ import plotly.express as px
5
+
6
+ df = pd.read_parquet('final_occ.parquet')
7
+
8
+ games = {'Birds of a feather': 'How do people choose their partner, based on their profession ?',
9
+ 'Different strokes': 'How do people occupation differ, based on their circumstances ?',}
10
+
11
+ st.title('Play with the US Census data 2023')
12
+ game = st.selectbox('Pick your Game', list(games.keys()))
13
+
14
+ st.subheader(game)
15
+ st.markdown(games[game])
16
+ st.markdown('*(Source : 1% sample of 2023 US Census)*')
17
+
18
+ if game == 'Birds of a feather':
19
+
20
+ couple_types = sorted(df['couple_type'].value_counts().index)
21
+ couple_type = st.selectbox('Couple Type', couple_types)
22
+
23
+ filtered = df.copy()
24
+ filtered['id'] = 1
25
+ filtered['id'] = filtered['id'].cumsum()
26
+
27
+ mask = filtered['couple_type'] == couple_type
28
+ st.toast(f'couple_type = {couple_type} : {mask.mean():.1%}')
29
+
30
+
31
+ temp = filtered.copy()
32
+ temp['chosen'] = mask
33
+
34
+
35
+ filtered = filtered[mask]
36
+ st.subheader('Who is your protagonist ?')
37
+ from_genders = sorted(filtered['SEX'].unique())
38
+
39
+ from_gender = st.selectbox('Protagonist Gender', from_genders)
40
+ protagonists = filtered.copy()
41
+
42
+ mask = protagonists['SEX'] == from_gender
43
+ st.toast(f'protagonist gender = {from_gender} : {mask.mean():.1%}')
44
+ protagonists = protagonists[mask]
45
+
46
+
47
+ data_min,data_max = protagonists['AGE'].agg('min max'.split())
48
+ min_age,max_age= st.slider('Protagonist Age Range', data_min,data_max,(data_min,data_max))
49
+ mask = protagonists['AGE'] >= min_age
50
+ st.toast(f'protagonist age >={min_age} : {mask.mean():.1%}')
51
+ protagonists = protagonists[mask]
52
+
53
+
54
+ mask = protagonists['AGE'] <= max_age
55
+ st.toast(f'protagonist age <= {max_age} : {mask.mean():.1%}')
56
+ protagonists = protagonists[mask]
57
+
58
+ ages = protagonists.groupby('AGE')['HHWT'].sum().reset_index()
59
+ st.plotly_chart(px.bar(ages,x='AGE',y='HHWT'))
60
+
61
+
62
+
63
+ n_top_professions = st.slider('Only keep top N protagonist occupations',10,100,30)
64
+ keep_unemployed = st.checkbox('Keep unemployed protagonist')
65
+ top_professions = protagonists.groupby('OCC')['HHWT'].sum().sort_values(ascending=False)[:n_top_professions].reset_index()
66
+ if not keep_unemployed:
67
+ top_professions = top_professions[top_professions['OCC'] != 'Unemployed']
68
+ st.plotly_chart(px.bar(top_professions, x='OCC', y='HHWT',height=800))
69
+
70
+
71
+
72
+
73
+
74
+
75
+ protagonists_ids = set(protagonists['id'].unique())
76
+ protagonists_house_ids = set(protagonists['CBSERIAL'].unique())
77
+
78
+ filtered = filtered[filtered['CBSERIAL'].isin(protagonists_house_ids)]
79
+
80
+ data = filtered[[ 'CBSERIAL', 'HHWT', 'OCC', 'id']]
81
+ data = pd.merge(data,data,on=['CBSERIAL', 'HHWT'],suffixes=('_protagonist','_partner'))
82
+ mask = data['id_protagonist'].isin(protagonists_ids)
83
+ data = data[mask]
84
+ mask = data['id_protagonist'] != data['id_partner']
85
+ data = data[mask]
86
+
87
+ data = data.groupby(['OCC_protagonist','OCC_partner'])['HHWT'].sum().rename('perc_partner').reset_index()
88
+ data = data[data['perc_partner'] > 0]
89
+
90
+ top_protagonist_occ = data.groupby('OCC_protagonist')['perc_partner'].sum().rename('total_protagonist').reset_index().sort_values('total_protagonist',ascending=False)
91
+
92
+ data = pd.merge(data,top_protagonist_occ,on=['OCC_protagonist'])
93
+ data['perc_partner'] /= data['total_protagonist']
94
+
95
+ filter_min_perc = 0.01
96
+ heatmap = data[data['perc_partner'] > filter_min_perc].copy()
97
+
98
+ st.subheader('Some insights')
99
+ same = data[data['OCC_partner'] == data['OCC_protagonist']]
100
+ n_top_protagonists = 10
101
+
102
+ st.plotly_chart(px.bar(same.sort_values('perc_partner',ascending=False)[:n_top_protagonists],x='OCC_protagonist',y='perc_partner',color='total_protagonist',title='Professions most commonly shared with the partner'))
103
+ st.plotly_chart(px.bar(same.sort_values('perc_partner',ascending=False)[-n_top_protagonists:],x='OCC_protagonist',y='perc_partner',color='total_protagonist',title='Professions least commonly shared with the partner'))
104
+
105
+ st.subheader('Pick the occupation of your protagonist')
106
+ contains = st.text_input('Filter occupations', '')
107
+ candidates = top_professions.copy()
108
+ if contains:
109
+ mask = candidates['OCC'].str.lower().str.contains(contains.lower())
110
+ candidates = candidates[mask]
111
+ protagonist_occupation = st.selectbox('Occupation',candidates['OCC'])
112
+
113
+ subset = heatmap[heatmap['OCC_protagonist'] == protagonist_occupation]
114
+ st.subheader(f'Distribution of partner occupations for protagonist occupation = {protagonist_occupation}')
115
+ fig = px.pie(subset, names="OCC_partner", values='perc_partner')
116
+ fig.update_traces(textposition='inside', textinfo='percent+label')
117
+ fig.update_traces(showlegend=False)
118
+ # fig.update(layout_coloraxis_showscale=False)
119
+
120
+ st.plotly_chart(fig)
121
+ # fig.update_traces(showlegend=False)
122
+ # fig.update(layout_coloraxis_showscale=False)
123
+
124
+ else:
125
+ data1 = df.copy()
126
+ data2 = df.copy()
127
+
128
+ st.write('Select the attribute to compare between the two groups')
129
+ col1, col2 = st.columns(2)
130
+ col1.subheader('Group1')
131
+ col2.subheader('Group2')
132
+
133
+
134
+ for col in ['couple_type', 'SEX', 'AGE', 'MARRNO']:
135
+ col1, col2 = st.columns(2)
136
+ if col != 'AGE':
137
+ choice1 = col1.selectbox(col, data1[col].unique(),key=f'{col}_1')
138
+ mask = data1[col] == choice1
139
+ data1 = data1[mask]
140
+ choice2 = col2.selectbox(col, data2[col].unique(),key=f'{col}_2')
141
+ mask = data2[col] == choice2
142
+ data2 = data2[mask]
143
+ else:
144
+ choice1 = col1.slider(col, data1[col].min(), data1[col].max(), (data1[col].min(), data1[col].max()),key=f'{col}_1')
145
+ mask = (data1[col] >= choice1[0]) & (data1[col] <= choice1[1])
146
+ data1 = data1[mask]
147
+ choice2 = col2.slider(col, data2[col].min(), data2[col].max(), (data2[col].min(), data2[col].max()),key=f'{col}_2')
148
+ mask = (data2[col] >= choice2[0]) & (data2[col] <= choice2[1])
149
+ data2 = data2[mask]
150
+
151
+ summary1 = data1.groupby('OCC')['HHWT'].sum().reset_index()
152
+ summary1['HHWT'] /= summary1['HHWT'].sum()
153
+ summary2 = data2.groupby('OCC')['HHWT'].sum().reset_index()
154
+ summary2['HHWT'] /= summary2['HHWT'].sum()
155
+ comparison = pd.merge(summary1, summary2, on='OCC', suffixes=('_group1', '_group2'), how='outer')
156
+ comparison['HHWT_group1'].fillna(0,inplace=True)
157
+ comparison['HHWT_group2'].fillna(0,inplace=True)
158
+ comparison['diff'] = comparison['HHWT_group1'] - comparison['HHWT_group2']
159
+ comparison['abs_diff'] = comparison['diff'].abs()
160
+ comparison = comparison.sort_values('abs_diff', ascending=False)[:30]
161
+ st.plotly_chart(px.bar(comparison.sort_values('diff'), x='OCC', y='diff', color='diff', title='Occupation distribution difference between the two groups (group1 - group2)', height=800))
162
+