Jawad commited on
Commit
1678afb
·
1 Parent(s): aac9eba

manage empty revenues

Browse files
Files changed (3) hide show
  1. data_processing.py +6 -5
  2. model.py +2 -3
  3. stream_app.py +187 -175
data_processing.py CHANGED
@@ -44,6 +44,7 @@ def process_data(data):
44
  decisions = pd.DataFrame(data['decisions'])
45
  decisions['year'] = pd.to_datetime(decisions['decision_date']).dt.year
46
  decisions.monetary_sanction = decisions.monetary_sanction.astype(float)
 
47
  decisions = decisions[decisions.status == 'V']
48
  decisions.decision_date = pd.to_datetime(decisions['decision_date']).dt.date
49
  decisions['id'] = decisions.url.apply(get_id)
@@ -77,10 +78,10 @@ def process_data(data):
77
 
78
  def get_monetary_dataframe(decision_scope):
79
  monetary_decision = decision_scope[decision_scope.monetary_sanction > 0]
80
- monetary_decision = monetary_decision[monetary_decision.org_revenues != ""]
81
- monetary_decision['org_revenues'] = monetary_decision.org_revenues.astype(float)
82
- monetary_decision['log10_org_revenues'] = monetary_decision.org_revenues.apply(np.log10)
83
- monetary_decision['log10_monetary_sanction'] = monetary_decision.monetary_sanction.apply(np.log10)
84
  monetary_decision['same_country'] = (monetary_decision.org_country == monetary_decision.authorities_country)
85
  monetary_decision['monetary_sanction_rate'] = monetary_decision.monetary_sanction/monetary_decision.org_revenues
86
  monetary_decision['log10_monetary_sanction_rate'] = monetary_decision.monetary_sanction_rate.apply(np.log10)
@@ -89,4 +90,4 @@ def get_monetary_dataframe(decision_scope):
89
 
90
  def get_themes_per_year(monetary_decision):
91
  #return monetary_decision.groupby(['year', 'violation_theme'])['monetary_sanction'].sum().unstack().fillna(0)
92
- return monetary_decision.groupby(['year', 'violation_theme'])['monetary_sanction'].sum().reset_index()
 
44
  decisions = pd.DataFrame(data['decisions'])
45
  decisions['year'] = pd.to_datetime(decisions['decision_date']).dt.year
46
  decisions.monetary_sanction = decisions.monetary_sanction.astype(float)
47
+ # keep validated decisions
48
  decisions = decisions[decisions.status == 'V']
49
  decisions.decision_date = pd.to_datetime(decisions['decision_date']).dt.date
50
  decisions['id'] = decisions.url.apply(get_id)
 
78
 
79
  def get_monetary_dataframe(decision_scope):
80
  monetary_decision = decision_scope[decision_scope.monetary_sanction > 0]
81
+ monetary_decision['has_revenues'] = (monetary_decision.org_revenues != "")
82
+ monetary_decision['org_revenues'] = monetary_decision.org_revenues.str.replace('', '0').astype(float)
83
+ monetary_decision['log10_org_revenues'] = monetary_decision.org_revenues.apply(lambda x: np.log10(x+1))
84
+ monetary_decision['log10_monetary_sanction'] = monetary_decision.monetary_sanction.apply(lambda x: np.log10(x+1))
85
  monetary_decision['same_country'] = (monetary_decision.org_country == monetary_decision.authorities_country)
86
  monetary_decision['monetary_sanction_rate'] = monetary_decision.monetary_sanction/monetary_decision.org_revenues
87
  monetary_decision['log10_monetary_sanction_rate'] = monetary_decision.monetary_sanction_rate.apply(np.log10)
 
90
 
91
  def get_themes_per_year(monetary_decision):
92
  #return monetary_decision.groupby(['year', 'violation_theme'])['monetary_sanction'].sum().unstack().fillna(0)
93
+ return monetary_decision.groupby(['year', 'violation_theme'])['monetary_sanction'].sum().reset_index()
model.py CHANGED
@@ -44,9 +44,8 @@ def run_training(predictors_train, predictors_test):
44
  'gamma': 0.5,
45
  'objective': 'reg:squarederror'}
46
  num_round = 1000
47
- xgb_cv = cv(dtrain=data_train, params=params, nfold=3,
48
- num_boost_round=1000, early_stopping_rounds=10, metrics="rmse", as_pandas=True, seed=123)
49
- print(xgb_cv)
50
  return xgb.train(params, data_train, num_round)
51
 
52
 
 
44
  'gamma': 0.5,
45
  'objective': 'reg:squarederror'}
46
  num_round = 1000
47
+ #xgb_cv = cv(dtrain=data_train, params=params, nfold=3,
48
+ # num_boost_round=1000, early_stopping_rounds=10, metrics="rmse", as_pandas=True, seed=123)
 
49
  return xgb.train(params, data_train, num_round)
50
 
51
 
stream_app.py CHANGED
@@ -29,7 +29,7 @@ def _max_width_():
29
  _max_width_()
30
 
31
  st.title("Data Analysis 🌎 📃")
32
- st.write("by [Teolex](https://www.theolex.io/)")
33
 
34
  # load and process data
35
  data = load_data()
@@ -44,7 +44,7 @@ else:
44
  select_auth = authorities.name.sort_values()
45
 
46
  authority = st.sidebar.selectbox('Authority', ['All', *select_auth])
47
- min_year, max_year = st.sidebar.slider('Decisions year', min_value=2001, max_value=2021, value=(2010, 2021))
48
 
49
  # apply filters
50
  authority_filter = True
@@ -55,192 +55,204 @@ else:
55
  year_filter = (decisions.year >= min_year) & (decisions.year <= max_year)
56
  decision_scope = decisions[authority_filter & year_filter]
57
 
 
 
 
 
 
 
 
58
  # explore monetary sanctions
59
  monetary_decision = get_monetary_dataframe(decision_scope)
60
 
 
 
61
  ##
62
  # Plot Graphs
63
  ##
64
 
65
- st.subheader("The organizations' sectors targeted by the sanctions: ")
66
- st.markdown("The graph shows the cumulated monetary sanction for the current filters")
67
-
68
- fig = px.treemap(monetary_decision,
69
- path=['org_company_type'],
70
- color='org_revenues',
71
- color_continuous_scale='RdBu',
72
- template="simple_white",
73
- values='monetary_sanction',
74
- width=1000, height=600)
75
- st.plotly_chart(fig)
76
-
77
- st.subheader("The organizations' regions targeted by the sanctions: ")
78
- st.markdown("The graph shows the cumulated monetary sanction for the current filters")
79
- fig = px.treemap(monetary_decision[~monetary_decision.org_continent.isnull()],
80
- path=['org_continent', 'org_country'],
81
- color_continuous_scale='RdBu',
82
- template="simple_white",
83
- values='monetary_sanction',
84
- width=1000, height=600)
85
- st.plotly_chart(fig)
86
-
87
- st.subheader("Revenues vs monetary sanctions representation ")
88
- st.markdown("The graph shows the cumulated monetary sanction for the current filters")
89
- fig = px.scatter(monetary_decision,
90
- x="org_revenues",
91
- y="monetary_sanction",
92
- log_x=True,
93
- log_y=True,
94
- template="simple_white",
95
- color="same_country",
96
- color_continuous_scale='RdBu',
97
- hover_name="org_name",
98
- width=1000, height=600)
99
- st.plotly_chart(fig)
100
-
101
- fig = px.scatter(monetary_decision[~monetary_decision.org_revenues.isnull()],
102
- x="decision_date",
103
- size="log10_monetary_sanction",
104
- y="org_revenues",
105
- log_y=True,
106
- template="simple_white",
107
- color="same_country",
108
- hover_name="monetary_sanction",
109
- width=1000, height=600)
110
- st.plotly_chart(fig)
111
-
112
- fig = px.histogram(monetary_decision, x="log10_monetary_sanction",
113
- # y="log10_org_revenues",
114
- color="same_country",
115
- marginal="box", # or violin, rug
116
- template="simple_white",
117
- width=1000, height=600, nbins=40, opacity=0.5,
118
- hover_data=monetary_decision.columns)
119
-
120
- st.plotly_chart(fig)
121
-
122
- fig = px.histogram(monetary_decision, x="log10_monetary_sanction_rate",
123
- # y="log10_org_revenues",
124
- color="same_country",
125
- marginal="box", # or violin, rug
126
- template="simple_white",
127
- width=1000, height=600, nbins=40, opacity=0.5,
128
- hover_data=monetary_decision.columns)
129
-
130
- st.plotly_chart(fig)
131
-
132
- p = scipy.stats.ks_2samp(monetary_decision[monetary_decision.same_country]['log10_monetary_sanction_rate'],
133
- monetary_decision[~monetary_decision.same_country]['log10_monetary_sanction_rate']
134
- , alternative='two-sided', mode='auto')
135
-
136
- st.metric(label="p-value", value=f"{round(p.pvalue, 2)}%")
137
-
138
- st.subheader("Sum of monetary sanctions over time ")
139
- st.markdown("The graph shows the cumulated monetary sanction per year for each violation theme")
140
- chart_data = get_themes_per_year(monetary_decision)
141
- fig = px.area(chart_data, x="year",
142
- y="monetary_sanction",
143
- color="violation_theme",
144
- template="simple_white",
145
- # groupnorm="fraction",
146
- line_group="violation_theme",
147
- width=1000, height=600)
148
- st.plotly_chart(fig)
149
-
150
- st.sidebar.title("Organizations view")
151
 
152
- col_x = ['log10_org_revenues', 'authorities_country', 'violation_theme', 'org_country', 'org_company_type']
153
-
154
- predictors, target = prepare_data(monetary_decision)
155
 
 
 
 
 
 
156
  st.title("Training phase")
157
- st.write(f"dataset size: {monetary_decision.shape[0]}")
158
- st.markdown("Plot taget distribution: log 10 of monetary sanctions")
159
- fig = ff.create_distplot([target], [' log 10 of monetary sanctions'], bin_size=0.1)
160
- fig.update_layout(width=1000,
161
- template="simple_white",
162
- height=600,
163
- bargap=0.01)
164
- st.plotly_chart(fig)
165
-
166
- # split data set
167
- predictors_train, predictors_test, target_train, target_test = split(predictors, target)
168
- st.subheader("Split dataset between training and test:")
169
- st.metric(label="Training size", value=predictors_train.shape[0])
170
- st.metric(label="Test size", value=predictors_test.shape[0])
171
 
 
172
  # train the model
173
- xgb_model = run_training(predictors_train, target_train)
174
-
175
- # evaluate model error
176
- target_train_predicted = predict(xgb_model, predictors_train)
177
- training_bias = np.mean(target_train_predicted - target_train)
178
- st.metric(label="Training bias", value=training_bias)
179
-
180
- target_test_predicted = predict(xgb_model, predictors_test)
181
- test_errors = target_test_predicted - target_test
182
- test_bias = np.mean(test_errors)
183
- st.metric(label="Test bias", value=test_bias)
184
-
185
- fig = ff.create_distplot([test_errors], ['errors distribution'], bin_size=0.1)
186
- fig.update_layout(width=1000,
187
- template="simple_white",
188
- height=600,
189
- bargap=0.01)
190
- st.plotly_chart(fig)
191
-
192
- st.subheader("Plot features importance for the trained model")
193
- xgb_features_importance = features_importance(xgb_model)
194
-
195
- fig = px.bar(xgb_features_importance,
196
- orientation='h',
197
- width=1000,
198
- template="simple_white",
199
- height=600,
200
- )
201
- st.plotly_chart(fig)
202
-
203
- st.subheader("Plot predicted vs real")
204
- compare = pd.concat([pd.DataFrame({'target': target_test, 'predicted': target_test_predicted, 'sample': 'test'}),
205
- pd.DataFrame({'target': target_train, 'predicted': target_train_predicted, 'sample': 'train'})])
206
- fig = px.scatter(
207
- compare,
208
- x='predicted',
209
- y='target',
210
- color='sample',
211
- marginal_y="violin",
212
- width=1000,
213
- template="simple_white",
214
- height=600,
215
- trendline="ols")
216
-
217
- st.plotly_chart(fig)
218
-
219
-
220
- naive_error_std = np.std(target_train - np.mean(target_train_predicted))
221
- model_error_std = np.std(target_train - target_train_predicted)
222
-
223
- st.metric(label="Naive error standard deviation", value=naive_error_std)
224
- st.metric(label="Model error standard deviation", value=model_error_std)
225
-
226
- corr_matrix = np.corrcoef(target_train, target_train_predicted)
227
- R_sq = corr_matrix[0, 1] ** 2
228
- st.metric(label="Explained variation thanks to model (R^2)", value=f"{round(100*R_sq, 2)}%")
229
-
230
- naive_error_std = np.std(target_test - np.mean(target_test_predicted))
231
- model_error_std = np.std(target_test - target_test_predicted)
232
-
233
- st.metric(label="Naive error standard deviation", value=naive_error_std)
234
- st.metric(label="Model error standard deviation", value=model_error_std)
235
-
236
- corr_matrix = np.corrcoef(target_test, target_test_predicted)
237
- R_sq = corr_matrix[0, 1] ** 2
238
- st.metric(label="Explained variation thanks to model (R^2)", value=f"{round(100*R_sq, 2)}%")
239
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
240
 
 
 
241
  sample_revenues = st.sidebar.number_input('Yearly revenues', value=1000000)
242
  authority = st.sidebar.selectbox('Organization country', predictors.org_country.cat.categories)
243
  authority = st.sidebar.selectbox('Organization activity', predictors.org_company_type.cat.categories)
244
-
245
-
246
-
 
29
  _max_width_()
30
 
31
  st.title("Data Analysis 🌎 📃")
32
+ st.write("by [Theolex](https://www.theolex.io/)")
33
 
34
  # load and process data
35
  data = load_data()
 
44
  select_auth = authorities.name.sort_values()
45
 
46
  authority = st.sidebar.selectbox('Authority', ['All', *select_auth])
47
+ min_year, max_year = st.sidebar.slider('Decisions year', min_value=2001, max_value=2021, value=(2001, 2021))
48
 
49
  # apply filters
50
  authority_filter = True
 
55
  year_filter = (decisions.year >= min_year) & (decisions.year <= max_year)
56
  decision_scope = decisions[authority_filter & year_filter]
57
 
58
+ st.subheader("Dataset Description")
59
+
60
+ st.metric('Number of validated decisions liked to organisations (and not individuals)', decision_scope.shape[0])
61
+
62
+ st.metric('Decisions with monetary sanctions',
63
+ decision_scope[decision_scope.monetary_sanction > 0].shape[0])
64
+
65
  # explore monetary sanctions
66
  monetary_decision = get_monetary_dataframe(decision_scope)
67
 
68
+ st.metric('Decisions with organizations that have published yearly revenues', sum(monetary_decision.has_revenues))
69
+
70
  ##
71
  # Plot Graphs
72
  ##
73
 
74
+ with st.expander("Data exploration"):
75
+ st.subheader("The organizations' sectors targeted by the sanctions: ")
76
+ st.markdown("The graph shows the cumulated monetary sanction for the current filters")
77
+
78
+ fig = px.treemap(monetary_decision,
79
+ path=['org_company_type'],
80
+ color='org_revenues',
81
+ color_continuous_scale='RdBu',
82
+ template="simple_white",
83
+ values='monetary_sanction',
84
+ width=1000, height=600)
85
+ st.plotly_chart(fig)
86
+
87
+ st.subheader("The organizations' regions targeted by the sanctions: ")
88
+ st.markdown("The graph shows the cumulated monetary sanction for the current filters")
89
+ fig = px.treemap(monetary_decision[~monetary_decision.org_continent.isnull()],
90
+ path=['org_continent', 'org_country'],
91
+ color_continuous_scale='RdBu',
92
+ template="simple_white",
93
+ values='monetary_sanction',
94
+ width=1000, height=600)
95
+ st.plotly_chart(fig)
96
+
97
+ st.subheader("Revenues vs monetary sanctions representation ")
98
+ st.markdown("The graph shows the cumulated monetary sanction for the current filters")
99
+ fig = px.scatter(monetary_decision,
100
+ x="org_revenues",
101
+ y="monetary_sanction",
102
+ log_x=True,
103
+ log_y=True,
104
+ template="simple_white",
105
+ color="same_country",
106
+ color_continuous_scale='RdBu',
107
+ hover_name="org_name",
108
+ width=1000, height=600)
109
+ st.plotly_chart(fig)
110
+
111
+ fig = px.scatter(monetary_decision[~monetary_decision.org_revenues.isnull()],
112
+ x="decision_date",
113
+ size="log10_monetary_sanction",
114
+ y="org_revenues",
115
+ log_y=True,
116
+ template="simple_white",
117
+ color="same_country",
118
+ hover_name="monetary_sanction",
119
+ width=1000, height=600)
120
+ st.plotly_chart(fig)
121
+
122
+ fig = px.histogram(monetary_decision, x="log10_monetary_sanction",
123
+ # y="log10_org_revenues",
124
+ color="same_country",
125
+ marginal="box", # or violin, rug
126
+ template="simple_white",
127
+ width=1000, height=600, nbins=40, opacity=0.5,
128
+ hover_data=monetary_decision.columns)
129
+
130
+ st.plotly_chart(fig)
131
+
132
+ fig = px.histogram(monetary_decision, x="log10_monetary_sanction_rate",
133
+ # y="log10_org_revenues",
134
+ color="same_country",
135
+ marginal="box", # or violin, rug
136
+ template="simple_white",
137
+ width=1000, height=600, nbins=40, opacity=0.5,
138
+ hover_data=monetary_decision.columns)
139
+
140
+ st.plotly_chart(fig)
141
+
142
+ p = scipy.stats.ks_2samp(monetary_decision[monetary_decision.same_country]['log10_monetary_sanction_rate'],
143
+ monetary_decision[~monetary_decision.same_country]['log10_monetary_sanction_rate']
144
+ , alternative='two-sided', mode='auto')
145
+
146
+ st.metric(label="p-value", value=f"{round(p.pvalue, 2)}%")
147
+
148
+ st.subheader("Sum of monetary sanctions over time ")
149
+ st.markdown("The graph shows the cumulated monetary sanction per year for each violation theme")
150
+ chart_data = get_themes_per_year(monetary_decision)
151
+ fig = px.area(chart_data, x="year",
152
+ y="monetary_sanction",
153
+ color="violation_theme",
154
+ template="simple_white",
155
+ # groupnorm="fraction",
156
+ line_group="violation_theme",
157
+ width=1000, height=600)
158
+ st.plotly_chart(fig)
 
159
 
 
 
 
160
 
161
+ ##############################################
162
+ ####
163
+ # build ML model
164
+ ####
165
+ ##############################################
166
  st.title("Training phase")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
167
 
168
+ predictors, target = prepare_data(monetary_decision)
169
  # train the model
170
+ if st.button('Run training'):
171
+ with st.expander("Training results"):
172
+ st.write(f"dataset size: {monetary_decision.shape[0]}")
173
+ st.markdown("Plot taget distribution: log 10 of monetary sanctions")
174
+ fig = ff.create_distplot([target], [' log 10 of monetary sanctions'], bin_size=0.1)
175
+ fig.update_layout(width=1000,
176
+ template="simple_white",
177
+ height=600,
178
+ bargap=0.01)
179
+ st.plotly_chart(fig)
180
+
181
+ # split data set
182
+ predictors_train, predictors_test, target_train, target_test = split(predictors, target)
183
+ st.subheader("Split dataset between training and test:")
184
+ st.metric(label="Training size", value=predictors_train.shape[0])
185
+ st.metric(label="Test size", value=predictors_test.shape[0])
186
+
187
+ xgb_model = run_training(predictors_train, target_train)
188
+
189
+ # evaluate model error
190
+ target_train_predicted = predict(xgb_model, predictors_train)
191
+ training_bias = np.mean(target_train_predicted - target_train)
192
+ st.metric(label="Training bias", value=training_bias)
193
+
194
+ target_test_predicted = predict(xgb_model, predictors_test)
195
+ test_errors = target_test_predicted - target_test
196
+ test_bias = np.mean(test_errors)
197
+ st.metric(label="Test bias", value=test_bias)
198
+
199
+ fig = ff.create_distplot([test_errors], ['errors distribution'], bin_size=0.1)
200
+ fig.update_layout(width=1000,
201
+ template="simple_white",
202
+ height=600,
203
+ bargap=0.01)
204
+ st.plotly_chart(fig)
205
+
206
+ st.subheader("Plot features importance for the trained model")
207
+ xgb_features_importance = features_importance(xgb_model)
208
+
209
+ fig = px.bar(xgb_features_importance,
210
+ orientation='h',
211
+ width=1000,
212
+ template="simple_white",
213
+ height=600,
214
+ )
215
+ st.plotly_chart(fig)
216
+
217
+ st.subheader("Plot predicted vs real")
218
+ compare = pd.concat(
219
+ [pd.DataFrame({'target': target_test, 'predicted': target_test_predicted, 'sample': 'test'}),
220
+ pd.DataFrame({'target': target_train, 'predicted': target_train_predicted, 'sample': 'train'})])
221
+ fig = px.scatter(
222
+ compare,
223
+ x='predicted',
224
+ y='target',
225
+ color='sample',
226
+ marginal_y="violin",
227
+ width=1000,
228
+ template="simple_white",
229
+ height=600,
230
+ trendline="ols")
231
+
232
+ st.plotly_chart(fig)
233
+
234
+ naive_error_std = np.std(target_train - np.mean(target_train_predicted))
235
+ model_error_std = np.std(target_train - target_train_predicted)
236
+
237
+ st.metric(label="Naive error standard deviation", value=naive_error_std)
238
+ st.metric(label="Model error standard deviation", value=model_error_std)
239
+
240
+ corr_matrix = np.corrcoef(target_train, target_train_predicted)
241
+ R_sq = corr_matrix[0, 1] ** 2
242
+ st.metric(label="Explained variation thanks to model (R^2)", value=f"{round(100 * R_sq, 2)}%")
243
+
244
+ naive_error_std = np.std(target_test - np.mean(target_test_predicted))
245
+ model_error_std = np.std(target_test - target_test_predicted)
246
+
247
+ st.metric(label="Naive error standard deviation", value=naive_error_std)
248
+ st.metric(label="Model error standard deviation", value=model_error_std)
249
+
250
+ corr_matrix = np.corrcoef(target_test, target_test_predicted)
251
+ R_sq = corr_matrix[0, 1] ** 2
252
+ st.metric(label="Explained variation thanks to model (R^2)", value=f"{round(100 * R_sq, 2)}%")
253
 
254
+ st.sidebar.title("Organizations view")
255
+ col_x = ['log10_org_revenues', 'authorities_country', 'violation_theme', 'org_country', 'org_company_type']
256
  sample_revenues = st.sidebar.number_input('Yearly revenues', value=1000000)
257
  authority = st.sidebar.selectbox('Organization country', predictors.org_country.cat.categories)
258
  authority = st.sidebar.selectbox('Organization activity', predictors.org_company_type.cat.categories)