Spaces:

Theolex
/

theolex_streamlit

Runtime error

App Files Files Community

Jawad commited on Nov 16, 2021

Commit

1678afb

1 Parent(s): aac9eba

manage empty revenues

Browse files

Files changed (3) hide show

data_processing.py +6 -5
model.py +2 -3
stream_app.py +187 -175

data_processing.py CHANGED Viewed

@@ -44,6 +44,7 @@ def process_data(data):
     decisions = pd.DataFrame(data['decisions'])
     decisions['year'] = pd.to_datetime(decisions['decision_date']).dt.year
     decisions.monetary_sanction = decisions.monetary_sanction.astype(float)
     decisions = decisions[decisions.status == 'V']
     decisions.decision_date = pd.to_datetime(decisions['decision_date']).dt.date
     decisions['id'] = decisions.url.apply(get_id)
@@ -77,10 +78,10 @@ def process_data(data):
 def get_monetary_dataframe(decision_scope):
     monetary_decision = decision_scope[decision_scope.monetary_sanction > 0]
-    monetary_decision = monetary_decision[monetary_decision.org_revenues != ""]
-    monetary_decision['org_revenues'] = monetary_decision.org_revenues.astype(float)
-    monetary_decision['log10_org_revenues'] = monetary_decision.org_revenues.apply(np.log10)
-    monetary_decision['log10_monetary_sanction'] = monetary_decision.monetary_sanction.apply(np.log10)
     monetary_decision['same_country'] = (monetary_decision.org_country == monetary_decision.authorities_country)
     monetary_decision['monetary_sanction_rate'] = monetary_decision.monetary_sanction/monetary_decision.org_revenues
     monetary_decision['log10_monetary_sanction_rate'] = monetary_decision.monetary_sanction_rate.apply(np.log10)
@@ -89,4 +90,4 @@ def get_monetary_dataframe(decision_scope):
 def get_themes_per_year(monetary_decision):
     #return monetary_decision.groupby(['year', 'violation_theme'])['monetary_sanction'].sum().unstack().fillna(0)
-    return monetary_decision.groupby(['year', 'violation_theme'])['monetary_sanction'].sum().reset_index()

     decisions = pd.DataFrame(data['decisions'])
     decisions['year'] = pd.to_datetime(decisions['decision_date']).dt.year
     decisions.monetary_sanction = decisions.monetary_sanction.astype(float)
+    # keep validated decisions
     decisions = decisions[decisions.status == 'V']
     decisions.decision_date = pd.to_datetime(decisions['decision_date']).dt.date
     decisions['id'] = decisions.url.apply(get_id)
 def get_monetary_dataframe(decision_scope):
     monetary_decision = decision_scope[decision_scope.monetary_sanction > 0]
+    monetary_decision['has_revenues'] = (monetary_decision.org_revenues != "")
+    monetary_decision['org_revenues'] = monetary_decision.org_revenues.str.replace('', '0').astype(float)
+    monetary_decision['log10_org_revenues'] = monetary_decision.org_revenues.apply(lambda x: np.log10(x+1))
+    monetary_decision['log10_monetary_sanction'] = monetary_decision.monetary_sanction.apply(lambda x: np.log10(x+1))
     monetary_decision['same_country'] = (monetary_decision.org_country == monetary_decision.authorities_country)
     monetary_decision['monetary_sanction_rate'] = monetary_decision.monetary_sanction/monetary_decision.org_revenues
     monetary_decision['log10_monetary_sanction_rate'] = monetary_decision.monetary_sanction_rate.apply(np.log10)
 def get_themes_per_year(monetary_decision):
     #return monetary_decision.groupby(['year', 'violation_theme'])['monetary_sanction'].sum().unstack().fillna(0)
+    return monetary_decision.groupby(['year', 'violation_theme'])['monetary_sanction'].sum().reset_index()

model.py CHANGED Viewed

@@ -44,9 +44,8 @@ def run_training(predictors_train, predictors_test):
              'gamma': 0.5,
              'objective': 'reg:squarederror'}
     num_round = 1000
-    xgb_cv = cv(dtrain=data_train, params=params, nfold=3,
-                num_boost_round=1000, early_stopping_rounds=10, metrics="rmse", as_pandas=True, seed=123)
-    print(xgb_cv)
     return xgb.train(params, data_train, num_round)

              'gamma': 0.5,
              'objective': 'reg:squarederror'}
     num_round = 1000
+    #xgb_cv = cv(dtrain=data_train, params=params, nfold=3,
+    #            num_boost_round=1000, early_stopping_rounds=10, metrics="rmse", as_pandas=True, seed=123)
     return xgb.train(params, data_train, num_round)

stream_app.py CHANGED Viewed

@@ -29,7 +29,7 @@ def _max_width_():
 _max_width_()
 st.title("Data Analysis 🌎 📃")
-st.write("by [Teolex](https://www.theolex.io/)")
 # load and process data
 data = load_data()
@@ -44,7 +44,7 @@ else:
     select_auth = authorities.name.sort_values()
 authority = st.sidebar.selectbox('Authority', ['All', *select_auth])
-min_year, max_year = st.sidebar.slider('Decisions year', min_value=2001, max_value=2021, value=(2010, 2021))
 # apply filters
 authority_filter = True
@@ -55,192 +55,204 @@ else:
 year_filter = (decisions.year >= min_year) & (decisions.year <= max_year)
 decision_scope = decisions[authority_filter & year_filter]
 # explore monetary sanctions
 monetary_decision = get_monetary_dataframe(decision_scope)
 ##
 # Plot Graphs
 ##
-st.subheader("The organizations' sectors targeted by the sanctions: ")
-st.markdown("The graph shows the cumulated monetary sanction for the current filters")
-fig = px.treemap(monetary_decision,
-                 path=['org_company_type'],
-                 color='org_revenues',
-                 color_continuous_scale='RdBu',
-                 template="simple_white",
-                 values='monetary_sanction',
-                 width=1000, height=600)
-st.plotly_chart(fig)
-st.subheader("The organizations' regions targeted by the sanctions: ")
-st.markdown("The graph shows the cumulated monetary sanction for the current filters")
-fig = px.treemap(monetary_decision[~monetary_decision.org_continent.isnull()],
-                 path=['org_continent', 'org_country'],
-                 color_continuous_scale='RdBu',
-                 template="simple_white",
-                 values='monetary_sanction',
-                 width=1000, height=600)
-st.plotly_chart(fig)
-st.subheader("Revenues vs monetary sanctions representation ")
-st.markdown("The graph shows the cumulated monetary sanction for the current filters")
-fig = px.scatter(monetary_decision,
-                 x="org_revenues",
-                 y="monetary_sanction",
-                 log_x=True,
-                 log_y=True,
-                 template="simple_white",
-                 color="same_country",
-                 color_continuous_scale='RdBu',
-                 hover_name="org_name",
-                 width=1000, height=600)
-st.plotly_chart(fig)
-fig = px.scatter(monetary_decision[~monetary_decision.org_revenues.isnull()],
-                 x="decision_date",
-                 size="log10_monetary_sanction",
-                 y="org_revenues",
-                 log_y=True,
-                 template="simple_white",
-                 color="same_country",
-                 hover_name="monetary_sanction",
-                 width=1000, height=600)
-st.plotly_chart(fig)
-fig = px.histogram(monetary_decision, x="log10_monetary_sanction",
-                   # y="log10_org_revenues",
-                   color="same_country",
-                   marginal="box",  # or violin, rug
-                   template="simple_white",
-                   width=1000, height=600, nbins=40, opacity=0.5,
-                   hover_data=monetary_decision.columns)
-st.plotly_chart(fig)
-fig = px.histogram(monetary_decision, x="log10_monetary_sanction_rate",
-                   # y="log10_org_revenues",
-                   color="same_country",
-                   marginal="box",  # or violin, rug
-                   template="simple_white",
-                   width=1000, height=600, nbins=40, opacity=0.5,
-                   hover_data=monetary_decision.columns)
-st.plotly_chart(fig)
-p = scipy.stats.ks_2samp(monetary_decision[monetary_decision.same_country]['log10_monetary_sanction_rate'],
-                         monetary_decision[~monetary_decision.same_country]['log10_monetary_sanction_rate']
-                         , alternative='two-sided', mode='auto')
-st.metric(label="p-value", value=f"{round(p.pvalue, 2)}%")
-st.subheader("Sum of monetary sanctions over time ")
-st.markdown("The graph shows the cumulated monetary sanction per year for each violation theme")
-chart_data = get_themes_per_year(monetary_decision)
-fig = px.area(chart_data, x="year",
-              y="monetary_sanction",
-              color="violation_theme",
-              template="simple_white",
-              # groupnorm="fraction",
-              line_group="violation_theme",
-              width=1000, height=600)
-st.plotly_chart(fig)
-st.sidebar.title("Organizations view")
-col_x = ['log10_org_revenues', 'authorities_country', 'violation_theme', 'org_country', 'org_company_type']
-predictors, target = prepare_data(monetary_decision)
 st.title("Training phase")
-st.write(f"dataset size: {monetary_decision.shape[0]}")
-st.markdown("Plot taget distribution: log 10 of monetary sanctions")
-fig = ff.create_distplot([target], [' log 10 of monetary sanctions'], bin_size=0.1)
-fig.update_layout(width=1000,
-                  template="simple_white",
-                  height=600,
-                  bargap=0.01)
-st.plotly_chart(fig)
-# split data set
-predictors_train, predictors_test, target_train, target_test = split(predictors, target)
-st.subheader("Split dataset between training and test:")
-st.metric(label="Training size", value=predictors_train.shape[0])
-st.metric(label="Test size", value=predictors_test.shape[0])
 # train the model
-xgb_model = run_training(predictors_train, target_train)
-# evaluate model error
-target_train_predicted = predict(xgb_model, predictors_train)
-training_bias = np.mean(target_train_predicted - target_train)
-st.metric(label="Training bias", value=training_bias)
-target_test_predicted = predict(xgb_model, predictors_test)
-test_errors = target_test_predicted - target_test
-test_bias = np.mean(test_errors)
-st.metric(label="Test bias", value=test_bias)
-fig = ff.create_distplot([test_errors], ['errors distribution'], bin_size=0.1)
-fig.update_layout(width=1000,
-                  template="simple_white",
-                  height=600,
-                  bargap=0.01)
-st.plotly_chart(fig)
-st.subheader("Plot features importance for the trained model")
-xgb_features_importance = features_importance(xgb_model)
-fig = px.bar(xgb_features_importance,
-             orientation='h',
-             width=1000,
-             template="simple_white",
-             height=600,
-             )
-st.plotly_chart(fig)
-st.subheader("Plot predicted vs real")
-compare = pd.concat([pd.DataFrame({'target': target_test, 'predicted': target_test_predicted, 'sample': 'test'}),
-                     pd.DataFrame({'target': target_train, 'predicted': target_train_predicted, 'sample': 'train'})])
-fig = px.scatter(
-    compare,
-    x='predicted',
-    y='target',
-    color='sample',
-    marginal_y="violin",
-    width=1000,
-    template="simple_white",
-    height=600,
-    trendline="ols")
-st.plotly_chart(fig)
-naive_error_std = np.std(target_train - np.mean(target_train_predicted))
-model_error_std = np.std(target_train - target_train_predicted)
-st.metric(label="Naive error standard deviation", value=naive_error_std)
-st.metric(label="Model error standard deviation", value=model_error_std)
-corr_matrix = np.corrcoef(target_train, target_train_predicted)
-R_sq = corr_matrix[0, 1] ** 2
-st.metric(label="Explained variation thanks to model (R^2)",  value=f"{round(100*R_sq, 2)}%")
-naive_error_std = np.std(target_test - np.mean(target_test_predicted))
-model_error_std = np.std(target_test - target_test_predicted)
-st.metric(label="Naive error standard deviation", value=naive_error_std)
-st.metric(label="Model error standard deviation", value=model_error_std)
-corr_matrix = np.corrcoef(target_test, target_test_predicted)
-R_sq = corr_matrix[0, 1] ** 2
-st.metric(label="Explained variation thanks to model (R^2)",  value=f"{round(100*R_sq, 2)}%")
 sample_revenues = st.sidebar.number_input('Yearly revenues', value=1000000)
 authority = st.sidebar.selectbox('Organization country', predictors.org_country.cat.categories)
 authority = st.sidebar.selectbox('Organization activity', predictors.org_company_type.cat.categories)

 _max_width_()
 st.title("Data Analysis 🌎 📃")
+st.write("by [Theolex](https://www.theolex.io/)")
 # load and process data
 data = load_data()
     select_auth = authorities.name.sort_values()
 authority = st.sidebar.selectbox('Authority', ['All', *select_auth])
+min_year, max_year = st.sidebar.slider('Decisions year', min_value=2001, max_value=2021, value=(2001, 2021))
 # apply filters
 authority_filter = True
 year_filter = (decisions.year >= min_year) & (decisions.year <= max_year)
 decision_scope = decisions[authority_filter & year_filter]
+st.subheader("Dataset Description")
+st.metric('Number of validated decisions liked to organisations (and not individuals)', decision_scope.shape[0])
+st.metric('Decisions with monetary sanctions',
+          decision_scope[decision_scope.monetary_sanction > 0].shape[0])
 # explore monetary sanctions
 monetary_decision = get_monetary_dataframe(decision_scope)
+st.metric('Decisions with organizations that have published yearly revenues', sum(monetary_decision.has_revenues))
 ##
 # Plot Graphs
 ##
+with st.expander("Data exploration"):
+    st.subheader("The organizations' sectors targeted by the sanctions: ")
+    st.markdown("The graph shows the cumulated monetary sanction for the current filters")
+    fig = px.treemap(monetary_decision,
+                     path=['org_company_type'],
+                     color='org_revenues',
+                     color_continuous_scale='RdBu',
+                     template="simple_white",
+                     values='monetary_sanction',
+                     width=1000, height=600)
+    st.plotly_chart(fig)
+    st.subheader("The organizations' regions targeted by the sanctions: ")
+    st.markdown("The graph shows the cumulated monetary sanction for the current filters")
+    fig = px.treemap(monetary_decision[~monetary_decision.org_continent.isnull()],
+                     path=['org_continent', 'org_country'],
+                     color_continuous_scale='RdBu',
+                     template="simple_white",
+                     values='monetary_sanction',
+                     width=1000, height=600)
+    st.plotly_chart(fig)
+    st.subheader("Revenues vs monetary sanctions representation ")
+    st.markdown("The graph shows the cumulated monetary sanction for the current filters")
+    fig = px.scatter(monetary_decision,
+                     x="org_revenues",
+                     y="monetary_sanction",
+                     log_x=True,
+                     log_y=True,
+                     template="simple_white",
+                     color="same_country",
+                     color_continuous_scale='RdBu',
+                     hover_name="org_name",
+                     width=1000, height=600)
+    st.plotly_chart(fig)
+    fig = px.scatter(monetary_decision[~monetary_decision.org_revenues.isnull()],
+                     x="decision_date",
+                     size="log10_monetary_sanction",
+                     y="org_revenues",
+                     log_y=True,
+                     template="simple_white",
+                     color="same_country",
+                     hover_name="monetary_sanction",
+                     width=1000, height=600)
+    st.plotly_chart(fig)
+    fig = px.histogram(monetary_decision, x="log10_monetary_sanction",
+                       # y="log10_org_revenues",
+                       color="same_country",
+                       marginal="box",  # or violin, rug
+                       template="simple_white",
+                       width=1000, height=600, nbins=40, opacity=0.5,
+                       hover_data=monetary_decision.columns)
+    st.plotly_chart(fig)
+    fig = px.histogram(monetary_decision, x="log10_monetary_sanction_rate",
+                       # y="log10_org_revenues",
+                       color="same_country",
+                       marginal="box",  # or violin, rug
+                       template="simple_white",
+                       width=1000, height=600, nbins=40, opacity=0.5,
+                       hover_data=monetary_decision.columns)
+    st.plotly_chart(fig)
+    p = scipy.stats.ks_2samp(monetary_decision[monetary_decision.same_country]['log10_monetary_sanction_rate'],
+                             monetary_decision[~monetary_decision.same_country]['log10_monetary_sanction_rate']
+                             , alternative='two-sided', mode='auto')
+    st.metric(label="p-value", value=f"{round(p.pvalue, 2)}%")
+    st.subheader("Sum of monetary sanctions over time ")
+    st.markdown("The graph shows the cumulated monetary sanction per year for each violation theme")
+    chart_data = get_themes_per_year(monetary_decision)
+    fig = px.area(chart_data, x="year",
+                  y="monetary_sanction",
+                  color="violation_theme",
+                  template="simple_white",
+                  # groupnorm="fraction",
+                  line_group="violation_theme",
+                  width=1000, height=600)
+    st.plotly_chart(fig)
+##############################################
+####
+# build ML model
+####
+##############################################
 st.title("Training phase")
+predictors, target = prepare_data(monetary_decision)
 # train the model
+if st.button('Run training'):
+    with st.expander("Training results"):
+        st.write(f"dataset size: {monetary_decision.shape[0]}")
+        st.markdown("Plot taget distribution: log 10 of monetary sanctions")
+        fig = ff.create_distplot([target], [' log 10 of monetary sanctions'], bin_size=0.1)
+        fig.update_layout(width=1000,
+                          template="simple_white",
+                          height=600,
+                          bargap=0.01)
+        st.plotly_chart(fig)
+        # split data set
+        predictors_train, predictors_test, target_train, target_test = split(predictors, target)
+        st.subheader("Split dataset between training and test:")
+        st.metric(label="Training size", value=predictors_train.shape[0])
+        st.metric(label="Test size", value=predictors_test.shape[0])
+        xgb_model = run_training(predictors_train, target_train)
+        # evaluate model error
+        target_train_predicted = predict(xgb_model, predictors_train)
+        training_bias = np.mean(target_train_predicted - target_train)
+        st.metric(label="Training bias", value=training_bias)
+        target_test_predicted = predict(xgb_model, predictors_test)
+        test_errors = target_test_predicted - target_test
+        test_bias = np.mean(test_errors)
+        st.metric(label="Test bias", value=test_bias)
+        fig = ff.create_distplot([test_errors], ['errors distribution'], bin_size=0.1)
+        fig.update_layout(width=1000,
+                          template="simple_white",
+                          height=600,
+                          bargap=0.01)
+        st.plotly_chart(fig)
+        st.subheader("Plot features importance for the trained model")
+        xgb_features_importance = features_importance(xgb_model)
+        fig = px.bar(xgb_features_importance,
+                     orientation='h',
+                     width=1000,
+                     template="simple_white",
+                     height=600,
+                     )
+        st.plotly_chart(fig)
+        st.subheader("Plot predicted vs real")
+        compare = pd.concat(
+            [pd.DataFrame({'target': target_test, 'predicted': target_test_predicted, 'sample': 'test'}),
+             pd.DataFrame({'target': target_train, 'predicted': target_train_predicted, 'sample': 'train'})])
+        fig = px.scatter(
+            compare,
+            x='predicted',
+            y='target',
+            color='sample',
+            marginal_y="violin",
+            width=1000,
+            template="simple_white",
+            height=600,
+            trendline="ols")
+        st.plotly_chart(fig)
+        naive_error_std = np.std(target_train - np.mean(target_train_predicted))
+        model_error_std = np.std(target_train - target_train_predicted)
+        st.metric(label="Naive error standard deviation", value=naive_error_std)
+        st.metric(label="Model error standard deviation", value=model_error_std)
+        corr_matrix = np.corrcoef(target_train, target_train_predicted)
+        R_sq = corr_matrix[0, 1] ** 2
+        st.metric(label="Explained variation thanks to model (R^2)", value=f"{round(100 * R_sq, 2)}%")
+        naive_error_std = np.std(target_test - np.mean(target_test_predicted))
+        model_error_std = np.std(target_test - target_test_predicted)
+        st.metric(label="Naive error standard deviation", value=naive_error_std)
+        st.metric(label="Model error standard deviation", value=model_error_std)
+        corr_matrix = np.corrcoef(target_test, target_test_predicted)
+        R_sq = corr_matrix[0, 1] ** 2
+        st.metric(label="Explained variation thanks to model (R^2)", value=f"{round(100 * R_sq, 2)}%")
+st.sidebar.title("Organizations view")
+col_x = ['log10_org_revenues', 'authorities_country', 'violation_theme', 'org_country', 'org_company_type']
 sample_revenues = st.sidebar.number_input('Yearly revenues', value=1000000)
 authority = st.sidebar.selectbox('Organization country', predictors.org_country.cat.categories)
 authority = st.sidebar.selectbox('Organization activity', predictors.org_company_type.cat.categories)