# -*- coding: utf-8 -*- import pandas as pd import streamlit as st import plotly.express as px import plotly.figure_factory as ff import scipy import numpy as np from data_processing import load_data, process_data, get_monetary_dataframe, get_themes_per_year from model import prepare_data, run_training, split, predict, features_importance def _max_width_(): max_width_str = f"max-width: 1500px;" st.markdown( f""" """, unsafe_allow_html=True, ) # force screen width _max_width_() st.title("Data Analysis 🌎 📃") st.write("by [Theolex](https://www.theolex.io/)") # load and process data data = load_data() decisions, organizations, authorities = process_data(data) st.sidebar.title("Authorities parameters") authorities_country = st.sidebar.selectbox('Authority country', ['All', *authorities.country.unique()]) if authorities_country != 'All': select_auth = authorities[authorities.country == authorities_country].name.sort_values() else: select_auth = authorities.name.sort_values() authority = st.sidebar.selectbox('Authority', ['All', *select_auth]) min_year, max_year = st.sidebar.slider('Decisions year', min_value=2001, max_value=2021, value=(2001, 2021)) # apply filters authority_filter = True if authority != 'All': authority_filter = decisions.authorities_name.apply(lambda a: authority in a) else: authority_filter = decisions.authorities_name.apply(lambda a: bool(set(select_auth) & set(a))) year_filter = (decisions.year >= min_year) & (decisions.year <= max_year) decision_scope = decisions[authority_filter & year_filter] st.subheader("Dataset Description") st.metric('Number of validated decisions liked to organisations (and not individuals)', decision_scope.shape[0]) st.metric('Decisions with monetary sanctions', decision_scope[decision_scope.monetary_sanction > 0].shape[0]) # explore monetary sanctions monetary_decision = get_monetary_dataframe(decision_scope) st.metric('Decisions with organizations that have published yearly revenues', sum(monetary_decision.has_revenues)) ## # Plot Graphs ## with st.expander("Data exploration"): st.subheader("The organizations' sectors targeted by the sanctions: ") st.markdown("The graph shows the cumulated monetary sanction for the current filters") fig = px.treemap(monetary_decision, path=['org_company_type'], color='org_revenues', color_continuous_scale='RdBu', template="simple_white", values='monetary_sanction', width=1000, height=600) st.plotly_chart(fig) st.subheader("The organizations' regions targeted by the sanctions: ") st.markdown("The graph shows the cumulated monetary sanction for the current filters") fig = px.treemap(monetary_decision[~monetary_decision.org_continent.isnull()], path=['org_continent', 'org_country'], color_continuous_scale='RdBu', template="simple_white", values='monetary_sanction', width=1000, height=600) st.plotly_chart(fig) st.subheader("Revenues vs monetary sanctions representation ") st.markdown("The graph shows the cumulated monetary sanction for the current filters") fig = px.scatter(monetary_decision, x="org_revenues", y="monetary_sanction", log_x=True, log_y=True, template="simple_white", color="same_country", color_continuous_scale='RdBu', hover_name="org_name", width=1000, height=600) st.plotly_chart(fig) fig = px.scatter(monetary_decision[~monetary_decision.org_revenues.isnull()], x="decision_date", size="log10_monetary_sanction", y="org_revenues", log_y=True, template="simple_white", color="same_country", hover_name="monetary_sanction", width=1000, height=600) st.plotly_chart(fig) fig = px.histogram(monetary_decision, x="log10_monetary_sanction", # y="log10_org_revenues", color="same_country", marginal="box", # or violin, rug template="simple_white", width=1000, height=600, nbins=40, opacity=0.5, hover_data=monetary_decision.columns) st.plotly_chart(fig) fig = px.histogram(monetary_decision, x="log10_monetary_sanction_rate", # y="log10_org_revenues", color="same_country", marginal="box", # or violin, rug template="simple_white", width=1000, height=600, nbins=40, opacity=0.5, hover_data=monetary_decision.columns) st.plotly_chart(fig) p = scipy.stats.ks_2samp(monetary_decision[monetary_decision.same_country]['log10_monetary_sanction_rate'], monetary_decision[~monetary_decision.same_country]['log10_monetary_sanction_rate'] , alternative='two-sided', mode='auto') st.metric(label="p-value", value=f"{round(p.pvalue, 2)}%") st.subheader("Sum of monetary sanctions over time ") st.markdown("The graph shows the cumulated monetary sanction per year for each violation theme") chart_data = get_themes_per_year(monetary_decision) fig = px.area(chart_data, x="year", y="monetary_sanction", color="violation_theme", template="simple_white", # groupnorm="fraction", line_group="violation_theme", width=1000, height=600) st.plotly_chart(fig) ############################################## #### # build ML model #### ############################################## st.title("Training phase") predictors, target = prepare_data(monetary_decision) # train the model if st.button('Run training'): with st.expander("Training results"): st.write(f"dataset size: {monetary_decision.shape[0]}") st.markdown("Plot taget distribution: log 10 of monetary sanctions") fig = ff.create_distplot([target], [' log 10 of monetary sanctions'], bin_size=0.1) fig.update_layout(width=1000, template="simple_white", height=600, bargap=0.01) st.plotly_chart(fig) # split data set predictors_train, predictors_test, target_train, target_test = split(predictors, target) st.subheader("Split dataset between training and test:") st.metric(label="Training size", value=predictors_train.shape[0]) st.metric(label="Test size", value=predictors_test.shape[0]) xgb_model = run_training(predictors_train, target_train) # evaluate model error target_train_predicted = predict(xgb_model, predictors_train) training_bias = np.mean(target_train_predicted - target_train) st.metric(label="Training bias", value=training_bias) target_test_predicted = predict(xgb_model, predictors_test) test_errors = target_test_predicted - target_test test_bias = np.mean(test_errors) st.metric(label="Test bias", value=test_bias) fig = ff.create_distplot([test_errors], ['errors distribution'], bin_size=0.1) fig.update_layout(width=1000, template="simple_white", height=600, bargap=0.01) st.plotly_chart(fig) st.subheader("Plot features importance for the trained model") xgb_features_importance = features_importance(xgb_model) fig = px.bar(xgb_features_importance, orientation='h', width=1000, template="simple_white", height=600, ) st.plotly_chart(fig) st.subheader("Plot predicted vs real") compare = pd.concat( [pd.DataFrame({'target': target_test, 'predicted': target_test_predicted, 'sample': 'test'}), pd.DataFrame({'target': target_train, 'predicted': target_train_predicted, 'sample': 'train'})]) fig = px.scatter( compare, x='predicted', y='target', color='sample', marginal_y="violin", width=1000, template="simple_white", height=600, trendline="ols") st.plotly_chart(fig) naive_error_std = np.std(target_train - np.mean(target_train_predicted)) model_error_std = np.std(target_train - target_train_predicted) st.metric(label="Naive error standard deviation", value=naive_error_std) st.metric(label="Model error standard deviation", value=model_error_std) corr_matrix = np.corrcoef(target_train, target_train_predicted) R_sq = corr_matrix[0, 1] ** 2 st.metric(label="Explained variation thanks to model (R^2)", value=f"{round(100 * R_sq, 2)}%") naive_error_std = np.std(target_test - np.mean(target_test_predicted)) model_error_std = np.std(target_test - target_test_predicted) st.metric(label="Naive error standard deviation", value=naive_error_std) st.metric(label="Model error standard deviation", value=model_error_std) corr_matrix = np.corrcoef(target_test, target_test_predicted) R_sq = corr_matrix[0, 1] ** 2 st.metric(label="Explained variation thanks to model (R^2)", value=f"{round(100 * R_sq, 2)}%") st.sidebar.title("Organizations view") col_x = ['log10_org_revenues', 'authorities_country', 'violation_theme', 'org_country', 'org_company_type'] sample_revenues = st.sidebar.number_input('Yearly revenues', value=1000000) authority = st.sidebar.selectbox('Organization country', predictors.org_country.cat.categories) authority = st.sidebar.selectbox('Organization activity', predictors.org_company_type.cat.categories)