|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
import streamlit as st |
|
import numpy as np |
|
import pandas as pd |
|
import os |
|
import time |
|
from datetime import datetime |
|
import requests |
|
import matplotlib |
|
from matplotlib import pyplot as plt |
|
import seaborn as sns |
|
import plotly.express as px |
|
from scipy.stats import norm |
|
from fuzzywuzzy import fuzz |
|
|
|
|
|
plt.style.use("seaborn") |
|
|
|
st.set_page_config(page_title="COVID statistics and Benford's Law", |
|
page_icon='images/icon.ico', |
|
initial_sidebar_state='auto') |
|
|
|
|
|
|
|
|
|
def argmax(iterable): |
|
return max(enumerate(iterable), key=lambda x: x[1])[0] |
|
|
|
|
|
|
|
|
|
def have_todays_data(file): |
|
if os.path.exists(file): |
|
(_, _, _, _, _, _, _, atime, mtime, ctime) = os.stat(file) |
|
modif_time_string = time.ctime(atime) |
|
modif = datetime.strptime(modif_time_string, '%a %b %d %H:%M:%S %Y') |
|
today = datetime.today() |
|
if modif.day == today.day and modif.month == today.month: |
|
return True |
|
else: |
|
return False |
|
else: |
|
return False |
|
|
|
|
|
st.title("COVID statistics and Benford's Law") |
|
st.markdown("""---""") |
|
|
|
|
|
|
|
url_cases = 'https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv' |
|
|
|
url_deaths = 'https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_deaths_global.csv' |
|
|
|
switch = st.sidebar.selectbox('Choose data', ('Cases', 'Deaths'), index=0) |
|
if switch == 'Deaths': |
|
url = url_deaths |
|
csv_file = 'deaths_stat.csv' |
|
description = 'deaths' |
|
else: |
|
url = url_cases |
|
csv_file = 'cases_stat.csv' |
|
description = 'cases' |
|
|
|
|
|
if have_todays_data(csv_file): |
|
raw_data = pd.read_csv(csv_file) |
|
else: |
|
response = requests.get(url) |
|
with open(csv_file, 'wb') as f: |
|
f.write(response.content) |
|
raw_data = pd.read_csv(csv_file) |
|
|
|
countries = raw_data['Country/Region'].unique() |
|
raw_data.drop(columns=['Lat', 'Long', 'Province/State'], inplace=True) |
|
|
|
|
|
exists = False |
|
while not exists: |
|
country = st.sidebar.text_input( |
|
'Country', |
|
value='Russia', |
|
help='Minor typos in countries names are forgiven)') |
|
if country.title() in countries: |
|
exists = True |
|
else: |
|
country = countries[argmax( |
|
list( |
|
map(lambda x: fuzz.token_set_ratio(x, country.title()), |
|
countries)))] |
|
exists = True |
|
|
|
raw_data = raw_data.groupby('Country/Region').sum() |
|
raw_data = raw_data.apply(lambda x: x.diff(1), axis=1).fillna(0) |
|
|
|
|
|
p = float(st.sidebar.selectbox('Choose p-value', ('0.05', '0.01'), index=0)) |
|
|
|
|
|
raw_data = raw_data.applymap(lambda x: int(str(abs(x))[0])) |
|
counts = raw_data.apply(lambda x: pd.value_counts(x), axis=1) |
|
counts.drop(columns=0, inplace=True) |
|
counts = counts.apply(lambda x: x / np.sum(x), axis=1) |
|
|
|
ru_counts = counts.loc[country] |
|
world_counts = counts.apply(np.mean, axis=0) |
|
|
|
|
|
fig1 = plt.figure(figsize=(12, 5)) |
|
plt.plot( |
|
world_counts, |
|
label=f'Daily new {description} in all countries (first digit)') |
|
log = [np.log10(1 + 1 / i) for i in range(1, 10)] |
|
plt.plot(range(1, 10), log, color='green', label="Benford's law: log(1+1/n)") |
|
plt.legend(fontsize=14) |
|
|
|
|
|
|
|
|
|
st.write(f''' |
|
Frequencies of the first digit occurence of daily new |
|
COVID {description} in all countries vs [Benford's |
|
law:](https://en.wikipedia.org/wiki/Benford's_law) |
|
''') |
|
plt.show() |
|
st.write(fig1) |
|
|
|
st.markdown(f''' |
|
Observation: distribution is well aligned with Benford's law. |
|
''') |
|
st.markdown("""---""") |
|
|
|
|
|
fig2 = plt.figure(figsize=(12, 7)) |
|
sns.boxplot(data=counts) |
|
sns.lineplot(x=range(9), |
|
y=ru_counts, |
|
color='red', |
|
marker='o', |
|
label=f'Daily new {description} in {country} (first digit)') |
|
log = [np.log10(1 + 1 / i) for i in range(1, 10)] |
|
plt.plot(log, |
|
color='green', |
|
linestyle='-', |
|
linewidth=2, |
|
label="Benford's law: log(1+1/n)") |
|
plt.legend(fontsize=14) |
|
st.markdown(f''' |
|
Frequencies of the first digit occurence of daily new |
|
COVID {description} in **{country}** vs all other countries: |
|
''') |
|
plt.show() |
|
st.write(fig2) |
|
|
|
st.markdown("""---""") |
|
st.markdown('**Null hypothesis:**') |
|
st.write( |
|
'First digit of new daily COVID cases in the country is distributed similarly to first digits of new daily COVID cases in all other countries.' |
|
) |
|
|
|
|
|
|
|
def check_null(country_counts, df_stat, p): |
|
check_results = np.zeros(9) |
|
for n in df_stat.columns: |
|
prob = norm.cdf(country_counts[n], |
|
loc=df_stat.iloc[1, n - 1], |
|
scale=df_stat.iloc[2, n - 1]) |
|
check_results[n - 1] += (prob < p or prob > 1 - p) |
|
strikes = sum(check_results) |
|
return check_results, strikes |
|
|
|
|
|
|
|
check_result, no_strikes = check_null(ru_counts, counts.describe(), p) |
|
check_result = [int(i) for i in check_result] |
|
check_df = pd.DataFrame(check_result, index=range(1, 10), columns=['Rejected']) |
|
|
|
st.markdown("""---""") |
|
st.markdown(f'Check null hypothesis for **{country}**:') |
|
st.markdown( |
|
f'Null hypothesis is rejected for {int(no_strikes)} digit(s).') |
|
st.write(check_df.T) |
|
|
|
|
|
|
|
counts_desc = counts.describe() |
|
counts['strikes'] = counts.apply(lambda x: check_null(x, counts_desc, p)[1], |
|
axis=1) |
|
|
|
|
|
x = counts[counts['strikes'] >= 2].sort_values(by='strikes', ascending=False) |
|
x.columns = x.columns.astype("str") |
|
cols = ['strikes'] + [col for col in x if col != 'strikes'] |
|
x = x[cols] |
|
x.rename(columns={'strikes': 'Rejections #'}, inplace=True) |
|
|
|
st.markdown("""---""") |
|
st.write( |
|
f'Distribution of countries/regions by the number of null hypothesis rejections, p = {p}:' |
|
) |
|
fig3 = plt.figure(figsize=(12, 5)) |
|
fig3 = px.histogram(counts, x="strikes", width=1200, height=500) |
|
|
|
|
|
|
|
|
|
fig3.update_layout( |
|
|
|
xaxis_title_text='Rejections #', |
|
yaxis_title_text='Count', |
|
bargap=0.5 |
|
) |
|
|
|
st.plotly_chart(fig3, use_container_width=True) |
|
|
|
st.markdown("""---""") |
|
|
|
st.write("COVID statistics 'champions':") |
|
st.write( |
|
x.style.format("{:.2}", na_rep="-").format({'Rejections #': "{:.0f}"})) |
|
|