HugoLaurencon's picture
first draft
4bbaeac
raw
history blame
No virus
2.27 kB
import streamlit as st
import json
import pandas as pd
import numpy as np
st.title('5k English documents from Oscar with their stats.')
path_data = "./10K_english_examples_with_stats.json"
with open(path_data) as json_file:
data = json.load(json_file)
data = data[:5000]
data = pd.DataFrame(data)
del data["len_words"]
st.header('Parameters of the filtering')
cutoff_special_characters_ratio = st.slider("Max cutoff special characters ratio", 0., 1., 1., step=0.01)
cutoff_stopwords_ratio = st.slider("Min cutoff stopwords ratio", 0., 1., 0., step=0.01)
cutoff_badwords_ratio = st.slider("Max cutoff badwords ratio", 0., 1., 1., step=0.001)
cutoff_lang_id_score = st.slider("Min cutoff lang id score", 0., 1., 0., step=0.01)
cutoff_perplexity_score = st.slider("Perplexity cutoff perplexity score", 0, 14000000, 14000000)
keys = [
("special_characters_ratio", cutoff_special_characters_ratio, True),
("stopwords_ratio", cutoff_stopwords_ratio, False),
("badwords_ratio", cutoff_badwords_ratio, True),
("lang_id_score", cutoff_lang_id_score, False),
("perplexity_score", cutoff_perplexity_score, True),
]
cond = [(data[key] <= cutoff) if max_cutoff else (data[key] >= cutoff) for key, cutoff, max_cutoff in keys]
cond = np.all(cond, axis=0)
data_keep = data.loc[cond]
st.header('Data that we keep')
st.markdown("Click on a column to sort by it.")
st.markdown("Place the cursor on the text to display it.")
st.dataframe(data_keep)
data_not_keep = data.loc[np.invert(cond)]
st.header('Data that is thrown away')
st.markdown("Click on a column to sort by it.")
st.markdown("Place the cursor on the text to display it.")
st.dataframe(data_not_keep)
def plot_hist(key, num_bins=50):
st.header(" ".join(key.split("_")))
hist_values = data[key].values
max_range = np.max(hist_values)
hist_values = np.histogram(
hist_values,
bins=num_bins,
range=(0,max_range)
)[0]
st.bar_chart(hist_values)
st.markdown(f"Each bin is of size: {max_range/num_bins}.")
for key, _, _ in keys:
plot_hist(key)
st.header('Download data')
with open(path_data) as json_file:
btn = st.download_button(
label="Download data as json",
data=json_file,
file_name='data.json',
)