|
import streamlit as st |
|
import json |
|
import pandas as pd |
|
import math |
|
import numpy as np |
|
import matplotlib.pyplot as plt |
|
|
|
|
|
def visualization(path_data, lang, num_docs, num_docs_for_words): |
|
|
|
with open(path_data) as json_file: |
|
data = json.load(json_file) |
|
|
|
num_docs = min(num_docs, len(data)) |
|
|
|
st.title(f"{num_docs} {lang} documents from Oscar with their stats.") |
|
|
|
sentences = [doc["text"].split(" ") for doc in data[:num_docs_for_words]] |
|
words = set([word for sentence in sentences for word in sentence]) |
|
words_data = [{"len_word": len(word), "word": word} for word in words] |
|
words_data = pd.DataFrame(words_data) |
|
|
|
data = data[:num_docs] |
|
data = pd.DataFrame(data) |
|
|
|
columns = list(data) |
|
keys = [] |
|
values = {} |
|
|
|
st.header("Filtering based on document content") |
|
|
|
|
|
if "special_%" in columns: |
|
special_ratio = st.sidebar.slider( |
|
"% filtered by special characters ratio", 0.0, 100.0, 0.0, step=1.0 |
|
) |
|
cutoff_index = max(0, math.floor((100 - special_ratio) * len(data.index) / 100) - 1) |
|
special_cutoff = np.partition(data["special_%"], cutoff_index)[cutoff_index] |
|
st.sidebar.text(f"Kept text with <{special_cutoff:.1f}% special chars") |
|
keys.append(("special_%", special_cutoff, True)) |
|
|
|
if "stop_%" in columns: |
|
stop_ratio = st.sidebar.slider( |
|
"% filtered by stop word ratio", 0.0, 100.0, 0.0, step=1.0 |
|
) |
|
cutoff_index = max(0, math.floor(stop_ratio * len(data.index) / 100) - 1) |
|
stop_cutoff = np.partition(data["stop_%"], cutoff_index)[cutoff_index] |
|
st.sidebar.text(f"Kept text with >{stop_cutoff:.1f}% stop words") |
|
keys.append(("stop_%", stop_cutoff, False)) |
|
|
|
if "bad_%" in columns: |
|
bad_ratio = st.sidebar.slider( |
|
"% filtered by badwords ratio", 0.0, 100.0, 0.0, step=1.0 |
|
) |
|
bad_index = max(0, math.floor((100 - bad_ratio) * len(data.index) / 100) - 1) |
|
bad_cutoff = np.partition(data["bad_%"], bad_index)[bad_index] |
|
st.sidebar.text(f"Kept text with <{bad_cutoff:.1f}% bad words") |
|
keys.append(("bad_%", bad_cutoff, True)) |
|
|
|
if "perplexity" in columns: |
|
ppl_ratio = st.sidebar.slider( |
|
"% filtered by perplexity", 0.0, 100.0, 0.0, step=1.0 |
|
) |
|
ppl_index = max(0, math.floor((100 - ppl_ratio) * len(data.index) / 100) - 1) |
|
ppl_cutoff = np.partition(data["perplexity"], ppl_index)[ppl_index] |
|
st.sidebar.text(f"Kept text with <{ppl_cutoff:.0f} perplexity") |
|
keys.append(("perplexity", ppl_cutoff, True)) |
|
|
|
cond = [ |
|
(data[key] <= cutoff) if max_cutoff else (data[key] >= cutoff) |
|
for key, cutoff, max_cutoff in keys |
|
] |
|
cond = np.all(cond, axis=0) |
|
|
|
data_not_keep = data.loc[np.invert(cond)] |
|
st.subheader("Filtered data") |
|
st.markdown("Click on a column to sort by it, place the cursor on the text to display it.") |
|
st.dataframe(data_not_keep) |
|
|
|
data_keep = data.loc[cond] |
|
st.subheader("Kept data") |
|
st.markdown("Click on a column to sort by it, place the cursor on the text to display it.") |
|
st.dataframe(data_keep) |
|
|
|
def plot_hist(dataframe, key, num_bins=50): |
|
st.subheader(" ".join(key.split("_"))) |
|
hist_values = dataframe[key].values |
|
max_range = np.max(hist_values) |
|
hist_values = np.histogram(hist_values, bins=num_bins, range=(0, max_range))[0] |
|
st.bar_chart(hist_values) |
|
st.markdown(f"Each bin is of size: {max_range/num_bins}.") |
|
|
|
|
|
|
|
|
|
st.header("Filtering links and concatenated words") |
|
max_len_word = int(np.max(words_data["len_word"])) + 1 |
|
cutoff_word = st.sidebar.slider("Word length cutoff", 0, max_len_word, max_len_word) |
|
cond_words = words_data["len_word"] <= cutoff_word |
|
|
|
words_keep = words_data.loc[cond_words] |
|
st.subheader(f"Words that we keep (for {num_docs_for_words} documents)") |
|
st.markdown("Click on a column to sort by it, place the cursor on the text to display it.") |
|
st.dataframe(words_keep) |
|
|
|
words_not_keep = words_data.loc[np.invert(cond_words)] |
|
st.subheader(f"Words that are thrown away (for {num_docs_for_words} documents)") |
|
st.markdown("Click on a column to sort by it, place the cursor on the text to display it.") |
|
st.dataframe(words_not_keep) |
|
|
|
st.header("Download data") |
|
|
|
with open(path_data) as json_file: |
|
btn = st.download_button( |
|
label="Download data as json", |
|
data=json_file, |
|
file_name="data.json", |
|
) |
|
|
|
|
|
path_data = "./en_examples_with_stats_no_small_docs.json" |
|
lang = "English" |
|
num_docs = 5000 |
|
num_docs_for_words = 500 |
|
|
|
visualization(path_data, lang, num_docs, num_docs_for_words) |
|
|