Spaces:
Runtime error
Runtime error
import streamlit as st | |
import json | |
import pandas as pd | |
import math | |
import numpy as np | |
import matplotlib.pyplot as plt | |
def visualization(path_data, lang, num_docs, num_docs_for_words): | |
with open(path_data) as json_file: | |
data = json.load(json_file) | |
num_docs = min(num_docs, len(data)) | |
st.title(f"{num_docs} {lang} documents from Oscar with their stats.") | |
sentences = [doc["text"].split(" ") for doc in data[:num_docs_for_words]] | |
words = set([word for sentence in sentences for word in sentence]) | |
words_data = [{"len_word": len(word), "word": word} for word in words] | |
words_data = pd.DataFrame(words_data) | |
data = data[:num_docs] | |
data = pd.DataFrame(data) | |
columns = list(data) | |
keys = [] | |
values = {} | |
st.header("Filtering based on document content") | |
if "special_%" in columns: | |
special_ratio = st.sidebar.slider( | |
"% filtered by special characters ratio", 0.0, 50.0, 0.0, step=1.0 | |
) | |
cutoff_index = max(0, math.floor((100 - special_ratio) * len(data.index) / 100) - 1) | |
special_cutoff = np.partition(data["special_%"], cutoff_index)[cutoff_index] | |
st.sidebar.text(f"Kept text with <{special_cutoff:.1f}% special chars") | |
keys.append(("special_%", special_cutoff, True)) | |
if "stop_%" in columns: | |
stop_ratio = st.sidebar.slider( | |
"% filtered by stop word ratio", 0.0, 50.0, 0.0, step=1.0 | |
) | |
cutoff_index = max(0, math.floor(stop_ratio * len(data.index) / 100) - 1) | |
stop_cutoff = np.partition(data["stop_%"], cutoff_index)[cutoff_index] | |
st.sidebar.text(f"Kept text with >{stop_cutoff:.2f}% stop words") | |
keys.append(("stop_%", stop_cutoff, False)) | |
def recalculate_bad_words(file): | |
def bad_word_ratio(text: str, bad_word_list): | |
return len([word for word in text.split() if word.lower().strip() in bad_word_list]) / len(text.split()) | |
bad_word_list = [word.decode().strip() for word in file.readlines()] | |
bad_word_ratios = [bad_word_ratio(text, bad_word_list) * 100 for text in data["text"]] | |
data["bad_%"] = bad_word_ratios | |
bad_word_file = st.sidebar.file_uploader("Upload your own list of bad words (1 word per line)") | |
st.session_state.old_bad_word_file = None | |
if bad_word_file != st.write(st.session_state.old_bad_word_file): | |
recalculate_bad_words(bad_word_file) | |
st.session_state.old_bad_word_file = bad_word_file | |
if "bad_%" in columns: | |
bad_ratio = st.sidebar.slider( | |
"% filtered by badwords ratio", 0.0, 50.0, 0.0, step=0.1 | |
) | |
bad_index = max(0, math.floor((100 - bad_ratio) * len(data.index) / 100) - 1) | |
bad_cutoff = np.partition(data["bad_%"], bad_index)[bad_index] | |
st.sidebar.text(f"Kept text with <{bad_cutoff:.2f}% bad words") | |
keys.append(("bad_%", bad_cutoff, True)) | |
if "perplexity" in columns: | |
ppl_ratio = st.sidebar.slider( | |
"% filtered by perplexity", 0.0, 50.0, 0.0, step=1.0 | |
) | |
ppl_index = max(0, math.floor((100 - ppl_ratio) * len(data.index) / 100) - 1) | |
ppl_cutoff = np.partition(data["perplexity"], ppl_index)[ppl_index] | |
st.sidebar.text(f"Kept text with <{ppl_cutoff:.0f} perplexity") | |
keys.append(("perplexity", ppl_cutoff, True)) | |
cond = [ | |
(data[key] <= cutoff) if max_cutoff else (data[key] >= cutoff) | |
for key, cutoff, max_cutoff in keys | |
] | |
cond = np.all(cond, axis=0) | |
data_not_keep = data.loc[np.invert(cond)] | |
st.subheader("Filtered data") | |
st.markdown("Click on a column to sort by it, place the cursor on the text to display it.") | |
st.dataframe(data_not_keep) | |
data_keep = data.loc[cond] | |
st.subheader("Kept data") | |
st.markdown("Click on a column to sort by it, place the cursor on the text to display it.") | |
st.dataframe(data_keep) | |
# def plot_hist(dataframe, key, num_bins=50): | |
# st.subheader(" ".join(key.split("_"))) | |
# hist_values = dataframe[key].values | |
# max_range = np.max(hist_values) | |
# hist_values = np.histogram(hist_values, bins=num_bins, range=(0, max_range))[0] | |
# st.bar_chart(hist_values) | |
# st.markdown(f"Each bin is of size: {max_range/num_bins}.") | |
# for key, _, _ in keys: | |
# plot_hist(data, key) | |
st.header("Filtering links and concatenated words") | |
max_len_word = int(np.max(words_data["len_word"])) + 1 | |
cutoff_word = st.sidebar.slider("Word length cutoff", 0, max_len_word, max_len_word) | |
cond_words = words_data["len_word"] <= cutoff_word | |
words_keep = words_data.loc[cond_words] | |
st.subheader(f"Words that we keep (for {num_docs_for_words} documents)") | |
st.markdown("Click on a column to sort by it, place the cursor on the text to display it.") | |
st.dataframe(words_keep) | |
words_not_keep = words_data.loc[np.invert(cond_words)] | |
st.subheader(f"Words that are thrown away (for {num_docs_for_words} documents)") | |
st.markdown("Click on a column to sort by it, place the cursor on the text to display it.") | |
st.dataframe(words_not_keep) | |
st.header("Download data") | |
with open(path_data) as json_file: | |
btn = st.download_button( | |
label="Download data as json", | |
data=json_file, | |
file_name="data.json", | |
) | |
path_data = "./en_examples_with_stats_ldnoob.json" | |
lang = "English" | |
num_docs = 5000 | |
num_docs_for_words = 500 | |
visualization(path_data, lang, num_docs, num_docs_for_words) | |