Spaces:

huggingface
/

text-data-filtering

Running

File size: 2,268 Bytes

fc95975
4bbaeac
 
 
fc95975
f622ed0
4bbaeac
4f5af5a
4bbaeac
 
 
4f5af5a
4bbaeac
 
 
f622ed0
4bbaeac
4f5af5a
4bbaeac
 
 
 
 
4f5af5a
4bbaeac
 
 
 
 
 
 
4f5af5a
4bbaeac
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f622ed0

import streamlit as st
import json
import pandas as pd
import numpy as np


st.title('5k English documents from Oscar with their stats.')

path_data = "./10K_english_examples_with_stats.json"
with open(path_data) as json_file:
    data = json.load(json_file)

data = data[:5000]
data = pd.DataFrame(data)
del data["len_words"]

st.header('Parameters of the filtering')

cutoff_special_characters_ratio = st.slider("Max cutoff special characters ratio", 0., 1., 1., step=0.01)
cutoff_stopwords_ratio = st.slider("Min cutoff stopwords ratio", 0., 1., 0., step=0.01)
cutoff_badwords_ratio = st.slider("Max cutoff badwords ratio", 0., 1., 1., step=0.001)
cutoff_lang_id_score = st.slider("Min cutoff lang id score", 0., 1., 0., step=0.01)
cutoff_perplexity_score = st.slider("Perplexity cutoff perplexity score", 0, 14000000, 14000000)

keys = [
    ("special_characters_ratio", cutoff_special_characters_ratio, True),
    ("stopwords_ratio", cutoff_stopwords_ratio, False),
    ("badwords_ratio", cutoff_badwords_ratio, True), 
    ("lang_id_score", cutoff_lang_id_score, False),
    ("perplexity_score", cutoff_perplexity_score, True),
]

cond = [(data[key] <= cutoff) if max_cutoff else (data[key] >= cutoff) for key, cutoff, max_cutoff in keys]
cond = np.all(cond, axis=0)

data_keep = data.loc[cond]
st.header('Data that we keep')
st.markdown("Click on a column to sort by it.")
st.markdown("Place the cursor on the text to display it.")
st.dataframe(data_keep)

data_not_keep = data.loc[np.invert(cond)]
st.header('Data that is thrown away')
st.markdown("Click on a column to sort by it.")
st.markdown("Place the cursor on the text to display it.")
st.dataframe(data_not_keep)

def plot_hist(key, num_bins=50):
    st.header(" ".join(key.split("_")))
    hist_values = data[key].values
    max_range = np.max(hist_values)
    hist_values = np.histogram(
        hist_values,
        bins=num_bins,
        range=(0,max_range)
    )[0]
    st.bar_chart(hist_values)
    st.markdown(f"Each bin is of size: {max_range/num_bins}.")

for key, _, _ in keys:
    plot_hist(key)

st.header('Download data')

with open(path_data) as json_file:
    btn = st.download_button(
        label="Download data as json",
        data=json_file,
        file_name='data.json',
    )