Spaces:

huggingface
/

text-data-filtering

Runtime error

App Files Files Community

text-data-filtering / app.py

teven

update app

a446a8b about 3 years ago

raw

history blame

5.55 kB

	import streamlit as st
	import json
	import pandas as pd
	import math
	import numpy as np
	import matplotlib.pyplot as plt


	def visualization(path_data, lang, num_docs, num_docs_for_words):
	with open(path_data) as json_file:
	data = json.load(json_file)

	num_docs = min(num_docs, len(data))

	st.title(f"{num_docs} {lang} documents from Oscar with their stats.")

	sentences = [doc["text"].split(" ") for doc in data[:num_docs_for_words]]
	words = set([word for sentence in sentences for word in sentence])
	words_data = [{"len_word": len(word), "word": word} for word in words]
	words_data = pd.DataFrame(words_data)

	data = data[:num_docs]
	data = pd.DataFrame(data)

	columns = list(data)
	keys = []
	values = {}

	st.header("Filtering based on document content")

	if "special_%" in columns:
	special_ratio = st.sidebar.slider(
	"% filtered by special characters ratio", 0.0, 50.0, 0.0, step=1.0
	)
	cutoff_index = max(0, math.floor((100 - special_ratio) * len(data.index) / 100) - 1)
	special_cutoff = np.partition(data["special_%"], cutoff_index)[cutoff_index]
	st.sidebar.text(f"Kept text with <{special_cutoff:.1f}% special chars")
	keys.append(("special_%", special_cutoff, True))

	if "stop_%" in columns:
	stop_ratio = st.sidebar.slider(
	"% filtered by stop word ratio", 0.0, 50.0, 0.0, step=1.0
	)
	cutoff_index = max(0, math.floor(stop_ratio * len(data.index) / 100) - 1)
	stop_cutoff = np.partition(data["stop_%"], cutoff_index)[cutoff_index]
	st.sidebar.text(f"Kept text with >{stop_cutoff:.2f}% stop words")
	keys.append(("stop_%", stop_cutoff, False))

	@st.cache(suppress_st_warning=True)
	def recalculate_bad_words(file):

	def bad_word_ratio(text: str, bad_word_list):
	return len([word for word in text.split() if word.lower().strip() in bad_word_list]) / len(text.split())

	bad_word_list = [word.decode().strip() for word in file.readlines()]

	bad_word_ratios = [bad_word_ratio(text, bad_word_list) * 100 for text in data["text"]]
	data["bad_%"] = bad_word_ratios

	bad_word_file = st.sidebar.file_uploader("Upload your own list of bad words (1 word per line)")

	st.session_state.old_bad_word_file = None
	if bad_word_file != st.write(st.session_state.old_bad_word_file):
	recalculate_bad_words(bad_word_file)
	st.session_state.old_bad_word_file = bad_word_file

	if "bad_%" in columns:
	bad_ratio = st.sidebar.slider(
	"% filtered by badwords ratio", 0.0, 50.0, 0.0, step=0.1
	)
	bad_index = max(0, math.floor((100 - bad_ratio) * len(data.index) / 100) - 1)
	bad_cutoff = np.partition(data["bad_%"], bad_index)[bad_index]
	st.sidebar.text(f"Kept text with <{bad_cutoff:.2f}% bad words")
	keys.append(("bad_%", bad_cutoff, True))

	if "perplexity" in columns:
	ppl_ratio = st.sidebar.slider(
	"% filtered by perplexity", 0.0, 50.0, 0.0, step=1.0
	)
	ppl_index = max(0, math.floor((100 - ppl_ratio) * len(data.index) / 100) - 1)
	ppl_cutoff = np.partition(data["perplexity"], ppl_index)[ppl_index]
	st.sidebar.text(f"Kept text with <{ppl_cutoff:.0f} perplexity")
	keys.append(("perplexity", ppl_cutoff, True))

	cond = [
	(data[key] <= cutoff) if max_cutoff else (data[key] >= cutoff)
	for key, cutoff, max_cutoff in keys
	]
	cond = np.all(cond, axis=0)

	data_not_keep = data.loc[np.invert(cond)]
	st.subheader("Filtered data")
	st.markdown("Click on a column to sort by it, place the cursor on the text to display it.")
	st.dataframe(data_not_keep)

	data_keep = data.loc[cond]
	st.subheader("Kept data")
	st.markdown("Click on a column to sort by it, place the cursor on the text to display it.")
	st.dataframe(data_keep)

	# def plot_hist(dataframe, key, num_bins=50):
	# st.subheader(" ".join(key.split("_")))
	# hist_values = dataframe[key].values
	# max_range = np.max(hist_values)
	# hist_values = np.histogram(hist_values, bins=num_bins, range=(0, max_range))[0]
	# st.bar_chart(hist_values)
	# st.markdown(f"Each bin is of size: {max_range/num_bins}.")

	# for key, _, _ in keys:
	# plot_hist(data, key)

	st.header("Filtering links and concatenated words")
	max_len_word = int(np.max(words_data["len_word"])) + 1
	cutoff_word = st.sidebar.slider("Word length cutoff", 0, max_len_word, max_len_word)
	cond_words = words_data["len_word"] <= cutoff_word

	words_keep = words_data.loc[cond_words]
	st.subheader(f"Words that we keep (for {num_docs_for_words} documents)")
	st.markdown("Click on a column to sort by it, place the cursor on the text to display it.")
	st.dataframe(words_keep)

	words_not_keep = words_data.loc[np.invert(cond_words)]
	st.subheader(f"Words that are thrown away (for {num_docs_for_words} documents)")
	st.markdown("Click on a column to sort by it, place the cursor on the text to display it.")
	st.dataframe(words_not_keep)

	st.header("Download data")

	with open(path_data) as json_file:
	btn = st.download_button(
	label="Download data as json",
	data=json_file,
	file_name="data.json",
	)


	path_data = "./en_examples_with_stats_ldnoob.json"
	lang = "English"
	num_docs = 5000
	num_docs_for_words = 500

	visualization(path_data, lang, num_docs, num_docs_for_words)