Spaces:

HuggingFaceM4
/

IDEFICS_Data_Measurement_Tool

Runtime error

App Files Files Community

IDEFICS_Data_Measurement_Tool / widgets /general_stats.py

Ezi

Upload 312 files

46df0b6 about 1 year ago

raw

history blame contribute delete

No virus

2.25 kB

	import gradio as gr
	import pandas as pd

	from widgets.widget_base import Widget
	from data_measurements.dataset_statistics import DatasetStatisticsCacheClass as dmt_cls
	import utils

	logs = utils.prepare_logging(__file__)


	class GeneralStats(Widget):
	def __init__(self):
	self.general_stats = gr.Markdown(render=False)
	self.general_stats_top_vocab = gr.DataFrame(render=False)
	self.general_stats_missing = gr.Markdown(render=False)
	self.general_stats_duplicates = gr.Markdown(render=False)

	def render(self):
	with gr.TabItem(f"General Text Statistics"):
	self.general_stats.render()
	self.general_stats_missing.render()
	self.general_stats_duplicates.render()
	self.general_stats_top_vocab.render()

	def update(self, dstats: dmt_cls):
	general_stats_text = f"""
	Use this widget to check whether the terms you see most represented in the dataset make sense for the goals of the dataset.

	There are {str(dstats.total_words)} total words.

	There are {dstats.total_open_words} after removing closed class words.

	The most common [open class words](https://dictionary.apa.org/open-class-words) and their counts are:
	"""
	top_vocab = pd.DataFrame(dstats.sorted_top_vocab_df).round(4)
	missing_text = (
	f"There are {dstats.text_nan_count} missing values in the dataset"
	)
	if dstats.dups_frac > 0:
	dupes_text = f"The dataset is {round(dstats.dups_frac * 100, 2)}% duplicates, For more information about the duplicates, click the 'Duplicates' tab."
	else:
	dupes_text = "There are 0 duplicate items in the dataset"
	return {
	self.general_stats: general_stats_text,
	self.general_stats_top_vocab: top_vocab,
	self.general_stats_missing: missing_text,
	self.general_stats_duplicates: dupes_text,
	}

	@property
	def output_components(self):
	return [
	self.general_stats,
	self.general_stats_top_vocab,
	self.general_stats_missing,
	self.general_stats_duplicates,
	]

	def add_events(self, state: gr.State):
	pass