File size: 2,251 Bytes
46df0b6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
import gradio as gr
import pandas as pd

from widgets.widget_base import Widget
from data_measurements.dataset_statistics import DatasetStatisticsCacheClass as dmt_cls
import utils

logs = utils.prepare_logging(__file__)


class GeneralStats(Widget):
    def __init__(self):
        self.general_stats = gr.Markdown(render=False)
        self.general_stats_top_vocab = gr.DataFrame(render=False)
        self.general_stats_missing = gr.Markdown(render=False)
        self.general_stats_duplicates = gr.Markdown(render=False)

    def render(self):
        with gr.TabItem(f"General Text Statistics"):
            self.general_stats.render()
            self.general_stats_missing.render()
            self.general_stats_duplicates.render()
            self.general_stats_top_vocab.render()

    def update(self, dstats: dmt_cls):
        general_stats_text = f"""
        Use this widget to check whether the terms you see most represented in the dataset make sense for the goals of the dataset.

        There are {str(dstats.total_words)} total words.

        There are {dstats.total_open_words} after removing closed class words.

        The most common [open class words](https://dictionary.apa.org/open-class-words) and their counts are: 
        """
        top_vocab = pd.DataFrame(dstats.sorted_top_vocab_df).round(4)
        missing_text = (
            f"There are {dstats.text_nan_count} missing values in the dataset"
        )
        if dstats.dups_frac > 0:
            dupes_text = f"The dataset is {round(dstats.dups_frac * 100, 2)}% duplicates, For more information about the duplicates, click the 'Duplicates' tab."
        else:
            dupes_text = "There are 0 duplicate items in the dataset"
        return {
            self.general_stats: general_stats_text,
            self.general_stats_top_vocab: top_vocab,
            self.general_stats_missing: missing_text,
            self.general_stats_duplicates: dupes_text,
        }

    @property
    def output_components(self):
        return [
            self.general_stats,
            self.general_stats_top_vocab,
            self.general_stats_missing,
            self.general_stats_duplicates,
        ]

    def add_events(self, state: gr.State):
        pass