File size: 6,695 Bytes
46df0b6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
import gradio as gr
import pandas as pd

from widgets.widget_base import Widget
from data_measurements.dataset_statistics import DatasetStatisticsCacheClass as dmt_cls
import utils

logs = utils.prepare_logging(__file__)


class Npmi(Widget):
    def __init__(self):
        self.npmi_first_word = gr.Dropdown(
            render=False, label="What is the first word you want to select?"
        )
        self.npmi_second_word = gr.Dropdown(
            render=False, label="What is the second word you want to select?"
        )
        self.npmi_error_text = gr.Markdown(render=False)
        self.npmi_df = gr.HTML(render=False)
        self.sort = gr.Dropdown(label="Sort By Column", render=False)
        self.npmi_empty_text = gr.Markdown(render=False)
        self.npmi_description = gr.Markdown(render=False)

    @property
    def output_components(self):
        return [
            self.npmi_first_word,
            self.npmi_second_word,
            self.sort,
            self.npmi_error_text,
            self.npmi_df,
            self.npmi_description,
            self.npmi_empty_text,
        ]

    def render(self):
        with gr.TabItem("Word Association: nPMI"):
            self.npmi_description.render()
            self.npmi_first_word.render()
            self.npmi_second_word.render()
            self.sort.render()
            self.npmi_df.render()
            self.npmi_empty_text.render()
            self.npmi_error_text.render()

    def update(self, dstats: dmt_cls):
        min_vocab = dstats.min_vocab_count
        npmi_stats = dstats.npmi_obj
        available_terms = npmi_stats.avail_identity_terms
        output = {comp: gr.update(visible=False) for comp in self.output_components}
        if npmi_stats and len(available_terms) > 0:
            output[self.npmi_description] = gr.Markdown.update(
                value=self.expander_npmi_description(min_vocab), visible=True
            )
            output[self.npmi_first_word] = gr.Dropdown.update(
                choices=available_terms, value=available_terms[0], visible=True
            )
            output[self.npmi_second_word] = gr.Dropdown.update(
                choices=available_terms[::-1], value=available_terms[-1], visible=True
            )
            output[self.sort] = gr.Dropdown.update(choices=['bias', available_terms[0], available_terms[-1]],
                                                   value='bias')
            output.update(
                self.npmi_show(available_terms[0], available_terms[-1], 'bias', dstats)
            )
        else:
            output[self.npmi_error_text] = gr.Markdown.update(
                visible=True,
                value="No words found co-occurring with both of the selected identity terms.",
            )
        return output

    def npmi_show(self, term1, term2, sort_col, dstats):
        npmi_stats = dstats.npmi_obj
        paired_results = npmi_stats.get_display(term1, term2)
        output = {}
        if paired_results.empty:
            output[self.npmi_empty_text] = gr.Markdown.update(
                value="""No words that co-occur enough times for results! Or there's a 🐛. 
                        Or we're still computing this one. 🤷""",
                visible=True,
            )
            output[self.npmi_df] = gr.DataFrame.update(visible=False)
        else:
            output[self.npmi_empty_text] = gr.Markdown.update(visible=False)
            logs.debug("Results to be shown in streamlit are")
            logs.debug(paired_results)
            s = pd.DataFrame(
                paired_results.sort_values(sort_col, ascending=False)
            )
            s.index.name = "word"
            s = s.reset_index().round(4)
            bias_col = [col for col in s.columns if col != "word"]
            # Keep the dataframe from being crazy big.
            if s.shape[0] > 10000:
                bias_thres = max(abs(s[s[0]][5000]), abs(s[s[0]][-5000]))
                logs.info(f"filtering with bias threshold: {bias_thres}")
                s_filtered = s[s[0].abs() > bias_thres]
            else:
                s_filtered = s
            out_df = (
                s_filtered.style.background_gradient(subset=bias_col)
                .format(formatter="{:,.3f}", subset=bias_col)
                .set_properties(**{"text-align": "center", "width": "100em"})
                .set_caption(
                    "nPMI scores between the selected identity terms and the words they both co-occur with"
                )
            )
            output[self.npmi_df] = out_df.to_html()
        return output

    @staticmethod
    def expander_npmi_description(min_vocab):
        return f"""
        Use this widget to identify problematic biases and stereotypes in 
        your data.

        nPMI scores for a word help to identify potentially
        problematic associations, ranked by how close the association is.

        nPMI bias scores for paired words help to identify how word
        associations are skewed between the selected selected words
        ([Aka et al., 2021](https://arxiv.org/abs/2103.03417)).

        You can select from gender and sexual orientation
        identity terms that appear in the dataset at least {min_vocab} times.

        The resulting ranked words are those that co-occur with both identity terms.

        The more *positive* the score, the more associated the word is with 
        the first identity term.
        The more *negative* the score, the more associated the word is with 
        the second identity term.

        -----
        """

    def update_sort_and_npmi(self, first_word, second_word, sort_col, dstats):
        output = {self.sort: gr.Dropdown.update(choices=['bias', first_word, second_word],
                                                value='bias')}
        new_df = self.npmi_show(first_word, second_word, sort_col, dstats)
        output.update(new_df)
        return output

    def add_events(self, state: gr.State):
        self.npmi_first_word.change(
            self.update_sort_and_npmi,
            inputs=[self.npmi_first_word, self.npmi_second_word, self.sort, state],
            outputs=[self.npmi_df, self.npmi_empty_text, self.sort],
        )
        self.npmi_second_word.change(
            self.update_sort_and_npmi,
            inputs=[self.npmi_first_word, self.npmi_second_word, self.sort, state],
            outputs=[self.npmi_df, self.npmi_empty_text, self.sort],
        )
        self.sort.change(
            self.npmi_show,
            inputs=[self.npmi_first_word, self.npmi_second_word, self.sort, state],
            outputs=[self.npmi_df, self.npmi_empty_text],
        )