nanom commited on
Commit
743fd42
1 Parent(s): 5a3c3c7

Upload datos app

Browse files
.gitignore ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ __pycache__/
2
+ .env
3
+ bias_tool_logs/
LICENSE ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Copyright (c) 2022 Fundación Vía Libre
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
app.py ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # --- Imports modules ---
2
+ from modules.module_vocabulary import Vocabulary
3
+
4
+ # --- Imports interfaces ---
5
+ from interfaces.interface_datos import interface as interface_datos
6
+
7
+ # --- Tool config ---
8
+ AVAILABLE_LOGS = True # [True | False]
9
+ LANGUAGE = "spanish" # [spanish]
10
+ VOCABULARY_SUBSET = "full" # [full]
11
+ # ToDo Cheange context dataset owner from nanom to vialibre
12
+ CONTEXTS_DATASET = "nanom/splittedspanish3bwc"
13
+
14
+ # --- Init classes ---
15
+ vocabulary = Vocabulary(
16
+ subset_name=VOCABULARY_SUBSET
17
+ )
18
+
19
+ # --- Main App ---
20
+ iface = interface_datos(
21
+ vocabulary=vocabulary,
22
+ contexts=CONTEXTS_DATASET,
23
+ available_logs=AVAILABLE_LOGS,
24
+ lang=LANGUAGE
25
+ )
26
+
27
+ iface.queue(concurrency_count=8)
28
+ iface.launch(debug=False)
data/full_vocab_v6.zip ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:478fa3e953fbc65746681b1b9770e726f0cd28a0a9992735c00001a09d04b42a
3
+ size 205538236
interfaces/.gitignore ADDED
@@ -0,0 +1 @@
 
 
1
+ __pycache__/
interfaces/interface_datos.py ADDED
@@ -0,0 +1,100 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from modules.module_logsManager import HuggingFaceDatasetSaver
2
+ from modules.module_connection import Word2ContextExplorerConnector
3
+ from tool_info import TOOL_INFO
4
+ import gradio as gr
5
+ import pandas as pd
6
+
7
+ def interface(vocabulary, contexts, available_logs, lang="spanish"):
8
+
9
+ # --- Init logs ---
10
+ log_callback = HuggingFaceDatasetSaver(
11
+ available_logs=available_logs
12
+ )
13
+
14
+ # --- Init Class ---
15
+ connector = Word2ContextExplorerConnector(vocabulary=vocabulary, context=contexts)
16
+ labels = pd.read_json(f"language/{lang}.json")["DataExplorer_interface"]
17
+
18
+ # --- Interface ---
19
+ iface = gr.Blocks(css=".container { max-width: 90%; margin: auto;}")
20
+
21
+ with iface:
22
+ with gr.Row():
23
+ with gr.Column():
24
+ with gr.Group():
25
+ gr.Markdown(labels["step1"])
26
+ with gr.Row(): input_word = gr.Textbox(label=labels["inputWord"]["title"],
27
+ show_label=False,
28
+ placeholder=labels["inputWord"]["placeholder"])
29
+ with gr.Row(): btn_get_w_info = gr.Button(labels["wordInfoButton"])
30
+
31
+ with gr.Group():
32
+ gr.Markdown(labels["step2"])
33
+ n_context = gr.Slider(label="",
34
+ step=1, minimum=1, maximum=30, value=5,
35
+ visible=True, interactive=True)
36
+ with gr.Group():
37
+ gr.Markdown(labels["step3"])
38
+ subsets_choice = gr.CheckboxGroup(label="",
39
+ interactive=True,
40
+ visible=True)
41
+ with gr.Row(): btn_get_contexts = gr.Button(labels["wordContextButton"], visible=True)
42
+
43
+ with gr.Row(): out_msj = gr.Markdown(label="", visible=True)
44
+
45
+ with gr.Column():
46
+ with gr.Group():
47
+ gr.Markdown(labels["wordDistributionTitle"])
48
+ dist_plot = gr.Plot(label="", show_label=False)
49
+ # Set visibility to "true" if you want to see cloud of related words by frequency
50
+ wc_plot = gr.Plot(label="", show_label=False, visible=False)
51
+
52
+ with gr.Group():
53
+ gr.Markdown(labels["frequencyPerSetTitle"])
54
+ subsets_freq = gr.HTML(label="")
55
+
56
+ with gr.Row():
57
+ with gr.Group():
58
+ with gr.Row(): gr.Markdown(labels["contextList"])
59
+ with gr.Row(): out_context = gr.Dataframe(label="",
60
+ interactive=False,
61
+ value=pd.DataFrame([], columns=['']),
62
+ wrap=True,
63
+ datatype=['str','markdown','str','markdown'])
64
+
65
+ with gr.Group():
66
+ gr.Markdown(TOOL_INFO)
67
+
68
+ btn_get_w_info.click(
69
+ fn=connector.get_word_info,
70
+ inputs=[input_word],
71
+ outputs=[out_msj,
72
+ out_context,
73
+ subsets_freq,
74
+ dist_plot,
75
+ wc_plot,
76
+ subsets_choice]
77
+ )
78
+
79
+ btn_get_contexts.click(
80
+ fn=connector.get_word_context,
81
+ inputs=[input_word, n_context, subsets_choice],
82
+ outputs=[out_msj, out_context]
83
+ )
84
+
85
+ # --- Logs ---
86
+ save_field = [input_word, subsets_choice]
87
+ log_callback.setup(components=save_field, flagging_dir="edia_datos_es")
88
+
89
+ btn_get_contexts.click(
90
+ fn=lambda *args: log_callback.flag(
91
+ flag_data=args,
92
+ flag_option="datos",
93
+ username="vialibre"
94
+ ),
95
+ inputs=save_field,
96
+ outputs=None,
97
+ preprocess=False
98
+ )
99
+
100
+ return iface
language/spanish.json ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "DataExplorer_interface": {
3
+ "step1": "1. Ingrese una palabra de interés",
4
+ "step2": "2. Seleccione cantidad máxima de contextos a recuperar",
5
+ "step3": "3. Seleccione conjuntos de interés",
6
+ "inputWord": {
7
+ "title": "",
8
+ "placeholder": "Ingresar aquí la palabra ..."
9
+ },
10
+ "wordInfoButton": "Obtener información de palabra",
11
+ "wordContextButton": "Buscar contextos",
12
+ "wordDistributionTitle": "Distribución de palabra en vocabulario",
13
+ "frequencyPerSetTitle": "Frecuencias de aparición por conjunto",
14
+ "contextList": "Lista de contextos"
15
+ },
16
+ }
modules/.gitignore ADDED
@@ -0,0 +1 @@
 
 
1
+ __pycache__/
modules/module_connection.py ADDED
@@ -0,0 +1,73 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import gradio as gr
3
+ from abc import ABC
4
+ from modules.module_word2Context import Word2Context
5
+
6
+ class Connector(ABC):
7
+ def parse_word(self, word : str):
8
+ return word.lower().strip()
9
+
10
+ def parse_words(self, array_in_string : str):
11
+ words = array_in_string.strip()
12
+ if not words:
13
+ return []
14
+ words = [self.parse_word(word) for word in words.split(',') if word.strip() != '']
15
+ return words
16
+
17
+ def process_error(self, err: str):
18
+ if err is None:
19
+ return
20
+ return "<center><h3>" + err + "</h3></center>"
21
+
22
+
23
+ class Word2ContextExplorerConnector(Connector):
24
+ def __init__(self, **kwargs):
25
+ vocabulary = kwargs.get('vocabulary', None)
26
+ context = kwargs.get('context', None)
27
+
28
+ if vocabulary is None and context is None:
29
+ raise KeyError
30
+ self.word2context_explorer = Word2Context(context, vocabulary)
31
+
32
+ def get_word_info(self, word):
33
+ err = ""
34
+ contexts = pd.DataFrame([],columns=[''])
35
+ subsets_info = ""
36
+ distribution_plot = None
37
+ word_cloud_plot = None
38
+ subsets_choice = gr.CheckboxGroup.update(choices=[])
39
+
40
+ err = self.word2context_explorer.errorChecking(word)
41
+ if err:
42
+ return self.process_error(err), contexts, subsets_info, distribution_plot, word_cloud_plot, subsets_choice
43
+
44
+ word = self.parse_word(word)
45
+
46
+ subsets_info, subsets_origin_info = self.word2context_explorer.getSubsetsInfo(word)
47
+
48
+ clean_keys = [key.split(" ")[0].strip() for key in subsets_origin_info]
49
+ subsets_choice = gr.CheckboxGroup.update(choices=clean_keys)
50
+
51
+ distribution_plot = self.word2context_explorer.genDistributionPlot(word)
52
+ word_cloud_plot = self.word2context_explorer.genWordCloudPlot(word)
53
+
54
+ return self.process_error(err), contexts, subsets_info, distribution_plot, word_cloud_plot, subsets_choice
55
+
56
+ def get_word_context(self, word, n_context, subset_choice):
57
+ word = self.parse_word(word)
58
+ n_context = int(n_context)
59
+ err = ""
60
+ contexts = pd.DataFrame([], columns=[''])
61
+
62
+ if len(subset_choice) > 0:
63
+ ds = self.word2context_explorer.findSplits(word, subset_choice)
64
+ else:
65
+ err = self.process_error("Error: Palabra no ingresada y/o conjunto/s de interés no seleccionado/s!")
66
+ return err, contexts
67
+
68
+ list_of_contexts = self.word2context_explorer.getContexts(word, n_context, ds)
69
+
70
+ contexts = pd.DataFrame(list_of_contexts, columns=['#','contexto','conjunto'])
71
+ contexts["buscar"] = contexts.contexto.apply(lambda text: self.word2context_explorer.genWebLink(text))
72
+
73
+ return self.process_error(err), contexts
modules/module_customSubsetsLabel.py ADDED
@@ -0,0 +1,89 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ class CustomSubsetsLabel:
2
+ def __init__(self):
3
+ self.html_head = """
4
+ <html>
5
+ <head>
6
+ <meta charset="utf-8">
7
+ <meta name="viewport" content="width=device-width, initial-scale=1">
8
+ <style>
9
+ progress {
10
+ -webkit-appearance: none;
11
+ }
12
+ progress::-webkit-progress-bar {
13
+ background-color: #666;
14
+ border-radius: 7px;
15
+ }
16
+ progress {
17
+ width:100%;
18
+ height:4px;
19
+ border-radius: 1px;
20
+ }
21
+ #myturn {
22
+ display: block;
23
+ position: relative;
24
+ margin: auto;
25
+ width: 90%;
26
+ padding: 2px;
27
+ }
28
+ </style>
29
+ </head>
30
+ <body>
31
+ """
32
+
33
+ self.html_footer ="</body></html>"
34
+
35
+ self.subset_links = {
36
+ 'allwikis': "https://github.com/josecannete/wikiextractorforBERT",
37
+ 'DGT': "http://opus.nlpl.eu/DGT.php",
38
+ 'DOGC': "http://opus.nlpl.eu/DOGC.php",
39
+ 'ECB': "http://opus.nlpl.eu/ECB.php",
40
+ 'EMEA': "http://opus.nlpl.eu/EMEA.php",
41
+ 'EUBookShop': "http://opus.nlpl.eu/EUbookshop.php",
42
+ 'Europarl': "http://opus.nlpl.eu/Europarl.php",
43
+ 'GlobalVoices': "http://opus.nlpl.eu/GlobalVoices.php",
44
+ 'JRC': "http://opus.nlpl.eu/JRC-Acquis.php",
45
+ 'multiUN': "http://opus.nlpl.eu/MultiUN.php",
46
+ 'NewsCommentary11': "http://opus.nlpl.eu/News-Commentary-v11.php",
47
+ 'OpenSubtitles2018': "http://opus.nlpl.eu/OpenSubtitles-v2018.php",
48
+ 'ParaCrawl': "http://opus.nlpl.eu/ParaCrawl.php",
49
+ 'TED': "http://opus.nlpl.eu/TED2013.php",
50
+ 'UN': "http://opus.nlpl.eu/UN.php",
51
+ }
52
+
53
+ def __progressbar(self, percentage, subset, freq, size=15):
54
+ html = f"""
55
+ <div id="myturn">
56
+ <progress value="{int(percentage)}" max="100"></progress>
57
+ <p style="text-align:left; font-size:{size}px; padding:0px;">
58
+ <a href="{self.subset_links[subset]}" target="_blank">
59
+ <strong>{subset}</strong> <span style="font-size:{size-2}px">(Frecuencia: {freq})</span>
60
+ </a>
61
+ <span style="float:right;">
62
+ <strong>{percentage}%</strong>
63
+ </span>
64
+ </p>
65
+ </div>
66
+ """
67
+ return html
68
+
69
+ def __render(self, subsets, freqs, percentages):
70
+ html = ""
71
+ for subset, freq, perc in zip(subsets, freqs, percentages):
72
+ html += self.__progressbar(
73
+ percentage=perc,
74
+ subset=subset,
75
+ freq=freq
76
+ )
77
+
78
+ return self.html_head + html + self.html_footer
79
+
80
+ def compute(self, subsets_dic):
81
+ subsets_dic_info = {
82
+ k.split()[0]:{'freq':int(k.split()[1][1:-1]),'perc':round(v*100,2)}
83
+ for k,v in subsets_dic.items()
84
+ }
85
+
86
+ subsets = list(subsets_dic_info.keys())
87
+ freqs = [d['freq'] for d in subsets_dic_info.values()]
88
+ percentages = [d['perc'] for d in subsets_dic_info.values()]
89
+ return self.__render(subsets, freqs, percentages)
modules/module_logsManager.py ADDED
@@ -0,0 +1,175 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from distutils.log import debug
2
+ from gradio.flagging import FlaggingCallback, _get_dataset_features_info
3
+ from gradio.components import IOComponent
4
+ from gradio import utils
5
+ from typing import Any, List, Optional
6
+ from dotenv import load_dotenv
7
+ from datetime import datetime
8
+ import csv, os, pytz
9
+
10
+
11
+ # --- Load environments vars ---
12
+ load_dotenv()
13
+
14
+
15
+ # --- Classes declaration ---
16
+ class DateLogs:
17
+ def __init__(self, zone="America/Argentina/Cordoba"):
18
+ self.time_zone = pytz.timezone(zone)
19
+
20
+ def full(self):
21
+ now = datetime.now(self.time_zone)
22
+ return now.strftime("%H:%M:%S %d-%m-%Y")
23
+
24
+ def day(self):
25
+ now = datetime.now(self.time_zone)
26
+ return now.strftime("%d-%m-%Y")
27
+
28
+ class HuggingFaceDatasetSaver(FlaggingCallback):
29
+ """
30
+ A callback that saves each flagged sample (both the input and output data)
31
+ to a HuggingFace dataset.
32
+ Example:
33
+ import gradio as gr
34
+ hf_writer = gr.HuggingFaceDatasetSaver(HF_API_TOKEN, "image-classification-mistakes")
35
+ def image_classifier(inp):
36
+ return {'cat': 0.3, 'dog': 0.7}
37
+ demo = gr.Interface(fn=image_classifier, inputs="image", outputs="label",
38
+ allow_flagging="manual", flagging_callback=hf_writer)
39
+ Guides: using_flagging
40
+ """
41
+
42
+ def __init__(
43
+ self,
44
+ hf_token: str = os.getenv('HF_TOKEN'),
45
+ dataset_name: str = os.getenv('DS_LOGS_NAME'),
46
+ organization: Optional[str] = os.getenv('ORG_NAME'),
47
+ private: bool = True,
48
+ available_logs: bool = False
49
+ ):
50
+ """
51
+ Parameters:
52
+ hf_token: The HuggingFace token to use to create (and write the flagged sample to) the HuggingFace dataset.
53
+ dataset_name: The name of the dataset to save the data to, e.g. "image-classifier-1"
54
+ organization: The organization to save the dataset under. The hf_token must provide write access to this organization. If not provided, saved under the name of the user corresponding to the hf_token.
55
+ private: Whether the dataset should be private (defaults to False).
56
+ """
57
+ self.hf_token = hf_token
58
+ self.dataset_name = dataset_name
59
+ self.organization_name = organization
60
+ self.dataset_private = private
61
+ self.datetime = DateLogs()
62
+ self.available_logs = available_logs
63
+
64
+ if not available_logs:
65
+ print("Push: logs DISABLED!...")
66
+
67
+
68
+ def setup(
69
+ self,
70
+ components: List[IOComponent],
71
+ flagging_dir: str
72
+ ):
73
+ """
74
+ Params:
75
+ flagging_dir (str): local directory where the dataset is cloned,
76
+ updated, and pushed from.
77
+ """
78
+ if self.available_logs:
79
+
80
+ try:
81
+ import huggingface_hub
82
+ except (ImportError, ModuleNotFoundError):
83
+ raise ImportError(
84
+ "Package `huggingface_hub` not found is needed "
85
+ "for HuggingFaceDatasetSaver. Try 'pip install huggingface_hub'."
86
+ )
87
+
88
+ path_to_dataset_repo = huggingface_hub.create_repo(
89
+ repo_id=os.path.join(self.organization_name, self.dataset_name),
90
+ token=self.hf_token,
91
+ private=self.dataset_private,
92
+ repo_type="dataset",
93
+ exist_ok=True,
94
+ )
95
+
96
+ self.path_to_dataset_repo = path_to_dataset_repo
97
+ self.components = components
98
+ self.flagging_dir = flagging_dir
99
+ self.dataset_dir = self.dataset_name
100
+
101
+ self.repo = huggingface_hub.Repository(
102
+ local_dir=self.dataset_dir,
103
+ clone_from=path_to_dataset_repo,
104
+ use_auth_token=self.hf_token,
105
+ )
106
+
107
+ self.repo.git_pull(lfs=True)
108
+
109
+ # Should filename be user-specified?
110
+ # log_file_name = self.datetime.day()+"_"+self.flagging_dir+".csv"
111
+ self.log_file = os.path.join(self.dataset_dir, self.flagging_dir+".csv")
112
+
113
+ def flag(
114
+ self,
115
+ flag_data: List[Any],
116
+ flag_option: Optional[str] = None,
117
+ flag_index: Optional[int] = None,
118
+ username: Optional[str] = None,
119
+ ) -> int:
120
+
121
+ if self.available_logs:
122
+ self.repo.git_pull(lfs=True)
123
+
124
+ is_new = not os.path.exists(self.log_file)
125
+
126
+ with open(self.log_file, "a", newline="", encoding="utf-8") as csvfile:
127
+ writer = csv.writer(csvfile)
128
+
129
+ # File previews for certain input and output types
130
+ infos, file_preview_types, headers = _get_dataset_features_info(
131
+ is_new, self.components
132
+ )
133
+
134
+ # Generate the headers and dataset_infos
135
+ if is_new:
136
+ headers = [
137
+ component.label or f"component {idx}"
138
+ for idx, component in enumerate(self.components)
139
+ ] + [
140
+ "flag",
141
+ "username",
142
+ "timestamp",
143
+ ]
144
+ writer.writerow(utils.sanitize_list_for_csv(headers))
145
+
146
+ # Generate the row corresponding to the flagged sample
147
+ csv_data = []
148
+ for component, sample in zip(self.components, flag_data):
149
+ save_dir = os.path.join(
150
+ self.dataset_dir,
151
+ utils.strip_invalid_filename_characters(component.label),
152
+ )
153
+ filepath = component.deserialize(sample, save_dir, None)
154
+ csv_data.append(filepath)
155
+ if isinstance(component, tuple(file_preview_types)):
156
+ csv_data.append(
157
+ "{}/resolve/main/{}".format(self.path_to_dataset_repo, filepath)
158
+ )
159
+
160
+ csv_data.append(flag_option if flag_option is not None else "")
161
+ csv_data.append(username if username is not None else "")
162
+ csv_data.append(self.datetime.full())
163
+ writer.writerow(utils.sanitize_list_for_csv(csv_data))
164
+
165
+
166
+ with open(self.log_file, "r", encoding="utf-8") as csvfile:
167
+ line_count = len([None for row in csv.reader(csvfile)]) - 1
168
+
169
+ self.repo.push_to_hub(commit_message="Flagged sample #{}".format(line_count))
170
+
171
+ else:
172
+ line_count = 0
173
+ print("Logs: Virtual push...")
174
+
175
+ return line_count
modules/module_segmentedWordCloud.py ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from wordcloud import WordCloud
2
+ import matplotlib.pyplot as plt
3
+
4
+
5
+ class SimpleGroupedColorFunc(object):
6
+ """Create a color function object which assigns EXACT colors
7
+ to certain words based on the color to words mapping
8
+
9
+ Parameters
10
+ ----------
11
+ color_to_words : dict(str -> list(str))
12
+ A dictionary that maps a color to the list of words.
13
+
14
+ default_color : str
15
+ Color that will be assigned to a word that's not a member
16
+ of any value from color_to_words.
17
+ """
18
+
19
+ def __init__(self, color_to_words, default_color):
20
+ self.word_to_color = {
21
+ word: color
22
+ for (color, words) in color_to_words.items()
23
+ for word in words
24
+ }
25
+
26
+ self.default_color = default_color
27
+
28
+ def __call__(self, word, **kwargs):
29
+ return self.word_to_color.get(word, self.default_color)
30
+
31
+
32
+ class SegmentedWordCloud:
33
+ def __init__(self, freq_dic, less_group, greater_group):
34
+ colors = {
35
+ 'less': '#529ef3',
36
+ 'salient':'#d35400',
37
+ 'greater':'#5d6d7e',
38
+ }
39
+
40
+ color_to_words = {
41
+ colors['greater']: greater_group,
42
+ colors['less']: less_group,
43
+ }
44
+
45
+
46
+ grouped_color_func = SimpleGroupedColorFunc(
47
+ color_to_words=color_to_words,
48
+ default_color=colors['salient']
49
+ )
50
+
51
+ self.wc = WordCloud(
52
+ background_color="white",
53
+ width=900,
54
+ height=300,
55
+ random_state=None).generate_from_frequencies(freq_dic)
56
+
57
+ self.wc.recolor(color_func=grouped_color_func)
58
+
59
+ def plot(self, figsize):
60
+ fig, ax = plt.subplots(figsize=figsize)
61
+ ax.imshow(self.wc, interpolation="bilinear")
62
+ ax.axis("off")
63
+ fig.tight_layout()
64
+ return fig
modules/module_vocabulary.py ADDED
@@ -0,0 +1,85 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from memory_profiler import profile
2
+ import pandas as pd
3
+
4
+ class Vocabulary:
5
+ @profile
6
+ def __init__(self, subset_name):
7
+ # Dataset info
8
+ self.subset_name = subset_name
9
+ self.ds_path = f"data/{subset_name}_vocab_v6.zip"
10
+
11
+ # Pandas dataset
12
+ self.df_vocab = None
13
+
14
+ # Minimal list with (percentile,freq) tuples to be able to plot the word distribution graph
15
+ self.histogram = None
16
+
17
+ # Load vocabulary dataset
18
+ self.__load()
19
+
20
+ def __contains__(self, word):
21
+ return word in self.df_vocab['word'].to_list()
22
+
23
+ def __load(self):
24
+ print(f"Preparing {self.subset_name} vocabulary...")
25
+
26
+ # --- Download vocab dataset ---
27
+ self.df_vocab = pd.read_json(self.ds_path)
28
+
29
+ # --- Create min histogram to plot the word distribution graph ---
30
+ x_values = self.df_vocab['percentile'].to_list()
31
+ y_values = self.df_vocab['freq'].to_list()
32
+
33
+ # Delete duplicated tups
34
+ uniques_tups_list = set(list(zip(x_values, y_values)))
35
+ # Leave only tuples with different first element
36
+ uniques_tups_list = dict(uniques_tups_list)
37
+
38
+ self.histogram = sorted(
39
+ uniques_tups_list.items(),
40
+ key=lambda tup: tup[0],
41
+ reverse=True
42
+ )
43
+
44
+ def __getValue(self, word, feature):
45
+ word_id, value = None, None
46
+
47
+ if word in self:
48
+ word_id = self.df_vocab['word'].to_list().index(word)
49
+
50
+ if word_id != None:
51
+ value = self.df_vocab[feature].to_list()[word_id]
52
+
53
+ return value
54
+
55
+ def getFreq(self, word):
56
+ return self.__getValue(word, 'freq')
57
+
58
+ def getPercentile(self, word):
59
+ return self.__getValue(word, 'percentile')
60
+
61
+ def getSplits(self, word):
62
+ return self.__getValue(word, 'splits')
63
+
64
+ def getSubsets(self, word):
65
+ return self.__getValue(word, 'in_subset')
66
+
67
+ def distribution(self):
68
+ x_values, y_values = zip(*self.histogram)
69
+ return x_values, y_values
70
+
71
+ def getWordNeighbors(self, word, n_neighbors=20):
72
+ word_id = self.df_vocab['word'].to_list().index(word)
73
+ words = self.df_vocab['word'].to_list()
74
+ freqs = self.df_vocab['freq'].to_list()
75
+ l_sorted = list(zip(words, freqs))
76
+
77
+ g = l_sorted[max(0, word_id-n_neighbors):word_id] # less than
78
+ e = l_sorted[word_id] # equal than
79
+ l = l_sorted[word_id+1:word_id+n_neighbors] # greter than
80
+
81
+ dic = dict(g+[e]+l)
82
+ l = [x[0] for x in l]
83
+ g = [x[0] for x in g]
84
+
85
+ return dic, l, g
modules/module_word2Context.py ADDED
@@ -0,0 +1,199 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from datasets import load_dataset, interleave_datasets
2
+ from modules.module_segmentedWordCloud import SegmentedWordCloud
3
+ from modules.module_customSubsetsLabel import CustomSubsetsLabel
4
+
5
+ from random import sample as random_sample
6
+ import re
7
+
8
+ import matplotlib as mpl
9
+ mpl.use('Agg')
10
+ import matplotlib.pyplot as plt
11
+
12
+
13
+ class Word2Context:
14
+ def __init__(self, context_ds_name, vocabulary):
15
+ self.context_ds_name = context_ds_name
16
+
17
+ # Vocabulary class
18
+ self.vocab = vocabulary
19
+
20
+ # Custom Label component
21
+ self.Label = CustomSubsetsLabel()
22
+
23
+ def errorChecking(self, word):
24
+ out_msj = ""
25
+
26
+ if not word:
27
+ out_msj = "Error: Primero debe ingresar una palabra!"
28
+ else:
29
+ if word not in self.vocab:
30
+ out_msj = f"Error: La palabra '<b>{word}</b>' no se encuentra en el vocabulario!"
31
+
32
+ return out_msj
33
+
34
+ def genWebLink(self,text):
35
+ text = text.replace("\"", "'")
36
+ text = text.replace("<u><b>", "")
37
+ text = text.replace("</b></u>", "")
38
+ url = "https://www.google.com.tr/search?q={}".format(text)
39
+ return '<a href="{}" rel="noopener noreferrer" target="_blank"><center>🌐🔍</center></a>'.format(url)
40
+
41
+ def genWordCloudPlot(self, word, figsize=(9,3)):
42
+ freq_dic, l_group, g_group = self.vocab.getWordNeighbors(word, n_neighbors=10)
43
+ wc = SegmentedWordCloud(freq_dic, l_group, g_group)
44
+ return wc.plot(figsize)
45
+
46
+ def genDistributionPlot(self, word, figsize=(6,1)):
47
+ x_values, y_values = self.vocab.distribution()
48
+ w_percentile = self.vocab.getPercentile(word)
49
+ w_freq = self.vocab.getFreq(word)
50
+
51
+ fig, ax = plt.subplots(figsize=figsize)
52
+ ax.plot(x_values, y_values, color='green')
53
+ ax.fill_between(x_values, y_values, color='lightgreen',)
54
+
55
+ # -- Uncomment if wordcloud is enabled in the application interface --
56
+ # ax.axvline(x=max(0,w_percentile-.01),
57
+ # color='blue',
58
+ # linewidth=7,
59
+ # alpha=.2,
60
+ # linestyle='-'
61
+ # )
62
+ # ax.axvline(x=min(100,w_percentile+.01),
63
+ # color='black',
64
+ # linewidth=7,
65
+ # alpha=.2,
66
+ # linestyle='-'
67
+ # )
68
+ ax.axvline(x=w_percentile,
69
+ color='#d35400',
70
+ linewidth=2,
71
+ linestyle='--',
72
+ label=f'{w_freq}\n(frecuencia total)'
73
+ )
74
+
75
+ ax.axis('off')
76
+ plt.legend(loc='upper left', prop={'size': 7})
77
+ return fig
78
+
79
+ def findSplits(self, word, subsets_list):
80
+ w_splits = self.vocab.getSplits(word)
81
+
82
+ splits_list = []
83
+ for subset in subsets_list:
84
+ current_split_list = []
85
+ for s in w_splits:
86
+ if (subset == s.split("_")[0]):
87
+ current_split_list.append(s)
88
+
89
+ if current_split_list:
90
+ splits_list.append(current_split_list)
91
+
92
+ splits_list = [random_sample(s_list, 1)[0] for s_list in splits_list]
93
+
94
+ ds_list = [
95
+ load_dataset(path=self.context_ds_name, name=split, streaming=True, split='all')
96
+ for split in splits_list
97
+ ]
98
+
99
+ datasets = ds_list[0]
100
+ if len(ds_list) > 1:
101
+ datasets = interleave_datasets(ds_list, probabilities=None)
102
+
103
+ return datasets
104
+
105
+ def findContexts(self, sample, word):
106
+ sample = sample['text'].strip()
107
+ context = ""
108
+ m = re.search(r'\b{}\b'.format(word), sample)
109
+ if m:
110
+ init = m.span()[0]
111
+ end = init+len(word)
112
+ context = sample[:init]+"<u><b>"+word+"</b></u>"+sample[end:]
113
+ return {'context':context}
114
+
115
+ def getSubsetsInfo(self, word):
116
+ total_freq = self.vocab.getFreq(word)
117
+ subsets_name_list = list(self.vocab.getSubsets(word).keys())
118
+ subsets_freq_list = list(self.vocab.getSubsets(word).values())
119
+
120
+ # Create subset frequency dict to subset_freq component
121
+ subsets_info = {
122
+ s_name + f" ({s_freq})": s_freq/total_freq
123
+ for s_name, s_freq in zip(subsets_name_list, subsets_freq_list)
124
+ }
125
+
126
+ subsets_origin_info = dict(sorted(subsets_info.items(), key=lambda x: x[1], reverse=True))
127
+ subsets_info = self.Label.compute(subsets_origin_info)
128
+ return subsets_info, subsets_origin_info
129
+
130
+ def getContexts(self, word, n_context, ds):
131
+ ds_w_contexts = ds.map(lambda sample: self.findContexts(sample, word))
132
+ only_contexts = ds_w_contexts.filter(lambda sample: sample['context'] != "")
133
+ shuffle_contexts = only_contexts.shuffle(buffer_size=10)
134
+
135
+ list_of_dict = list(shuffle_contexts.take(n_context))
136
+ list_of_contexts = [(i,dic['context'],dic['subset']) for i,dic in enumerate(list_of_dict)]
137
+
138
+ return list_of_contexts
139
+
140
+ # TODO: The next methods can be removed, or keep them as a wrapper method of several ones
141
+ '''
142
+ def getWordInfo(self, word):
143
+ errors = ""
144
+ contexts = pd.DataFrame([],columns=[''])
145
+ subsets_info = ""
146
+ distribution_plot = None
147
+ word_cloud_plot = None
148
+ subsets_choice = gr.CheckboxGroup.update(choices=[])
149
+
150
+ errors = self.errorChecking(word)
151
+ if errors:
152
+ return errors, contexts, subsets_info, distribution_plot, word_cloud_plot, subsets_choice
153
+
154
+ total_freq = self.vocab.getFreq(word)
155
+ subsets_name_list = list(self.vocab.getSubsets(word).keys())
156
+ subsets_freq_list = list(self.vocab.getSubsets(word).values())
157
+
158
+ # Create subset frequency dict to subset_freq component
159
+ subsets_info = {
160
+ s_name + f" ({s_freq})": s_freq/total_freq
161
+ for s_name, s_freq in zip(subsets_name_list, subsets_freq_list)
162
+ }
163
+ subsets_origin_info = dict(sorted(subsets_info.items(), key=lambda x: x[1], reverse=True))
164
+ subsets_info = self.Label.compute(subsets_origin_info)
165
+
166
+ # Create sort list to subsets_choice component
167
+ clean_keys = [key.split(" ")[0].strip() for key in subsets_origin_info]
168
+ subsets_choice = gr.CheckboxGroup.update(choices=clean_keys)
169
+
170
+ # Get word distribution, and wordcloud graph
171
+ distribution_plot = self.genDistributionPlot(word)
172
+ word_cloud_plot = self.genWordCloudPlot(word)
173
+
174
+ return errors, contexts, subsets_info, distribution_plot, word_cloud_plot, subsets_choice
175
+
176
+ def getWordContext(self, word, n_context, subset_choice):
177
+ n_context = int(n_context)
178
+ errors = ""
179
+
180
+ if len(subset_choice) > 0:
181
+ ds = self.findSplits(word, subset_choice)
182
+
183
+ else:
184
+ errors = "Error: Palabra no ingresada y/o conjunto/s de interés no seleccionado/s!"
185
+ errors = "<center><h3>"+errors+"</h3></center>"
186
+ return errors, pd.DataFrame([], columns=[''])
187
+
188
+ ds_w_contexts = ds.map(lambda sample: self.findContexts(sample, word))
189
+ only_contexts = ds_w_contexts.filter(lambda sample: sample['context'] != "")
190
+ shuffle_contexts = only_contexts.shuffle(buffer_size=10)
191
+
192
+ list_of_dict = list(shuffle_contexts.take(n_context))
193
+ list_of_contexts = [(i,dic['context'],dic['subset']) for i,dic in enumerate(list_of_dict)]
194
+
195
+ contexts = pd.DataFrame(list_of_contexts, columns=['#','contexto','conjunto'])
196
+ contexts["buscar"] = contexts.contexto.apply(lambda text: self.genWebLink(text))
197
+
198
+ return errors, contexts
199
+ '''
requirements.txt ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ regex
2
+ torch
3
+ transformers
4
+ wordcloud
5
+ matplotlib
6
+ numpy
7
+ uuid
8
+ python-dotenv
9
+ memory_profiler
tool_info.py ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ TOOL_INFO = """
2
+ > ### A tool to overcome technical barriers for bias assessment in human language technologies
3
+
4
+ * [Read Full Paper](https://arxiv.org/abs/2207.06591)
5
+
6
+ > ### Licensing Information
7
+ * [MIT Licence](https://huggingface.co/spaces/vialibre/vialibre/edia_datos_es/resolve/main/LICENSE)
8
+
9
+ > ### Citation Information
10
+ ```c
11
+ @misc{https://doi.org/10.48550/arxiv.2207.06591,
12
+ doi = {10.48550/ARXIV.2207.06591},
13
+ url = {https://arxiv.org/abs/2207.06591},
14
+ author = {Alemany, Laura Alonso and Benotti, Luciana and González, Lucía and Maina, Hernán and Busaniche, Beatriz and Halvorsen, Alexia and Bordone, Matías and Sánchez, Jorge},
15
+ keywords = {Computation and Language (cs.CL), Artificial Intelligence (cs.AI),
16
+ FOS: Computer and information sciences, FOS: Computer and information sciences},
17
+ title = {A tool to overcome technical barriers for bias assessment in human language technologies},
18
+ publisher = {arXiv},
19
+ year = {2022},
20
+ copyright = {Creative Commons Attribution Non Commercial Share Alike 4.0 International}
21
+ }
22
+ ```
23
+ """