Spaces:
Runtime error
Runtime error
Upload datos app
Browse files- .gitignore +3 -0
- LICENSE +21 -0
- app.py +28 -0
- data/full_vocab_v6.zip +3 -0
- interfaces/.gitignore +1 -0
- interfaces/interface_datos.py +100 -0
- language/spanish.json +16 -0
- modules/.gitignore +1 -0
- modules/module_connection.py +73 -0
- modules/module_customSubsetsLabel.py +89 -0
- modules/module_logsManager.py +175 -0
- modules/module_segmentedWordCloud.py +64 -0
- modules/module_vocabulary.py +85 -0
- modules/module_word2Context.py +199 -0
- requirements.txt +9 -0
- tool_info.py +23 -0
.gitignore
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
__pycache__/
|
2 |
+
.env
|
3 |
+
bias_tool_logs/
|
LICENSE
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
MIT License
|
2 |
+
|
3 |
+
Copyright (c) 2022 Fundación Vía Libre
|
4 |
+
|
5 |
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6 |
+
of this software and associated documentation files (the "Software"), to deal
|
7 |
+
in the Software without restriction, including without limitation the rights
|
8 |
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9 |
+
copies of the Software, and to permit persons to whom the Software is
|
10 |
+
furnished to do so, subject to the following conditions:
|
11 |
+
|
12 |
+
The above copyright notice and this permission notice shall be included in all
|
13 |
+
copies or substantial portions of the Software.
|
14 |
+
|
15 |
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16 |
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17 |
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18 |
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19 |
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20 |
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
21 |
+
SOFTWARE.
|
app.py
ADDED
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# --- Imports modules ---
|
2 |
+
from modules.module_vocabulary import Vocabulary
|
3 |
+
|
4 |
+
# --- Imports interfaces ---
|
5 |
+
from interfaces.interface_datos import interface as interface_datos
|
6 |
+
|
7 |
+
# --- Tool config ---
|
8 |
+
AVAILABLE_LOGS = True # [True | False]
|
9 |
+
LANGUAGE = "spanish" # [spanish]
|
10 |
+
VOCABULARY_SUBSET = "full" # [full]
|
11 |
+
# ToDo Cheange context dataset owner from nanom to vialibre
|
12 |
+
CONTEXTS_DATASET = "nanom/splittedspanish3bwc"
|
13 |
+
|
14 |
+
# --- Init classes ---
|
15 |
+
vocabulary = Vocabulary(
|
16 |
+
subset_name=VOCABULARY_SUBSET
|
17 |
+
)
|
18 |
+
|
19 |
+
# --- Main App ---
|
20 |
+
iface = interface_datos(
|
21 |
+
vocabulary=vocabulary,
|
22 |
+
contexts=CONTEXTS_DATASET,
|
23 |
+
available_logs=AVAILABLE_LOGS,
|
24 |
+
lang=LANGUAGE
|
25 |
+
)
|
26 |
+
|
27 |
+
iface.queue(concurrency_count=8)
|
28 |
+
iface.launch(debug=False)
|
data/full_vocab_v6.zip
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:478fa3e953fbc65746681b1b9770e726f0cd28a0a9992735c00001a09d04b42a
|
3 |
+
size 205538236
|
interfaces/.gitignore
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
__pycache__/
|
interfaces/interface_datos.py
ADDED
@@ -0,0 +1,100 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from modules.module_logsManager import HuggingFaceDatasetSaver
|
2 |
+
from modules.module_connection import Word2ContextExplorerConnector
|
3 |
+
from tool_info import TOOL_INFO
|
4 |
+
import gradio as gr
|
5 |
+
import pandas as pd
|
6 |
+
|
7 |
+
def interface(vocabulary, contexts, available_logs, lang="spanish"):
|
8 |
+
|
9 |
+
# --- Init logs ---
|
10 |
+
log_callback = HuggingFaceDatasetSaver(
|
11 |
+
available_logs=available_logs
|
12 |
+
)
|
13 |
+
|
14 |
+
# --- Init Class ---
|
15 |
+
connector = Word2ContextExplorerConnector(vocabulary=vocabulary, context=contexts)
|
16 |
+
labels = pd.read_json(f"language/{lang}.json")["DataExplorer_interface"]
|
17 |
+
|
18 |
+
# --- Interface ---
|
19 |
+
iface = gr.Blocks(css=".container { max-width: 90%; margin: auto;}")
|
20 |
+
|
21 |
+
with iface:
|
22 |
+
with gr.Row():
|
23 |
+
with gr.Column():
|
24 |
+
with gr.Group():
|
25 |
+
gr.Markdown(labels["step1"])
|
26 |
+
with gr.Row(): input_word = gr.Textbox(label=labels["inputWord"]["title"],
|
27 |
+
show_label=False,
|
28 |
+
placeholder=labels["inputWord"]["placeholder"])
|
29 |
+
with gr.Row(): btn_get_w_info = gr.Button(labels["wordInfoButton"])
|
30 |
+
|
31 |
+
with gr.Group():
|
32 |
+
gr.Markdown(labels["step2"])
|
33 |
+
n_context = gr.Slider(label="",
|
34 |
+
step=1, minimum=1, maximum=30, value=5,
|
35 |
+
visible=True, interactive=True)
|
36 |
+
with gr.Group():
|
37 |
+
gr.Markdown(labels["step3"])
|
38 |
+
subsets_choice = gr.CheckboxGroup(label="",
|
39 |
+
interactive=True,
|
40 |
+
visible=True)
|
41 |
+
with gr.Row(): btn_get_contexts = gr.Button(labels["wordContextButton"], visible=True)
|
42 |
+
|
43 |
+
with gr.Row(): out_msj = gr.Markdown(label="", visible=True)
|
44 |
+
|
45 |
+
with gr.Column():
|
46 |
+
with gr.Group():
|
47 |
+
gr.Markdown(labels["wordDistributionTitle"])
|
48 |
+
dist_plot = gr.Plot(label="", show_label=False)
|
49 |
+
# Set visibility to "true" if you want to see cloud of related words by frequency
|
50 |
+
wc_plot = gr.Plot(label="", show_label=False, visible=False)
|
51 |
+
|
52 |
+
with gr.Group():
|
53 |
+
gr.Markdown(labels["frequencyPerSetTitle"])
|
54 |
+
subsets_freq = gr.HTML(label="")
|
55 |
+
|
56 |
+
with gr.Row():
|
57 |
+
with gr.Group():
|
58 |
+
with gr.Row(): gr.Markdown(labels["contextList"])
|
59 |
+
with gr.Row(): out_context = gr.Dataframe(label="",
|
60 |
+
interactive=False,
|
61 |
+
value=pd.DataFrame([], columns=['']),
|
62 |
+
wrap=True,
|
63 |
+
datatype=['str','markdown','str','markdown'])
|
64 |
+
|
65 |
+
with gr.Group():
|
66 |
+
gr.Markdown(TOOL_INFO)
|
67 |
+
|
68 |
+
btn_get_w_info.click(
|
69 |
+
fn=connector.get_word_info,
|
70 |
+
inputs=[input_word],
|
71 |
+
outputs=[out_msj,
|
72 |
+
out_context,
|
73 |
+
subsets_freq,
|
74 |
+
dist_plot,
|
75 |
+
wc_plot,
|
76 |
+
subsets_choice]
|
77 |
+
)
|
78 |
+
|
79 |
+
btn_get_contexts.click(
|
80 |
+
fn=connector.get_word_context,
|
81 |
+
inputs=[input_word, n_context, subsets_choice],
|
82 |
+
outputs=[out_msj, out_context]
|
83 |
+
)
|
84 |
+
|
85 |
+
# --- Logs ---
|
86 |
+
save_field = [input_word, subsets_choice]
|
87 |
+
log_callback.setup(components=save_field, flagging_dir="edia_datos_es")
|
88 |
+
|
89 |
+
btn_get_contexts.click(
|
90 |
+
fn=lambda *args: log_callback.flag(
|
91 |
+
flag_data=args,
|
92 |
+
flag_option="datos",
|
93 |
+
username="vialibre"
|
94 |
+
),
|
95 |
+
inputs=save_field,
|
96 |
+
outputs=None,
|
97 |
+
preprocess=False
|
98 |
+
)
|
99 |
+
|
100 |
+
return iface
|
language/spanish.json
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"DataExplorer_interface": {
|
3 |
+
"step1": "1. Ingrese una palabra de interés",
|
4 |
+
"step2": "2. Seleccione cantidad máxima de contextos a recuperar",
|
5 |
+
"step3": "3. Seleccione conjuntos de interés",
|
6 |
+
"inputWord": {
|
7 |
+
"title": "",
|
8 |
+
"placeholder": "Ingresar aquí la palabra ..."
|
9 |
+
},
|
10 |
+
"wordInfoButton": "Obtener información de palabra",
|
11 |
+
"wordContextButton": "Buscar contextos",
|
12 |
+
"wordDistributionTitle": "Distribución de palabra en vocabulario",
|
13 |
+
"frequencyPerSetTitle": "Frecuencias de aparición por conjunto",
|
14 |
+
"contextList": "Lista de contextos"
|
15 |
+
},
|
16 |
+
}
|
modules/.gitignore
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
__pycache__/
|
modules/module_connection.py
ADDED
@@ -0,0 +1,73 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
import gradio as gr
|
3 |
+
from abc import ABC
|
4 |
+
from modules.module_word2Context import Word2Context
|
5 |
+
|
6 |
+
class Connector(ABC):
|
7 |
+
def parse_word(self, word : str):
|
8 |
+
return word.lower().strip()
|
9 |
+
|
10 |
+
def parse_words(self, array_in_string : str):
|
11 |
+
words = array_in_string.strip()
|
12 |
+
if not words:
|
13 |
+
return []
|
14 |
+
words = [self.parse_word(word) for word in words.split(',') if word.strip() != '']
|
15 |
+
return words
|
16 |
+
|
17 |
+
def process_error(self, err: str):
|
18 |
+
if err is None:
|
19 |
+
return
|
20 |
+
return "<center><h3>" + err + "</h3></center>"
|
21 |
+
|
22 |
+
|
23 |
+
class Word2ContextExplorerConnector(Connector):
|
24 |
+
def __init__(self, **kwargs):
|
25 |
+
vocabulary = kwargs.get('vocabulary', None)
|
26 |
+
context = kwargs.get('context', None)
|
27 |
+
|
28 |
+
if vocabulary is None and context is None:
|
29 |
+
raise KeyError
|
30 |
+
self.word2context_explorer = Word2Context(context, vocabulary)
|
31 |
+
|
32 |
+
def get_word_info(self, word):
|
33 |
+
err = ""
|
34 |
+
contexts = pd.DataFrame([],columns=[''])
|
35 |
+
subsets_info = ""
|
36 |
+
distribution_plot = None
|
37 |
+
word_cloud_plot = None
|
38 |
+
subsets_choice = gr.CheckboxGroup.update(choices=[])
|
39 |
+
|
40 |
+
err = self.word2context_explorer.errorChecking(word)
|
41 |
+
if err:
|
42 |
+
return self.process_error(err), contexts, subsets_info, distribution_plot, word_cloud_plot, subsets_choice
|
43 |
+
|
44 |
+
word = self.parse_word(word)
|
45 |
+
|
46 |
+
subsets_info, subsets_origin_info = self.word2context_explorer.getSubsetsInfo(word)
|
47 |
+
|
48 |
+
clean_keys = [key.split(" ")[0].strip() for key in subsets_origin_info]
|
49 |
+
subsets_choice = gr.CheckboxGroup.update(choices=clean_keys)
|
50 |
+
|
51 |
+
distribution_plot = self.word2context_explorer.genDistributionPlot(word)
|
52 |
+
word_cloud_plot = self.word2context_explorer.genWordCloudPlot(word)
|
53 |
+
|
54 |
+
return self.process_error(err), contexts, subsets_info, distribution_plot, word_cloud_plot, subsets_choice
|
55 |
+
|
56 |
+
def get_word_context(self, word, n_context, subset_choice):
|
57 |
+
word = self.parse_word(word)
|
58 |
+
n_context = int(n_context)
|
59 |
+
err = ""
|
60 |
+
contexts = pd.DataFrame([], columns=[''])
|
61 |
+
|
62 |
+
if len(subset_choice) > 0:
|
63 |
+
ds = self.word2context_explorer.findSplits(word, subset_choice)
|
64 |
+
else:
|
65 |
+
err = self.process_error("Error: Palabra no ingresada y/o conjunto/s de interés no seleccionado/s!")
|
66 |
+
return err, contexts
|
67 |
+
|
68 |
+
list_of_contexts = self.word2context_explorer.getContexts(word, n_context, ds)
|
69 |
+
|
70 |
+
contexts = pd.DataFrame(list_of_contexts, columns=['#','contexto','conjunto'])
|
71 |
+
contexts["buscar"] = contexts.contexto.apply(lambda text: self.word2context_explorer.genWebLink(text))
|
72 |
+
|
73 |
+
return self.process_error(err), contexts
|
modules/module_customSubsetsLabel.py
ADDED
@@ -0,0 +1,89 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
class CustomSubsetsLabel:
|
2 |
+
def __init__(self):
|
3 |
+
self.html_head = """
|
4 |
+
<html>
|
5 |
+
<head>
|
6 |
+
<meta charset="utf-8">
|
7 |
+
<meta name="viewport" content="width=device-width, initial-scale=1">
|
8 |
+
<style>
|
9 |
+
progress {
|
10 |
+
-webkit-appearance: none;
|
11 |
+
}
|
12 |
+
progress::-webkit-progress-bar {
|
13 |
+
background-color: #666;
|
14 |
+
border-radius: 7px;
|
15 |
+
}
|
16 |
+
progress {
|
17 |
+
width:100%;
|
18 |
+
height:4px;
|
19 |
+
border-radius: 1px;
|
20 |
+
}
|
21 |
+
#myturn {
|
22 |
+
display: block;
|
23 |
+
position: relative;
|
24 |
+
margin: auto;
|
25 |
+
width: 90%;
|
26 |
+
padding: 2px;
|
27 |
+
}
|
28 |
+
</style>
|
29 |
+
</head>
|
30 |
+
<body>
|
31 |
+
"""
|
32 |
+
|
33 |
+
self.html_footer ="</body></html>"
|
34 |
+
|
35 |
+
self.subset_links = {
|
36 |
+
'allwikis': "https://github.com/josecannete/wikiextractorforBERT",
|
37 |
+
'DGT': "http://opus.nlpl.eu/DGT.php",
|
38 |
+
'DOGC': "http://opus.nlpl.eu/DOGC.php",
|
39 |
+
'ECB': "http://opus.nlpl.eu/ECB.php",
|
40 |
+
'EMEA': "http://opus.nlpl.eu/EMEA.php",
|
41 |
+
'EUBookShop': "http://opus.nlpl.eu/EUbookshop.php",
|
42 |
+
'Europarl': "http://opus.nlpl.eu/Europarl.php",
|
43 |
+
'GlobalVoices': "http://opus.nlpl.eu/GlobalVoices.php",
|
44 |
+
'JRC': "http://opus.nlpl.eu/JRC-Acquis.php",
|
45 |
+
'multiUN': "http://opus.nlpl.eu/MultiUN.php",
|
46 |
+
'NewsCommentary11': "http://opus.nlpl.eu/News-Commentary-v11.php",
|
47 |
+
'OpenSubtitles2018': "http://opus.nlpl.eu/OpenSubtitles-v2018.php",
|
48 |
+
'ParaCrawl': "http://opus.nlpl.eu/ParaCrawl.php",
|
49 |
+
'TED': "http://opus.nlpl.eu/TED2013.php",
|
50 |
+
'UN': "http://opus.nlpl.eu/UN.php",
|
51 |
+
}
|
52 |
+
|
53 |
+
def __progressbar(self, percentage, subset, freq, size=15):
|
54 |
+
html = f"""
|
55 |
+
<div id="myturn">
|
56 |
+
<progress value="{int(percentage)}" max="100"></progress>
|
57 |
+
<p style="text-align:left; font-size:{size}px; padding:0px;">
|
58 |
+
<a href="{self.subset_links[subset]}" target="_blank">
|
59 |
+
<strong>{subset}</strong> <span style="font-size:{size-2}px">(Frecuencia: {freq})</span>
|
60 |
+
</a>
|
61 |
+
<span style="float:right;">
|
62 |
+
<strong>{percentage}%</strong>
|
63 |
+
</span>
|
64 |
+
</p>
|
65 |
+
</div>
|
66 |
+
"""
|
67 |
+
return html
|
68 |
+
|
69 |
+
def __render(self, subsets, freqs, percentages):
|
70 |
+
html = ""
|
71 |
+
for subset, freq, perc in zip(subsets, freqs, percentages):
|
72 |
+
html += self.__progressbar(
|
73 |
+
percentage=perc,
|
74 |
+
subset=subset,
|
75 |
+
freq=freq
|
76 |
+
)
|
77 |
+
|
78 |
+
return self.html_head + html + self.html_footer
|
79 |
+
|
80 |
+
def compute(self, subsets_dic):
|
81 |
+
subsets_dic_info = {
|
82 |
+
k.split()[0]:{'freq':int(k.split()[1][1:-1]),'perc':round(v*100,2)}
|
83 |
+
for k,v in subsets_dic.items()
|
84 |
+
}
|
85 |
+
|
86 |
+
subsets = list(subsets_dic_info.keys())
|
87 |
+
freqs = [d['freq'] for d in subsets_dic_info.values()]
|
88 |
+
percentages = [d['perc'] for d in subsets_dic_info.values()]
|
89 |
+
return self.__render(subsets, freqs, percentages)
|
modules/module_logsManager.py
ADDED
@@ -0,0 +1,175 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from distutils.log import debug
|
2 |
+
from gradio.flagging import FlaggingCallback, _get_dataset_features_info
|
3 |
+
from gradio.components import IOComponent
|
4 |
+
from gradio import utils
|
5 |
+
from typing import Any, List, Optional
|
6 |
+
from dotenv import load_dotenv
|
7 |
+
from datetime import datetime
|
8 |
+
import csv, os, pytz
|
9 |
+
|
10 |
+
|
11 |
+
# --- Load environments vars ---
|
12 |
+
load_dotenv()
|
13 |
+
|
14 |
+
|
15 |
+
# --- Classes declaration ---
|
16 |
+
class DateLogs:
|
17 |
+
def __init__(self, zone="America/Argentina/Cordoba"):
|
18 |
+
self.time_zone = pytz.timezone(zone)
|
19 |
+
|
20 |
+
def full(self):
|
21 |
+
now = datetime.now(self.time_zone)
|
22 |
+
return now.strftime("%H:%M:%S %d-%m-%Y")
|
23 |
+
|
24 |
+
def day(self):
|
25 |
+
now = datetime.now(self.time_zone)
|
26 |
+
return now.strftime("%d-%m-%Y")
|
27 |
+
|
28 |
+
class HuggingFaceDatasetSaver(FlaggingCallback):
|
29 |
+
"""
|
30 |
+
A callback that saves each flagged sample (both the input and output data)
|
31 |
+
to a HuggingFace dataset.
|
32 |
+
Example:
|
33 |
+
import gradio as gr
|
34 |
+
hf_writer = gr.HuggingFaceDatasetSaver(HF_API_TOKEN, "image-classification-mistakes")
|
35 |
+
def image_classifier(inp):
|
36 |
+
return {'cat': 0.3, 'dog': 0.7}
|
37 |
+
demo = gr.Interface(fn=image_classifier, inputs="image", outputs="label",
|
38 |
+
allow_flagging="manual", flagging_callback=hf_writer)
|
39 |
+
Guides: using_flagging
|
40 |
+
"""
|
41 |
+
|
42 |
+
def __init__(
|
43 |
+
self,
|
44 |
+
hf_token: str = os.getenv('HF_TOKEN'),
|
45 |
+
dataset_name: str = os.getenv('DS_LOGS_NAME'),
|
46 |
+
organization: Optional[str] = os.getenv('ORG_NAME'),
|
47 |
+
private: bool = True,
|
48 |
+
available_logs: bool = False
|
49 |
+
):
|
50 |
+
"""
|
51 |
+
Parameters:
|
52 |
+
hf_token: The HuggingFace token to use to create (and write the flagged sample to) the HuggingFace dataset.
|
53 |
+
dataset_name: The name of the dataset to save the data to, e.g. "image-classifier-1"
|
54 |
+
organization: The organization to save the dataset under. The hf_token must provide write access to this organization. If not provided, saved under the name of the user corresponding to the hf_token.
|
55 |
+
private: Whether the dataset should be private (defaults to False).
|
56 |
+
"""
|
57 |
+
self.hf_token = hf_token
|
58 |
+
self.dataset_name = dataset_name
|
59 |
+
self.organization_name = organization
|
60 |
+
self.dataset_private = private
|
61 |
+
self.datetime = DateLogs()
|
62 |
+
self.available_logs = available_logs
|
63 |
+
|
64 |
+
if not available_logs:
|
65 |
+
print("Push: logs DISABLED!...")
|
66 |
+
|
67 |
+
|
68 |
+
def setup(
|
69 |
+
self,
|
70 |
+
components: List[IOComponent],
|
71 |
+
flagging_dir: str
|
72 |
+
):
|
73 |
+
"""
|
74 |
+
Params:
|
75 |
+
flagging_dir (str): local directory where the dataset is cloned,
|
76 |
+
updated, and pushed from.
|
77 |
+
"""
|
78 |
+
if self.available_logs:
|
79 |
+
|
80 |
+
try:
|
81 |
+
import huggingface_hub
|
82 |
+
except (ImportError, ModuleNotFoundError):
|
83 |
+
raise ImportError(
|
84 |
+
"Package `huggingface_hub` not found is needed "
|
85 |
+
"for HuggingFaceDatasetSaver. Try 'pip install huggingface_hub'."
|
86 |
+
)
|
87 |
+
|
88 |
+
path_to_dataset_repo = huggingface_hub.create_repo(
|
89 |
+
repo_id=os.path.join(self.organization_name, self.dataset_name),
|
90 |
+
token=self.hf_token,
|
91 |
+
private=self.dataset_private,
|
92 |
+
repo_type="dataset",
|
93 |
+
exist_ok=True,
|
94 |
+
)
|
95 |
+
|
96 |
+
self.path_to_dataset_repo = path_to_dataset_repo
|
97 |
+
self.components = components
|
98 |
+
self.flagging_dir = flagging_dir
|
99 |
+
self.dataset_dir = self.dataset_name
|
100 |
+
|
101 |
+
self.repo = huggingface_hub.Repository(
|
102 |
+
local_dir=self.dataset_dir,
|
103 |
+
clone_from=path_to_dataset_repo,
|
104 |
+
use_auth_token=self.hf_token,
|
105 |
+
)
|
106 |
+
|
107 |
+
self.repo.git_pull(lfs=True)
|
108 |
+
|
109 |
+
# Should filename be user-specified?
|
110 |
+
# log_file_name = self.datetime.day()+"_"+self.flagging_dir+".csv"
|
111 |
+
self.log_file = os.path.join(self.dataset_dir, self.flagging_dir+".csv")
|
112 |
+
|
113 |
+
def flag(
|
114 |
+
self,
|
115 |
+
flag_data: List[Any],
|
116 |
+
flag_option: Optional[str] = None,
|
117 |
+
flag_index: Optional[int] = None,
|
118 |
+
username: Optional[str] = None,
|
119 |
+
) -> int:
|
120 |
+
|
121 |
+
if self.available_logs:
|
122 |
+
self.repo.git_pull(lfs=True)
|
123 |
+
|
124 |
+
is_new = not os.path.exists(self.log_file)
|
125 |
+
|
126 |
+
with open(self.log_file, "a", newline="", encoding="utf-8") as csvfile:
|
127 |
+
writer = csv.writer(csvfile)
|
128 |
+
|
129 |
+
# File previews for certain input and output types
|
130 |
+
infos, file_preview_types, headers = _get_dataset_features_info(
|
131 |
+
is_new, self.components
|
132 |
+
)
|
133 |
+
|
134 |
+
# Generate the headers and dataset_infos
|
135 |
+
if is_new:
|
136 |
+
headers = [
|
137 |
+
component.label or f"component {idx}"
|
138 |
+
for idx, component in enumerate(self.components)
|
139 |
+
] + [
|
140 |
+
"flag",
|
141 |
+
"username",
|
142 |
+
"timestamp",
|
143 |
+
]
|
144 |
+
writer.writerow(utils.sanitize_list_for_csv(headers))
|
145 |
+
|
146 |
+
# Generate the row corresponding to the flagged sample
|
147 |
+
csv_data = []
|
148 |
+
for component, sample in zip(self.components, flag_data):
|
149 |
+
save_dir = os.path.join(
|
150 |
+
self.dataset_dir,
|
151 |
+
utils.strip_invalid_filename_characters(component.label),
|
152 |
+
)
|
153 |
+
filepath = component.deserialize(sample, save_dir, None)
|
154 |
+
csv_data.append(filepath)
|
155 |
+
if isinstance(component, tuple(file_preview_types)):
|
156 |
+
csv_data.append(
|
157 |
+
"{}/resolve/main/{}".format(self.path_to_dataset_repo, filepath)
|
158 |
+
)
|
159 |
+
|
160 |
+
csv_data.append(flag_option if flag_option is not None else "")
|
161 |
+
csv_data.append(username if username is not None else "")
|
162 |
+
csv_data.append(self.datetime.full())
|
163 |
+
writer.writerow(utils.sanitize_list_for_csv(csv_data))
|
164 |
+
|
165 |
+
|
166 |
+
with open(self.log_file, "r", encoding="utf-8") as csvfile:
|
167 |
+
line_count = len([None for row in csv.reader(csvfile)]) - 1
|
168 |
+
|
169 |
+
self.repo.push_to_hub(commit_message="Flagged sample #{}".format(line_count))
|
170 |
+
|
171 |
+
else:
|
172 |
+
line_count = 0
|
173 |
+
print("Logs: Virtual push...")
|
174 |
+
|
175 |
+
return line_count
|
modules/module_segmentedWordCloud.py
ADDED
@@ -0,0 +1,64 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from wordcloud import WordCloud
|
2 |
+
import matplotlib.pyplot as plt
|
3 |
+
|
4 |
+
|
5 |
+
class SimpleGroupedColorFunc(object):
|
6 |
+
"""Create a color function object which assigns EXACT colors
|
7 |
+
to certain words based on the color to words mapping
|
8 |
+
|
9 |
+
Parameters
|
10 |
+
----------
|
11 |
+
color_to_words : dict(str -> list(str))
|
12 |
+
A dictionary that maps a color to the list of words.
|
13 |
+
|
14 |
+
default_color : str
|
15 |
+
Color that will be assigned to a word that's not a member
|
16 |
+
of any value from color_to_words.
|
17 |
+
"""
|
18 |
+
|
19 |
+
def __init__(self, color_to_words, default_color):
|
20 |
+
self.word_to_color = {
|
21 |
+
word: color
|
22 |
+
for (color, words) in color_to_words.items()
|
23 |
+
for word in words
|
24 |
+
}
|
25 |
+
|
26 |
+
self.default_color = default_color
|
27 |
+
|
28 |
+
def __call__(self, word, **kwargs):
|
29 |
+
return self.word_to_color.get(word, self.default_color)
|
30 |
+
|
31 |
+
|
32 |
+
class SegmentedWordCloud:
|
33 |
+
def __init__(self, freq_dic, less_group, greater_group):
|
34 |
+
colors = {
|
35 |
+
'less': '#529ef3',
|
36 |
+
'salient':'#d35400',
|
37 |
+
'greater':'#5d6d7e',
|
38 |
+
}
|
39 |
+
|
40 |
+
color_to_words = {
|
41 |
+
colors['greater']: greater_group,
|
42 |
+
colors['less']: less_group,
|
43 |
+
}
|
44 |
+
|
45 |
+
|
46 |
+
grouped_color_func = SimpleGroupedColorFunc(
|
47 |
+
color_to_words=color_to_words,
|
48 |
+
default_color=colors['salient']
|
49 |
+
)
|
50 |
+
|
51 |
+
self.wc = WordCloud(
|
52 |
+
background_color="white",
|
53 |
+
width=900,
|
54 |
+
height=300,
|
55 |
+
random_state=None).generate_from_frequencies(freq_dic)
|
56 |
+
|
57 |
+
self.wc.recolor(color_func=grouped_color_func)
|
58 |
+
|
59 |
+
def plot(self, figsize):
|
60 |
+
fig, ax = plt.subplots(figsize=figsize)
|
61 |
+
ax.imshow(self.wc, interpolation="bilinear")
|
62 |
+
ax.axis("off")
|
63 |
+
fig.tight_layout()
|
64 |
+
return fig
|
modules/module_vocabulary.py
ADDED
@@ -0,0 +1,85 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from memory_profiler import profile
|
2 |
+
import pandas as pd
|
3 |
+
|
4 |
+
class Vocabulary:
|
5 |
+
@profile
|
6 |
+
def __init__(self, subset_name):
|
7 |
+
# Dataset info
|
8 |
+
self.subset_name = subset_name
|
9 |
+
self.ds_path = f"data/{subset_name}_vocab_v6.zip"
|
10 |
+
|
11 |
+
# Pandas dataset
|
12 |
+
self.df_vocab = None
|
13 |
+
|
14 |
+
# Minimal list with (percentile,freq) tuples to be able to plot the word distribution graph
|
15 |
+
self.histogram = None
|
16 |
+
|
17 |
+
# Load vocabulary dataset
|
18 |
+
self.__load()
|
19 |
+
|
20 |
+
def __contains__(self, word):
|
21 |
+
return word in self.df_vocab['word'].to_list()
|
22 |
+
|
23 |
+
def __load(self):
|
24 |
+
print(f"Preparing {self.subset_name} vocabulary...")
|
25 |
+
|
26 |
+
# --- Download vocab dataset ---
|
27 |
+
self.df_vocab = pd.read_json(self.ds_path)
|
28 |
+
|
29 |
+
# --- Create min histogram to plot the word distribution graph ---
|
30 |
+
x_values = self.df_vocab['percentile'].to_list()
|
31 |
+
y_values = self.df_vocab['freq'].to_list()
|
32 |
+
|
33 |
+
# Delete duplicated tups
|
34 |
+
uniques_tups_list = set(list(zip(x_values, y_values)))
|
35 |
+
# Leave only tuples with different first element
|
36 |
+
uniques_tups_list = dict(uniques_tups_list)
|
37 |
+
|
38 |
+
self.histogram = sorted(
|
39 |
+
uniques_tups_list.items(),
|
40 |
+
key=lambda tup: tup[0],
|
41 |
+
reverse=True
|
42 |
+
)
|
43 |
+
|
44 |
+
def __getValue(self, word, feature):
|
45 |
+
word_id, value = None, None
|
46 |
+
|
47 |
+
if word in self:
|
48 |
+
word_id = self.df_vocab['word'].to_list().index(word)
|
49 |
+
|
50 |
+
if word_id != None:
|
51 |
+
value = self.df_vocab[feature].to_list()[word_id]
|
52 |
+
|
53 |
+
return value
|
54 |
+
|
55 |
+
def getFreq(self, word):
|
56 |
+
return self.__getValue(word, 'freq')
|
57 |
+
|
58 |
+
def getPercentile(self, word):
|
59 |
+
return self.__getValue(word, 'percentile')
|
60 |
+
|
61 |
+
def getSplits(self, word):
|
62 |
+
return self.__getValue(word, 'splits')
|
63 |
+
|
64 |
+
def getSubsets(self, word):
|
65 |
+
return self.__getValue(word, 'in_subset')
|
66 |
+
|
67 |
+
def distribution(self):
|
68 |
+
x_values, y_values = zip(*self.histogram)
|
69 |
+
return x_values, y_values
|
70 |
+
|
71 |
+
def getWordNeighbors(self, word, n_neighbors=20):
|
72 |
+
word_id = self.df_vocab['word'].to_list().index(word)
|
73 |
+
words = self.df_vocab['word'].to_list()
|
74 |
+
freqs = self.df_vocab['freq'].to_list()
|
75 |
+
l_sorted = list(zip(words, freqs))
|
76 |
+
|
77 |
+
g = l_sorted[max(0, word_id-n_neighbors):word_id] # less than
|
78 |
+
e = l_sorted[word_id] # equal than
|
79 |
+
l = l_sorted[word_id+1:word_id+n_neighbors] # greter than
|
80 |
+
|
81 |
+
dic = dict(g+[e]+l)
|
82 |
+
l = [x[0] for x in l]
|
83 |
+
g = [x[0] for x in g]
|
84 |
+
|
85 |
+
return dic, l, g
|
modules/module_word2Context.py
ADDED
@@ -0,0 +1,199 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from datasets import load_dataset, interleave_datasets
|
2 |
+
from modules.module_segmentedWordCloud import SegmentedWordCloud
|
3 |
+
from modules.module_customSubsetsLabel import CustomSubsetsLabel
|
4 |
+
|
5 |
+
from random import sample as random_sample
|
6 |
+
import re
|
7 |
+
|
8 |
+
import matplotlib as mpl
|
9 |
+
mpl.use('Agg')
|
10 |
+
import matplotlib.pyplot as plt
|
11 |
+
|
12 |
+
|
13 |
+
class Word2Context:
|
14 |
+
def __init__(self, context_ds_name, vocabulary):
|
15 |
+
self.context_ds_name = context_ds_name
|
16 |
+
|
17 |
+
# Vocabulary class
|
18 |
+
self.vocab = vocabulary
|
19 |
+
|
20 |
+
# Custom Label component
|
21 |
+
self.Label = CustomSubsetsLabel()
|
22 |
+
|
23 |
+
def errorChecking(self, word):
|
24 |
+
out_msj = ""
|
25 |
+
|
26 |
+
if not word:
|
27 |
+
out_msj = "Error: Primero debe ingresar una palabra!"
|
28 |
+
else:
|
29 |
+
if word not in self.vocab:
|
30 |
+
out_msj = f"Error: La palabra '<b>{word}</b>' no se encuentra en el vocabulario!"
|
31 |
+
|
32 |
+
return out_msj
|
33 |
+
|
34 |
+
def genWebLink(self,text):
|
35 |
+
text = text.replace("\"", "'")
|
36 |
+
text = text.replace("<u><b>", "")
|
37 |
+
text = text.replace("</b></u>", "")
|
38 |
+
url = "https://www.google.com.tr/search?q={}".format(text)
|
39 |
+
return '<a href="{}" rel="noopener noreferrer" target="_blank"><center>🌐🔍</center></a>'.format(url)
|
40 |
+
|
41 |
+
def genWordCloudPlot(self, word, figsize=(9,3)):
|
42 |
+
freq_dic, l_group, g_group = self.vocab.getWordNeighbors(word, n_neighbors=10)
|
43 |
+
wc = SegmentedWordCloud(freq_dic, l_group, g_group)
|
44 |
+
return wc.plot(figsize)
|
45 |
+
|
46 |
+
def genDistributionPlot(self, word, figsize=(6,1)):
|
47 |
+
x_values, y_values = self.vocab.distribution()
|
48 |
+
w_percentile = self.vocab.getPercentile(word)
|
49 |
+
w_freq = self.vocab.getFreq(word)
|
50 |
+
|
51 |
+
fig, ax = plt.subplots(figsize=figsize)
|
52 |
+
ax.plot(x_values, y_values, color='green')
|
53 |
+
ax.fill_between(x_values, y_values, color='lightgreen',)
|
54 |
+
|
55 |
+
# -- Uncomment if wordcloud is enabled in the application interface --
|
56 |
+
# ax.axvline(x=max(0,w_percentile-.01),
|
57 |
+
# color='blue',
|
58 |
+
# linewidth=7,
|
59 |
+
# alpha=.2,
|
60 |
+
# linestyle='-'
|
61 |
+
# )
|
62 |
+
# ax.axvline(x=min(100,w_percentile+.01),
|
63 |
+
# color='black',
|
64 |
+
# linewidth=7,
|
65 |
+
# alpha=.2,
|
66 |
+
# linestyle='-'
|
67 |
+
# )
|
68 |
+
ax.axvline(x=w_percentile,
|
69 |
+
color='#d35400',
|
70 |
+
linewidth=2,
|
71 |
+
linestyle='--',
|
72 |
+
label=f'{w_freq}\n(frecuencia total)'
|
73 |
+
)
|
74 |
+
|
75 |
+
ax.axis('off')
|
76 |
+
plt.legend(loc='upper left', prop={'size': 7})
|
77 |
+
return fig
|
78 |
+
|
79 |
+
def findSplits(self, word, subsets_list):
|
80 |
+
w_splits = self.vocab.getSplits(word)
|
81 |
+
|
82 |
+
splits_list = []
|
83 |
+
for subset in subsets_list:
|
84 |
+
current_split_list = []
|
85 |
+
for s in w_splits:
|
86 |
+
if (subset == s.split("_")[0]):
|
87 |
+
current_split_list.append(s)
|
88 |
+
|
89 |
+
if current_split_list:
|
90 |
+
splits_list.append(current_split_list)
|
91 |
+
|
92 |
+
splits_list = [random_sample(s_list, 1)[0] for s_list in splits_list]
|
93 |
+
|
94 |
+
ds_list = [
|
95 |
+
load_dataset(path=self.context_ds_name, name=split, streaming=True, split='all')
|
96 |
+
for split in splits_list
|
97 |
+
]
|
98 |
+
|
99 |
+
datasets = ds_list[0]
|
100 |
+
if len(ds_list) > 1:
|
101 |
+
datasets = interleave_datasets(ds_list, probabilities=None)
|
102 |
+
|
103 |
+
return datasets
|
104 |
+
|
105 |
+
def findContexts(self, sample, word):
|
106 |
+
sample = sample['text'].strip()
|
107 |
+
context = ""
|
108 |
+
m = re.search(r'\b{}\b'.format(word), sample)
|
109 |
+
if m:
|
110 |
+
init = m.span()[0]
|
111 |
+
end = init+len(word)
|
112 |
+
context = sample[:init]+"<u><b>"+word+"</b></u>"+sample[end:]
|
113 |
+
return {'context':context}
|
114 |
+
|
115 |
+
def getSubsetsInfo(self, word):
|
116 |
+
total_freq = self.vocab.getFreq(word)
|
117 |
+
subsets_name_list = list(self.vocab.getSubsets(word).keys())
|
118 |
+
subsets_freq_list = list(self.vocab.getSubsets(word).values())
|
119 |
+
|
120 |
+
# Create subset frequency dict to subset_freq component
|
121 |
+
subsets_info = {
|
122 |
+
s_name + f" ({s_freq})": s_freq/total_freq
|
123 |
+
for s_name, s_freq in zip(subsets_name_list, subsets_freq_list)
|
124 |
+
}
|
125 |
+
|
126 |
+
subsets_origin_info = dict(sorted(subsets_info.items(), key=lambda x: x[1], reverse=True))
|
127 |
+
subsets_info = self.Label.compute(subsets_origin_info)
|
128 |
+
return subsets_info, subsets_origin_info
|
129 |
+
|
130 |
+
def getContexts(self, word, n_context, ds):
|
131 |
+
ds_w_contexts = ds.map(lambda sample: self.findContexts(sample, word))
|
132 |
+
only_contexts = ds_w_contexts.filter(lambda sample: sample['context'] != "")
|
133 |
+
shuffle_contexts = only_contexts.shuffle(buffer_size=10)
|
134 |
+
|
135 |
+
list_of_dict = list(shuffle_contexts.take(n_context))
|
136 |
+
list_of_contexts = [(i,dic['context'],dic['subset']) for i,dic in enumerate(list_of_dict)]
|
137 |
+
|
138 |
+
return list_of_contexts
|
139 |
+
|
140 |
+
# TODO: The next methods can be removed, or keep them as a wrapper method of several ones
|
141 |
+
'''
|
142 |
+
def getWordInfo(self, word):
|
143 |
+
errors = ""
|
144 |
+
contexts = pd.DataFrame([],columns=[''])
|
145 |
+
subsets_info = ""
|
146 |
+
distribution_plot = None
|
147 |
+
word_cloud_plot = None
|
148 |
+
subsets_choice = gr.CheckboxGroup.update(choices=[])
|
149 |
+
|
150 |
+
errors = self.errorChecking(word)
|
151 |
+
if errors:
|
152 |
+
return errors, contexts, subsets_info, distribution_plot, word_cloud_plot, subsets_choice
|
153 |
+
|
154 |
+
total_freq = self.vocab.getFreq(word)
|
155 |
+
subsets_name_list = list(self.vocab.getSubsets(word).keys())
|
156 |
+
subsets_freq_list = list(self.vocab.getSubsets(word).values())
|
157 |
+
|
158 |
+
# Create subset frequency dict to subset_freq component
|
159 |
+
subsets_info = {
|
160 |
+
s_name + f" ({s_freq})": s_freq/total_freq
|
161 |
+
for s_name, s_freq in zip(subsets_name_list, subsets_freq_list)
|
162 |
+
}
|
163 |
+
subsets_origin_info = dict(sorted(subsets_info.items(), key=lambda x: x[1], reverse=True))
|
164 |
+
subsets_info = self.Label.compute(subsets_origin_info)
|
165 |
+
|
166 |
+
# Create sort list to subsets_choice component
|
167 |
+
clean_keys = [key.split(" ")[0].strip() for key in subsets_origin_info]
|
168 |
+
subsets_choice = gr.CheckboxGroup.update(choices=clean_keys)
|
169 |
+
|
170 |
+
# Get word distribution, and wordcloud graph
|
171 |
+
distribution_plot = self.genDistributionPlot(word)
|
172 |
+
word_cloud_plot = self.genWordCloudPlot(word)
|
173 |
+
|
174 |
+
return errors, contexts, subsets_info, distribution_plot, word_cloud_plot, subsets_choice
|
175 |
+
|
176 |
+
def getWordContext(self, word, n_context, subset_choice):
|
177 |
+
n_context = int(n_context)
|
178 |
+
errors = ""
|
179 |
+
|
180 |
+
if len(subset_choice) > 0:
|
181 |
+
ds = self.findSplits(word, subset_choice)
|
182 |
+
|
183 |
+
else:
|
184 |
+
errors = "Error: Palabra no ingresada y/o conjunto/s de interés no seleccionado/s!"
|
185 |
+
errors = "<center><h3>"+errors+"</h3></center>"
|
186 |
+
return errors, pd.DataFrame([], columns=[''])
|
187 |
+
|
188 |
+
ds_w_contexts = ds.map(lambda sample: self.findContexts(sample, word))
|
189 |
+
only_contexts = ds_w_contexts.filter(lambda sample: sample['context'] != "")
|
190 |
+
shuffle_contexts = only_contexts.shuffle(buffer_size=10)
|
191 |
+
|
192 |
+
list_of_dict = list(shuffle_contexts.take(n_context))
|
193 |
+
list_of_contexts = [(i,dic['context'],dic['subset']) for i,dic in enumerate(list_of_dict)]
|
194 |
+
|
195 |
+
contexts = pd.DataFrame(list_of_contexts, columns=['#','contexto','conjunto'])
|
196 |
+
contexts["buscar"] = contexts.contexto.apply(lambda text: self.genWebLink(text))
|
197 |
+
|
198 |
+
return errors, contexts
|
199 |
+
'''
|
requirements.txt
ADDED
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
regex
|
2 |
+
torch
|
3 |
+
transformers
|
4 |
+
wordcloud
|
5 |
+
matplotlib
|
6 |
+
numpy
|
7 |
+
uuid
|
8 |
+
python-dotenv
|
9 |
+
memory_profiler
|
tool_info.py
ADDED
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
TOOL_INFO = """
|
2 |
+
> ### A tool to overcome technical barriers for bias assessment in human language technologies
|
3 |
+
|
4 |
+
* [Read Full Paper](https://arxiv.org/abs/2207.06591)
|
5 |
+
|
6 |
+
> ### Licensing Information
|
7 |
+
* [MIT Licence](https://huggingface.co/spaces/vialibre/vialibre/edia_datos_es/resolve/main/LICENSE)
|
8 |
+
|
9 |
+
> ### Citation Information
|
10 |
+
```c
|
11 |
+
@misc{https://doi.org/10.48550/arxiv.2207.06591,
|
12 |
+
doi = {10.48550/ARXIV.2207.06591},
|
13 |
+
url = {https://arxiv.org/abs/2207.06591},
|
14 |
+
author = {Alemany, Laura Alonso and Benotti, Luciana and González, Lucía and Maina, Hernán and Busaniche, Beatriz and Halvorsen, Alexia and Bordone, Matías and Sánchez, Jorge},
|
15 |
+
keywords = {Computation and Language (cs.CL), Artificial Intelligence (cs.AI),
|
16 |
+
FOS: Computer and information sciences, FOS: Computer and information sciences},
|
17 |
+
title = {A tool to overcome technical barriers for bias assessment in human language technologies},
|
18 |
+
publisher = {arXiv},
|
19 |
+
year = {2022},
|
20 |
+
copyright = {Creative Commons Attribution Non Commercial Share Alike 4.0 International}
|
21 |
+
}
|
22 |
+
```
|
23 |
+
"""
|