Spaces:
Runtime error
Runtime error
Code correction and typing added
Browse files- app.py +9 -3
- interfaces/interface_datos.py +103 -38
- modules/module_connection.py +46 -13
- modules/module_customSubsetsLabel.py +35 -6
- modules/module_logsManager.py +26 -17
- modules/module_segmentedWordCloud.py +18 -4
- modules/module_vocabulary.py +50 -10
- modules/module_word2Context.py +71 -85
app.py
CHANGED
@@ -1,26 +1,32 @@
|
|
1 |
# --- Imports modules ---
|
2 |
from modules.module_vocabulary import Vocabulary
|
3 |
|
|
|
4 |
# --- Imports interfaces ---
|
5 |
from interfaces.interface_datos import interface as interface_datos
|
6 |
|
|
|
7 |
# --- Tool config ---
|
8 |
-
AVAILABLE_LOGS = True # [True | False]
|
9 |
-
LANGUAGE = "spanish" # [spanish]
|
10 |
-
VOCABULARY_SUBSET = "full" # [full]
|
11 |
# ToDo: Change the owner of the context dataset from nanom to vialibre
|
12 |
CONTEXTS_DATASET = "nanom/splittedspanish3bwc"
|
|
|
|
|
|
|
|
|
|
|
13 |
|
14 |
# --- Init classes ---
|
15 |
vocabulary = Vocabulary(
|
16 |
subset_name=VOCABULARY_SUBSET
|
17 |
)
|
18 |
|
|
|
19 |
# --- Main App ---
|
20 |
iface = interface_datos(
|
21 |
vocabulary=vocabulary,
|
22 |
contexts=CONTEXTS_DATASET,
|
23 |
available_logs=AVAILABLE_LOGS,
|
|
|
24 |
lang=LANGUAGE
|
25 |
)
|
26 |
|
|
|
1 |
# --- Imports modules ---
|
2 |
from modules.module_vocabulary import Vocabulary
|
3 |
|
4 |
+
|
5 |
# --- Imports interfaces ---
|
6 |
from interfaces.interface_datos import interface as interface_datos
|
7 |
|
8 |
+
|
9 |
# --- Tool config ---
|
|
|
|
|
|
|
10 |
# ToDo: Change the owner of the context dataset from nanom to vialibre
|
11 |
CONTEXTS_DATASET = "nanom/splittedspanish3bwc"
|
12 |
+
AVAILABLE_WORDCLOUD = False # [True | False]
|
13 |
+
AVAILABLE_LOGS = True # [True | False]
|
14 |
+
LANGUAGE = "spanish" # [spanish]
|
15 |
+
VOCABULARY_SUBSET = "full" # [full]
|
16 |
+
|
17 |
|
18 |
# --- Init classes ---
|
19 |
vocabulary = Vocabulary(
|
20 |
subset_name=VOCABULARY_SUBSET
|
21 |
)
|
22 |
|
23 |
+
|
24 |
# --- Main App ---
|
25 |
iface = interface_datos(
|
26 |
vocabulary=vocabulary,
|
27 |
contexts=CONTEXTS_DATASET,
|
28 |
available_logs=AVAILABLE_LOGS,
|
29 |
+
available_wordcloud=AVAILABLE_WORDCLOUD,
|
30 |
lang=LANGUAGE
|
31 |
)
|
32 |
|
interfaces/interface_datos.py
CHANGED
@@ -4,7 +4,14 @@ from tool_info import TOOL_INFO
|
|
4 |
import gradio as gr
|
5 |
import pandas as pd
|
6 |
|
7 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
8 |
|
9 |
# --- Init logs ---
|
10 |
log_callback = HuggingFaceDatasetSaver(
|
@@ -12,58 +19,112 @@ def interface(vocabulary, contexts, available_logs, lang="spanish"):
|
|
12 |
)
|
13 |
|
14 |
# --- Init Class ---
|
15 |
-
connector = Word2ContextExplorerConnector(
|
16 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
17 |
|
18 |
# --- Interface ---
|
19 |
-
iface = gr.Blocks(
|
|
|
|
|
20 |
|
21 |
with iface:
|
22 |
with gr.Row():
|
23 |
with gr.Column():
|
24 |
with gr.Group():
|
25 |
-
gr.Markdown(
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
30 |
|
31 |
with gr.Group():
|
32 |
-
gr.Markdown(
|
33 |
-
|
34 |
-
|
35 |
-
|
|
|
|
|
|
|
|
|
|
|
36 |
with gr.Group():
|
37 |
-
gr.Markdown(
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
42 |
|
43 |
-
with gr.Row():
|
|
|
|
|
|
|
|
|
44 |
|
45 |
with gr.Column():
|
46 |
with gr.Group():
|
47 |
-
gr.Markdown(
|
48 |
-
|
49 |
-
|
50 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
51 |
|
52 |
with gr.Group():
|
53 |
-
gr.Markdown(
|
54 |
-
|
|
|
|
|
|
|
|
|
55 |
|
56 |
with gr.Row():
|
57 |
with gr.Group():
|
58 |
-
with gr.Row():
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
64 |
|
65 |
with gr.Group():
|
66 |
-
gr.Markdown(
|
|
|
|
|
67 |
|
68 |
btn_get_w_info.click(
|
69 |
fn=connector.get_word_info,
|
@@ -73,10 +134,11 @@ def interface(vocabulary, contexts, available_logs, lang="spanish"):
|
|
73 |
subsets_freq,
|
74 |
dist_plot,
|
75 |
wc_plot,
|
76 |
-
subsets_choice
|
|
|
77 |
)
|
78 |
|
79 |
-
btn_get_contexts.click(
|
80 |
fn=connector.get_word_context,
|
81 |
inputs=[input_word, n_context, subsets_choice],
|
82 |
outputs=[out_msj, out_context]
|
@@ -84,13 +146,16 @@ def interface(vocabulary, contexts, available_logs, lang="spanish"):
|
|
84 |
|
85 |
# --- Logs ---
|
86 |
save_field = [input_word, subsets_choice]
|
87 |
-
log_callback.setup(
|
|
|
|
|
|
|
88 |
|
89 |
btn_get_contexts.click(
|
90 |
fn=lambda *args: log_callback.flag(
|
91 |
-
|
92 |
-
|
93 |
-
|
94 |
),
|
95 |
inputs=save_field,
|
96 |
outputs=None,
|
|
|
4 |
import gradio as gr
|
5 |
import pandas as pd
|
6 |
|
7 |
+
|
8 |
+
def interface(
|
9 |
+
vocabulary, # Vocabulary class instance
|
10 |
+
contexts: str,
|
11 |
+
available_logs: bool,
|
12 |
+
available_wordcloud: bool,
|
13 |
+
lang: str="spanish"
|
14 |
+
) -> gr.Blocks:
|
15 |
|
16 |
# --- Init logs ---
|
17 |
log_callback = HuggingFaceDatasetSaver(
|
|
|
19 |
)
|
20 |
|
21 |
# --- Init Class ---
|
22 |
+
connector = Word2ContextExplorerConnector(
|
23 |
+
vocabulary=vocabulary,
|
24 |
+
context=contexts
|
25 |
+
)
|
26 |
+
|
27 |
+
# --- Load language ---
|
28 |
+
labels = pd.read_json(
|
29 |
+
f"language/{lang}.json"
|
30 |
+
)["DataExplorer_interface"]
|
31 |
|
32 |
# --- Interface ---
|
33 |
+
iface = gr.Blocks(
|
34 |
+
css=".container { max-width: 90%; margin: auto;}"
|
35 |
+
)
|
36 |
|
37 |
with iface:
|
38 |
with gr.Row():
|
39 |
with gr.Column():
|
40 |
with gr.Group():
|
41 |
+
gr.Markdown(
|
42 |
+
value=labels["step1"]
|
43 |
+
)
|
44 |
+
with gr.Row():
|
45 |
+
input_word = gr.Textbox(
|
46 |
+
label=labels["inputWord"]["title"],
|
47 |
+
show_label=False,
|
48 |
+
placeholder=labels["inputWord"]["placeholder"]
|
49 |
+
)
|
50 |
+
with gr.Row():
|
51 |
+
btn_get_w_info = gr.Button(
|
52 |
+
value=labels["wordInfoButton"]
|
53 |
+
)
|
54 |
|
55 |
with gr.Group():
|
56 |
+
gr.Markdown(
|
57 |
+
value=labels["step2"]
|
58 |
+
)
|
59 |
+
n_context = gr.Slider(
|
60 |
+
label="",
|
61 |
+
step=1, minimum=1, maximum=30, value=5,
|
62 |
+
visible=True,
|
63 |
+
interactive=True
|
64 |
+
)
|
65 |
with gr.Group():
|
66 |
+
gr.Markdown(
|
67 |
+
value=labels["step3"]
|
68 |
+
)
|
69 |
+
subsets_choice = gr.CheckboxGroup(
|
70 |
+
label="",
|
71 |
+
interactive=True,
|
72 |
+
visible=True
|
73 |
+
)
|
74 |
+
with gr.Row():
|
75 |
+
btn_get_contexts = gr.Button(
|
76 |
+
value=labels["wordContextButton"],
|
77 |
+
visible=True
|
78 |
+
)
|
79 |
|
80 |
+
with gr.Row():
|
81 |
+
out_msj = gr.Markdown(
|
82 |
+
label="",
|
83 |
+
visible=True
|
84 |
+
)
|
85 |
|
86 |
with gr.Column():
|
87 |
with gr.Group():
|
88 |
+
gr.Markdown(
|
89 |
+
value=labels["wordDistributionTitle"]
|
90 |
+
)
|
91 |
+
dist_plot = gr.Plot(
|
92 |
+
label="",
|
93 |
+
show_label=False
|
94 |
+
)
|
95 |
+
wc_plot = gr.Plot(
|
96 |
+
label="",
|
97 |
+
show_label=False,
|
98 |
+
visible=available_wordcloud
|
99 |
+
)
|
100 |
|
101 |
with gr.Group():
|
102 |
+
gr.Markdown(
|
103 |
+
value=labels["frequencyPerSetTitle"]
|
104 |
+
)
|
105 |
+
subsets_freq = gr.HTML(
|
106 |
+
label=""
|
107 |
+
)
|
108 |
|
109 |
with gr.Row():
|
110 |
with gr.Group():
|
111 |
+
with gr.Row():
|
112 |
+
gr.Markdown(
|
113 |
+
value=labels["contextList"]
|
114 |
+
)
|
115 |
+
with gr.Row():
|
116 |
+
out_context = gr.Dataframe(
|
117 |
+
label="",
|
118 |
+
interactive=False,
|
119 |
+
value=pd.DataFrame([], columns=['']),
|
120 |
+
wrap=True,
|
121 |
+
datatype=['str','markdown','str','markdown']
|
122 |
+
)
|
123 |
|
124 |
with gr.Group():
|
125 |
+
gr.Markdown(
|
126 |
+
value=TOOL_INFO
|
127 |
+
)
|
128 |
|
129 |
btn_get_w_info.click(
|
130 |
fn=connector.get_word_info,
|
|
|
134 |
subsets_freq,
|
135 |
dist_plot,
|
136 |
wc_plot,
|
137 |
+
subsets_choice
|
138 |
+
]
|
139 |
)
|
140 |
|
141 |
+
btn_get_contexts.click(
|
142 |
fn=connector.get_word_context,
|
143 |
inputs=[input_word, n_context, subsets_choice],
|
144 |
outputs=[out_msj, out_context]
|
|
|
146 |
|
147 |
# --- Logs ---
|
148 |
save_field = [input_word, subsets_choice]
|
149 |
+
log_callback.setup(
|
150 |
+
components=save_field,
|
151 |
+
flagging_dir=f"edia_datos_{lang}"
|
152 |
+
)
|
153 |
|
154 |
btn_get_contexts.click(
|
155 |
fn=lambda *args: log_callback.flag(
|
156 |
+
flag_data=args,
|
157 |
+
flag_option="datos",
|
158 |
+
username="vialibre"
|
159 |
),
|
160 |
inputs=save_field,
|
161 |
outputs=None,
|
modules/module_connection.py
CHANGED
@@ -1,37 +1,64 @@
|
|
|
|
|
|
1 |
import pandas as pd
|
2 |
import gradio as gr
|
3 |
from abc import ABC
|
4 |
-
from modules.module_word2Context import Word2Context
|
5 |
|
6 |
class Connector(ABC):
|
7 |
-
def parse_word(
|
|
|
|
|
|
|
|
|
8 |
return word.lower().strip()
|
9 |
|
10 |
-
def parse_words(
|
|
|
|
|
|
|
|
|
11 |
words = array_in_string.strip()
|
12 |
if not words:
|
13 |
return []
|
14 |
-
words = [
|
|
|
|
|
|
|
15 |
return words
|
16 |
|
17 |
-
def process_error(
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
|
|
|
|
|
|
|
22 |
|
23 |
class Word2ContextExplorerConnector(Connector):
|
24 |
-
def __init__(
|
|
|
|
|
|
|
|
|
25 |
vocabulary = kwargs.get('vocabulary', None)
|
26 |
context = kwargs.get('context', None)
|
27 |
|
28 |
if vocabulary is None and context is None:
|
29 |
raise KeyError
|
30 |
-
self.word2context_explorer = Word2Context(context, vocabulary)
|
31 |
|
32 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
33 |
err = ""
|
34 |
-
contexts = pd.DataFrame([],columns=[''])
|
35 |
subsets_info = ""
|
36 |
distribution_plot = None
|
37 |
word_cloud_plot = None
|
@@ -53,7 +80,13 @@ class Word2ContextExplorerConnector(Connector):
|
|
53 |
|
54 |
return self.process_error(err), contexts, subsets_info, distribution_plot, word_cloud_plot, subsets_choice
|
55 |
|
56 |
-
def get_word_context(
|
|
|
|
|
|
|
|
|
|
|
|
|
57 |
word = self.parse_word(word)
|
58 |
n_context = int(n_context)
|
59 |
err = ""
|
|
|
1 |
+
from modules.module_word2Context import Word2Context
|
2 |
+
from typing import List, Tuple
|
3 |
import pandas as pd
|
4 |
import gradio as gr
|
5 |
from abc import ABC
|
|
|
6 |
|
7 |
class Connector(ABC):
|
8 |
+
def parse_word(
|
9 |
+
self,
|
10 |
+
word: str
|
11 |
+
) -> str:
|
12 |
+
|
13 |
return word.lower().strip()
|
14 |
|
15 |
+
def parse_words(
|
16 |
+
self,
|
17 |
+
array_in_string: str
|
18 |
+
) -> List[str]:
|
19 |
+
|
20 |
words = array_in_string.strip()
|
21 |
if not words:
|
22 |
return []
|
23 |
+
words = [
|
24 |
+
self.parse_word(word)
|
25 |
+
for word in words.split(',') if word.strip() != ''
|
26 |
+
]
|
27 |
return words
|
28 |
|
29 |
+
def process_error(
|
30 |
+
self,
|
31 |
+
err: str
|
32 |
+
) -> str:
|
33 |
|
34 |
+
if err:
|
35 |
+
err = "<center><h3>" + err + "</h3></center>"
|
36 |
+
return err
|
37 |
|
38 |
class Word2ContextExplorerConnector(Connector):
|
39 |
+
def __init__(
|
40 |
+
self,
|
41 |
+
**kwargs
|
42 |
+
) -> None:
|
43 |
+
|
44 |
vocabulary = kwargs.get('vocabulary', None)
|
45 |
context = kwargs.get('context', None)
|
46 |
|
47 |
if vocabulary is None and context is None:
|
48 |
raise KeyError
|
|
|
49 |
|
50 |
+
self.word2context_explorer = Word2Context(
|
51 |
+
context, # Context dataset HF name | path
|
52 |
+
vocabulary # Vocabulary class instance
|
53 |
+
)
|
54 |
+
|
55 |
+
def get_word_info(
|
56 |
+
self,
|
57 |
+
word: str
|
58 |
+
) -> Tuple:
|
59 |
+
|
60 |
err = ""
|
61 |
+
contexts = pd.DataFrame([], columns=[''])
|
62 |
subsets_info = ""
|
63 |
distribution_plot = None
|
64 |
word_cloud_plot = None
|
|
|
80 |
|
81 |
return self.process_error(err), contexts, subsets_info, distribution_plot, word_cloud_plot, subsets_choice
|
82 |
|
83 |
+
def get_word_context(
|
84 |
+
self,
|
85 |
+
word: str,
|
86 |
+
n_context: int,
|
87 |
+
subset_choice: List[str]
|
88 |
+
) -> Tuple:
|
89 |
+
|
90 |
word = self.parse_word(word)
|
91 |
n_context = int(n_context)
|
92 |
err = ""
|
modules/module_customSubsetsLabel.py
CHANGED
@@ -1,5 +1,10 @@
|
|
|
|
|
|
1 |
class CustomSubsetsLabel:
|
2 |
-
def __init__(
|
|
|
|
|
|
|
3 |
self.html_head = """
|
4 |
<html>
|
5 |
<head>
|
@@ -50,7 +55,14 @@ class CustomSubsetsLabel:
|
|
50 |
'UN': "http://opus.nlpl.eu/UN.php",
|
51 |
}
|
52 |
|
53 |
-
def __progressbar(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
54 |
html = f"""
|
55 |
<div id="myturn">
|
56 |
<progress value="{int(percentage)}" max="100"></progress>
|
@@ -66,7 +78,13 @@ class CustomSubsetsLabel:
|
|
66 |
"""
|
67 |
return html
|
68 |
|
69 |
-
def __render(
|
|
|
|
|
|
|
|
|
|
|
|
|
70 |
html = ""
|
71 |
for subset, freq, perc in zip(subsets, freqs, percentages):
|
72 |
html += self.__progressbar(
|
@@ -77,13 +95,24 @@ class CustomSubsetsLabel:
|
|
77 |
|
78 |
return self.html_head + html + self.html_footer
|
79 |
|
80 |
-
def compute(
|
|
|
|
|
|
|
|
|
81 |
subsets_dic_info = {
|
82 |
k.split()[0]:{'freq':int(k.split()[1][1:-1]),'perc':round(v*100,2)}
|
83 |
for k,v in subsets_dic.items()
|
84 |
}
|
85 |
|
86 |
subsets = list(subsets_dic_info.keys())
|
87 |
-
freqs = [
|
88 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
89 |
return self.__render(subsets, freqs, percentages)
|
|
|
1 |
+
from typing import List, Dict
|
2 |
+
|
3 |
class CustomSubsetsLabel:
|
4 |
+
def __init__(
|
5 |
+
self
|
6 |
+
) -> None:
|
7 |
+
|
8 |
self.html_head = """
|
9 |
<html>
|
10 |
<head>
|
|
|
55 |
'UN': "http://opus.nlpl.eu/UN.php",
|
56 |
}
|
57 |
|
58 |
+
def __progressbar(
|
59 |
+
self,
|
60 |
+
percentage: float,
|
61 |
+
subset: str,
|
62 |
+
freq: int,
|
63 |
+
size: int=15
|
64 |
+
) -> str:
|
65 |
+
|
66 |
html = f"""
|
67 |
<div id="myturn">
|
68 |
<progress value="{int(percentage)}" max="100"></progress>
|
|
|
78 |
"""
|
79 |
return html
|
80 |
|
81 |
+
def __render(
|
82 |
+
self,
|
83 |
+
subsets: List[str],
|
84 |
+
freqs: List[int],
|
85 |
+
percentages: List[float]
|
86 |
+
) -> str:
|
87 |
+
|
88 |
html = ""
|
89 |
for subset, freq, perc in zip(subsets, freqs, percentages):
|
90 |
html += self.__progressbar(
|
|
|
95 |
|
96 |
return self.html_head + html + self.html_footer
|
97 |
|
98 |
+
def compute(
|
99 |
+
self,
|
100 |
+
subsets_dic: Dict[str, int]
|
101 |
+
) -> str:
|
102 |
+
|
103 |
subsets_dic_info = {
|
104 |
k.split()[0]:{'freq':int(k.split()[1][1:-1]),'perc':round(v*100,2)}
|
105 |
for k,v in subsets_dic.items()
|
106 |
}
|
107 |
|
108 |
subsets = list(subsets_dic_info.keys())
|
109 |
+
freqs = [
|
110 |
+
d['freq']
|
111 |
+
for d in subsets_dic_info.values()
|
112 |
+
]
|
113 |
+
percentages = [
|
114 |
+
d['perc']
|
115 |
+
for d in subsets_dic_info.values()
|
116 |
+
]
|
117 |
+
|
118 |
return self.__render(subsets, freqs, percentages)
|
modules/module_logsManager.py
CHANGED
@@ -1,4 +1,3 @@
|
|
1 |
-
from distutils.log import debug
|
2 |
from gradio.flagging import FlaggingCallback, _get_dataset_features_info
|
3 |
from gradio.components import IOComponent
|
4 |
from gradio import utils
|
@@ -14,14 +13,24 @@ load_dotenv()
|
|
14 |
|
15 |
# --- Classes declaration ---
|
16 |
class DateLogs:
|
17 |
-
def __init__(
|
|
|
|
|
|
|
|
|
18 |
self.time_zone = pytz.timezone(zone)
|
19 |
|
20 |
-
def full(
|
|
|
|
|
|
|
21 |
now = datetime.now(self.time_zone)
|
22 |
return now.strftime("%H:%M:%S %d-%m-%Y")
|
23 |
|
24 |
-
def day(
|
|
|
|
|
|
|
25 |
now = datetime.now(self.time_zone)
|
26 |
return now.strftime("%d-%m-%Y")
|
27 |
|
@@ -41,12 +50,12 @@ class HuggingFaceDatasetSaver(FlaggingCallback):
|
|
41 |
|
42 |
def __init__(
|
43 |
self,
|
44 |
-
hf_token: str
|
45 |
-
dataset_name: str
|
46 |
-
organization: Optional[str]
|
47 |
-
private: bool
|
48 |
-
available_logs: bool
|
49 |
-
):
|
50 |
"""
|
51 |
Parameters:
|
52 |
hf_token: The HuggingFace token to use to create (and write the flagged sample to) the HuggingFace dataset.
|
@@ -66,10 +75,10 @@ class HuggingFaceDatasetSaver(FlaggingCallback):
|
|
66 |
|
67 |
|
68 |
def setup(
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
"""
|
74 |
Params:
|
75 |
flagging_dir (str): local directory where the dataset is cloned,
|
@@ -113,9 +122,9 @@ class HuggingFaceDatasetSaver(FlaggingCallback):
|
|
113 |
def flag(
|
114 |
self,
|
115 |
flag_data: List[Any],
|
116 |
-
flag_option: Optional[str]
|
117 |
-
flag_index: Optional[int]
|
118 |
-
username: Optional[str]
|
119 |
) -> int:
|
120 |
|
121 |
if self.available_logs:
|
|
|
|
|
1 |
from gradio.flagging import FlaggingCallback, _get_dataset_features_info
|
2 |
from gradio.components import IOComponent
|
3 |
from gradio import utils
|
|
|
13 |
|
14 |
# --- Classes declaration ---
|
15 |
class DateLogs:
|
16 |
+
def __init__(
|
17 |
+
self,
|
18 |
+
zone: str="America/Argentina/Cordoba"
|
19 |
+
) -> None:
|
20 |
+
|
21 |
self.time_zone = pytz.timezone(zone)
|
22 |
|
23 |
+
def full(
|
24 |
+
self
|
25 |
+
) -> str:
|
26 |
+
|
27 |
now = datetime.now(self.time_zone)
|
28 |
return now.strftime("%H:%M:%S %d-%m-%Y")
|
29 |
|
30 |
+
def day(
|
31 |
+
self
|
32 |
+
) -> str:
|
33 |
+
|
34 |
now = datetime.now(self.time_zone)
|
35 |
return now.strftime("%d-%m-%Y")
|
36 |
|
|
|
50 |
|
51 |
def __init__(
|
52 |
self,
|
53 |
+
hf_token: str=os.getenv('HF_TOKEN'),
|
54 |
+
dataset_name: str=os.getenv('DS_LOGS_NAME'),
|
55 |
+
organization: Optional[str]=os.getenv('ORG_NAME'),
|
56 |
+
private: bool=True,
|
57 |
+
available_logs: bool=False
|
58 |
+
) -> None:
|
59 |
"""
|
60 |
Parameters:
|
61 |
hf_token: The HuggingFace token to use to create (and write the flagged sample to) the HuggingFace dataset.
|
|
|
75 |
|
76 |
|
77 |
def setup(
|
78 |
+
self,
|
79 |
+
components: List[IOComponent],
|
80 |
+
flagging_dir: str
|
81 |
+
) -> None:
|
82 |
"""
|
83 |
Params:
|
84 |
flagging_dir (str): local directory where the dataset is cloned,
|
|
|
122 |
def flag(
|
123 |
self,
|
124 |
flag_data: List[Any],
|
125 |
+
flag_option: Optional[str]=None,
|
126 |
+
flag_index: Optional[int]=None,
|
127 |
+
username: Optional[str]=None,
|
128 |
) -> int:
|
129 |
|
130 |
if self.available_logs:
|
modules/module_segmentedWordCloud.py
CHANGED
@@ -1,6 +1,6 @@
|
|
1 |
from wordcloud import WordCloud
|
2 |
import matplotlib.pyplot as plt
|
3 |
-
|
4 |
|
5 |
class SimpleGroupedColorFunc(object):
|
6 |
"""Create a color function object which assigns EXACT colors
|
@@ -16,7 +16,12 @@ class SimpleGroupedColorFunc(object):
|
|
16 |
of any value from color_to_words.
|
17 |
"""
|
18 |
|
19 |
-
def __init__(
|
|
|
|
|
|
|
|
|
|
|
20 |
self.word_to_color = {
|
21 |
word: color
|
22 |
for (color, words) in color_to_words.items()
|
@@ -30,7 +35,13 @@ class SimpleGroupedColorFunc(object):
|
|
30 |
|
31 |
|
32 |
class SegmentedWordCloud:
|
33 |
-
def __init__(
|
|
|
|
|
|
|
|
|
|
|
|
|
34 |
colors = {
|
35 |
'less': '#529ef3',
|
36 |
'salient':'#d35400',
|
@@ -56,7 +67,10 @@ class SegmentedWordCloud:
|
|
56 |
|
57 |
self.wc.recolor(color_func=grouped_color_func)
|
58 |
|
59 |
-
def plot(
|
|
|
|
|
|
|
60 |
fig, ax = plt.subplots(figsize=figsize)
|
61 |
ax.imshow(self.wc, interpolation="bilinear")
|
62 |
ax.axis("off")
|
|
|
1 |
from wordcloud import WordCloud
|
2 |
import matplotlib.pyplot as plt
|
3 |
+
from typing import Dict, Tuple, List
|
4 |
|
5 |
class SimpleGroupedColorFunc(object):
|
6 |
"""Create a color function object which assigns EXACT colors
|
|
|
16 |
of any value from color_to_words.
|
17 |
"""
|
18 |
|
19 |
+
def __init__(
|
20 |
+
self,
|
21 |
+
color_to_words: Dict,
|
22 |
+
default_color: str
|
23 |
+
) -> Dict:
|
24 |
+
|
25 |
self.word_to_color = {
|
26 |
word: color
|
27 |
for (color, words) in color_to_words.items()
|
|
|
35 |
|
36 |
|
37 |
class SegmentedWordCloud:
|
38 |
+
def __init__(
|
39 |
+
self,
|
40 |
+
freq_dic: Dict[str, int],
|
41 |
+
less_group: List[str],
|
42 |
+
greater_group: List[str]
|
43 |
+
) :
|
44 |
+
|
45 |
colors = {
|
46 |
'less': '#529ef3',
|
47 |
'salient':'#d35400',
|
|
|
67 |
|
68 |
self.wc.recolor(color_func=grouped_color_func)
|
69 |
|
70 |
+
def plot(
|
71 |
+
self,
|
72 |
+
figsize: Tuple[int,int]
|
73 |
+
):
|
74 |
fig, ax = plt.subplots(figsize=figsize)
|
75 |
ax.imshow(self.wc, interpolation="bilinear")
|
76 |
ax.axis("off")
|
modules/module_vocabulary.py
CHANGED
@@ -1,9 +1,14 @@
|
|
1 |
from memory_profiler import profile
|
2 |
import pandas as pd
|
|
|
3 |
|
4 |
class Vocabulary:
|
5 |
@profile
|
6 |
-
def __init__(
|
|
|
|
|
|
|
|
|
7 |
# Dataset info
|
8 |
self.subset_name = subset_name
|
9 |
self.ds_path = f"data/{subset_name}_vocab_v6.zip"
|
@@ -17,10 +22,17 @@ class Vocabulary:
|
|
17 |
# Load vocabulary dataset
|
18 |
self.__load()
|
19 |
|
20 |
-
def __contains__(
|
|
|
|
|
|
|
|
|
21 |
return word in self.df_vocab['word'].to_list()
|
22 |
|
23 |
-
def __load(
|
|
|
|
|
|
|
24 |
print(f"Preparing {self.subset_name} vocabulary...")
|
25 |
|
26 |
# --- Download vocab dataset ---
|
@@ -41,7 +53,11 @@ class Vocabulary:
|
|
41 |
reverse=True
|
42 |
)
|
43 |
|
44 |
-
def __getValue(
|
|
|
|
|
|
|
|
|
45 |
word_id, value = None, None
|
46 |
|
47 |
if word in self:
|
@@ -52,23 +68,47 @@ class Vocabulary:
|
|
52 |
|
53 |
return value
|
54 |
|
55 |
-
def getFreq(
|
|
|
|
|
|
|
|
|
56 |
return self.__getValue(word, 'freq')
|
57 |
|
58 |
-
def getPercentile(
|
|
|
|
|
|
|
|
|
59 |
return self.__getValue(word, 'percentile')
|
60 |
|
61 |
-
def getSplits(
|
|
|
|
|
|
|
|
|
62 |
return self.__getValue(word, 'splits')
|
63 |
|
64 |
-
def getSubsets(
|
|
|
|
|
|
|
|
|
65 |
return self.__getValue(word, 'in_subset')
|
66 |
|
67 |
-
def distribution(
|
|
|
|
|
|
|
68 |
x_values, y_values = zip(*self.histogram)
|
69 |
return x_values, y_values
|
70 |
|
71 |
-
def getWordNeighbors(
|
|
|
|
|
|
|
|
|
|
|
72 |
word_id = self.df_vocab['word'].to_list().index(word)
|
73 |
words = self.df_vocab['word'].to_list()
|
74 |
freqs = self.df_vocab['freq'].to_list()
|
|
|
1 |
from memory_profiler import profile
|
2 |
import pandas as pd
|
3 |
+
from typing import List, Dict, Tuple
|
4 |
|
5 |
class Vocabulary:
|
6 |
@profile
|
7 |
+
def __init__(
|
8 |
+
self,
|
9 |
+
subset_name: str
|
10 |
+
) -> None:
|
11 |
+
|
12 |
# Dataset info
|
13 |
self.subset_name = subset_name
|
14 |
self.ds_path = f"data/{subset_name}_vocab_v6.zip"
|
|
|
22 |
# Load vocabulary dataset
|
23 |
self.__load()
|
24 |
|
25 |
+
def __contains__(
|
26 |
+
self,
|
27 |
+
word: str
|
28 |
+
) -> bool:
|
29 |
+
|
30 |
return word in self.df_vocab['word'].to_list()
|
31 |
|
32 |
+
def __load(
|
33 |
+
self
|
34 |
+
) -> None:
|
35 |
+
|
36 |
print(f"Preparing {self.subset_name} vocabulary...")
|
37 |
|
38 |
# --- Download vocab dataset ---
|
|
|
53 |
reverse=True
|
54 |
)
|
55 |
|
56 |
+
def __getValue(
|
57 |
+
self,
|
58 |
+
word: str,
|
59 |
+
feature: str
|
60 |
+
):
|
61 |
word_id, value = None, None
|
62 |
|
63 |
if word in self:
|
|
|
68 |
|
69 |
return value
|
70 |
|
71 |
+
def getFreq(
|
72 |
+
self,
|
73 |
+
word
|
74 |
+
) -> int:
|
75 |
+
|
76 |
return self.__getValue(word, 'freq')
|
77 |
|
78 |
+
def getPercentile(
|
79 |
+
self,
|
80 |
+
word:str
|
81 |
+
) -> float:
|
82 |
+
|
83 |
return self.__getValue(word, 'percentile')
|
84 |
|
85 |
+
def getSplits(
|
86 |
+
self,
|
87 |
+
word: str
|
88 |
+
) -> List[str]:
|
89 |
+
|
90 |
return self.__getValue(word, 'splits')
|
91 |
|
92 |
+
def getSubsets(
|
93 |
+
self,
|
94 |
+
word: str
|
95 |
+
) -> Dict[str, int]:
|
96 |
+
|
97 |
return self.__getValue(word, 'in_subset')
|
98 |
|
99 |
+
def distribution(
|
100 |
+
self
|
101 |
+
) -> Tuple:
|
102 |
+
|
103 |
x_values, y_values = zip(*self.histogram)
|
104 |
return x_values, y_values
|
105 |
|
106 |
+
def getWordNeighbors(
|
107 |
+
self,
|
108 |
+
word: str,
|
109 |
+
n_neighbors: int=20
|
110 |
+
)-> Tuple:
|
111 |
+
|
112 |
word_id = self.df_vocab['word'].to_list().index(word)
|
113 |
words = self.df_vocab['word'].to_list()
|
114 |
freqs = self.df_vocab['freq'].to_list()
|
modules/module_word2Context.py
CHANGED
@@ -1,8 +1,8 @@
|
|
1 |
from datasets import load_dataset, interleave_datasets
|
2 |
from modules.module_segmentedWordCloud import SegmentedWordCloud
|
3 |
from modules.module_customSubsetsLabel import CustomSubsetsLabel
|
4 |
-
|
5 |
from random import sample as random_sample
|
|
|
6 |
import re
|
7 |
|
8 |
import matplotlib as mpl
|
@@ -11,7 +11,12 @@ import matplotlib.pyplot as plt
|
|
11 |
|
12 |
|
13 |
class Word2Context:
|
14 |
-
def __init__(
|
|
|
|
|
|
|
|
|
|
|
15 |
self.context_ds_name = context_ds_name
|
16 |
|
17 |
# Vocabulary class
|
@@ -20,7 +25,11 @@ class Word2Context:
|
|
20 |
# Custom Label component
|
21 |
self.Label = CustomSubsetsLabel()
|
22 |
|
23 |
-
def errorChecking(
|
|
|
|
|
|
|
|
|
24 |
out_msj = ""
|
25 |
|
26 |
if not word:
|
@@ -31,19 +40,33 @@ class Word2Context:
|
|
31 |
|
32 |
return out_msj
|
33 |
|
34 |
-
def genWebLink(
|
|
|
|
|
|
|
|
|
35 |
text = text.replace("\"", "'")
|
36 |
text = text.replace("<u><b>", "")
|
37 |
text = text.replace("</b></u>", "")
|
38 |
url = "https://www.google.com.tr/search?q={}".format(text)
|
39 |
return '<a href="{}" rel="noopener noreferrer" target="_blank"><center>ππ</center></a>'.format(url)
|
40 |
|
41 |
-
def genWordCloudPlot(
|
|
|
|
|
|
|
|
|
|
|
42 |
freq_dic, l_group, g_group = self.vocab.getWordNeighbors(word, n_neighbors=10)
|
43 |
wc = SegmentedWordCloud(freq_dic, l_group, g_group)
|
44 |
return wc.plot(figsize)
|
45 |
|
46 |
-
def genDistributionPlot(
|
|
|
|
|
|
|
|
|
|
|
47 |
x_values, y_values = self.vocab.distribution()
|
48 |
w_percentile = self.vocab.getPercentile(word)
|
49 |
w_freq = self.vocab.getFreq(word)
|
@@ -52,19 +75,20 @@ class Word2Context:
|
|
52 |
ax.plot(x_values, y_values, color='green')
|
53 |
ax.fill_between(x_values, y_values, color='lightgreen',)
|
54 |
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
|
|
68 |
ax.axvline(x=w_percentile,
|
69 |
color='#d35400',
|
70 |
linewidth=2,
|
@@ -76,7 +100,12 @@ class Word2Context:
|
|
76 |
plt.legend(loc='upper left', prop={'size': 7})
|
77 |
return fig
|
78 |
|
79 |
-
def findSplits(
|
|
|
|
|
|
|
|
|
|
|
80 |
w_splits = self.vocab.getSplits(word)
|
81 |
|
82 |
splits_list = []
|
@@ -102,7 +131,12 @@ class Word2Context:
|
|
102 |
|
103 |
return datasets
|
104 |
|
105 |
-
def findContexts(
|
|
|
|
|
|
|
|
|
|
|
106 |
sample = sample['text'].strip()
|
107 |
context = ""
|
108 |
m = re.search(r'\b{}\b'.format(word), sample)
|
@@ -112,7 +146,11 @@ class Word2Context:
|
|
112 |
context = sample[:init]+"<u><b>"+word+"</b></u>"+sample[end:]
|
113 |
return {'context':context}
|
114 |
|
115 |
-
def getSubsetsInfo(
|
|
|
|
|
|
|
|
|
116 |
total_freq = self.vocab.getFreq(word)
|
117 |
subsets_name_list = list(self.vocab.getSubsets(word).keys())
|
118 |
subsets_freq_list = list(self.vocab.getSubsets(word).values())
|
@@ -127,73 +165,21 @@ class Word2Context:
|
|
127 |
subsets_info = self.Label.compute(subsets_origin_info)
|
128 |
return subsets_info, subsets_origin_info
|
129 |
|
130 |
-
def getContexts(
|
131 |
-
|
132 |
-
|
133 |
-
|
134 |
-
|
135 |
-
|
136 |
-
list_of_contexts = [(i,dic['context'],dic['subset']) for i,dic in enumerate(list_of_dict)]
|
137 |
-
|
138 |
-
return list_of_contexts
|
139 |
-
|
140 |
-
# TODO: The next methods can be removed, or keep them as a wrapper method of several ones
|
141 |
-
'''
|
142 |
-
def getWordInfo(self, word):
|
143 |
-
errors = ""
|
144 |
-
contexts = pd.DataFrame([],columns=[''])
|
145 |
-
subsets_info = ""
|
146 |
-
distribution_plot = None
|
147 |
-
word_cloud_plot = None
|
148 |
-
subsets_choice = gr.CheckboxGroup.update(choices=[])
|
149 |
-
|
150 |
-
errors = self.errorChecking(word)
|
151 |
-
if errors:
|
152 |
-
return errors, contexts, subsets_info, distribution_plot, word_cloud_plot, subsets_choice
|
153 |
|
154 |
-
total_freq = self.vocab.getFreq(word)
|
155 |
-
subsets_name_list = list(self.vocab.getSubsets(word).keys())
|
156 |
-
subsets_freq_list = list(self.vocab.getSubsets(word).values())
|
157 |
-
|
158 |
-
# Create subset frequency dict to subset_freq component
|
159 |
-
subsets_info = {
|
160 |
-
s_name + f" ({s_freq})": s_freq/total_freq
|
161 |
-
for s_name, s_freq in zip(subsets_name_list, subsets_freq_list)
|
162 |
-
}
|
163 |
-
subsets_origin_info = dict(sorted(subsets_info.items(), key=lambda x: x[1], reverse=True))
|
164 |
-
subsets_info = self.Label.compute(subsets_origin_info)
|
165 |
-
|
166 |
-
# Create sort list to subsets_choice component
|
167 |
-
clean_keys = [key.split(" ")[0].strip() for key in subsets_origin_info]
|
168 |
-
subsets_choice = gr.CheckboxGroup.update(choices=clean_keys)
|
169 |
-
|
170 |
-
# Get word distribution, and wordcloud graph
|
171 |
-
distribution_plot = self.genDistributionPlot(word)
|
172 |
-
word_cloud_plot = self.genWordCloudPlot(word)
|
173 |
-
|
174 |
-
return errors, contexts, subsets_info, distribution_plot, word_cloud_plot, subsets_choice
|
175 |
-
|
176 |
-
def getWordContext(self, word, n_context, subset_choice):
|
177 |
-
n_context = int(n_context)
|
178 |
-
errors = ""
|
179 |
-
|
180 |
-
if len(subset_choice) > 0:
|
181 |
-
ds = self.findSplits(word, subset_choice)
|
182 |
-
|
183 |
-
else:
|
184 |
-
errors = "Error: Palabra no ingresada y/o conjunto/s de interΓ©s no seleccionado/s!"
|
185 |
-
errors = "<center><h3>"+errors+"</h3></center>"
|
186 |
-
return errors, pd.DataFrame([], columns=[''])
|
187 |
-
|
188 |
ds_w_contexts = ds.map(lambda sample: self.findContexts(sample, word))
|
189 |
only_contexts = ds_w_contexts.filter(lambda sample: sample['context'] != "")
|
190 |
shuffle_contexts = only_contexts.shuffle(buffer_size=10)
|
191 |
|
192 |
list_of_dict = list(shuffle_contexts.take(n_context))
|
193 |
-
list_of_contexts = [
|
194 |
-
|
195 |
-
|
196 |
-
|
197 |
|
198 |
-
return
|
199 |
-
'''
|
|
|
1 |
from datasets import load_dataset, interleave_datasets
|
2 |
from modules.module_segmentedWordCloud import SegmentedWordCloud
|
3 |
from modules.module_customSubsetsLabel import CustomSubsetsLabel
|
|
|
4 |
from random import sample as random_sample
|
5 |
+
from typing import Tuple, List, Dict
|
6 |
import re
|
7 |
|
8 |
import matplotlib as mpl
|
|
|
11 |
|
12 |
|
13 |
class Word2Context:
|
14 |
+
def __init__(
|
15 |
+
self,
|
16 |
+
context_ds_name: str,
|
17 |
+
vocabulary # Vocabulary class instance
|
18 |
+
) -> None:
|
19 |
+
|
20 |
self.context_ds_name = context_ds_name
|
21 |
|
22 |
# Vocabulary class
|
|
|
25 |
# Custom Label component
|
26 |
self.Label = CustomSubsetsLabel()
|
27 |
|
28 |
+
def errorChecking(
|
29 |
+
self,
|
30 |
+
word: str
|
31 |
+
) -> str:
|
32 |
+
|
33 |
out_msj = ""
|
34 |
|
35 |
if not word:
|
|
|
40 |
|
41 |
return out_msj
|
42 |
|
43 |
+
def genWebLink(
|
44 |
+
self,
|
45 |
+
text: str
|
46 |
+
) -> str:
|
47 |
+
|
48 |
text = text.replace("\"", "'")
|
49 |
text = text.replace("<u><b>", "")
|
50 |
text = text.replace("</b></u>", "")
|
51 |
url = "https://www.google.com.tr/search?q={}".format(text)
|
52 |
return '<a href="{}" rel="noopener noreferrer" target="_blank"><center>ππ</center></a>'.format(url)
|
53 |
|
54 |
+
def genWordCloudPlot(
|
55 |
+
self,
|
56 |
+
word: str,
|
57 |
+
figsize: Tuple[int,int]=(9,3)
|
58 |
+
): # ToDO: Figure typing
|
59 |
+
|
60 |
freq_dic, l_group, g_group = self.vocab.getWordNeighbors(word, n_neighbors=10)
|
61 |
wc = SegmentedWordCloud(freq_dic, l_group, g_group)
|
62 |
return wc.plot(figsize)
|
63 |
|
64 |
+
def genDistributionPlot(
|
65 |
+
self,
|
66 |
+
word: str,
|
67 |
+
figsize: Tuple[int,int]=(6,1)
|
68 |
+
): # ToDO: Figure typing
|
69 |
+
|
70 |
x_values, y_values = self.vocab.distribution()
|
71 |
w_percentile = self.vocab.getPercentile(word)
|
72 |
w_freq = self.vocab.getFreq(word)
|
|
|
75 |
ax.plot(x_values, y_values, color='green')
|
76 |
ax.fill_between(x_values, y_values, color='lightgreen',)
|
77 |
|
78 |
+
ax.axvline(x=max(0,w_percentile-.01),
|
79 |
+
color='blue',
|
80 |
+
linewidth=7,
|
81 |
+
alpha=.1,
|
82 |
+
linestyle='-'
|
83 |
+
)
|
84 |
+
|
85 |
+
ax.axvline(x=min(100,w_percentile+.01),
|
86 |
+
color='black',
|
87 |
+
linewidth=7,
|
88 |
+
alpha=.1,
|
89 |
+
linestyle='-'
|
90 |
+
)
|
91 |
+
|
92 |
ax.axvline(x=w_percentile,
|
93 |
color='#d35400',
|
94 |
linewidth=2,
|
|
|
100 |
plt.legend(loc='upper left', prop={'size': 7})
|
101 |
return fig
|
102 |
|
103 |
+
def findSplits(
|
104 |
+
self,
|
105 |
+
word: str,
|
106 |
+
subsets_list: List[str]
|
107 |
+
):
|
108 |
+
|
109 |
w_splits = self.vocab.getSplits(word)
|
110 |
|
111 |
splits_list = []
|
|
|
131 |
|
132 |
return datasets
|
133 |
|
134 |
+
def findContexts(
|
135 |
+
self,
|
136 |
+
sample: str,
|
137 |
+
word: str
|
138 |
+
) -> Dict[str,str]:
|
139 |
+
|
140 |
sample = sample['text'].strip()
|
141 |
context = ""
|
142 |
m = re.search(r'\b{}\b'.format(word), sample)
|
|
|
146 |
context = sample[:init]+"<u><b>"+word+"</b></u>"+sample[end:]
|
147 |
return {'context':context}
|
148 |
|
149 |
+
def getSubsetsInfo(
|
150 |
+
self,
|
151 |
+
word: str
|
152 |
+
) -> Tuple:
|
153 |
+
|
154 |
total_freq = self.vocab.getFreq(word)
|
155 |
subsets_name_list = list(self.vocab.getSubsets(word).keys())
|
156 |
subsets_freq_list = list(self.vocab.getSubsets(word).values())
|
|
|
165 |
subsets_info = self.Label.compute(subsets_origin_info)
|
166 |
return subsets_info, subsets_origin_info
|
167 |
|
168 |
+
def getContexts(
|
169 |
+
self,
|
170 |
+
word: str,
|
171 |
+
n_context: int,
|
172 |
+
ds
|
173 |
+
) -> List:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
174 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
175 |
ds_w_contexts = ds.map(lambda sample: self.findContexts(sample, word))
|
176 |
only_contexts = ds_w_contexts.filter(lambda sample: sample['context'] != "")
|
177 |
shuffle_contexts = only_contexts.shuffle(buffer_size=10)
|
178 |
|
179 |
list_of_dict = list(shuffle_contexts.take(n_context))
|
180 |
+
list_of_contexts = [
|
181 |
+
(i, dic['context'], dic['subset'])
|
182 |
+
for i,dic in enumerate(list_of_dict)
|
183 |
+
]
|
184 |
|
185 |
+
return list_of_contexts
|
|