Spaces:
Configuration error
Configuration error
Typing. Added __init_ann_method and __init_sklearn_method in embedding class. Upgrade getNearestNeighbors method. Fix bug in get method from ann class. Etc
Browse files- .gitignore +1 -1
- app.py +10 -8
- interfaces/interface_BiasWordExplorer.py +96 -40
- interfaces/interface_WordExplorer.py +105 -32
- language/.gitignore +1 -0
- modules/model_embbeding.py +112 -51
- modules/module_BiasExplorer.py +125 -57
- modules/module_WordExplorer.py +128 -56
- modules/module_ann.py +53 -23
- modules/module_connection.py +116 -66
- modules/module_logsManager.py +7 -5
- tool_info.py +1 -1
.gitignore
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
__pycache__/
|
2 |
-
bias_tool_logs/
|
3 |
*.env
|
|
|
|
1 |
__pycache__/
|
|
|
2 |
*.env
|
3 |
+
logs_edia_we_spanish/
|
app.py
CHANGED
@@ -4,30 +4,32 @@ import pandas as pd
|
|
4 |
|
5 |
|
6 |
# --- Imports modules ---
|
7 |
-
from modules.model_embbeding import Embedding
|
8 |
|
9 |
|
10 |
# --- Imports interfaces ---
|
11 |
-
from interfaces.interface_WordExplorer import interface as wordExplorer_interface
|
12 |
from interfaces.interface_BiasWordExplorer import interface as biasWordExplorer_interface
|
13 |
|
14 |
|
15 |
# --- Tool config ---
|
16 |
-
AVAILABLE_LOGS = True # [True | False]
|
17 |
-
LANGUAGE = "spanish" # [spanish | english]
|
18 |
EMBEDDINGS_PATH = "data/fasttext-sbwc.100k.vec"
|
19 |
-
|
|
|
|
|
|
|
20 |
|
21 |
|
22 |
# --- Init classes ---
|
23 |
embedding = Embedding(
|
24 |
path=EMBEDDINGS_PATH,
|
25 |
-
binary=EMBEDDINGS_PATH.endswith('.bin'),
|
26 |
limit=None,
|
27 |
randomizedPCA=False,
|
28 |
-
max_neighbors=MAX_NEIGHBORS
|
|
|
29 |
)
|
30 |
|
|
|
31 |
# --- Init Vars ---
|
32 |
labels = pd.read_json(f"language/{LANGUAGE}.json")["app"]
|
33 |
|
@@ -41,7 +43,7 @@ INTERFACE_LIST = [
|
|
41 |
wordExplorer_interface(
|
42 |
embedding=embedding,
|
43 |
available_logs=AVAILABLE_LOGS,
|
44 |
-
max_neighbors=MAX_NEIGHBORS,
|
45 |
lang=LANGUAGE),
|
46 |
]
|
47 |
|
|
|
4 |
|
5 |
|
6 |
# --- Imports modules ---
|
7 |
+
from modules.model_embbeding import Embedding
|
8 |
|
9 |
|
10 |
# --- Imports interfaces ---
|
11 |
+
from interfaces.interface_WordExplorer import interface as wordExplorer_interface
|
12 |
from interfaces.interface_BiasWordExplorer import interface as biasWordExplorer_interface
|
13 |
|
14 |
|
15 |
# --- Tool config ---
|
|
|
|
|
16 |
EMBEDDINGS_PATH = "data/fasttext-sbwc.100k.vec"
|
17 |
+
LANGUAGE = "spanish" # [spanish | english]
|
18 |
+
MAX_NEIGHBORS = 20
|
19 |
+
NN_METHOD = 'sklearn' # ['sklearn' | 'ann']
|
20 |
+
AVAILABLE_LOGS = True # [True | False]
|
21 |
|
22 |
|
23 |
# --- Init classes ---
|
24 |
embedding = Embedding(
|
25 |
path=EMBEDDINGS_PATH,
|
|
|
26 |
limit=None,
|
27 |
randomizedPCA=False,
|
28 |
+
max_neighbors=MAX_NEIGHBORS,
|
29 |
+
nn_method=NN_METHOD
|
30 |
)
|
31 |
|
32 |
+
|
33 |
# --- Init Vars ---
|
34 |
labels = pd.read_json(f"language/{LANGUAGE}.json")["app"]
|
35 |
|
|
|
43 |
wordExplorer_interface(
|
44 |
embedding=embedding,
|
45 |
available_logs=AVAILABLE_LOGS,
|
46 |
+
max_neighbors=MAX_NEIGHBORS,
|
47 |
lang=LANGUAGE),
|
48 |
]
|
49 |
|
interfaces/interface_BiasWordExplorer.py
CHANGED
@@ -1,48 +1,96 @@
|
|
1 |
import gradio as gr
|
2 |
import pandas as pd
|
3 |
-
from tkinter import image_names
|
4 |
|
5 |
-
from tool_info import TOOL_INFO
|
6 |
from modules.module_logsManager import HuggingFaceDatasetSaver
|
7 |
from modules.module_connection import BiasWordExplorerConnector
|
8 |
from examples.examples import examples1_explorar_sesgo_en_palabras, examples2_explorar_sesgo_en_palabras
|
|
|
|
|
9 |
|
10 |
# --- Interface ---
|
11 |
-
def interface(
|
|
|
|
|
|
|
|
|
|
|
12 |
# --- Init logs ---
|
13 |
log_callback = HuggingFaceDatasetSaver(
|
14 |
-
available_logs=available_logs
|
|
|
15 |
)
|
|
|
16 |
# --- Init vars ---
|
17 |
-
connector = BiasWordExplorerConnector(
|
18 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
19 |
|
|
|
20 |
interface = gr.Blocks()
|
|
|
21 |
with interface:
|
22 |
-
gr.Markdown(
|
|
|
|
|
23 |
with gr.Row():
|
24 |
with gr.Column():
|
25 |
with gr.Row():
|
26 |
-
diagnose_list = gr.Textbox(
|
|
|
|
|
|
|
27 |
with gr.Row():
|
28 |
-
gr.Markdown(
|
|
|
|
|
29 |
with gr.Row():
|
30 |
-
wordlist_1 = gr.Textbox(
|
31 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
32 |
with gr.Row():
|
33 |
-
gr.Markdown(
|
|
|
|
|
34 |
with gr.Row():
|
35 |
-
wordlist_3 = gr.Textbox(
|
36 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
37 |
with gr.Column():
|
38 |
with gr.Row():
|
39 |
-
bias2d = gr.Button(
|
|
|
|
|
40 |
with gr.Row():
|
41 |
-
bias4d = gr.Button(
|
|
|
|
|
42 |
with gr.Row():
|
43 |
-
err_msg = gr.Markdown(
|
|
|
|
|
|
|
44 |
with gr.Row():
|
45 |
-
bias_plot = gr.Plot(
|
|
|
|
|
|
|
|
|
46 |
with gr.Row():
|
47 |
examples = gr.Examples(
|
48 |
fn=connector.calculate_bias_2d,
|
@@ -54,51 +102,59 @@ def interface(embedding, available_logs, lang="spanish"):
|
|
54 |
with gr.Row():
|
55 |
examples = gr.Examples(
|
56 |
fn=connector.calculate_bias_4d,
|
57 |
-
inputs=[wordlist_1, wordlist_2,
|
58 |
-
|
59 |
-
|
|
|
60 |
examples=examples2_explorar_sesgo_en_palabras,
|
61 |
label=labels["examples4Spaces"]
|
62 |
)
|
63 |
|
64 |
with gr.Row():
|
65 |
-
gr.Markdown(
|
|
|
|
|
66 |
|
67 |
bias2d.click(
|
68 |
-
fn=connector.calculate_bias_2d,
|
69 |
-
inputs=[wordlist_1,wordlist_2,diagnose_list],
|
70 |
-
outputs=[bias_plot,err_msg]
|
71 |
)
|
72 |
-
|
73 |
bias4d.click(
|
74 |
fn=connector.calculate_bias_4d,
|
75 |
-
inputs=[wordlist_1,wordlist_2,
|
76 |
-
|
|
|
77 |
)
|
78 |
|
79 |
# --- Logs ---
|
80 |
-
save_field = [wordlist_1,wordlist_2,wordlist_3,wordlist_4,diagnose_list]
|
81 |
-
log_callback.setup(
|
|
|
|
|
|
|
82 |
|
83 |
bias2d.click(
|
84 |
fn=lambda *args: log_callback.flag(
|
85 |
-
|
86 |
-
|
87 |
-
|
88 |
),
|
89 |
inputs=save_field,
|
90 |
-
outputs=None,
|
91 |
preprocess=False
|
92 |
)
|
93 |
-
|
94 |
bias4d.click(
|
95 |
fn=lambda *args: log_callback.flag(
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
),
|
100 |
inputs=save_field,
|
101 |
-
outputs=None,
|
102 |
preprocess=False
|
103 |
)
|
104 |
-
|
|
|
|
1 |
import gradio as gr
|
2 |
import pandas as pd
|
|
|
3 |
|
|
|
4 |
from modules.module_logsManager import HuggingFaceDatasetSaver
|
5 |
from modules.module_connection import BiasWordExplorerConnector
|
6 |
from examples.examples import examples1_explorar_sesgo_en_palabras, examples2_explorar_sesgo_en_palabras
|
7 |
+
from tool_info import TOOL_INFO
|
8 |
+
|
9 |
|
10 |
# --- Interface ---
|
11 |
+
def interface(
|
12 |
+
embedding, # Class Embedding instance
|
13 |
+
available_logs: bool,
|
14 |
+
lang: str="spanish"
|
15 |
+
) -> gr.Blocks:
|
16 |
+
|
17 |
# --- Init logs ---
|
18 |
log_callback = HuggingFaceDatasetSaver(
|
19 |
+
available_logs=available_logs,
|
20 |
+
dataset_name=f"logs_edia_we_{lang}"
|
21 |
)
|
22 |
+
|
23 |
# --- Init vars ---
|
24 |
+
connector = BiasWordExplorerConnector(
|
25 |
+
embedding=embedding
|
26 |
+
)
|
27 |
+
|
28 |
+
# --- Load language ---
|
29 |
+
labels = pd.read_json(
|
30 |
+
f"language/{lang}.json"
|
31 |
+
)["BiasWordExplorer_interface"]
|
32 |
|
33 |
+
# --- Interface ---
|
34 |
interface = gr.Blocks()
|
35 |
+
|
36 |
with interface:
|
37 |
+
gr.Markdown(
|
38 |
+
value=labels["step1"]
|
39 |
+
)
|
40 |
with gr.Row():
|
41 |
with gr.Column():
|
42 |
with gr.Row():
|
43 |
+
diagnose_list = gr.Textbox(
|
44 |
+
lines=2,
|
45 |
+
label=labels["wordListToDiagnose"]
|
46 |
+
)
|
47 |
with gr.Row():
|
48 |
+
gr.Markdown(
|
49 |
+
value=labels["step2&2Spaces"]
|
50 |
+
)
|
51 |
with gr.Row():
|
52 |
+
wordlist_1 = gr.Textbox(
|
53 |
+
lines=2,
|
54 |
+
label=labels["wordList1"]
|
55 |
+
)
|
56 |
+
wordlist_2 = gr.Textbox(
|
57 |
+
lines=2,
|
58 |
+
label=labels["wordList2"]
|
59 |
+
)
|
60 |
with gr.Row():
|
61 |
+
gr.Markdown(
|
62 |
+
value=labels["step2&4Spaces"]
|
63 |
+
)
|
64 |
with gr.Row():
|
65 |
+
wordlist_3 = gr.Textbox(
|
66 |
+
lines=2,
|
67 |
+
label=labels["wordList3"]
|
68 |
+
)
|
69 |
+
wordlist_4 = gr.Textbox(
|
70 |
+
lines=2,
|
71 |
+
label=labels["wordList4"]
|
72 |
+
)
|
73 |
+
|
74 |
with gr.Column():
|
75 |
with gr.Row():
|
76 |
+
bias2d = gr.Button(
|
77 |
+
value=labels["plot2SpacesButton"]
|
78 |
+
)
|
79 |
with gr.Row():
|
80 |
+
bias4d = gr.Button(
|
81 |
+
value=labels["plot4SpacesButton"]
|
82 |
+
)
|
83 |
with gr.Row():
|
84 |
+
err_msg = gr.Markdown(
|
85 |
+
label="",
|
86 |
+
visible=True
|
87 |
+
)
|
88 |
with gr.Row():
|
89 |
+
bias_plot = gr.Plot(
|
90 |
+
label="",
|
91 |
+
show_label=False
|
92 |
+
)
|
93 |
+
|
94 |
with gr.Row():
|
95 |
examples = gr.Examples(
|
96 |
fn=connector.calculate_bias_2d,
|
|
|
102 |
with gr.Row():
|
103 |
examples = gr.Examples(
|
104 |
fn=connector.calculate_bias_4d,
|
105 |
+
inputs=[wordlist_1, wordlist_2,wordlist_3, wordlist_4, diagnose_list],
|
106 |
+
outputs=[
|
107 |
+
bias_plot, err_msg
|
108 |
+
],
|
109 |
examples=examples2_explorar_sesgo_en_palabras,
|
110 |
label=labels["examples4Spaces"]
|
111 |
)
|
112 |
|
113 |
with gr.Row():
|
114 |
+
gr.Markdown(
|
115 |
+
value=TOOL_INFO
|
116 |
+
)
|
117 |
|
118 |
bias2d.click(
|
119 |
+
fn=connector.calculate_bias_2d,
|
120 |
+
inputs=[wordlist_1, wordlist_2, diagnose_list],
|
121 |
+
outputs=[bias_plot, err_msg]
|
122 |
)
|
123 |
+
|
124 |
bias4d.click(
|
125 |
fn=connector.calculate_bias_4d,
|
126 |
+
inputs=[wordlist_1, wordlist_2,
|
127 |
+
wordlist_3, wordlist_4, diagnose_list],
|
128 |
+
outputs=[bias_plot, err_msg]
|
129 |
)
|
130 |
|
131 |
# --- Logs ---
|
132 |
+
save_field = [wordlist_1, wordlist_2,wordlist_3, wordlist_4, diagnose_list]
|
133 |
+
log_callback.setup(
|
134 |
+
components=save_field,
|
135 |
+
flagging_dir="logs_word_bias"
|
136 |
+
)
|
137 |
|
138 |
bias2d.click(
|
139 |
fn=lambda *args: log_callback.flag(
|
140 |
+
flag_data=args,
|
141 |
+
flag_option="plot_2d",
|
142 |
+
username="vialibre"
|
143 |
),
|
144 |
inputs=save_field,
|
145 |
+
outputs=None,
|
146 |
preprocess=False
|
147 |
)
|
148 |
+
|
149 |
bias4d.click(
|
150 |
fn=lambda *args: log_callback.flag(
|
151 |
+
flag_data=args,
|
152 |
+
flag_option="plot_4d",
|
153 |
+
username="vialibre"
|
154 |
),
|
155 |
inputs=save_field,
|
156 |
+
outputs=None,
|
157 |
preprocess=False
|
158 |
)
|
159 |
+
|
160 |
+
return interface
|
interfaces/interface_WordExplorer.py
CHANGED
@@ -2,73 +2,140 @@ import gradio as gr
|
|
2 |
import pandas as pd
|
3 |
import matplotlib.pyplot as plt
|
4 |
|
5 |
-
from
|
6 |
-
from modules.module_connection import WordExplorerConnector # Updated
|
7 |
from modules.module_logsManager import HuggingFaceDatasetSaver
|
8 |
from examples.examples import examples_explorar_relaciones_entre_palabras
|
|
|
9 |
|
10 |
plt.rcParams.update({'font.size': 14})
|
11 |
|
12 |
def interface(
|
13 |
-
embedding,
|
14 |
available_logs: bool,
|
15 |
-
max_neighbors: int,
|
16 |
lang: str="spanish",
|
17 |
) -> gr.Blocks:
|
18 |
|
19 |
# --- Init logs ---
|
20 |
log_callback = HuggingFaceDatasetSaver(
|
21 |
-
available_logs=available_logs
|
|
|
22 |
)
|
|
|
23 |
# --- Init vars ---
|
24 |
-
connector = WordExplorerConnector(
|
25 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
26 |
|
27 |
# --- Interface ---
|
28 |
interface = gr.Blocks()
|
|
|
29 |
with interface:
|
30 |
-
gr.Markdown(
|
|
|
|
|
|
|
31 |
with gr.Row():
|
32 |
with gr.Column(scale=3):
|
33 |
with gr.Row(equal_height=True):
|
34 |
with gr.Column(scale=5):
|
35 |
-
diagnose_list = gr.Textbox(
|
|
|
|
|
|
|
36 |
with gr.Column(scale=1,min_width=10):
|
37 |
-
color_wordlist = gr.ColorPicker(
|
|
|
|
|
|
|
|
|
38 |
with gr.Row():
|
39 |
with gr.Column(scale=5):
|
40 |
-
wordlist_1 = gr.Textbox(
|
|
|
|
|
|
|
41 |
with gr.Column(scale=1,min_width=10):
|
42 |
-
color_wordlist_1 = gr.ColorPicker(
|
|
|
|
|
|
|
43 |
with gr.Row():
|
44 |
with gr.Column(scale=5):
|
45 |
-
wordlist_2 = gr.Textbox(
|
|
|
|
|
|
|
46 |
with gr.Column(scale=1,min_width=10):
|
47 |
-
color_wordlist_2 = gr.ColorPicker(
|
|
|
|
|
|
|
48 |
with gr.Row():
|
49 |
with gr.Column(scale=5):
|
50 |
-
wordlist_3 = gr.Textbox(
|
|
|
|
|
|
|
51 |
with gr.Column(scale=1,min_width=10):
|
52 |
-
color_wordlist_3 = gr.ColorPicker(
|
|
|
|
|
|
|
53 |
with gr.Row():
|
54 |
with gr.Column(scale=5):
|
55 |
-
wordlist_4 = gr.Textbox(
|
|
|
|
|
|
|
56 |
with gr.Column(scale=1,min_width=10):
|
57 |
-
color_wordlist_4 = gr.ColorPicker(
|
|
|
|
|
|
|
58 |
with gr.Column(scale=4):
|
59 |
with gr.Row():
|
60 |
with gr.Row():
|
61 |
-
gr.Markdown(
|
62 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
63 |
with gr.Row():
|
64 |
-
alpha = gr.Slider(
|
65 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
66 |
with gr.Row():
|
67 |
-
btn_plot = gr.Button(
|
|
|
|
|
68 |
with gr.Row():
|
69 |
-
err_msg = gr.Markdown(
|
|
|
|
|
|
|
70 |
with gr.Row():
|
71 |
-
word_proyections = gr.Plot(
|
|
|
|
|
|
|
72 |
|
73 |
with gr.Row():
|
74 |
gr.Examples(
|
@@ -80,7 +147,9 @@ def interface(
|
|
80 |
)
|
81 |
|
82 |
with gr.Row():
|
83 |
-
gr.Markdown(
|
|
|
|
|
84 |
|
85 |
btn_plot.click(
|
86 |
fn=connector.plot_proyection_2d,
|
@@ -99,21 +168,25 @@ def interface(
|
|
99 |
fontsize,
|
100 |
n_neighbors
|
101 |
],
|
102 |
-
outputs=[word_proyections,err_msg]
|
103 |
)
|
104 |
|
105 |
# --- Logs ---
|
106 |
-
save_field = [diagnose_list,wordlist_1,wordlist_2,wordlist_3,wordlist_4]
|
107 |
-
log_callback.setup(
|
|
|
|
|
|
|
108 |
|
109 |
btn_plot.click(
|
110 |
fn=lambda *args: log_callback.flag(
|
111 |
-
|
112 |
-
|
113 |
-
|
114 |
),
|
115 |
inputs=save_field,
|
116 |
outputs=None,
|
117 |
preprocess=False
|
118 |
)
|
|
|
119 |
return interface
|
|
|
2 |
import pandas as pd
|
3 |
import matplotlib.pyplot as plt
|
4 |
|
5 |
+
from modules.module_connection import WordExplorerConnector
|
|
|
6 |
from modules.module_logsManager import HuggingFaceDatasetSaver
|
7 |
from examples.examples import examples_explorar_relaciones_entre_palabras
|
8 |
+
from tool_info import TOOL_INFO
|
9 |
|
10 |
plt.rcParams.update({'font.size': 14})
|
11 |
|
12 |
def interface(
|
13 |
+
embedding, # Class Embedding instance
|
14 |
available_logs: bool,
|
15 |
+
max_neighbors: int,
|
16 |
lang: str="spanish",
|
17 |
) -> gr.Blocks:
|
18 |
|
19 |
# --- Init logs ---
|
20 |
log_callback = HuggingFaceDatasetSaver(
|
21 |
+
available_logs=available_logs,
|
22 |
+
dataset_name=f"logs_edia_we_{lang}"
|
23 |
)
|
24 |
+
|
25 |
# --- Init vars ---
|
26 |
+
connector = WordExplorerConnector(
|
27 |
+
embedding=embedding
|
28 |
+
)
|
29 |
+
|
30 |
+
# --- Load language ---
|
31 |
+
labels = pd.read_json(
|
32 |
+
f"language/{lang}.json"
|
33 |
+
)["WordExplorer_interface"]
|
34 |
|
35 |
# --- Interface ---
|
36 |
interface = gr.Blocks()
|
37 |
+
|
38 |
with interface:
|
39 |
+
gr.Markdown(
|
40 |
+
value=labels["title"]
|
41 |
+
)
|
42 |
+
|
43 |
with gr.Row():
|
44 |
with gr.Column(scale=3):
|
45 |
with gr.Row(equal_height=True):
|
46 |
with gr.Column(scale=5):
|
47 |
+
diagnose_list = gr.Textbox(
|
48 |
+
lines=2,
|
49 |
+
label=labels["wordListToDiagnose"]
|
50 |
+
)
|
51 |
with gr.Column(scale=1,min_width=10):
|
52 |
+
color_wordlist = gr.ColorPicker(
|
53 |
+
label="",
|
54 |
+
value='#000000'
|
55 |
+
)
|
56 |
+
|
57 |
with gr.Row():
|
58 |
with gr.Column(scale=5):
|
59 |
+
wordlist_1 = gr.Textbox(
|
60 |
+
lines=2,
|
61 |
+
label=labels["wordList1"]
|
62 |
+
)
|
63 |
with gr.Column(scale=1,min_width=10):
|
64 |
+
color_wordlist_1 = gr.ColorPicker(
|
65 |
+
label="",
|
66 |
+
value='#1f78b4'
|
67 |
+
)
|
68 |
with gr.Row():
|
69 |
with gr.Column(scale=5):
|
70 |
+
wordlist_2 = gr.Textbox(
|
71 |
+
lines=2,
|
72 |
+
label=labels["wordList2"]
|
73 |
+
)
|
74 |
with gr.Column(scale=1,min_width=10):
|
75 |
+
color_wordlist_2 = gr.ColorPicker(
|
76 |
+
label="",
|
77 |
+
value='#33a02c'
|
78 |
+
)
|
79 |
with gr.Row():
|
80 |
with gr.Column(scale=5):
|
81 |
+
wordlist_3 = gr.Textbox(
|
82 |
+
lines=2,
|
83 |
+
label=labels["wordList3"]
|
84 |
+
)
|
85 |
with gr.Column(scale=1,min_width=10):
|
86 |
+
color_wordlist_3 = gr.ColorPicker(
|
87 |
+
label="",
|
88 |
+
value='#e31a1c'
|
89 |
+
)
|
90 |
with gr.Row():
|
91 |
with gr.Column(scale=5):
|
92 |
+
wordlist_4 = gr.Textbox(
|
93 |
+
lines=2,
|
94 |
+
label=labels["wordList4"]
|
95 |
+
)
|
96 |
with gr.Column(scale=1,min_width=10):
|
97 |
+
color_wordlist_4 = gr.ColorPicker(
|
98 |
+
label="",
|
99 |
+
value='#6a3d9a'
|
100 |
+
)
|
101 |
with gr.Column(scale=4):
|
102 |
with gr.Row():
|
103 |
with gr.Row():
|
104 |
+
gr.Markdown(
|
105 |
+
value=labels["plotNeighbours"]["title"]
|
106 |
+
)
|
107 |
+
n_neighbors = gr.Slider(
|
108 |
+
minimum=0,
|
109 |
+
maximum=max_neighbors,
|
110 |
+
step=1,
|
111 |
+
label=labels["plotNeighbours"]["quantity"]
|
112 |
+
)
|
113 |
with gr.Row():
|
114 |
+
alpha = gr.Slider(
|
115 |
+
minimum=0.1,
|
116 |
+
maximum=0.9,
|
117 |
+
value=0.3,
|
118 |
+
step=0.1,
|
119 |
+
label=labels["options"]["transparency"]
|
120 |
+
)
|
121 |
+
fontsize=gr.Number(
|
122 |
+
value=25,
|
123 |
+
label=labels["options"]["font-size"]
|
124 |
+
)
|
125 |
with gr.Row():
|
126 |
+
btn_plot = gr.Button(
|
127 |
+
value=labels["plot_button"]
|
128 |
+
)
|
129 |
with gr.Row():
|
130 |
+
err_msg = gr.Markdown(
|
131 |
+
label="",
|
132 |
+
visible=True
|
133 |
+
)
|
134 |
with gr.Row():
|
135 |
+
word_proyections = gr.Plot(
|
136 |
+
label="",
|
137 |
+
show_label=False
|
138 |
+
)
|
139 |
|
140 |
with gr.Row():
|
141 |
gr.Examples(
|
|
|
147 |
)
|
148 |
|
149 |
with gr.Row():
|
150 |
+
gr.Markdown(
|
151 |
+
value=TOOL_INFO
|
152 |
+
)
|
153 |
|
154 |
btn_plot.click(
|
155 |
fn=connector.plot_proyection_2d,
|
|
|
168 |
fontsize,
|
169 |
n_neighbors
|
170 |
],
|
171 |
+
outputs=[word_proyections, err_msg]
|
172 |
)
|
173 |
|
174 |
# --- Logs ---
|
175 |
+
save_field = [diagnose_list, wordlist_1, wordlist_2, wordlist_3, wordlist_4]
|
176 |
+
log_callback.setup(
|
177 |
+
components=save_field,
|
178 |
+
flagging_dir="logs_word_explorer"
|
179 |
+
)
|
180 |
|
181 |
btn_plot.click(
|
182 |
fn=lambda *args: log_callback.flag(
|
183 |
+
flag_data=args,
|
184 |
+
flag_option="word_explorer",
|
185 |
+
username="vialibre",
|
186 |
),
|
187 |
inputs=save_field,
|
188 |
outputs=None,
|
189 |
preprocess=False
|
190 |
)
|
191 |
+
|
192 |
return interface
|
language/.gitignore
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
english.json
|
modules/model_embbeding.py
CHANGED
@@ -3,7 +3,7 @@ from memory_profiler import profile
|
|
3 |
from sklearn.neighbors import NearestNeighbors
|
4 |
from sklearn.decomposition import PCA
|
5 |
from gensim.models import KeyedVectors
|
6 |
-
from typing import List
|
7 |
import os
|
8 |
import pandas as pd
|
9 |
|
@@ -13,21 +13,22 @@ from gensim import matutils
|
|
13 |
|
14 |
|
15 |
class Embedding:
|
16 |
-
@profile
|
17 |
def __init__(self,
|
18 |
path: str,
|
19 |
-
|
20 |
-
limit: int=None,
|
21 |
randomizedPCA: bool=False,
|
22 |
-
max_neighbors: int=20
|
|
|
23 |
) -> None:
|
24 |
|
25 |
# Embedding vars
|
26 |
self.path = path
|
27 |
self.limit = limit
|
28 |
self.randomizedPCA = randomizedPCA
|
29 |
-
self.binary = binary
|
30 |
self.max_neighbors = max_neighbors
|
|
|
|
|
|
|
31 |
|
32 |
# Full embedding dataset
|
33 |
self.ds = None
|
@@ -43,36 +44,34 @@ class Embedding:
|
|
43 |
self,
|
44 |
) -> None:
|
45 |
|
|
|
|
|
46 |
print(f"Preparing {os.path.basename(self.path)} embeddings...")
|
47 |
|
48 |
# --- Prepare dataset ---
|
49 |
self.ds = self.__preparate(
|
50 |
-
self.path, self.
|
51 |
)
|
52 |
|
53 |
# --- Estimate Nearest Neighbors
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
self.neigh.fit(
|
69 |
-
X=self.ds['embedding'].to_list()
|
70 |
-
)
|
71 |
|
72 |
def __preparate(
|
73 |
self,
|
74 |
-
path: str,
|
75 |
-
binary: bool,
|
76 |
limit: int,
|
77 |
randomizedPCA: bool
|
78 |
) -> pd.DataFrame:
|
@@ -94,7 +93,7 @@ class Embedding:
|
|
94 |
print("--------> PATH:", path)
|
95 |
model = KeyedVectors.load_word2vec_format(
|
96 |
fname=path,
|
97 |
-
binary=
|
98 |
limit=limit
|
99 |
)
|
100 |
|
@@ -116,11 +115,48 @@ class Embedding:
|
|
116 |
df_uncased = df_cased.drop_duplicates(subset='word')
|
117 |
return df_uncased
|
118 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
119 |
def __getValue(
|
120 |
self,
|
121 |
word: str,
|
122 |
feature: str
|
123 |
-
):
|
|
|
124 |
word_id, value = None, None
|
125 |
|
126 |
if word in self:
|
@@ -134,13 +170,15 @@ class Embedding:
|
|
134 |
def getEmbedding(
|
135 |
self,
|
136 |
word: str
|
137 |
-
):
|
|
|
138 |
return self.__getValue(word, 'embedding')
|
139 |
|
140 |
def getPCA(
|
141 |
self,
|
142 |
word: str
|
143 |
-
):
|
|
|
144 |
return self.__getValue(word, 'pca')
|
145 |
|
146 |
def getNearestNeighbors(
|
@@ -152,35 +190,58 @@ class Embedding:
|
|
152 |
|
153 |
assert(n_neighbors <= self.max_neighbors), f"Error: The value of the parameter 'n_neighbors:{n_neighbors}' must less than or equal to {self.max_neighbors}!."
|
154 |
|
155 |
-
|
156 |
-
|
157 |
-
|
158 |
-
elif nn_method == 'sklearn':
|
159 |
-
word_emb = self.getEmbedding(word).reshape(1,-1)
|
160 |
-
_, nn_ids = self.neigh.kneighbors(word_emb, n_neighbors + 1) #Fix and Update
|
161 |
-
words = [self.ds['word'].to_list()[idx] for idx in nn_ids[0]][1:] #Fix and Update
|
162 |
-
else:
|
163 |
-
words = []
|
164 |
-
return words
|
165 |
|
166 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
167 |
self,
|
168 |
-
|
169 |
-
|
170 |
-
|
171 |
-
return word in self.ds['word'].to_list()
|
172 |
-
|
173 |
-
# ToDo: Revisar estos dos métodos usados en la pestaña sesgoEnPalabras
|
174 |
-
# ya que ahora los embedding vienen normalizados
|
175 |
-
def cosineSimilarities(self, vector_1, vectors_all):
|
176 |
norm = np.linalg.norm(vector_1)
|
177 |
all_norms = np.linalg.norm(vectors_all, axis=1)
|
178 |
dot_products = dot(vectors_all, vector_1)
|
179 |
similarities = dot_products / (norm * all_norms)
|
180 |
return similarities
|
181 |
|
182 |
-
def getCosineSimilarities(
|
|
|
|
|
|
|
|
|
|
|
183 |
return dot(
|
184 |
matutils.unitvec(self.getEmbedding(w1)),
|
185 |
matutils.unitvec(self.getEmbedding(w2))
|
186 |
-
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
3 |
from sklearn.neighbors import NearestNeighbors
|
4 |
from sklearn.decomposition import PCA
|
5 |
from gensim.models import KeyedVectors
|
6 |
+
from typing import List, Any
|
7 |
import os
|
8 |
import pandas as pd
|
9 |
|
|
|
13 |
|
14 |
|
15 |
class Embedding:
|
|
|
16 |
def __init__(self,
|
17 |
path: str,
|
18 |
+
limit: int=None,
|
|
|
19 |
randomizedPCA: bool=False,
|
20 |
+
max_neighbors: int=20,
|
21 |
+
nn_method: str='sklearn'
|
22 |
) -> None:
|
23 |
|
24 |
# Embedding vars
|
25 |
self.path = path
|
26 |
self.limit = limit
|
27 |
self.randomizedPCA = randomizedPCA
|
|
|
28 |
self.max_neighbors = max_neighbors
|
29 |
+
|
30 |
+
self.availables_nn_methods = ['sklearn', 'ann']
|
31 |
+
self.nn_method = nn_method
|
32 |
|
33 |
# Full embedding dataset
|
34 |
self.ds = None
|
|
|
44 |
self,
|
45 |
) -> None:
|
46 |
|
47 |
+
assert(self.nn_method in self.availables_nn_methods), f"Error: The value of the parameter 'nn method' can only be {self.availables_nn_methods}!"
|
48 |
+
|
49 |
print(f"Preparing {os.path.basename(self.path)} embeddings...")
|
50 |
|
51 |
# --- Prepare dataset ---
|
52 |
self.ds = self.__preparate(
|
53 |
+
self.path, self.limit, self.randomizedPCA
|
54 |
)
|
55 |
|
56 |
# --- Estimate Nearest Neighbors
|
57 |
+
if self.nn_method == 'sklearn':
|
58 |
+
# Method A: Througth Sklearn method
|
59 |
+
self.__init_sklearn_method(
|
60 |
+
max_neighbors=self.max_neighbors,
|
61 |
+
vectors=self.ds['embedding'].to_list()
|
62 |
+
)
|
63 |
+
|
64 |
+
elif self.nn_method == 'ann':
|
65 |
+
# Method B: Througth annoy using forest tree
|
66 |
+
self.__init_ann_method(
|
67 |
+
words=self.ds['word'].to_list(),
|
68 |
+
vectors=self.ds['embedding'].to_list(),
|
69 |
+
coord=self.ds['pca'].to_list()
|
70 |
+
)
|
|
|
|
|
|
|
71 |
|
72 |
def __preparate(
|
73 |
self,
|
74 |
+
path: str,
|
|
|
75 |
limit: int,
|
76 |
randomizedPCA: bool
|
77 |
) -> pd.DataFrame:
|
|
|
93 |
print("--------> PATH:", path)
|
94 |
model = KeyedVectors.load_word2vec_format(
|
95 |
fname=path,
|
96 |
+
binary=path.endswith('.bin'),
|
97 |
limit=limit
|
98 |
)
|
99 |
|
|
|
115 |
df_uncased = df_cased.drop_duplicates(subset='word')
|
116 |
return df_uncased
|
117 |
|
118 |
+
def __init_ann_method(
|
119 |
+
self,
|
120 |
+
words: List[str],
|
121 |
+
vectors: List[float],
|
122 |
+
coord: List[float],
|
123 |
+
n_trees: int=20,
|
124 |
+
metric: str='dot'
|
125 |
+
) -> None:
|
126 |
+
|
127 |
+
print("Initializing Annoy method to search for nearby neighbors...")
|
128 |
+
self.ann = Ann(
|
129 |
+
words=words,
|
130 |
+
vectors=vectors,
|
131 |
+
coord=coord,
|
132 |
+
)
|
133 |
+
|
134 |
+
self.ann.init(
|
135 |
+
n_trees=n_trees,
|
136 |
+
metric=metric,
|
137 |
+
n_jobs=-1
|
138 |
+
)
|
139 |
+
|
140 |
+
def __init_sklearn_method(
|
141 |
+
self,
|
142 |
+
max_neighbors: int,
|
143 |
+
vectors: List[float]
|
144 |
+
) -> None:
|
145 |
+
|
146 |
+
print("Initializing sklearn method to search for nearby neighbors...")
|
147 |
+
self.neigh = NearestNeighbors(
|
148 |
+
n_neighbors=max_neighbors
|
149 |
+
)
|
150 |
+
self.neigh.fit(
|
151 |
+
X=vectors
|
152 |
+
)
|
153 |
+
|
154 |
def __getValue(
|
155 |
self,
|
156 |
word: str,
|
157 |
feature: str
|
158 |
+
) -> Any:
|
159 |
+
|
160 |
word_id, value = None, None
|
161 |
|
162 |
if word in self:
|
|
|
170 |
def getEmbedding(
|
171 |
self,
|
172 |
word: str
|
173 |
+
) -> np.ndarray:
|
174 |
+
|
175 |
return self.__getValue(word, 'embedding')
|
176 |
|
177 |
def getPCA(
|
178 |
self,
|
179 |
word: str
|
180 |
+
) -> np.ndarray:
|
181 |
+
|
182 |
return self.__getValue(word, 'pca')
|
183 |
|
184 |
def getNearestNeighbors(
|
|
|
190 |
|
191 |
assert(n_neighbors <= self.max_neighbors), f"Error: The value of the parameter 'n_neighbors:{n_neighbors}' must less than or equal to {self.max_neighbors}!."
|
192 |
|
193 |
+
assert(nn_method in self.availables_nn_methods), f"Error: The value of the parameter 'nn method' can only be {self.availables_nn_methods}!"
|
194 |
+
|
195 |
+
neighbords_list = None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
196 |
|
197 |
+
if word in self:
|
198 |
+
if nn_method == 'ann':
|
199 |
+
if self.ann is None:
|
200 |
+
self.__init_ann_method(
|
201 |
+
words=self.ds['word'].to_list(),
|
202 |
+
vectors=self.ds['embedding'].to_list(),
|
203 |
+
coord=self.ds['pca'].to_list()
|
204 |
+
)
|
205 |
+
neighbords_list = self.ann.get(word, n_neighbors)
|
206 |
+
|
207 |
+
elif nn_method == 'sklearn':
|
208 |
+
if self.neigh is None:
|
209 |
+
self.__init_sklearn_method(
|
210 |
+
max_neighbors=self.max_neighbors,
|
211 |
+
vectors=self.ds['embedding'].to_list()
|
212 |
+
)
|
213 |
+
|
214 |
+
word_emb = self.getEmbedding(word).reshape(1,-1)
|
215 |
+
_, nn_ids = self.neigh.kneighbors(word_emb, n_neighbors + 1)
|
216 |
+
neighbords_list = [self.ds['word'].to_list()[idx] for idx in nn_ids[0]][1:]
|
217 |
+
|
218 |
+
return neighbords_list
|
219 |
+
|
220 |
+
def cosineSimilarities(
|
221 |
self,
|
222 |
+
vector_1,
|
223 |
+
vectors_all
|
224 |
+
):
|
|
|
|
|
|
|
|
|
|
|
225 |
norm = np.linalg.norm(vector_1)
|
226 |
all_norms = np.linalg.norm(vectors_all, axis=1)
|
227 |
dot_products = dot(vectors_all, vector_1)
|
228 |
similarities = dot_products / (norm * all_norms)
|
229 |
return similarities
|
230 |
|
231 |
+
def getCosineSimilarities(
|
232 |
+
self,
|
233 |
+
w1,
|
234 |
+
w2
|
235 |
+
):
|
236 |
+
|
237 |
return dot(
|
238 |
matutils.unitvec(self.getEmbedding(w1)),
|
239 |
matutils.unitvec(self.getEmbedding(w2))
|
240 |
+
)
|
241 |
+
|
242 |
+
def __contains__(
|
243 |
+
self,
|
244 |
+
word: str
|
245 |
+
) -> bool:
|
246 |
+
|
247 |
+
return word in self.ds['word'].to_list()
|
modules/module_BiasExplorer.py
CHANGED
@@ -5,10 +5,14 @@ import seaborn as sns
|
|
5 |
import matplotlib.pyplot as plt
|
6 |
from sklearn.decomposition import PCA
|
7 |
|
8 |
-
def take_two_sides_extreme_sorted(
|
9 |
-
|
10 |
-
|
11 |
-
|
|
|
|
|
|
|
|
|
12 |
head_df = df.head(n_extreme)[:]
|
13 |
tail_df = df.tail(n_extreme)[:]
|
14 |
|
@@ -56,39 +60,63 @@ __all__ = ['GenderBiasWE', 'BiasWordEmbedding']
|
|
56 |
|
57 |
|
58 |
class WordBiasExplorer():
|
59 |
-
def __init__(
|
60 |
-
|
|
|
|
|
61 |
|
62 |
-
self.
|
63 |
self.direction = None
|
64 |
self.positive_end = None
|
65 |
self.negative_end = None
|
66 |
|
67 |
-
def __copy__(
|
68 |
-
|
|
|
|
|
|
|
69 |
bias_word_embedding.direction = copy.deepcopy(self.direction)
|
70 |
bias_word_embedding.positive_end = copy.deepcopy(self.positive_end)
|
71 |
bias_word_embedding.negative_end = copy.deepcopy(self.negative_end)
|
72 |
return bias_word_embedding
|
73 |
|
74 |
-
def __deepcopy__(
|
|
|
|
|
|
|
|
|
75 |
bias_word_embedding = copy.copy(self)
|
76 |
bias_word_embedding.model = copy.deepcopy(bias_word_embedding.model)
|
77 |
return bias_word_embedding
|
78 |
|
79 |
-
def __getitem__(
|
80 |
-
|
|
|
|
|
81 |
|
82 |
-
|
83 |
-
return item in self.vocabulary
|
84 |
|
85 |
-
def
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
86 |
if self.direction is None:
|
87 |
raise RuntimeError('The direction was not identified'
|
88 |
' for this {} instance'
|
89 |
.format(self.__class__.__name__))
|
90 |
|
91 |
-
def _identify_subspace_by_pca(
|
|
|
|
|
|
|
|
|
|
|
92 |
matrix = []
|
93 |
|
94 |
for word1, word2 in definitional_pairs:
|
@@ -105,8 +133,14 @@ class WordBiasExplorer():
|
|
105 |
return pca
|
106 |
|
107 |
|
108 |
-
def _identify_direction(
|
109 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
110 |
if method not in DIRECTION_METHODS:
|
111 |
raise ValueError('method should be one of {}, {} was given'.format(
|
112 |
DIRECTION_METHODS, method))
|
@@ -154,7 +188,11 @@ class WordBiasExplorer():
|
|
154 |
self.positive_end = positive_end
|
155 |
self.negative_end = negative_end
|
156 |
|
157 |
-
def project_on_direction(
|
|
|
|
|
|
|
|
|
158 |
"""Project the normalized vector of the word on the direction.
|
159 |
:param str word: The word tor project
|
160 |
:return float: The projection scalar
|
@@ -163,13 +201,15 @@ class WordBiasExplorer():
|
|
163 |
self._is_direction_identified()
|
164 |
|
165 |
vector = self[word]
|
166 |
-
projection_score = self.
|
167 |
[vector])[0]
|
168 |
return projection_score
|
169 |
|
|
|
|
|
|
|
|
|
170 |
|
171 |
-
|
172 |
-
def _calc_projection_scores(self, words):
|
173 |
self._is_direction_identified()
|
174 |
|
175 |
df = pd.DataFrame({'word': words})
|
@@ -181,7 +221,11 @@ class WordBiasExplorer():
|
|
181 |
|
182 |
return df
|
183 |
|
184 |
-
def calc_projection_data(
|
|
|
|
|
|
|
|
|
185 |
"""
|
186 |
Calculate projection, projected and rejected vectors of a words list.
|
187 |
:param list words: List of words
|
@@ -206,7 +250,12 @@ class WordBiasExplorer():
|
|
206 |
|
207 |
return pd.DataFrame(projection_data)
|
208 |
|
209 |
-
def plot_dist_projections_on_direction(
|
|
|
|
|
|
|
|
|
|
|
210 |
"""Plot the projection scalars distribution on the direction.
|
211 |
:param dict word_groups word: The groups to projects
|
212 |
:return float: The ax object of the plot
|
@@ -221,7 +270,7 @@ class WordBiasExplorer():
|
|
221 |
words = word_groups[name]
|
222 |
label = '{} (#{})'.format(name, len(words))
|
223 |
vectors = [self[word] for word in words]
|
224 |
-
projections = self.
|
225 |
vectors)
|
226 |
sns.distplot(projections, hist=False, label=label, ax=ax)
|
227 |
|
@@ -236,18 +285,26 @@ class WordBiasExplorer():
|
|
236 |
|
237 |
return ax
|
238 |
|
239 |
-
def __errorChecking(
|
|
|
|
|
|
|
|
|
240 |
out_msj = ""
|
241 |
|
242 |
if not word:
|
243 |
out_msj = "Error: Primero debe ingresar una palabra!"
|
244 |
else:
|
245 |
-
if word not in self.
|
246 |
out_msj = f"Error: La palabra '<b>{word}</b>' no se encuentra en el vocabulario!"
|
247 |
|
248 |
return out_msj
|
249 |
|
250 |
-
def check_oov(
|
|
|
|
|
|
|
|
|
251 |
for wordlist in wordlists:
|
252 |
for word in wordlist:
|
253 |
msg = self.__errorChecking(word)
|
@@ -255,13 +312,15 @@ class WordBiasExplorer():
|
|
255 |
return msg
|
256 |
return None
|
257 |
|
258 |
-
def plot_biased_words(
|
259 |
-
|
260 |
-
|
261 |
-
|
262 |
-
|
263 |
-
|
264 |
-
|
|
|
|
|
265 |
bias_2D = wordlist_top == [] and wordlist_bottom == []
|
266 |
|
267 |
if bias_2D and (not wordlist_right or not wordlist_left):
|
@@ -273,21 +332,24 @@ class WordBiasExplorer():
|
|
273 |
if err:
|
274 |
raise Exception(err)
|
275 |
|
276 |
-
return self.get_bias_plot(
|
277 |
-
|
278 |
-
|
279 |
-
|
280 |
-
|
|
|
281 |
|
282 |
-
def get_bias_plot(
|
283 |
-
|
284 |
-
|
285 |
-
|
286 |
-
|
287 |
-
|
288 |
-
|
289 |
-
|
290 |
-
|
|
|
|
|
291 |
fig, ax = plt.subplots(1, figsize=figsize)
|
292 |
self.method = method
|
293 |
self.plot_projection_scores(plot_2D, words_to_diagnose, definitional_1, definitional_2, n_extreme, ax)
|
@@ -298,14 +360,17 @@ class WordBiasExplorer():
|
|
298 |
|
299 |
return fig
|
300 |
|
301 |
-
def plot_projection_scores(
|
302 |
-
|
303 |
-
|
304 |
-
|
305 |
-
|
306 |
-
|
307 |
-
|
308 |
-
|
|
|
|
|
|
|
309 |
name_left = ', '.join(definitional_1[1])
|
310 |
name_right = ', '.join(definitional_1[0])
|
311 |
|
@@ -341,6 +406,9 @@ class WordBiasExplorer():
|
|
341 |
sns.barplot(x='projection', y='word', data=projections_df,
|
342 |
palette=projections_df['color'])
|
343 |
else:
|
|
|
|
|
|
|
344 |
sns.scatterplot(x='projection_x', y='projection_y', data=projections_df,
|
345 |
palette=projections_df['color'])
|
346 |
|
|
|
5 |
import matplotlib.pyplot as plt
|
6 |
from sklearn.decomposition import PCA
|
7 |
|
8 |
+
def take_two_sides_extreme_sorted(
|
9 |
+
df,
|
10 |
+
n_extreme,
|
11 |
+
part_column=None,
|
12 |
+
head_value='',
|
13 |
+
tail_value=''
|
14 |
+
):
|
15 |
+
|
16 |
head_df = df.head(n_extreme)[:]
|
17 |
tail_df = df.tail(n_extreme)[:]
|
18 |
|
|
|
60 |
|
61 |
|
62 |
class WordBiasExplorer():
|
63 |
+
def __init__(
|
64 |
+
self,
|
65 |
+
embedding # Class Embedding instance
|
66 |
+
) -> None:
|
67 |
|
68 |
+
self.embedding = embedding
|
69 |
self.direction = None
|
70 |
self.positive_end = None
|
71 |
self.negative_end = None
|
72 |
|
73 |
+
def __copy__(
|
74 |
+
self
|
75 |
+
):
|
76 |
+
|
77 |
+
bias_word_embedding = self.__class__(self.embedding)
|
78 |
bias_word_embedding.direction = copy.deepcopy(self.direction)
|
79 |
bias_word_embedding.positive_end = copy.deepcopy(self.positive_end)
|
80 |
bias_word_embedding.negative_end = copy.deepcopy(self.negative_end)
|
81 |
return bias_word_embedding
|
82 |
|
83 |
+
def __deepcopy__(
|
84 |
+
self,
|
85 |
+
memo
|
86 |
+
):
|
87 |
+
|
88 |
bias_word_embedding = copy.copy(self)
|
89 |
bias_word_embedding.model = copy.deepcopy(bias_word_embedding.model)
|
90 |
return bias_word_embedding
|
91 |
|
92 |
+
def __getitem__(
|
93 |
+
self,
|
94 |
+
key: str
|
95 |
+
) -> np.ndarray:
|
96 |
|
97 |
+
return self.embedding.getEmbedding(key)
|
|
|
98 |
|
99 |
+
def __contains__(
|
100 |
+
self,
|
101 |
+
item: str
|
102 |
+
) -> bool:
|
103 |
+
|
104 |
+
return item in self.embedding
|
105 |
+
|
106 |
+
def _is_direction_identified(
|
107 |
+
self
|
108 |
+
):
|
109 |
if self.direction is None:
|
110 |
raise RuntimeError('The direction was not identified'
|
111 |
' for this {} instance'
|
112 |
.format(self.__class__.__name__))
|
113 |
|
114 |
+
def _identify_subspace_by_pca(
|
115 |
+
self,
|
116 |
+
definitional_pairs,
|
117 |
+
n_components
|
118 |
+
):
|
119 |
+
|
120 |
matrix = []
|
121 |
|
122 |
for word1, word2 in definitional_pairs:
|
|
|
133 |
return pca
|
134 |
|
135 |
|
136 |
+
def _identify_direction(
|
137 |
+
self,
|
138 |
+
positive_end,
|
139 |
+
negative_end,
|
140 |
+
definitional,
|
141 |
+
method='pca'
|
142 |
+
):
|
143 |
+
|
144 |
if method not in DIRECTION_METHODS:
|
145 |
raise ValueError('method should be one of {}, {} was given'.format(
|
146 |
DIRECTION_METHODS, method))
|
|
|
188 |
self.positive_end = positive_end
|
189 |
self.negative_end = negative_end
|
190 |
|
191 |
+
def project_on_direction(
|
192 |
+
self,
|
193 |
+
word: str
|
194 |
+
):
|
195 |
+
|
196 |
"""Project the normalized vector of the word on the direction.
|
197 |
:param str word: The word tor project
|
198 |
:return float: The projection scalar
|
|
|
201 |
self._is_direction_identified()
|
202 |
|
203 |
vector = self[word]
|
204 |
+
projection_score = self.embedding.cosineSimilarities(self.direction,
|
205 |
[vector])[0]
|
206 |
return projection_score
|
207 |
|
208 |
+
def _calc_projection_scores(
|
209 |
+
self,
|
210 |
+
words
|
211 |
+
):
|
212 |
|
|
|
|
|
213 |
self._is_direction_identified()
|
214 |
|
215 |
df = pd.DataFrame({'word': words})
|
|
|
221 |
|
222 |
return df
|
223 |
|
224 |
+
def calc_projection_data(
|
225 |
+
self,
|
226 |
+
words
|
227 |
+
):
|
228 |
+
|
229 |
"""
|
230 |
Calculate projection, projected and rejected vectors of a words list.
|
231 |
:param list words: List of words
|
|
|
250 |
|
251 |
return pd.DataFrame(projection_data)
|
252 |
|
253 |
+
def plot_dist_projections_on_direction(
|
254 |
+
self,
|
255 |
+
word_groups,
|
256 |
+
ax=None
|
257 |
+
):
|
258 |
+
|
259 |
"""Plot the projection scalars distribution on the direction.
|
260 |
:param dict word_groups word: The groups to projects
|
261 |
:return float: The ax object of the plot
|
|
|
270 |
words = word_groups[name]
|
271 |
label = '{} (#{})'.format(name, len(words))
|
272 |
vectors = [self[word] for word in words]
|
273 |
+
projections = self.embedding.cosineSimilarities(self.direction,
|
274 |
vectors)
|
275 |
sns.distplot(projections, hist=False, label=label, ax=ax)
|
276 |
|
|
|
285 |
|
286 |
return ax
|
287 |
|
288 |
+
def __errorChecking(
|
289 |
+
self,
|
290 |
+
word
|
291 |
+
):
|
292 |
+
|
293 |
out_msj = ""
|
294 |
|
295 |
if not word:
|
296 |
out_msj = "Error: Primero debe ingresar una palabra!"
|
297 |
else:
|
298 |
+
if word not in self.embedding:
|
299 |
out_msj = f"Error: La palabra '<b>{word}</b>' no se encuentra en el vocabulario!"
|
300 |
|
301 |
return out_msj
|
302 |
|
303 |
+
def check_oov(
|
304 |
+
self,
|
305 |
+
wordlists
|
306 |
+
):
|
307 |
+
|
308 |
for wordlist in wordlists:
|
309 |
for word in wordlist:
|
310 |
msg = self.__errorChecking(word)
|
|
|
312 |
return msg
|
313 |
return None
|
314 |
|
315 |
+
def plot_biased_words(
|
316 |
+
self,
|
317 |
+
words_to_diagnose,
|
318 |
+
wordlist_right,
|
319 |
+
wordlist_left,
|
320 |
+
wordlist_top=[],
|
321 |
+
wordlist_bottom=[]
|
322 |
+
):
|
323 |
+
|
324 |
bias_2D = wordlist_top == [] and wordlist_bottom == []
|
325 |
|
326 |
if bias_2D and (not wordlist_right or not wordlist_left):
|
|
|
332 |
if err:
|
333 |
raise Exception(err)
|
334 |
|
335 |
+
return self.get_bias_plot(
|
336 |
+
bias_2D,
|
337 |
+
words_to_diagnose,
|
338 |
+
definitional_1=(wordlist_right, wordlist_left),
|
339 |
+
definitional_2=(wordlist_top, wordlist_bottom)
|
340 |
+
)
|
341 |
|
342 |
+
def get_bias_plot(
|
343 |
+
self,
|
344 |
+
plot_2D,
|
345 |
+
words_to_diagnose,
|
346 |
+
definitional_1,
|
347 |
+
definitional_2=([], []),
|
348 |
+
method='sum',
|
349 |
+
n_extreme=10,
|
350 |
+
figsize=(15, 10)
|
351 |
+
):
|
352 |
+
|
353 |
fig, ax = plt.subplots(1, figsize=figsize)
|
354 |
self.method = method
|
355 |
self.plot_projection_scores(plot_2D, words_to_diagnose, definitional_1, definitional_2, n_extreme, ax)
|
|
|
360 |
|
361 |
return fig
|
362 |
|
363 |
+
def plot_projection_scores(
|
364 |
+
self,
|
365 |
+
plot_2D,
|
366 |
+
words,
|
367 |
+
definitional_1,
|
368 |
+
definitional_2=([], []),
|
369 |
+
n_extreme=10,
|
370 |
+
ax=None,
|
371 |
+
axis_projection_step=0.1
|
372 |
+
):
|
373 |
+
|
374 |
name_left = ', '.join(definitional_1[1])
|
375 |
name_right = ', '.join(definitional_1[0])
|
376 |
|
|
|
406 |
sns.barplot(x='projection', y='word', data=projections_df,
|
407 |
palette=projections_df['color'])
|
408 |
else:
|
409 |
+
# ToDo: revisar este warning:
|
410 |
+
# Ignoring `palette` because no `hue` variable has been assigned. sns.scatterplot(x='projection_x', y='projection_y', data=projections_df,
|
411 |
+
|
412 |
sns.scatterplot(x='projection_x', y='projection_y', data=projections_df,
|
413 |
palette=projections_df['color'])
|
414 |
|
modules/module_WordExplorer.py
CHANGED
@@ -1,3 +1,4 @@
|
|
|
|
1 |
import numpy as np
|
2 |
import pandas as pd
|
3 |
import seaborn as sns
|
@@ -5,37 +6,63 @@ from numpy.linalg import norm
|
|
5 |
|
6 |
import matplotlib as mpl
|
7 |
mpl.use('Agg')
|
8 |
-
import
|
|
|
9 |
|
10 |
class WordToPlot:
|
11 |
-
def __init__(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
12 |
self.word = word
|
13 |
self.color = color
|
14 |
self.bias_space = bias_space
|
15 |
self.alpha = alpha
|
16 |
|
|
|
17 |
class WordExplorer:
|
18 |
-
def __init__(
|
19 |
-
self
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
20 |
|
21 |
-
def __errorChecking(self, word):
|
22 |
out_msj = ""
|
23 |
|
24 |
if not word:
|
25 |
out_msj = "Error: Primero debe ingresar una palabra!"
|
26 |
else:
|
27 |
-
if word not in self.
|
28 |
out_msj = f"Error: La palabra '<b>{word}</b>' no se encuentra en el vocabulario!"
|
29 |
|
30 |
return out_msj
|
31 |
|
32 |
-
|
|
|
|
|
|
|
|
|
|
|
33 |
words = string.strip()
|
34 |
if words:
|
35 |
words = [word.strip() for word in words.split(',') if word != ""]
|
36 |
return words
|
37 |
|
38 |
-
def check_oov(
|
|
|
|
|
|
|
|
|
39 |
for wordlist in wordlists:
|
40 |
for word in wordlist:
|
41 |
msg = self.__errorChecking(word)
|
@@ -43,10 +70,21 @@ class WordExplorer:
|
|
43 |
return msg
|
44 |
return None
|
45 |
|
46 |
-
def get_neighbors(
|
47 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
48 |
|
49 |
-
def get_df(self, words_embedded, processed_word_list):
|
50 |
df = pd.DataFrame(words_embedded)
|
51 |
|
52 |
df['word'] = [wtp.word for wtp in processed_word_list]
|
@@ -55,16 +93,18 @@ class WordExplorer:
|
|
55 |
df['word_bias_space'] = [wtp.bias_space for wtp in processed_word_list]
|
56 |
return df
|
57 |
|
58 |
-
def get_plot(
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
|
|
|
|
68 |
fig, ax = plt.subplots(figsize=figsize)
|
69 |
|
70 |
sns.scatterplot(
|
@@ -89,11 +129,20 @@ class WordExplorer:
|
|
89 |
legend=False,
|
90 |
palette=color_dict
|
91 |
)
|
|
|
92 |
for i, wtp in enumerate(processed_word_list):
|
93 |
x, y = words_embedded[i, :]
|
94 |
-
ax.annotate(
|
95 |
-
|
96 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
97 |
|
98 |
ax.set_xticks([])
|
99 |
ax.set_yticks([])
|
@@ -103,25 +152,27 @@ class WordExplorer:
|
|
103 |
|
104 |
return fig
|
105 |
|
106 |
-
def plot_projections_2d(
|
107 |
-
|
108 |
-
|
109 |
-
|
110 |
-
|
111 |
-
|
112 |
-
|
113 |
-
|
|
|
|
|
114 |
# convertirlas a vector
|
115 |
choices = [0, 1, 2, 3, 4]
|
116 |
wordlist_choice = [
|
117 |
-
wordlist_0,
|
118 |
wordlist_1,
|
119 |
-
wordlist_2,
|
120 |
-
wordlist_3,
|
121 |
wordlist_4
|
122 |
]
|
123 |
|
124 |
-
err = self.check_oov(wordlist_choice)
|
125 |
if err:
|
126 |
raise Exception(err)
|
127 |
|
@@ -139,48 +190,69 @@ class WordExplorer:
|
|
139 |
processed_word_list = []
|
140 |
for word_list_to_process, color in zip(wordlist_choice, choices):
|
141 |
for word in word_list_to_process:
|
142 |
-
processed_word_list.append(
|
|
|
|
|
143 |
|
144 |
if n_neighbors > 0:
|
145 |
-
|
146 |
-
|
147 |
-
|
148 |
-
# n_neighbors=n_neighbors+1,
|
149 |
-
n_neighbors=n_neighbors,
|
150 |
nn_method=kwargs.get('nn_method', 'sklearn')
|
151 |
)
|
|
|
152 |
for n in neighbors:
|
153 |
if n not in [wtp.word for wtp in processed_word_list]:
|
154 |
-
processed_word_list.append(
|
|
|
|
|
155 |
|
156 |
if not processed_word_list:
|
157 |
raise Exception('Only empty lists were passed')
|
158 |
-
|
159 |
-
words_embedded = np.array([self.vocabulary.getPCA(wtp.word) for wtp in processed_word_list])
|
160 |
|
161 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
162 |
|
163 |
-
fig = self.get_plot(data, processed_word_list, words_embedded,
|
164 |
-
color_dict, n_neighbors, n_alpha,
|
165 |
-
kwargs.get('fontsize', 18),
|
166 |
-
kwargs.get('figsize', (20, 15))
|
167 |
-
)
|
168 |
plt.show()
|
169 |
return fig
|
170 |
|
171 |
-
|
|
|
|
|
|
|
|
|
|
|
172 |
err = self.check_oov([wordlist])
|
173 |
if err:
|
174 |
raise Exception(err)
|
175 |
-
|
176 |
-
words_emb = np.array([self.
|
|
|
177 |
mean_vec = np.mean(words_emb, axis=0)
|
178 |
|
179 |
doesnt_match = ""
|
180 |
farthest_emb = 1.0
|
181 |
for word in wordlist:
|
182 |
-
word_emb = self.
|
183 |
-
cos_sim = np.dot(mean_vec, word_emb) /
|
|
|
184 |
if cos_sim <= farthest_emb:
|
185 |
farthest_emb = cos_sim
|
186 |
doesnt_match = word
|
|
|
1 |
+
import matplotlib.pyplot as plt
|
2 |
import numpy as np
|
3 |
import pandas as pd
|
4 |
import seaborn as sns
|
|
|
6 |
|
7 |
import matplotlib as mpl
|
8 |
mpl.use('Agg')
|
9 |
+
from typing import List, Dict, Tuple
|
10 |
+
|
11 |
|
12 |
class WordToPlot:
|
13 |
+
def __init__(
|
14 |
+
self,
|
15 |
+
word: str,
|
16 |
+
color: str,
|
17 |
+
bias_space: int,
|
18 |
+
alpha: float
|
19 |
+
):
|
20 |
+
|
21 |
self.word = word
|
22 |
self.color = color
|
23 |
self.bias_space = bias_space
|
24 |
self.alpha = alpha
|
25 |
|
26 |
+
|
27 |
class WordExplorer:
|
28 |
+
def __init__(
|
29 |
+
self,
|
30 |
+
embedding # Class Embedding instance
|
31 |
+
) -> None:
|
32 |
+
|
33 |
+
self.embedding = embedding
|
34 |
+
|
35 |
+
def __errorChecking(
|
36 |
+
self,
|
37 |
+
word: str
|
38 |
+
) -> str:
|
39 |
|
|
|
40 |
out_msj = ""
|
41 |
|
42 |
if not word:
|
43 |
out_msj = "Error: Primero debe ingresar una palabra!"
|
44 |
else:
|
45 |
+
if word not in self.embedding:
|
46 |
out_msj = f"Error: La palabra '<b>{word}</b>' no se encuentra en el vocabulario!"
|
47 |
|
48 |
return out_msj
|
49 |
|
50 |
+
# ToDo: Este método no se usa. Creo que es el implementado en la clase connections base ¿Borrar?
|
51 |
+
def parse_words(
|
52 |
+
self,
|
53 |
+
string: str
|
54 |
+
) -> List[str]:
|
55 |
+
|
56 |
words = string.strip()
|
57 |
if words:
|
58 |
words = [word.strip() for word in words.split(',') if word != ""]
|
59 |
return words
|
60 |
|
61 |
+
def check_oov(
|
62 |
+
self,
|
63 |
+
wordlists: List[str]
|
64 |
+
) -> str:
|
65 |
+
|
66 |
for wordlist in wordlists:
|
67 |
for word in wordlist:
|
68 |
msg = self.__errorChecking(word)
|
|
|
70 |
return msg
|
71 |
return None
|
72 |
|
73 |
+
def get_neighbors(
|
74 |
+
self,
|
75 |
+
word: str,
|
76 |
+
n_neighbors: int,
|
77 |
+
nn_method: str
|
78 |
+
) -> List[str]:
|
79 |
+
|
80 |
+
return self.embedding.getNearestNeighbors(word, n_neighbors, nn_method)
|
81 |
+
|
82 |
+
def get_df(
|
83 |
+
self,
|
84 |
+
words_embedded: np.ndarray,
|
85 |
+
processed_word_list: List[str]
|
86 |
+
) -> pd.DataFrame:
|
87 |
|
|
|
88 |
df = pd.DataFrame(words_embedded)
|
89 |
|
90 |
df['word'] = [wtp.word for wtp in processed_word_list]
|
|
|
93 |
df['word_bias_space'] = [wtp.bias_space for wtp in processed_word_list]
|
94 |
return df
|
95 |
|
96 |
+
def get_plot(
|
97 |
+
self,
|
98 |
+
data: pd.DataFrame,
|
99 |
+
processed_word_list: List[str],
|
100 |
+
words_embedded: np.ndarray,
|
101 |
+
color_dict: Dict,
|
102 |
+
n_neighbors: int,
|
103 |
+
n_alpha: float,
|
104 |
+
fontsize: int=18,
|
105 |
+
figsize: Tuple[int, int]=(20, 15)
|
106 |
+
):
|
107 |
+
|
108 |
fig, ax = plt.subplots(figsize=figsize)
|
109 |
|
110 |
sns.scatterplot(
|
|
|
129 |
legend=False,
|
130 |
palette=color_dict
|
131 |
)
|
132 |
+
|
133 |
for i, wtp in enumerate(processed_word_list):
|
134 |
x, y = words_embedded[i, :]
|
135 |
+
ax.annotate(
|
136 |
+
wtp.word,
|
137 |
+
xy=(x, y),
|
138 |
+
xytext=(5, 2),
|
139 |
+
color=wtp.color,
|
140 |
+
textcoords='offset points',
|
141 |
+
ha='right',
|
142 |
+
va='bottom',
|
143 |
+
size=fontsize,
|
144 |
+
alpha=wtp.alpha
|
145 |
+
)
|
146 |
|
147 |
ax.set_xticks([])
|
148 |
ax.set_yticks([])
|
|
|
152 |
|
153 |
return fig
|
154 |
|
155 |
+
def plot_projections_2d(
|
156 |
+
self,
|
157 |
+
wordlist_0: List[str],
|
158 |
+
wordlist_1: List[str]=[],
|
159 |
+
wordlist_2: List[str]=[],
|
160 |
+
wordlist_3: List[str]=[],
|
161 |
+
wordlist_4: List[str]=[],
|
162 |
+
**kwargs
|
163 |
+
):
|
164 |
+
|
165 |
# convertirlas a vector
|
166 |
choices = [0, 1, 2, 3, 4]
|
167 |
wordlist_choice = [
|
168 |
+
wordlist_0,
|
169 |
wordlist_1,
|
170 |
+
wordlist_2,
|
171 |
+
wordlist_3,
|
172 |
wordlist_4
|
173 |
]
|
174 |
|
175 |
+
err = self.check_oov(wordlist_choice)
|
176 |
if err:
|
177 |
raise Exception(err)
|
178 |
|
|
|
190 |
processed_word_list = []
|
191 |
for word_list_to_process, color in zip(wordlist_choice, choices):
|
192 |
for word in word_list_to_process:
|
193 |
+
processed_word_list.append(
|
194 |
+
WordToPlot(word, color_dict[color], color, 1)
|
195 |
+
)
|
196 |
|
197 |
if n_neighbors > 0:
|
198 |
+
neighbors = self.get_neighbors(
|
199 |
+
word,
|
200 |
+
n_neighbors=n_neighbors,
|
|
|
|
|
201 |
nn_method=kwargs.get('nn_method', 'sklearn')
|
202 |
)
|
203 |
+
|
204 |
for n in neighbors:
|
205 |
if n not in [wtp.word for wtp in processed_word_list]:
|
206 |
+
processed_word_list.append(
|
207 |
+
WordToPlot(n, color_dict[color], color, n_alpha)
|
208 |
+
)
|
209 |
|
210 |
if not processed_word_list:
|
211 |
raise Exception('Only empty lists were passed')
|
|
|
|
|
212 |
|
213 |
+
words_embedded = np.array(
|
214 |
+
[self.embedding.getPCA(wtp.word) for wtp in processed_word_list]
|
215 |
+
)
|
216 |
+
|
217 |
+
data = self.get_df(
|
218 |
+
words_embedded,
|
219 |
+
processed_word_list
|
220 |
+
)
|
221 |
+
|
222 |
+
fig = self.get_plot(
|
223 |
+
data,
|
224 |
+
processed_word_list,
|
225 |
+
words_embedded,
|
226 |
+
color_dict,
|
227 |
+
n_neighbors,
|
228 |
+
n_alpha,
|
229 |
+
kwargs.get('fontsize', 18),
|
230 |
+
kwargs.get('figsize', (20, 15))
|
231 |
+
)
|
232 |
|
|
|
|
|
|
|
|
|
|
|
233 |
plt.show()
|
234 |
return fig
|
235 |
|
236 |
+
# ToDo: No encuentro donde se usa este método. ¿Borrar?
|
237 |
+
def doesnt_match(
|
238 |
+
self,
|
239 |
+
wordlist
|
240 |
+
):
|
241 |
+
|
242 |
err = self.check_oov([wordlist])
|
243 |
if err:
|
244 |
raise Exception(err)
|
245 |
+
|
246 |
+
words_emb = np.array([self.embedding.getEmbedding(word)
|
247 |
+
for word in wordlist])
|
248 |
mean_vec = np.mean(words_emb, axis=0)
|
249 |
|
250 |
doesnt_match = ""
|
251 |
farthest_emb = 1.0
|
252 |
for word in wordlist:
|
253 |
+
word_emb = self.embedding.getEmbedding(word)
|
254 |
+
cos_sim = np.dot(mean_vec, word_emb) / \
|
255 |
+
(norm(mean_vec)*norm(word_emb))
|
256 |
if cos_sim <= farthest_emb:
|
257 |
farthest_emb = cos_sim
|
258 |
doesnt_match = word
|
modules/module_ann.py
CHANGED
@@ -1,45 +1,71 @@
|
|
1 |
import time
|
2 |
-
import operator
|
3 |
from tqdm import tqdm
|
4 |
from annoy import AnnoyIndex
|
5 |
from memory_profiler import profile
|
|
|
6 |
|
7 |
class TicToc:
|
8 |
-
def __init__(
|
|
|
|
|
|
|
9 |
self.i = None
|
10 |
-
|
|
|
|
|
|
|
|
|
11 |
self.i = time.time()
|
12 |
-
|
|
|
|
|
|
|
|
|
13 |
f = time.time()
|
14 |
print(f - self.i, "seg.")
|
15 |
|
|
|
16 |
class Ann:
|
17 |
-
def __init__(
|
18 |
-
self
|
19 |
-
|
20 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
21 |
self.tree = None
|
22 |
|
23 |
self.tt = TicToc()
|
24 |
|
25 |
-
|
26 |
-
|
|
|
|
|
|
|
|
|
27 |
# metrics options = "angular", "euclidean", "manhattan", "hamming", or "dot"
|
28 |
# n_jobs=-1 Run over all CPU availables
|
29 |
|
30 |
-
print("
|
31 |
self.tt.start()
|
32 |
self.tree = AnnoyIndex(len(self.vectors[0]), metric=metric)
|
33 |
-
for i,v in tqdm(enumerate(self.vectors), total=len(self.vectors)):
|
34 |
-
self.tree.add_item(i,v)
|
35 |
self.tt.stop()
|
36 |
|
37 |
-
print("
|
38 |
self.tt.start()
|
39 |
self.tree.build(n_trees=n_trees, n_jobs=n_jobs)
|
40 |
self.tt.stop()
|
41 |
|
42 |
-
def __getWordId(
|
|
|
|
|
|
|
|
|
43 |
word_id = None
|
44 |
try:
|
45 |
word_id = self.words.index(word)
|
@@ -47,16 +73,20 @@ class Ann:
|
|
47 |
pass
|
48 |
return word_id
|
49 |
|
50 |
-
def get(
|
|
|
|
|
|
|
|
|
|
|
51 |
word_id = self.__getWordId(word)
|
52 |
-
|
53 |
|
54 |
if word_id != None:
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
word_xy_list = operator.itemgetter(*neighbord_id)(self.words)
|
59 |
else:
|
60 |
print(f"The word '{word}' does not exist")
|
61 |
-
|
62 |
-
return
|
|
|
1 |
import time
|
|
|
2 |
from tqdm import tqdm
|
3 |
from annoy import AnnoyIndex
|
4 |
from memory_profiler import profile
|
5 |
+
from typing import List, Any
|
6 |
|
7 |
class TicToc:
|
8 |
+
def __init__(
|
9 |
+
self
|
10 |
+
) -> None:
|
11 |
+
|
12 |
self.i = None
|
13 |
+
|
14 |
+
def start(
|
15 |
+
self
|
16 |
+
) -> None:
|
17 |
+
|
18 |
self.i = time.time()
|
19 |
+
|
20 |
+
def stop(
|
21 |
+
self
|
22 |
+
) -> None:
|
23 |
+
|
24 |
f = time.time()
|
25 |
print(f - self.i, "seg.")
|
26 |
|
27 |
+
|
28 |
class Ann:
|
29 |
+
def __init__(
|
30 |
+
self,
|
31 |
+
words: List[str],
|
32 |
+
vectors: List[float],
|
33 |
+
coord: List[float],
|
34 |
+
) -> None:
|
35 |
+
|
36 |
+
self.words = words
|
37 |
+
self.vectors = vectors
|
38 |
+
self.coord = coord
|
39 |
self.tree = None
|
40 |
|
41 |
self.tt = TicToc()
|
42 |
|
43 |
+
def init(self,
|
44 |
+
n_trees: int=10,
|
45 |
+
metric: str='angular',
|
46 |
+
n_jobs: int=-1
|
47 |
+
) -> None:
|
48 |
+
|
49 |
# metrics options = "angular", "euclidean", "manhattan", "hamming", or "dot"
|
50 |
# n_jobs=-1 Run over all CPU availables
|
51 |
|
52 |
+
print("\tInit tree...")
|
53 |
self.tt.start()
|
54 |
self.tree = AnnoyIndex(len(self.vectors[0]), metric=metric)
|
55 |
+
for i, v in tqdm(enumerate(self.vectors), total=len(self.vectors)):
|
56 |
+
self.tree.add_item(i, v)
|
57 |
self.tt.stop()
|
58 |
|
59 |
+
print("\tBuild tree...")
|
60 |
self.tt.start()
|
61 |
self.tree.build(n_trees=n_trees, n_jobs=n_jobs)
|
62 |
self.tt.stop()
|
63 |
|
64 |
+
def __getWordId(
|
65 |
+
self,
|
66 |
+
word: str
|
67 |
+
) -> int:
|
68 |
+
|
69 |
word_id = None
|
70 |
try:
|
71 |
word_id = self.words.index(word)
|
|
|
73 |
pass
|
74 |
return word_id
|
75 |
|
76 |
+
def get(
|
77 |
+
self,
|
78 |
+
word: str,
|
79 |
+
n_neighbors: int=10
|
80 |
+
) -> List[str]:
|
81 |
+
|
82 |
word_id = self.__getWordId(word)
|
83 |
+
neighbords_list = None
|
84 |
|
85 |
if word_id != None:
|
86 |
+
neighbords_id = self.tree.get_nns_by_item(word_id, n_neighbors + 1)
|
87 |
+
neighbords_list = [self.words[idx] for idx in neighbords_id][1:]
|
88 |
+
|
|
|
89 |
else:
|
90 |
print(f"The word '{word}' does not exist")
|
91 |
+
|
92 |
+
return neighbords_list
|
modules/module_connection.py
CHANGED
@@ -1,52 +1,75 @@
|
|
1 |
-
|
2 |
-
import pandas as pd
|
3 |
-
import gradio as gr
|
4 |
-
from abc import ABC, abstractmethod
|
5 |
|
6 |
-
from modules.module_WordExplorer import WordExplorer
|
7 |
from modules.module_BiasExplorer import WordBiasExplorer
|
|
|
|
|
8 |
|
9 |
class Connector(ABC):
|
10 |
-
def parse_word(
|
|
|
|
|
|
|
|
|
11 |
return word.lower().strip()
|
12 |
|
13 |
-
def parse_words(
|
|
|
|
|
|
|
|
|
14 |
words = array_in_string.strip()
|
15 |
if not words:
|
16 |
return []
|
17 |
-
|
|
|
|
|
|
|
|
|
18 |
return words
|
19 |
|
20 |
-
def process_error(
|
21 |
-
|
22 |
-
|
23 |
-
|
|
|
|
|
|
|
|
|
24 |
|
25 |
|
26 |
class WordExplorerConnector(Connector):
|
|
|
|
|
|
|
|
|
27 |
|
28 |
-
def __init__(self, **kwargs):
|
29 |
if 'embedding' in kwargs:
|
30 |
embedding = kwargs.get('embedding')
|
31 |
else:
|
32 |
raise KeyError
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
|
|
|
|
|
|
|
|
|
|
50 |
err = ""
|
51 |
neighbors_method = 'sklearn'
|
52 |
wordlist_0 = self.parse_words(wordlist_0)
|
@@ -59,49 +82,63 @@ class WordExplorerConnector(Connector):
|
|
59 |
err = self.process_error("Ingresa al menos 1 palabras para continuar")
|
60 |
return None, err
|
61 |
|
62 |
-
err = self.word_explorer.check_oov(
|
|
|
|
|
|
|
63 |
if err:
|
64 |
return None, self.process_error(err)
|
65 |
|
66 |
-
fig = self.word_explorer.plot_projections_2d(
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
|
|
|
|
81 |
return fig, self.process_error(err)
|
82 |
|
83 |
class BiasWordExplorerConnector(Connector):
|
84 |
|
85 |
-
def __init__(
|
|
|
|
|
|
|
|
|
86 |
if 'embedding' in kwargs:
|
87 |
embedding = kwargs.get('embedding')
|
88 |
else:
|
89 |
raise KeyError
|
90 |
-
self.bias_word_explorer = WordBiasExplorer(embedding)
|
91 |
|
92 |
-
|
93 |
-
|
94 |
-
|
95 |
-
|
96 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
97 |
err = ""
|
98 |
wordlist_1 = self.parse_words(wordlist_1)
|
99 |
wordlist_2 = self.parse_words(wordlist_2)
|
100 |
to_diagnose_list = self.parse_words(to_diagnose_list)
|
101 |
|
102 |
word_lists = [wordlist_1, wordlist_2, to_diagnose_list]
|
103 |
-
for
|
104 |
-
if not
|
105 |
err = "Debe ingresar al menos 1 palabra en las lista de palabras a diagnosticar, sesgo 1 y sesgo 2"
|
106 |
if err:
|
107 |
return None, self.process_error(err)
|
@@ -110,17 +147,23 @@ class BiasWordExplorerConnector(Connector):
|
|
110 |
if err:
|
111 |
return None, self.process_error(err)
|
112 |
|
113 |
-
fig = self.bias_word_explorer.plot_biased_words(
|
|
|
|
|
|
|
|
|
114 |
|
115 |
return fig, self.process_error(err)
|
116 |
|
117 |
-
def calculate_bias_4d(
|
118 |
-
|
119 |
-
|
120 |
-
|
121 |
-
|
122 |
-
|
123 |
-
|
|
|
|
|
124 |
err = ""
|
125 |
wordlist_1 = self.parse_words(wordlist_1)
|
126 |
wordlist_2 = self.parse_words(wordlist_2)
|
@@ -129,8 +172,8 @@ class BiasWordExplorerConnector(Connector):
|
|
129 |
to_diagnose_list = self.parse_words(to_diagnose_list)
|
130 |
|
131 |
wordlists = [wordlist_1, wordlist_2, wordlist_3, wordlist_4, to_diagnose_list]
|
132 |
-
for
|
133 |
-
if not
|
134 |
err = "¡Para graficar con 4 espacios, debe ingresar al menos 1 palabra en todas las listas!"
|
135 |
if err:
|
136 |
return None, self.process_error(err)
|
@@ -139,5 +182,12 @@ class BiasWordExplorerConnector(Connector):
|
|
139 |
if err:
|
140 |
return None, self.process_error(err)
|
141 |
|
142 |
-
fig = self.bias_word_explorer.plot_biased_words(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
143 |
return fig, self.process_error(err)
|
|
|
1 |
+
from abc import ABC
|
|
|
|
|
|
|
2 |
|
3 |
+
from modules.module_WordExplorer import WordExplorer
|
4 |
from modules.module_BiasExplorer import WordBiasExplorer
|
5 |
+
from typing import List, Tuple
|
6 |
+
|
7 |
|
8 |
class Connector(ABC):
|
9 |
+
def parse_word(
|
10 |
+
self,
|
11 |
+
word: str
|
12 |
+
) -> str:
|
13 |
+
|
14 |
return word.lower().strip()
|
15 |
|
16 |
+
def parse_words(
|
17 |
+
self,
|
18 |
+
array_in_string: str
|
19 |
+
) -> List[str]:
|
20 |
+
|
21 |
words = array_in_string.strip()
|
22 |
if not words:
|
23 |
return []
|
24 |
+
|
25 |
+
words = [
|
26 |
+
self.parse_word(word)
|
27 |
+
for word in words.split(',') if word.strip() != ''
|
28 |
+
]
|
29 |
return words
|
30 |
|
31 |
+
def process_error(
|
32 |
+
self,
|
33 |
+
err: str
|
34 |
+
) -> str:
|
35 |
+
|
36 |
+
if err:
|
37 |
+
err = "<center><h3>" + err + "</h3></center>"
|
38 |
+
return err
|
39 |
|
40 |
|
41 |
class WordExplorerConnector(Connector):
|
42 |
+
def __init__(
|
43 |
+
self,
|
44 |
+
**kwargs
|
45 |
+
) -> None:
|
46 |
|
|
|
47 |
if 'embedding' in kwargs:
|
48 |
embedding = kwargs.get('embedding')
|
49 |
else:
|
50 |
raise KeyError
|
51 |
+
|
52 |
+
self.word_explorer = WordExplorer(
|
53 |
+
embedding=embedding
|
54 |
+
)
|
55 |
+
|
56 |
+
def plot_proyection_2d(
|
57 |
+
self,
|
58 |
+
wordlist_0: str,
|
59 |
+
wordlist_1: str,
|
60 |
+
wordlist_2: str,
|
61 |
+
wordlist_3: str,
|
62 |
+
wordlist_4: str,
|
63 |
+
color_wordlist_0: str,
|
64 |
+
color_wordlist_1: str,
|
65 |
+
color_wordlist_2: str,
|
66 |
+
color_wordlist_3: str,
|
67 |
+
color_wordlist_4: str,
|
68 |
+
n_alpha: float,
|
69 |
+
fontsize: int,
|
70 |
+
n_neighbors: int
|
71 |
+
) -> Tuple:
|
72 |
+
|
73 |
err = ""
|
74 |
neighbors_method = 'sklearn'
|
75 |
wordlist_0 = self.parse_words(wordlist_0)
|
|
|
82 |
err = self.process_error("Ingresa al menos 1 palabras para continuar")
|
83 |
return None, err
|
84 |
|
85 |
+
err = self.word_explorer.check_oov(
|
86 |
+
[wordlist_0, wordlist_1, wordlist_2, wordlist_3, wordlist_4]
|
87 |
+
)
|
88 |
+
|
89 |
if err:
|
90 |
return None, self.process_error(err)
|
91 |
|
92 |
+
fig = self.word_explorer.plot_projections_2d(
|
93 |
+
wordlist_0,
|
94 |
+
wordlist_1,
|
95 |
+
wordlist_2,
|
96 |
+
wordlist_3,
|
97 |
+
wordlist_4,
|
98 |
+
color_wordlist_0=color_wordlist_0,
|
99 |
+
color_wordlist_1=color_wordlist_1,
|
100 |
+
color_wordlist_2=color_wordlist_2,
|
101 |
+
color_wordlist_3=color_wordlist_3,
|
102 |
+
color_wordlist_4=color_wordlist_4,
|
103 |
+
n_alpha=n_alpha,
|
104 |
+
fontsize=fontsize,
|
105 |
+
n_neighbors=n_neighbors,
|
106 |
+
nn_method = neighbors_method
|
107 |
+
)
|
108 |
+
|
109 |
return fig, self.process_error(err)
|
110 |
|
111 |
class BiasWordExplorerConnector(Connector):
|
112 |
|
113 |
+
def __init__(
|
114 |
+
self,
|
115 |
+
**kwargs
|
116 |
+
) -> None:
|
117 |
+
|
118 |
if 'embedding' in kwargs:
|
119 |
embedding = kwargs.get('embedding')
|
120 |
else:
|
121 |
raise KeyError
|
|
|
122 |
|
123 |
+
self.bias_word_explorer = WordBiasExplorer(
|
124 |
+
embedding=embedding
|
125 |
+
)
|
126 |
+
|
127 |
+
def calculate_bias_2d(
|
128 |
+
self,
|
129 |
+
wordlist_1: str,
|
130 |
+
wordlist_2: str,
|
131 |
+
to_diagnose_list: str
|
132 |
+
) -> Tuple:
|
133 |
+
|
134 |
err = ""
|
135 |
wordlist_1 = self.parse_words(wordlist_1)
|
136 |
wordlist_2 = self.parse_words(wordlist_2)
|
137 |
to_diagnose_list = self.parse_words(to_diagnose_list)
|
138 |
|
139 |
word_lists = [wordlist_1, wordlist_2, to_diagnose_list]
|
140 |
+
for _list in word_lists:
|
141 |
+
if not _list:
|
142 |
err = "Debe ingresar al menos 1 palabra en las lista de palabras a diagnosticar, sesgo 1 y sesgo 2"
|
143 |
if err:
|
144 |
return None, self.process_error(err)
|
|
|
147 |
if err:
|
148 |
return None, self.process_error(err)
|
149 |
|
150 |
+
fig = self.bias_word_explorer.plot_biased_words(
|
151 |
+
to_diagnose_list,
|
152 |
+
wordlist_2,
|
153 |
+
wordlist_1
|
154 |
+
)
|
155 |
|
156 |
return fig, self.process_error(err)
|
157 |
|
158 |
+
def calculate_bias_4d(
|
159 |
+
self,
|
160 |
+
wordlist_1: str,
|
161 |
+
wordlist_2: str,
|
162 |
+
wordlist_3: str,
|
163 |
+
wordlist_4: str,
|
164 |
+
to_diagnose_list: str
|
165 |
+
) -> Tuple:
|
166 |
+
|
167 |
err = ""
|
168 |
wordlist_1 = self.parse_words(wordlist_1)
|
169 |
wordlist_2 = self.parse_words(wordlist_2)
|
|
|
172 |
to_diagnose_list = self.parse_words(to_diagnose_list)
|
173 |
|
174 |
wordlists = [wordlist_1, wordlist_2, wordlist_3, wordlist_4, to_diagnose_list]
|
175 |
+
for _list in wordlists:
|
176 |
+
if not _list:
|
177 |
err = "¡Para graficar con 4 espacios, debe ingresar al menos 1 palabra en todas las listas!"
|
178 |
if err:
|
179 |
return None, self.process_error(err)
|
|
|
182 |
if err:
|
183 |
return None, self.process_error(err)
|
184 |
|
185 |
+
fig = self.bias_word_explorer.plot_biased_words(
|
186 |
+
to_diagnose_list,
|
187 |
+
wordlist_1,
|
188 |
+
wordlist_2,
|
189 |
+
wordlist_3,
|
190 |
+
wordlist_4
|
191 |
+
)
|
192 |
+
|
193 |
return fig, self.process_error(err)
|
modules/module_logsManager.py
CHANGED
@@ -40,11 +40,11 @@ class HuggingFaceDatasetSaver(FlaggingCallback):
|
|
40 |
|
41 |
def __init__(
|
42 |
self,
|
43 |
-
|
44 |
-
|
45 |
-
organization: Optional[str]
|
46 |
-
private: bool
|
47 |
-
available_logs: bool
|
48 |
):
|
49 |
"""
|
50 |
Parameters:
|
@@ -53,6 +53,8 @@ class HuggingFaceDatasetSaver(FlaggingCallback):
|
|
53 |
organization: The organization to save the dataset under. The hf_token must provide write access to this organization. If not provided, saved under the name of the user corresponding to the hf_token.
|
54 |
private: Whether the dataset should be private (defaults to False).
|
55 |
"""
|
|
|
|
|
56 |
self.hf_token = hf_token
|
57 |
self.dataset_name = dataset_name
|
58 |
self.organization_name = organization
|
|
|
40 |
|
41 |
def __init__(
|
42 |
self,
|
43 |
+
dataset_name: str=None,
|
44 |
+
hf_token: str=os.getenv('HF_TOKEN'),
|
45 |
+
organization: Optional[str]=os.getenv('ORG_NAME'),
|
46 |
+
private: bool=True,
|
47 |
+
available_logs: bool=False
|
48 |
):
|
49 |
"""
|
50 |
Parameters:
|
|
|
53 |
organization: The organization to save the dataset under. The hf_token must provide write access to this organization. If not provided, saved under the name of the user corresponding to the hf_token.
|
54 |
private: Whether the dataset should be private (defaults to False).
|
55 |
"""
|
56 |
+
assert(dataset_name is not None), "Error: Parameter 'dataset_name' cannot be empty!."
|
57 |
+
|
58 |
self.hf_token = hf_token
|
59 |
self.dataset_name = dataset_name
|
60 |
self.organization_name = organization
|
tool_info.py
CHANGED
@@ -4,7 +4,7 @@ TOOL_INFO = """
|
|
4 |
* [Read Full Paper](https://arxiv.org/abs/2207.06591)
|
5 |
|
6 |
> ### Licensing Information
|
7 |
-
* [MIT Licence](https://huggingface.co/spaces/vialibre/
|
8 |
|
9 |
> ### Citation Information
|
10 |
```c
|
|
|
4 |
* [Read Full Paper](https://arxiv.org/abs/2207.06591)
|
5 |
|
6 |
> ### Licensing Information
|
7 |
+
* [MIT Licence](https://huggingface.co/spaces/vialibre/edia_we_es/resolve/main/LICENSE)
|
8 |
|
9 |
> ### Citation Information
|
10 |
```c
|