nanom commited on
Commit
ced76fc
1 Parent(s): 37a709b

Typing. Added __init_ann_method and __init_sklearn_method in embedding class. Upgrade getNearestNeighbors method. Fix bug in get method from ann class. Etc

Browse files
.gitignore CHANGED
@@ -1,3 +1,3 @@
1
  __pycache__/
2
- bias_tool_logs/
3
  *.env
 
 
1
  __pycache__/
 
2
  *.env
3
+ logs_edia_we_spanish/
app.py CHANGED
@@ -4,30 +4,32 @@ import pandas as pd
4
 
5
 
6
  # --- Imports modules ---
7
- from modules.model_embbeding import Embedding # Fix and Updated
8
 
9
 
10
  # --- Imports interfaces ---
11
- from interfaces.interface_WordExplorer import interface as wordExplorer_interface # Updated
12
  from interfaces.interface_BiasWordExplorer import interface as biasWordExplorer_interface
13
 
14
 
15
  # --- Tool config ---
16
- AVAILABLE_LOGS = True # [True | False]
17
- LANGUAGE = "spanish" # [spanish | english]
18
  EMBEDDINGS_PATH = "data/fasttext-sbwc.100k.vec"
19
- MAX_NEIGHBORS = 20 # Updated
 
 
 
20
 
21
 
22
  # --- Init classes ---
23
  embedding = Embedding(
24
  path=EMBEDDINGS_PATH,
25
- binary=EMBEDDINGS_PATH.endswith('.bin'),
26
  limit=None,
27
  randomizedPCA=False,
28
- max_neighbors=MAX_NEIGHBORS # Updated
 
29
  )
30
 
 
31
  # --- Init Vars ---
32
  labels = pd.read_json(f"language/{LANGUAGE}.json")["app"]
33
 
@@ -41,7 +43,7 @@ INTERFACE_LIST = [
41
  wordExplorer_interface(
42
  embedding=embedding,
43
  available_logs=AVAILABLE_LOGS,
44
- max_neighbors=MAX_NEIGHBORS, # Updated
45
  lang=LANGUAGE),
46
  ]
47
 
 
4
 
5
 
6
  # --- Imports modules ---
7
+ from modules.model_embbeding import Embedding
8
 
9
 
10
  # --- Imports interfaces ---
11
+ from interfaces.interface_WordExplorer import interface as wordExplorer_interface
12
  from interfaces.interface_BiasWordExplorer import interface as biasWordExplorer_interface
13
 
14
 
15
  # --- Tool config ---
 
 
16
  EMBEDDINGS_PATH = "data/fasttext-sbwc.100k.vec"
17
+ LANGUAGE = "spanish" # [spanish | english]
18
+ MAX_NEIGHBORS = 20
19
+ NN_METHOD = 'sklearn' # ['sklearn' | 'ann']
20
+ AVAILABLE_LOGS = True # [True | False]
21
 
22
 
23
  # --- Init classes ---
24
  embedding = Embedding(
25
  path=EMBEDDINGS_PATH,
 
26
  limit=None,
27
  randomizedPCA=False,
28
+ max_neighbors=MAX_NEIGHBORS,
29
+ nn_method=NN_METHOD
30
  )
31
 
32
+
33
  # --- Init Vars ---
34
  labels = pd.read_json(f"language/{LANGUAGE}.json")["app"]
35
 
 
43
  wordExplorer_interface(
44
  embedding=embedding,
45
  available_logs=AVAILABLE_LOGS,
46
+ max_neighbors=MAX_NEIGHBORS,
47
  lang=LANGUAGE),
48
  ]
49
 
interfaces/interface_BiasWordExplorer.py CHANGED
@@ -1,48 +1,96 @@
1
  import gradio as gr
2
  import pandas as pd
3
- from tkinter import image_names
4
 
5
- from tool_info import TOOL_INFO
6
  from modules.module_logsManager import HuggingFaceDatasetSaver
7
  from modules.module_connection import BiasWordExplorerConnector
8
  from examples.examples import examples1_explorar_sesgo_en_palabras, examples2_explorar_sesgo_en_palabras
 
 
9
 
10
  # --- Interface ---
11
- def interface(embedding, available_logs, lang="spanish"):
 
 
 
 
 
12
  # --- Init logs ---
13
  log_callback = HuggingFaceDatasetSaver(
14
- available_logs=available_logs
 
15
  )
 
16
  # --- Init vars ---
17
- connector = BiasWordExplorerConnector(embedding=embedding)
18
- labels = pd.read_json(f"language/{lang}.json")["BiasWordExplorer_interface"]
 
 
 
 
 
 
19
 
 
20
  interface = gr.Blocks()
 
21
  with interface:
22
- gr.Markdown(labels["step1"])
 
 
23
  with gr.Row():
24
  with gr.Column():
25
  with gr.Row():
26
- diagnose_list = gr.Textbox(lines=2, label=labels["wordListToDiagnose"])
 
 
 
27
  with gr.Row():
28
- gr.Markdown(labels["step2&2Spaces"])
 
 
29
  with gr.Row():
30
- wordlist_1 = gr.Textbox(lines=2, label=labels["wordList1"])
31
- wordlist_2 = gr.Textbox(lines=2, label=labels["wordList2"])
 
 
 
 
 
 
32
  with gr.Row():
33
- gr.Markdown(labels["step2&4Spaces"])
 
 
34
  with gr.Row():
35
- wordlist_3 = gr.Textbox(lines=2, label=labels["wordList3"])
36
- wordlist_4 = gr.Textbox(lines=2, label=labels["wordList4"])
 
 
 
 
 
 
 
37
  with gr.Column():
38
  with gr.Row():
39
- bias2d = gr.Button(labels["plot2SpacesButton"])
 
 
40
  with gr.Row():
41
- bias4d = gr.Button(labels["plot4SpacesButton"])
 
 
42
  with gr.Row():
43
- err_msg = gr.Markdown(label='',visible=True)
 
 
 
44
  with gr.Row():
45
- bias_plot = gr.Plot(label="", show_label=False)
 
 
 
 
46
  with gr.Row():
47
  examples = gr.Examples(
48
  fn=connector.calculate_bias_2d,
@@ -54,51 +102,59 @@ def interface(embedding, available_logs, lang="spanish"):
54
  with gr.Row():
55
  examples = gr.Examples(
56
  fn=connector.calculate_bias_4d,
57
- inputs=[wordlist_1, wordlist_2,
58
- wordlist_3, wordlist_4, diagnose_list],
59
- outputs=[bias_plot, err_msg],
 
60
  examples=examples2_explorar_sesgo_en_palabras,
61
  label=labels["examples4Spaces"]
62
  )
63
 
64
  with gr.Row():
65
- gr.Markdown(TOOL_INFO)
 
 
66
 
67
  bias2d.click(
68
- fn=connector.calculate_bias_2d,
69
- inputs=[wordlist_1,wordlist_2,diagnose_list],
70
- outputs=[bias_plot,err_msg]
71
  )
72
-
73
  bias4d.click(
74
  fn=connector.calculate_bias_4d,
75
- inputs=[wordlist_1,wordlist_2,wordlist_3,wordlist_4,diagnose_list],
76
- outputs=[bias_plot,err_msg]
 
77
  )
78
 
79
  # --- Logs ---
80
- save_field = [wordlist_1,wordlist_2,wordlist_3,wordlist_4,diagnose_list]
81
- log_callback.setup(components=save_field, flagging_dir="edia_bias_we_es")
 
 
 
82
 
83
  bias2d.click(
84
  fn=lambda *args: log_callback.flag(
85
- flag_data=args,
86
- flag_option="plot_2d",
87
- username="vialibre"
88
  ),
89
  inputs=save_field,
90
- outputs=None,
91
  preprocess=False
92
  )
93
-
94
  bias4d.click(
95
  fn=lambda *args: log_callback.flag(
96
- flag_data=args,
97
- flag_option="plot_4d",
98
- username="vialibre"
99
  ),
100
  inputs=save_field,
101
- outputs=None,
102
  preprocess=False
103
  )
104
- return interface
 
 
1
  import gradio as gr
2
  import pandas as pd
 
3
 
 
4
  from modules.module_logsManager import HuggingFaceDatasetSaver
5
  from modules.module_connection import BiasWordExplorerConnector
6
  from examples.examples import examples1_explorar_sesgo_en_palabras, examples2_explorar_sesgo_en_palabras
7
+ from tool_info import TOOL_INFO
8
+
9
 
10
  # --- Interface ---
11
+ def interface(
12
+ embedding, # Class Embedding instance
13
+ available_logs: bool,
14
+ lang: str="spanish"
15
+ ) -> gr.Blocks:
16
+
17
  # --- Init logs ---
18
  log_callback = HuggingFaceDatasetSaver(
19
+ available_logs=available_logs,
20
+ dataset_name=f"logs_edia_we_{lang}"
21
  )
22
+
23
  # --- Init vars ---
24
+ connector = BiasWordExplorerConnector(
25
+ embedding=embedding
26
+ )
27
+
28
+ # --- Load language ---
29
+ labels = pd.read_json(
30
+ f"language/{lang}.json"
31
+ )["BiasWordExplorer_interface"]
32
 
33
+ # --- Interface ---
34
  interface = gr.Blocks()
35
+
36
  with interface:
37
+ gr.Markdown(
38
+ value=labels["step1"]
39
+ )
40
  with gr.Row():
41
  with gr.Column():
42
  with gr.Row():
43
+ diagnose_list = gr.Textbox(
44
+ lines=2,
45
+ label=labels["wordListToDiagnose"]
46
+ )
47
  with gr.Row():
48
+ gr.Markdown(
49
+ value=labels["step2&2Spaces"]
50
+ )
51
  with gr.Row():
52
+ wordlist_1 = gr.Textbox(
53
+ lines=2,
54
+ label=labels["wordList1"]
55
+ )
56
+ wordlist_2 = gr.Textbox(
57
+ lines=2,
58
+ label=labels["wordList2"]
59
+ )
60
  with gr.Row():
61
+ gr.Markdown(
62
+ value=labels["step2&4Spaces"]
63
+ )
64
  with gr.Row():
65
+ wordlist_3 = gr.Textbox(
66
+ lines=2,
67
+ label=labels["wordList3"]
68
+ )
69
+ wordlist_4 = gr.Textbox(
70
+ lines=2,
71
+ label=labels["wordList4"]
72
+ )
73
+
74
  with gr.Column():
75
  with gr.Row():
76
+ bias2d = gr.Button(
77
+ value=labels["plot2SpacesButton"]
78
+ )
79
  with gr.Row():
80
+ bias4d = gr.Button(
81
+ value=labels["plot4SpacesButton"]
82
+ )
83
  with gr.Row():
84
+ err_msg = gr.Markdown(
85
+ label="",
86
+ visible=True
87
+ )
88
  with gr.Row():
89
+ bias_plot = gr.Plot(
90
+ label="",
91
+ show_label=False
92
+ )
93
+
94
  with gr.Row():
95
  examples = gr.Examples(
96
  fn=connector.calculate_bias_2d,
 
102
  with gr.Row():
103
  examples = gr.Examples(
104
  fn=connector.calculate_bias_4d,
105
+ inputs=[wordlist_1, wordlist_2,wordlist_3, wordlist_4, diagnose_list],
106
+ outputs=[
107
+ bias_plot, err_msg
108
+ ],
109
  examples=examples2_explorar_sesgo_en_palabras,
110
  label=labels["examples4Spaces"]
111
  )
112
 
113
  with gr.Row():
114
+ gr.Markdown(
115
+ value=TOOL_INFO
116
+ )
117
 
118
  bias2d.click(
119
+ fn=connector.calculate_bias_2d,
120
+ inputs=[wordlist_1, wordlist_2, diagnose_list],
121
+ outputs=[bias_plot, err_msg]
122
  )
123
+
124
  bias4d.click(
125
  fn=connector.calculate_bias_4d,
126
+ inputs=[wordlist_1, wordlist_2,
127
+ wordlist_3, wordlist_4, diagnose_list],
128
+ outputs=[bias_plot, err_msg]
129
  )
130
 
131
  # --- Logs ---
132
+ save_field = [wordlist_1, wordlist_2,wordlist_3, wordlist_4, diagnose_list]
133
+ log_callback.setup(
134
+ components=save_field,
135
+ flagging_dir="logs_word_bias"
136
+ )
137
 
138
  bias2d.click(
139
  fn=lambda *args: log_callback.flag(
140
+ flag_data=args,
141
+ flag_option="plot_2d",
142
+ username="vialibre"
143
  ),
144
  inputs=save_field,
145
+ outputs=None,
146
  preprocess=False
147
  )
148
+
149
  bias4d.click(
150
  fn=lambda *args: log_callback.flag(
151
+ flag_data=args,
152
+ flag_option="plot_4d",
153
+ username="vialibre"
154
  ),
155
  inputs=save_field,
156
+ outputs=None,
157
  preprocess=False
158
  )
159
+
160
+ return interface
interfaces/interface_WordExplorer.py CHANGED
@@ -2,73 +2,140 @@ import gradio as gr
2
  import pandas as pd
3
  import matplotlib.pyplot as plt
4
 
5
- from tool_info import TOOL_INFO
6
- from modules.module_connection import WordExplorerConnector # Updated
7
  from modules.module_logsManager import HuggingFaceDatasetSaver
8
  from examples.examples import examples_explorar_relaciones_entre_palabras
 
9
 
10
  plt.rcParams.update({'font.size': 14})
11
 
12
  def interface(
13
- embedding,
14
  available_logs: bool,
15
- max_neighbors: int, # Updated
16
  lang: str="spanish",
17
  ) -> gr.Blocks:
18
 
19
  # --- Init logs ---
20
  log_callback = HuggingFaceDatasetSaver(
21
- available_logs=available_logs
 
22
  )
 
23
  # --- Init vars ---
24
- connector = WordExplorerConnector(embedding=embedding)
25
- labels = pd.read_json(f"language/{lang}.json")["WordExplorer_interface"]
 
 
 
 
 
 
26
 
27
  # --- Interface ---
28
  interface = gr.Blocks()
 
29
  with interface:
30
- gr.Markdown(labels["title"])
 
 
 
31
  with gr.Row():
32
  with gr.Column(scale=3):
33
  with gr.Row(equal_height=True):
34
  with gr.Column(scale=5):
35
- diagnose_list = gr.Textbox(lines=2, label=labels["wordListToDiagnose"])
 
 
 
36
  with gr.Column(scale=1,min_width=10):
37
- color_wordlist = gr.ColorPicker(label="",value='#000000',)
 
 
 
 
38
  with gr.Row():
39
  with gr.Column(scale=5):
40
- wordlist_1 = gr.Textbox(lines=2, label=labels["wordList1"])
 
 
 
41
  with gr.Column(scale=1,min_width=10):
42
- color_wordlist_1 = gr.ColorPicker(label="",value='#1f78b4')
 
 
 
43
  with gr.Row():
44
  with gr.Column(scale=5):
45
- wordlist_2 = gr.Textbox(lines=2, label=labels["wordList2"])
 
 
 
46
  with gr.Column(scale=1,min_width=10):
47
- color_wordlist_2 = gr.ColorPicker(label="",value='#33a02c')
 
 
 
48
  with gr.Row():
49
  with gr.Column(scale=5):
50
- wordlist_3 = gr.Textbox(lines=2, label=labels["wordList3"])
 
 
 
51
  with gr.Column(scale=1,min_width=10):
52
- color_wordlist_3 = gr.ColorPicker(label="",value='#e31a1c')
 
 
 
53
  with gr.Row():
54
  with gr.Column(scale=5):
55
- wordlist_4 = gr.Textbox(lines=2, label=labels["wordList4"])
 
 
 
56
  with gr.Column(scale=1,min_width=10):
57
- color_wordlist_4 = gr.ColorPicker(label="",value='#6a3d9a')
 
 
 
58
  with gr.Column(scale=4):
59
  with gr.Row():
60
  with gr.Row():
61
- gr.Markdown(labels["plotNeighbours"]["title"])
62
- n_neighbors = gr.Slider(minimum=0,maximum=max_neighbors,step=1,label=labels["plotNeighbours"]["quantity"])
 
 
 
 
 
 
 
63
  with gr.Row():
64
- alpha = gr.Slider(minimum=0.1,maximum=0.9, value=0.3, step=0.1,label=labels["options"]["transparency"])
65
- fontsize=gr.Number(value=25, label=labels["options"]["font-size"])
 
 
 
 
 
 
 
 
 
66
  with gr.Row():
67
- btn_plot = gr.Button(labels["plot_button"])
 
 
68
  with gr.Row():
69
- err_msg = gr.Markdown(label="", visible=True)
 
 
 
70
  with gr.Row():
71
- word_proyections = gr.Plot(label="", show_label=False)
 
 
 
72
 
73
  with gr.Row():
74
  gr.Examples(
@@ -80,7 +147,9 @@ def interface(
80
  )
81
 
82
  with gr.Row():
83
- gr.Markdown(TOOL_INFO)
 
 
84
 
85
  btn_plot.click(
86
  fn=connector.plot_proyection_2d,
@@ -99,21 +168,25 @@ def interface(
99
  fontsize,
100
  n_neighbors
101
  ],
102
- outputs=[word_proyections,err_msg]
103
  )
104
 
105
  # --- Logs ---
106
- save_field = [diagnose_list,wordlist_1,wordlist_2,wordlist_3,wordlist_4]
107
- log_callback.setup(components=save_field, flagging_dir="edia_we_es")
 
 
 
108
 
109
  btn_plot.click(
110
  fn=lambda *args: log_callback.flag(
111
- flag_data=args,
112
- flag_option="explorar_palabras",
113
- username="vialibre",
114
  ),
115
  inputs=save_field,
116
  outputs=None,
117
  preprocess=False
118
  )
 
119
  return interface
 
2
  import pandas as pd
3
  import matplotlib.pyplot as plt
4
 
5
+ from modules.module_connection import WordExplorerConnector
 
6
  from modules.module_logsManager import HuggingFaceDatasetSaver
7
  from examples.examples import examples_explorar_relaciones_entre_palabras
8
+ from tool_info import TOOL_INFO
9
 
10
  plt.rcParams.update({'font.size': 14})
11
 
12
  def interface(
13
+ embedding, # Class Embedding instance
14
  available_logs: bool,
15
+ max_neighbors: int,
16
  lang: str="spanish",
17
  ) -> gr.Blocks:
18
 
19
  # --- Init logs ---
20
  log_callback = HuggingFaceDatasetSaver(
21
+ available_logs=available_logs,
22
+ dataset_name=f"logs_edia_we_{lang}"
23
  )
24
+
25
  # --- Init vars ---
26
+ connector = WordExplorerConnector(
27
+ embedding=embedding
28
+ )
29
+
30
+ # --- Load language ---
31
+ labels = pd.read_json(
32
+ f"language/{lang}.json"
33
+ )["WordExplorer_interface"]
34
 
35
  # --- Interface ---
36
  interface = gr.Blocks()
37
+
38
  with interface:
39
+ gr.Markdown(
40
+ value=labels["title"]
41
+ )
42
+
43
  with gr.Row():
44
  with gr.Column(scale=3):
45
  with gr.Row(equal_height=True):
46
  with gr.Column(scale=5):
47
+ diagnose_list = gr.Textbox(
48
+ lines=2,
49
+ label=labels["wordListToDiagnose"]
50
+ )
51
  with gr.Column(scale=1,min_width=10):
52
+ color_wordlist = gr.ColorPicker(
53
+ label="",
54
+ value='#000000'
55
+ )
56
+
57
  with gr.Row():
58
  with gr.Column(scale=5):
59
+ wordlist_1 = gr.Textbox(
60
+ lines=2,
61
+ label=labels["wordList1"]
62
+ )
63
  with gr.Column(scale=1,min_width=10):
64
+ color_wordlist_1 = gr.ColorPicker(
65
+ label="",
66
+ value='#1f78b4'
67
+ )
68
  with gr.Row():
69
  with gr.Column(scale=5):
70
+ wordlist_2 = gr.Textbox(
71
+ lines=2,
72
+ label=labels["wordList2"]
73
+ )
74
  with gr.Column(scale=1,min_width=10):
75
+ color_wordlist_2 = gr.ColorPicker(
76
+ label="",
77
+ value='#33a02c'
78
+ )
79
  with gr.Row():
80
  with gr.Column(scale=5):
81
+ wordlist_3 = gr.Textbox(
82
+ lines=2,
83
+ label=labels["wordList3"]
84
+ )
85
  with gr.Column(scale=1,min_width=10):
86
+ color_wordlist_3 = gr.ColorPicker(
87
+ label="",
88
+ value='#e31a1c'
89
+ )
90
  with gr.Row():
91
  with gr.Column(scale=5):
92
+ wordlist_4 = gr.Textbox(
93
+ lines=2,
94
+ label=labels["wordList4"]
95
+ )
96
  with gr.Column(scale=1,min_width=10):
97
+ color_wordlist_4 = gr.ColorPicker(
98
+ label="",
99
+ value='#6a3d9a'
100
+ )
101
  with gr.Column(scale=4):
102
  with gr.Row():
103
  with gr.Row():
104
+ gr.Markdown(
105
+ value=labels["plotNeighbours"]["title"]
106
+ )
107
+ n_neighbors = gr.Slider(
108
+ minimum=0,
109
+ maximum=max_neighbors,
110
+ step=1,
111
+ label=labels["plotNeighbours"]["quantity"]
112
+ )
113
  with gr.Row():
114
+ alpha = gr.Slider(
115
+ minimum=0.1,
116
+ maximum=0.9,
117
+ value=0.3,
118
+ step=0.1,
119
+ label=labels["options"]["transparency"]
120
+ )
121
+ fontsize=gr.Number(
122
+ value=25,
123
+ label=labels["options"]["font-size"]
124
+ )
125
  with gr.Row():
126
+ btn_plot = gr.Button(
127
+ value=labels["plot_button"]
128
+ )
129
  with gr.Row():
130
+ err_msg = gr.Markdown(
131
+ label="",
132
+ visible=True
133
+ )
134
  with gr.Row():
135
+ word_proyections = gr.Plot(
136
+ label="",
137
+ show_label=False
138
+ )
139
 
140
  with gr.Row():
141
  gr.Examples(
 
147
  )
148
 
149
  with gr.Row():
150
+ gr.Markdown(
151
+ value=TOOL_INFO
152
+ )
153
 
154
  btn_plot.click(
155
  fn=connector.plot_proyection_2d,
 
168
  fontsize,
169
  n_neighbors
170
  ],
171
+ outputs=[word_proyections, err_msg]
172
  )
173
 
174
  # --- Logs ---
175
+ save_field = [diagnose_list, wordlist_1, wordlist_2, wordlist_3, wordlist_4]
176
+ log_callback.setup(
177
+ components=save_field,
178
+ flagging_dir="logs_word_explorer"
179
+ )
180
 
181
  btn_plot.click(
182
  fn=lambda *args: log_callback.flag(
183
+ flag_data=args,
184
+ flag_option="word_explorer",
185
+ username="vialibre",
186
  ),
187
  inputs=save_field,
188
  outputs=None,
189
  preprocess=False
190
  )
191
+
192
  return interface
language/.gitignore ADDED
@@ -0,0 +1 @@
 
 
1
+ english.json
modules/model_embbeding.py CHANGED
@@ -3,7 +3,7 @@ from memory_profiler import profile
3
  from sklearn.neighbors import NearestNeighbors
4
  from sklearn.decomposition import PCA
5
  from gensim.models import KeyedVectors
6
- from typing import List
7
  import os
8
  import pandas as pd
9
 
@@ -13,21 +13,22 @@ from gensim import matutils
13
 
14
 
15
  class Embedding:
16
- @profile
17
  def __init__(self,
18
  path: str,
19
- binary: bool,
20
- limit: int=None,
21
  randomizedPCA: bool=False,
22
- max_neighbors: int=20
 
23
  ) -> None:
24
 
25
  # Embedding vars
26
  self.path = path
27
  self.limit = limit
28
  self.randomizedPCA = randomizedPCA
29
- self.binary = binary
30
  self.max_neighbors = max_neighbors
 
 
 
31
 
32
  # Full embedding dataset
33
  self.ds = None
@@ -43,36 +44,34 @@ class Embedding:
43
  self,
44
  ) -> None:
45
 
 
 
46
  print(f"Preparing {os.path.basename(self.path)} embeddings...")
47
 
48
  # --- Prepare dataset ---
49
  self.ds = self.__preparate(
50
- self.path, self.binary, self.limit, self.randomizedPCA
51
  )
52
 
53
  # --- Estimate Nearest Neighbors
54
- # Method A: Througth annoy using forest tree
55
- self.ann = Ann(
56
- words=self.ds['word'],
57
- vectors=self.ds['embedding'],
58
- coord=self.ds['pca']
59
- )
60
- self.ann.init(
61
- n_trees=20, metric='dot', n_jobs=-1
62
- )
63
-
64
- # Method B: Througth Sklearn method
65
- self.neigh = NearestNeighbors(
66
- n_neighbors=self.max_neighbors
67
- )
68
- self.neigh.fit(
69
- X=self.ds['embedding'].to_list()
70
- )
71
 
72
  def __preparate(
73
  self,
74
- path: str,
75
- binary: bool,
76
  limit: int,
77
  randomizedPCA: bool
78
  ) -> pd.DataFrame:
@@ -94,7 +93,7 @@ class Embedding:
94
  print("--------> PATH:", path)
95
  model = KeyedVectors.load_word2vec_format(
96
  fname=path,
97
- binary=binary,
98
  limit=limit
99
  )
100
 
@@ -116,11 +115,48 @@ class Embedding:
116
  df_uncased = df_cased.drop_duplicates(subset='word')
117
  return df_uncased
118
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
119
  def __getValue(
120
  self,
121
  word: str,
122
  feature: str
123
- ):
 
124
  word_id, value = None, None
125
 
126
  if word in self:
@@ -134,13 +170,15 @@ class Embedding:
134
  def getEmbedding(
135
  self,
136
  word: str
137
- ):
 
138
  return self.__getValue(word, 'embedding')
139
 
140
  def getPCA(
141
  self,
142
  word: str
143
- ):
 
144
  return self.__getValue(word, 'pca')
145
 
146
  def getNearestNeighbors(
@@ -152,35 +190,58 @@ class Embedding:
152
 
153
  assert(n_neighbors <= self.max_neighbors), f"Error: The value of the parameter 'n_neighbors:{n_neighbors}' must less than or equal to {self.max_neighbors}!."
154
 
155
- if nn_method == 'ann':
156
- words = self.ann.get(word, n_neighbors)
157
-
158
- elif nn_method == 'sklearn':
159
- word_emb = self.getEmbedding(word).reshape(1,-1)
160
- _, nn_ids = self.neigh.kneighbors(word_emb, n_neighbors + 1) #Fix and Update
161
- words = [self.ds['word'].to_list()[idx] for idx in nn_ids[0]][1:] #Fix and Update
162
- else:
163
- words = []
164
- return words
165
 
166
- def __contains__(
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
167
  self,
168
- word: str
169
- ) -> bool:
170
-
171
- return word in self.ds['word'].to_list()
172
-
173
- # ToDo: Revisar estos dos métodos usados en la pestaña sesgoEnPalabras
174
- # ya que ahora los embedding vienen normalizados
175
- def cosineSimilarities(self, vector_1, vectors_all):
176
  norm = np.linalg.norm(vector_1)
177
  all_norms = np.linalg.norm(vectors_all, axis=1)
178
  dot_products = dot(vectors_all, vector_1)
179
  similarities = dot_products / (norm * all_norms)
180
  return similarities
181
 
182
- def getCosineSimilarities(self, w1, w2):
 
 
 
 
 
183
  return dot(
184
  matutils.unitvec(self.getEmbedding(w1)),
185
  matutils.unitvec(self.getEmbedding(w2))
186
- )
 
 
 
 
 
 
 
 
3
  from sklearn.neighbors import NearestNeighbors
4
  from sklearn.decomposition import PCA
5
  from gensim.models import KeyedVectors
6
+ from typing import List, Any
7
  import os
8
  import pandas as pd
9
 
 
13
 
14
 
15
  class Embedding:
 
16
  def __init__(self,
17
  path: str,
18
+ limit: int=None,
 
19
  randomizedPCA: bool=False,
20
+ max_neighbors: int=20,
21
+ nn_method: str='sklearn'
22
  ) -> None:
23
 
24
  # Embedding vars
25
  self.path = path
26
  self.limit = limit
27
  self.randomizedPCA = randomizedPCA
 
28
  self.max_neighbors = max_neighbors
29
+
30
+ self.availables_nn_methods = ['sklearn', 'ann']
31
+ self.nn_method = nn_method
32
 
33
  # Full embedding dataset
34
  self.ds = None
 
44
  self,
45
  ) -> None:
46
 
47
+ assert(self.nn_method in self.availables_nn_methods), f"Error: The value of the parameter 'nn method' can only be {self.availables_nn_methods}!"
48
+
49
  print(f"Preparing {os.path.basename(self.path)} embeddings...")
50
 
51
  # --- Prepare dataset ---
52
  self.ds = self.__preparate(
53
+ self.path, self.limit, self.randomizedPCA
54
  )
55
 
56
  # --- Estimate Nearest Neighbors
57
+ if self.nn_method == 'sklearn':
58
+ # Method A: Througth Sklearn method
59
+ self.__init_sklearn_method(
60
+ max_neighbors=self.max_neighbors,
61
+ vectors=self.ds['embedding'].to_list()
62
+ )
63
+
64
+ elif self.nn_method == 'ann':
65
+ # Method B: Througth annoy using forest tree
66
+ self.__init_ann_method(
67
+ words=self.ds['word'].to_list(),
68
+ vectors=self.ds['embedding'].to_list(),
69
+ coord=self.ds['pca'].to_list()
70
+ )
 
 
 
71
 
72
  def __preparate(
73
  self,
74
+ path: str,
 
75
  limit: int,
76
  randomizedPCA: bool
77
  ) -> pd.DataFrame:
 
93
  print("--------> PATH:", path)
94
  model = KeyedVectors.load_word2vec_format(
95
  fname=path,
96
+ binary=path.endswith('.bin'),
97
  limit=limit
98
  )
99
 
 
115
  df_uncased = df_cased.drop_duplicates(subset='word')
116
  return df_uncased
117
 
118
+ def __init_ann_method(
119
+ self,
120
+ words: List[str],
121
+ vectors: List[float],
122
+ coord: List[float],
123
+ n_trees: int=20,
124
+ metric: str='dot'
125
+ ) -> None:
126
+
127
+ print("Initializing Annoy method to search for nearby neighbors...")
128
+ self.ann = Ann(
129
+ words=words,
130
+ vectors=vectors,
131
+ coord=coord,
132
+ )
133
+
134
+ self.ann.init(
135
+ n_trees=n_trees,
136
+ metric=metric,
137
+ n_jobs=-1
138
+ )
139
+
140
+ def __init_sklearn_method(
141
+ self,
142
+ max_neighbors: int,
143
+ vectors: List[float]
144
+ ) -> None:
145
+
146
+ print("Initializing sklearn method to search for nearby neighbors...")
147
+ self.neigh = NearestNeighbors(
148
+ n_neighbors=max_neighbors
149
+ )
150
+ self.neigh.fit(
151
+ X=vectors
152
+ )
153
+
154
  def __getValue(
155
  self,
156
  word: str,
157
  feature: str
158
+ ) -> Any:
159
+
160
  word_id, value = None, None
161
 
162
  if word in self:
 
170
  def getEmbedding(
171
  self,
172
  word: str
173
+ ) -> np.ndarray:
174
+
175
  return self.__getValue(word, 'embedding')
176
 
177
  def getPCA(
178
  self,
179
  word: str
180
+ ) -> np.ndarray:
181
+
182
  return self.__getValue(word, 'pca')
183
 
184
  def getNearestNeighbors(
 
190
 
191
  assert(n_neighbors <= self.max_neighbors), f"Error: The value of the parameter 'n_neighbors:{n_neighbors}' must less than or equal to {self.max_neighbors}!."
192
 
193
+ assert(nn_method in self.availables_nn_methods), f"Error: The value of the parameter 'nn method' can only be {self.availables_nn_methods}!"
194
+
195
+ neighbords_list = None
 
 
 
 
 
 
 
196
 
197
+ if word in self:
198
+ if nn_method == 'ann':
199
+ if self.ann is None:
200
+ self.__init_ann_method(
201
+ words=self.ds['word'].to_list(),
202
+ vectors=self.ds['embedding'].to_list(),
203
+ coord=self.ds['pca'].to_list()
204
+ )
205
+ neighbords_list = self.ann.get(word, n_neighbors)
206
+
207
+ elif nn_method == 'sklearn':
208
+ if self.neigh is None:
209
+ self.__init_sklearn_method(
210
+ max_neighbors=self.max_neighbors,
211
+ vectors=self.ds['embedding'].to_list()
212
+ )
213
+
214
+ word_emb = self.getEmbedding(word).reshape(1,-1)
215
+ _, nn_ids = self.neigh.kneighbors(word_emb, n_neighbors + 1)
216
+ neighbords_list = [self.ds['word'].to_list()[idx] for idx in nn_ids[0]][1:]
217
+
218
+ return neighbords_list
219
+
220
+ def cosineSimilarities(
221
  self,
222
+ vector_1,
223
+ vectors_all
224
+ ):
 
 
 
 
 
225
  norm = np.linalg.norm(vector_1)
226
  all_norms = np.linalg.norm(vectors_all, axis=1)
227
  dot_products = dot(vectors_all, vector_1)
228
  similarities = dot_products / (norm * all_norms)
229
  return similarities
230
 
231
+ def getCosineSimilarities(
232
+ self,
233
+ w1,
234
+ w2
235
+ ):
236
+
237
  return dot(
238
  matutils.unitvec(self.getEmbedding(w1)),
239
  matutils.unitvec(self.getEmbedding(w2))
240
+ )
241
+
242
+ def __contains__(
243
+ self,
244
+ word: str
245
+ ) -> bool:
246
+
247
+ return word in self.ds['word'].to_list()
modules/module_BiasExplorer.py CHANGED
@@ -5,10 +5,14 @@ import seaborn as sns
5
  import matplotlib.pyplot as plt
6
  from sklearn.decomposition import PCA
7
 
8
- def take_two_sides_extreme_sorted(df, n_extreme,
9
- part_column=None,
10
- head_value='',
11
- tail_value=''):
 
 
 
 
12
  head_df = df.head(n_extreme)[:]
13
  tail_df = df.tail(n_extreme)[:]
14
 
@@ -56,39 +60,63 @@ __all__ = ['GenderBiasWE', 'BiasWordEmbedding']
56
 
57
 
58
  class WordBiasExplorer():
59
- def __init__(self, vocabulary):
60
- # pylint: disable=undefined-variable
 
 
61
 
62
- self.vocabulary = vocabulary
63
  self.direction = None
64
  self.positive_end = None
65
  self.negative_end = None
66
 
67
- def __copy__(self):
68
- bias_word_embedding = self.__class__(self.vocabulary)
 
 
 
69
  bias_word_embedding.direction = copy.deepcopy(self.direction)
70
  bias_word_embedding.positive_end = copy.deepcopy(self.positive_end)
71
  bias_word_embedding.negative_end = copy.deepcopy(self.negative_end)
72
  return bias_word_embedding
73
 
74
- def __deepcopy__(self, memo):
 
 
 
 
75
  bias_word_embedding = copy.copy(self)
76
  bias_word_embedding.model = copy.deepcopy(bias_word_embedding.model)
77
  return bias_word_embedding
78
 
79
- def __getitem__(self, key):
80
- return self.vocabulary.getEmbedding(key)
 
 
81
 
82
- def __contains__(self, item):
83
- return item in self.vocabulary
84
 
85
- def _is_direction_identified(self):
 
 
 
 
 
 
 
 
 
86
  if self.direction is None:
87
  raise RuntimeError('The direction was not identified'
88
  ' for this {} instance'
89
  .format(self.__class__.__name__))
90
 
91
- def _identify_subspace_by_pca(self, definitional_pairs, n_components):
 
 
 
 
 
92
  matrix = []
93
 
94
  for word1, word2 in definitional_pairs:
@@ -105,8 +133,14 @@ class WordBiasExplorer():
105
  return pca
106
 
107
 
108
- def _identify_direction(self, positive_end, negative_end,
109
- definitional, method='pca'):
 
 
 
 
 
 
110
  if method not in DIRECTION_METHODS:
111
  raise ValueError('method should be one of {}, {} was given'.format(
112
  DIRECTION_METHODS, method))
@@ -154,7 +188,11 @@ class WordBiasExplorer():
154
  self.positive_end = positive_end
155
  self.negative_end = negative_end
156
 
157
- def project_on_direction(self, word):
 
 
 
 
158
  """Project the normalized vector of the word on the direction.
159
  :param str word: The word tor project
160
  :return float: The projection scalar
@@ -163,13 +201,15 @@ class WordBiasExplorer():
163
  self._is_direction_identified()
164
 
165
  vector = self[word]
166
- projection_score = self.vocabulary.cosineSimilarities(self.direction,
167
  [vector])[0]
168
  return projection_score
169
 
 
 
 
 
170
 
171
-
172
- def _calc_projection_scores(self, words):
173
  self._is_direction_identified()
174
 
175
  df = pd.DataFrame({'word': words})
@@ -181,7 +221,11 @@ class WordBiasExplorer():
181
 
182
  return df
183
 
184
- def calc_projection_data(self, words):
 
 
 
 
185
  """
186
  Calculate projection, projected and rejected vectors of a words list.
187
  :param list words: List of words
@@ -206,7 +250,12 @@ class WordBiasExplorer():
206
 
207
  return pd.DataFrame(projection_data)
208
 
209
- def plot_dist_projections_on_direction(self, word_groups, ax=None):
 
 
 
 
 
210
  """Plot the projection scalars distribution on the direction.
211
  :param dict word_groups word: The groups to projects
212
  :return float: The ax object of the plot
@@ -221,7 +270,7 @@ class WordBiasExplorer():
221
  words = word_groups[name]
222
  label = '{} (#{})'.format(name, len(words))
223
  vectors = [self[word] for word in words]
224
- projections = self.vocabulary.cosineSimilarities(self.direction,
225
  vectors)
226
  sns.distplot(projections, hist=False, label=label, ax=ax)
227
 
@@ -236,18 +285,26 @@ class WordBiasExplorer():
236
 
237
  return ax
238
 
239
- def __errorChecking(self, word):
 
 
 
 
240
  out_msj = ""
241
 
242
  if not word:
243
  out_msj = "Error: Primero debe ingresar una palabra!"
244
  else:
245
- if word not in self.vocabulary:
246
  out_msj = f"Error: La palabra '<b>{word}</b>' no se encuentra en el vocabulario!"
247
 
248
  return out_msj
249
 
250
- def check_oov(self, wordlists):
 
 
 
 
251
  for wordlist in wordlists:
252
  for word in wordlist:
253
  msg = self.__errorChecking(word)
@@ -255,13 +312,15 @@ class WordBiasExplorer():
255
  return msg
256
  return None
257
 
258
- def plot_biased_words(self,
259
- words_to_diagnose,
260
- wordlist_right,
261
- wordlist_left,
262
- wordlist_top=[],
263
- wordlist_bottom=[]
264
- ):
 
 
265
  bias_2D = wordlist_top == [] and wordlist_bottom == []
266
 
267
  if bias_2D and (not wordlist_right or not wordlist_left):
@@ -273,21 +332,24 @@ class WordBiasExplorer():
273
  if err:
274
  raise Exception(err)
275
 
276
- return self.get_bias_plot(bias_2D,
277
- words_to_diagnose,
278
- definitional_1=(wordlist_right, wordlist_left),
279
- definitional_2=(wordlist_top, wordlist_bottom)
280
- )
 
281
 
282
- def get_bias_plot(self,
283
- plot_2D,
284
- words_to_diagnose,
285
- definitional_1,
286
- definitional_2=([], []),
287
- method='sum',
288
- n_extreme=10,
289
- figsize=(15, 10)
290
- ):
 
 
291
  fig, ax = plt.subplots(1, figsize=figsize)
292
  self.method = method
293
  self.plot_projection_scores(plot_2D, words_to_diagnose, definitional_1, definitional_2, n_extreme, ax)
@@ -298,14 +360,17 @@ class WordBiasExplorer():
298
 
299
  return fig
300
 
301
- def plot_projection_scores(self,
302
- plot_2D,
303
- words,
304
- definitional_1,
305
- definitional_2=([], []),
306
- n_extreme=10,
307
- ax=None,
308
- axis_projection_step=0.1):
 
 
 
309
  name_left = ', '.join(definitional_1[1])
310
  name_right = ', '.join(definitional_1[0])
311
 
@@ -341,6 +406,9 @@ class WordBiasExplorer():
341
  sns.barplot(x='projection', y='word', data=projections_df,
342
  palette=projections_df['color'])
343
  else:
 
 
 
344
  sns.scatterplot(x='projection_x', y='projection_y', data=projections_df,
345
  palette=projections_df['color'])
346
 
 
5
  import matplotlib.pyplot as plt
6
  from sklearn.decomposition import PCA
7
 
8
+ def take_two_sides_extreme_sorted(
9
+ df,
10
+ n_extreme,
11
+ part_column=None,
12
+ head_value='',
13
+ tail_value=''
14
+ ):
15
+
16
  head_df = df.head(n_extreme)[:]
17
  tail_df = df.tail(n_extreme)[:]
18
 
 
60
 
61
 
62
  class WordBiasExplorer():
63
+ def __init__(
64
+ self,
65
+ embedding # Class Embedding instance
66
+ ) -> None:
67
 
68
+ self.embedding = embedding
69
  self.direction = None
70
  self.positive_end = None
71
  self.negative_end = None
72
 
73
+ def __copy__(
74
+ self
75
+ ):
76
+
77
+ bias_word_embedding = self.__class__(self.embedding)
78
  bias_word_embedding.direction = copy.deepcopy(self.direction)
79
  bias_word_embedding.positive_end = copy.deepcopy(self.positive_end)
80
  bias_word_embedding.negative_end = copy.deepcopy(self.negative_end)
81
  return bias_word_embedding
82
 
83
+ def __deepcopy__(
84
+ self,
85
+ memo
86
+ ):
87
+
88
  bias_word_embedding = copy.copy(self)
89
  bias_word_embedding.model = copy.deepcopy(bias_word_embedding.model)
90
  return bias_word_embedding
91
 
92
+ def __getitem__(
93
+ self,
94
+ key: str
95
+ ) -> np.ndarray:
96
 
97
+ return self.embedding.getEmbedding(key)
 
98
 
99
+ def __contains__(
100
+ self,
101
+ item: str
102
+ ) -> bool:
103
+
104
+ return item in self.embedding
105
+
106
+ def _is_direction_identified(
107
+ self
108
+ ):
109
  if self.direction is None:
110
  raise RuntimeError('The direction was not identified'
111
  ' for this {} instance'
112
  .format(self.__class__.__name__))
113
 
114
+ def _identify_subspace_by_pca(
115
+ self,
116
+ definitional_pairs,
117
+ n_components
118
+ ):
119
+
120
  matrix = []
121
 
122
  for word1, word2 in definitional_pairs:
 
133
  return pca
134
 
135
 
136
+ def _identify_direction(
137
+ self,
138
+ positive_end,
139
+ negative_end,
140
+ definitional,
141
+ method='pca'
142
+ ):
143
+
144
  if method not in DIRECTION_METHODS:
145
  raise ValueError('method should be one of {}, {} was given'.format(
146
  DIRECTION_METHODS, method))
 
188
  self.positive_end = positive_end
189
  self.negative_end = negative_end
190
 
191
+ def project_on_direction(
192
+ self,
193
+ word: str
194
+ ):
195
+
196
  """Project the normalized vector of the word on the direction.
197
  :param str word: The word tor project
198
  :return float: The projection scalar
 
201
  self._is_direction_identified()
202
 
203
  vector = self[word]
204
+ projection_score = self.embedding.cosineSimilarities(self.direction,
205
  [vector])[0]
206
  return projection_score
207
 
208
+ def _calc_projection_scores(
209
+ self,
210
+ words
211
+ ):
212
 
 
 
213
  self._is_direction_identified()
214
 
215
  df = pd.DataFrame({'word': words})
 
221
 
222
  return df
223
 
224
+ def calc_projection_data(
225
+ self,
226
+ words
227
+ ):
228
+
229
  """
230
  Calculate projection, projected and rejected vectors of a words list.
231
  :param list words: List of words
 
250
 
251
  return pd.DataFrame(projection_data)
252
 
253
+ def plot_dist_projections_on_direction(
254
+ self,
255
+ word_groups,
256
+ ax=None
257
+ ):
258
+
259
  """Plot the projection scalars distribution on the direction.
260
  :param dict word_groups word: The groups to projects
261
  :return float: The ax object of the plot
 
270
  words = word_groups[name]
271
  label = '{} (#{})'.format(name, len(words))
272
  vectors = [self[word] for word in words]
273
+ projections = self.embedding.cosineSimilarities(self.direction,
274
  vectors)
275
  sns.distplot(projections, hist=False, label=label, ax=ax)
276
 
 
285
 
286
  return ax
287
 
288
+ def __errorChecking(
289
+ self,
290
+ word
291
+ ):
292
+
293
  out_msj = ""
294
 
295
  if not word:
296
  out_msj = "Error: Primero debe ingresar una palabra!"
297
  else:
298
+ if word not in self.embedding:
299
  out_msj = f"Error: La palabra '<b>{word}</b>' no se encuentra en el vocabulario!"
300
 
301
  return out_msj
302
 
303
+ def check_oov(
304
+ self,
305
+ wordlists
306
+ ):
307
+
308
  for wordlist in wordlists:
309
  for word in wordlist:
310
  msg = self.__errorChecking(word)
 
312
  return msg
313
  return None
314
 
315
+ def plot_biased_words(
316
+ self,
317
+ words_to_diagnose,
318
+ wordlist_right,
319
+ wordlist_left,
320
+ wordlist_top=[],
321
+ wordlist_bottom=[]
322
+ ):
323
+
324
  bias_2D = wordlist_top == [] and wordlist_bottom == []
325
 
326
  if bias_2D and (not wordlist_right or not wordlist_left):
 
332
  if err:
333
  raise Exception(err)
334
 
335
+ return self.get_bias_plot(
336
+ bias_2D,
337
+ words_to_diagnose,
338
+ definitional_1=(wordlist_right, wordlist_left),
339
+ definitional_2=(wordlist_top, wordlist_bottom)
340
+ )
341
 
342
+ def get_bias_plot(
343
+ self,
344
+ plot_2D,
345
+ words_to_diagnose,
346
+ definitional_1,
347
+ definitional_2=([], []),
348
+ method='sum',
349
+ n_extreme=10,
350
+ figsize=(15, 10)
351
+ ):
352
+
353
  fig, ax = plt.subplots(1, figsize=figsize)
354
  self.method = method
355
  self.plot_projection_scores(plot_2D, words_to_diagnose, definitional_1, definitional_2, n_extreme, ax)
 
360
 
361
  return fig
362
 
363
+ def plot_projection_scores(
364
+ self,
365
+ plot_2D,
366
+ words,
367
+ definitional_1,
368
+ definitional_2=([], []),
369
+ n_extreme=10,
370
+ ax=None,
371
+ axis_projection_step=0.1
372
+ ):
373
+
374
  name_left = ', '.join(definitional_1[1])
375
  name_right = ', '.join(definitional_1[0])
376
 
 
406
  sns.barplot(x='projection', y='word', data=projections_df,
407
  palette=projections_df['color'])
408
  else:
409
+ # ToDo: revisar este warning:
410
+ # Ignoring `palette` because no `hue` variable has been assigned. sns.scatterplot(x='projection_x', y='projection_y', data=projections_df,
411
+
412
  sns.scatterplot(x='projection_x', y='projection_y', data=projections_df,
413
  palette=projections_df['color'])
414
 
modules/module_WordExplorer.py CHANGED
@@ -1,3 +1,4 @@
 
1
  import numpy as np
2
  import pandas as pd
3
  import seaborn as sns
@@ -5,37 +6,63 @@ from numpy.linalg import norm
5
 
6
  import matplotlib as mpl
7
  mpl.use('Agg')
8
- import matplotlib.pyplot as plt
 
9
 
10
  class WordToPlot:
11
- def __init__(self, word, color, bias_space, alpha):
 
 
 
 
 
 
 
12
  self.word = word
13
  self.color = color
14
  self.bias_space = bias_space
15
  self.alpha = alpha
16
 
 
17
  class WordExplorer:
18
- def __init__(self, vocabulary) -> None:
19
- self.vocabulary = vocabulary
 
 
 
 
 
 
 
 
 
20
 
21
- def __errorChecking(self, word):
22
  out_msj = ""
23
 
24
  if not word:
25
  out_msj = "Error: Primero debe ingresar una palabra!"
26
  else:
27
- if word not in self.vocabulary:
28
  out_msj = f"Error: La palabra '<b>{word}</b>' no se encuentra en el vocabulario!"
29
 
30
  return out_msj
31
 
32
- def parse_words(self, string):
 
 
 
 
 
33
  words = string.strip()
34
  if words:
35
  words = [word.strip() for word in words.split(',') if word != ""]
36
  return words
37
 
38
- def check_oov(self, wordlists):
 
 
 
 
39
  for wordlist in wordlists:
40
  for word in wordlist:
41
  msg = self.__errorChecking(word)
@@ -43,10 +70,21 @@ class WordExplorer:
43
  return msg
44
  return None
45
 
46
- def get_neighbors(self, word, n_neighbors, nn_method):
47
- return self.vocabulary.getNearestNeighbors(word, n_neighbors, nn_method)
 
 
 
 
 
 
 
 
 
 
 
 
48
 
49
- def get_df(self, words_embedded, processed_word_list):
50
  df = pd.DataFrame(words_embedded)
51
 
52
  df['word'] = [wtp.word for wtp in processed_word_list]
@@ -55,16 +93,18 @@ class WordExplorer:
55
  df['word_bias_space'] = [wtp.bias_space for wtp in processed_word_list]
56
  return df
57
 
58
- def get_plot(self,
59
- data,
60
- processed_word_list,
61
- words_embedded,
62
- color_dict,
63
- n_neighbors,
64
- n_alpha,
65
- fontsize=18,
66
- figsize=(20, 15)
67
- ):
 
 
68
  fig, ax = plt.subplots(figsize=figsize)
69
 
70
  sns.scatterplot(
@@ -89,11 +129,20 @@ class WordExplorer:
89
  legend=False,
90
  palette=color_dict
91
  )
 
92
  for i, wtp in enumerate(processed_word_list):
93
  x, y = words_embedded[i, :]
94
- ax.annotate(wtp.word, xy=(x, y), xytext=(5, 2), color=wtp.color,
95
- textcoords='offset points',
96
- ha='right', va='bottom', size=fontsize, alpha=wtp.alpha)
 
 
 
 
 
 
 
 
97
 
98
  ax.set_xticks([])
99
  ax.set_yticks([])
@@ -103,25 +152,27 @@ class WordExplorer:
103
 
104
  return fig
105
 
106
- def plot_projections_2d(self,
107
- wordlist_0,
108
- wordlist_1 = [],
109
- wordlist_2 = [],
110
- wordlist_3 = [],
111
- wordlist_4 = [],
112
- **kwargs
113
- ):
 
 
114
  # convertirlas a vector
115
  choices = [0, 1, 2, 3, 4]
116
  wordlist_choice = [
117
- wordlist_0,
118
  wordlist_1,
119
- wordlist_2,
120
- wordlist_3,
121
  wordlist_4
122
  ]
123
 
124
- err = self.check_oov(wordlist_choice)
125
  if err:
126
  raise Exception(err)
127
 
@@ -139,48 +190,69 @@ class WordExplorer:
139
  processed_word_list = []
140
  for word_list_to_process, color in zip(wordlist_choice, choices):
141
  for word in word_list_to_process:
142
- processed_word_list.append(WordToPlot(word, color_dict[color], color, 1))
 
 
143
 
144
  if n_neighbors > 0:
145
- # Updated: Con el agregado del parámetro max_neightbors, el (n_neighbors+1)
146
- # hacia superar ese valor máximo y se producia una aserción
147
- neighbors = self.get_neighbors(word,
148
- # n_neighbors=n_neighbors+1,
149
- n_neighbors=n_neighbors,
150
  nn_method=kwargs.get('nn_method', 'sklearn')
151
  )
 
152
  for n in neighbors:
153
  if n not in [wtp.word for wtp in processed_word_list]:
154
- processed_word_list.append(WordToPlot(n, color_dict[color], color, n_alpha))
 
 
155
 
156
  if not processed_word_list:
157
  raise Exception('Only empty lists were passed')
158
-
159
- words_embedded = np.array([self.vocabulary.getPCA(wtp.word) for wtp in processed_word_list])
160
 
161
- data = self.get_df(words_embedded, processed_word_list)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
162
 
163
- fig = self.get_plot(data, processed_word_list, words_embedded,
164
- color_dict, n_neighbors, n_alpha,
165
- kwargs.get('fontsize', 18),
166
- kwargs.get('figsize', (20, 15))
167
- )
168
  plt.show()
169
  return fig
170
 
171
- def doesnt_match(self, wordlist):
 
 
 
 
 
172
  err = self.check_oov([wordlist])
173
  if err:
174
  raise Exception(err)
175
-
176
- words_emb = np.array([self.vocabulary.getEmbedding(word) for word in wordlist])
 
177
  mean_vec = np.mean(words_emb, axis=0)
178
 
179
  doesnt_match = ""
180
  farthest_emb = 1.0
181
  for word in wordlist:
182
- word_emb = self.vocabulary.getEmbedding(word)
183
- cos_sim = np.dot(mean_vec, word_emb) / (norm(mean_vec)*norm(word_emb))
 
184
  if cos_sim <= farthest_emb:
185
  farthest_emb = cos_sim
186
  doesnt_match = word
 
1
+ import matplotlib.pyplot as plt
2
  import numpy as np
3
  import pandas as pd
4
  import seaborn as sns
 
6
 
7
  import matplotlib as mpl
8
  mpl.use('Agg')
9
+ from typing import List, Dict, Tuple
10
+
11
 
12
  class WordToPlot:
13
+ def __init__(
14
+ self,
15
+ word: str,
16
+ color: str,
17
+ bias_space: int,
18
+ alpha: float
19
+ ):
20
+
21
  self.word = word
22
  self.color = color
23
  self.bias_space = bias_space
24
  self.alpha = alpha
25
 
26
+
27
  class WordExplorer:
28
+ def __init__(
29
+ self,
30
+ embedding # Class Embedding instance
31
+ ) -> None:
32
+
33
+ self.embedding = embedding
34
+
35
+ def __errorChecking(
36
+ self,
37
+ word: str
38
+ ) -> str:
39
 
 
40
  out_msj = ""
41
 
42
  if not word:
43
  out_msj = "Error: Primero debe ingresar una palabra!"
44
  else:
45
+ if word not in self.embedding:
46
  out_msj = f"Error: La palabra '<b>{word}</b>' no se encuentra en el vocabulario!"
47
 
48
  return out_msj
49
 
50
+ # ToDo: Este método no se usa. Creo que es el implementado en la clase connections base ¿Borrar?
51
+ def parse_words(
52
+ self,
53
+ string: str
54
+ ) -> List[str]:
55
+
56
  words = string.strip()
57
  if words:
58
  words = [word.strip() for word in words.split(',') if word != ""]
59
  return words
60
 
61
+ def check_oov(
62
+ self,
63
+ wordlists: List[str]
64
+ ) -> str:
65
+
66
  for wordlist in wordlists:
67
  for word in wordlist:
68
  msg = self.__errorChecking(word)
 
70
  return msg
71
  return None
72
 
73
+ def get_neighbors(
74
+ self,
75
+ word: str,
76
+ n_neighbors: int,
77
+ nn_method: str
78
+ ) -> List[str]:
79
+
80
+ return self.embedding.getNearestNeighbors(word, n_neighbors, nn_method)
81
+
82
+ def get_df(
83
+ self,
84
+ words_embedded: np.ndarray,
85
+ processed_word_list: List[str]
86
+ ) -> pd.DataFrame:
87
 
 
88
  df = pd.DataFrame(words_embedded)
89
 
90
  df['word'] = [wtp.word for wtp in processed_word_list]
 
93
  df['word_bias_space'] = [wtp.bias_space for wtp in processed_word_list]
94
  return df
95
 
96
+ def get_plot(
97
+ self,
98
+ data: pd.DataFrame,
99
+ processed_word_list: List[str],
100
+ words_embedded: np.ndarray,
101
+ color_dict: Dict,
102
+ n_neighbors: int,
103
+ n_alpha: float,
104
+ fontsize: int=18,
105
+ figsize: Tuple[int, int]=(20, 15)
106
+ ):
107
+
108
  fig, ax = plt.subplots(figsize=figsize)
109
 
110
  sns.scatterplot(
 
129
  legend=False,
130
  palette=color_dict
131
  )
132
+
133
  for i, wtp in enumerate(processed_word_list):
134
  x, y = words_embedded[i, :]
135
+ ax.annotate(
136
+ wtp.word,
137
+ xy=(x, y),
138
+ xytext=(5, 2),
139
+ color=wtp.color,
140
+ textcoords='offset points',
141
+ ha='right',
142
+ va='bottom',
143
+ size=fontsize,
144
+ alpha=wtp.alpha
145
+ )
146
 
147
  ax.set_xticks([])
148
  ax.set_yticks([])
 
152
 
153
  return fig
154
 
155
+ def plot_projections_2d(
156
+ self,
157
+ wordlist_0: List[str],
158
+ wordlist_1: List[str]=[],
159
+ wordlist_2: List[str]=[],
160
+ wordlist_3: List[str]=[],
161
+ wordlist_4: List[str]=[],
162
+ **kwargs
163
+ ):
164
+
165
  # convertirlas a vector
166
  choices = [0, 1, 2, 3, 4]
167
  wordlist_choice = [
168
+ wordlist_0,
169
  wordlist_1,
170
+ wordlist_2,
171
+ wordlist_3,
172
  wordlist_4
173
  ]
174
 
175
+ err = self.check_oov(wordlist_choice)
176
  if err:
177
  raise Exception(err)
178
 
 
190
  processed_word_list = []
191
  for word_list_to_process, color in zip(wordlist_choice, choices):
192
  for word in word_list_to_process:
193
+ processed_word_list.append(
194
+ WordToPlot(word, color_dict[color], color, 1)
195
+ )
196
 
197
  if n_neighbors > 0:
198
+ neighbors = self.get_neighbors(
199
+ word,
200
+ n_neighbors=n_neighbors,
 
 
201
  nn_method=kwargs.get('nn_method', 'sklearn')
202
  )
203
+
204
  for n in neighbors:
205
  if n not in [wtp.word for wtp in processed_word_list]:
206
+ processed_word_list.append(
207
+ WordToPlot(n, color_dict[color], color, n_alpha)
208
+ )
209
 
210
  if not processed_word_list:
211
  raise Exception('Only empty lists were passed')
 
 
212
 
213
+ words_embedded = np.array(
214
+ [self.embedding.getPCA(wtp.word) for wtp in processed_word_list]
215
+ )
216
+
217
+ data = self.get_df(
218
+ words_embedded,
219
+ processed_word_list
220
+ )
221
+
222
+ fig = self.get_plot(
223
+ data,
224
+ processed_word_list,
225
+ words_embedded,
226
+ color_dict,
227
+ n_neighbors,
228
+ n_alpha,
229
+ kwargs.get('fontsize', 18),
230
+ kwargs.get('figsize', (20, 15))
231
+ )
232
 
 
 
 
 
 
233
  plt.show()
234
  return fig
235
 
236
+ # ToDo: No encuentro donde se usa este método. ¿Borrar?
237
+ def doesnt_match(
238
+ self,
239
+ wordlist
240
+ ):
241
+
242
  err = self.check_oov([wordlist])
243
  if err:
244
  raise Exception(err)
245
+
246
+ words_emb = np.array([self.embedding.getEmbedding(word)
247
+ for word in wordlist])
248
  mean_vec = np.mean(words_emb, axis=0)
249
 
250
  doesnt_match = ""
251
  farthest_emb = 1.0
252
  for word in wordlist:
253
+ word_emb = self.embedding.getEmbedding(word)
254
+ cos_sim = np.dot(mean_vec, word_emb) / \
255
+ (norm(mean_vec)*norm(word_emb))
256
  if cos_sim <= farthest_emb:
257
  farthest_emb = cos_sim
258
  doesnt_match = word
modules/module_ann.py CHANGED
@@ -1,45 +1,71 @@
1
  import time
2
- import operator
3
  from tqdm import tqdm
4
  from annoy import AnnoyIndex
5
  from memory_profiler import profile
 
6
 
7
  class TicToc:
8
- def __init__(self):
 
 
 
9
  self.i = None
10
- def start(self):
 
 
 
 
11
  self.i = time.time()
12
- def stop(self):
 
 
 
 
13
  f = time.time()
14
  print(f - self.i, "seg.")
15
 
 
16
  class Ann:
17
- def __init__(self, words, vectors, coord):
18
- self.words = words.to_list()
19
- self.vectors = vectors.to_list()
20
- self.coord = coord.to_list()
 
 
 
 
 
 
21
  self.tree = None
22
 
23
  self.tt = TicToc()
24
 
25
- @profile
26
- def init(self, n_trees=10, metric='angular', n_jobs=-1):
 
 
 
 
27
  # metrics options = "angular", "euclidean", "manhattan", "hamming", or "dot"
28
  # n_jobs=-1 Run over all CPU availables
29
 
30
- print("Init tree...")
31
  self.tt.start()
32
  self.tree = AnnoyIndex(len(self.vectors[0]), metric=metric)
33
- for i,v in tqdm(enumerate(self.vectors), total=len(self.vectors)):
34
- self.tree.add_item(i,v)
35
  self.tt.stop()
36
 
37
- print("Build tree...")
38
  self.tt.start()
39
  self.tree.build(n_trees=n_trees, n_jobs=n_jobs)
40
  self.tt.stop()
41
 
42
- def __getWordId(self, word):
 
 
 
 
43
  word_id = None
44
  try:
45
  word_id = self.words.index(word)
@@ -47,16 +73,20 @@ class Ann:
47
  pass
48
  return word_id
49
 
50
- def get(self, word, n_neighbors=10):
 
 
 
 
 
51
  word_id = self.__getWordId(word)
52
- reword_xy_list = None
53
 
54
  if word_id != None:
55
- neighbord_id = self.tree.get_nns_by_item(word_id, n_neighbors)
56
- # word_xy_list = list(map(lambda i: (self.words[i],self.coord[i]), neighbord_id))
57
- # word_xy_list = list(map(lambda i: self.words[i], neighbord_id))
58
- word_xy_list = operator.itemgetter(*neighbord_id)(self.words)
59
  else:
60
  print(f"The word '{word}' does not exist")
61
-
62
- return word_xy_list
 
1
  import time
 
2
  from tqdm import tqdm
3
  from annoy import AnnoyIndex
4
  from memory_profiler import profile
5
+ from typing import List, Any
6
 
7
  class TicToc:
8
+ def __init__(
9
+ self
10
+ ) -> None:
11
+
12
  self.i = None
13
+
14
+ def start(
15
+ self
16
+ ) -> None:
17
+
18
  self.i = time.time()
19
+
20
+ def stop(
21
+ self
22
+ ) -> None:
23
+
24
  f = time.time()
25
  print(f - self.i, "seg.")
26
 
27
+
28
  class Ann:
29
+ def __init__(
30
+ self,
31
+ words: List[str],
32
+ vectors: List[float],
33
+ coord: List[float],
34
+ ) -> None:
35
+
36
+ self.words = words
37
+ self.vectors = vectors
38
+ self.coord = coord
39
  self.tree = None
40
 
41
  self.tt = TicToc()
42
 
43
+ def init(self,
44
+ n_trees: int=10,
45
+ metric: str='angular',
46
+ n_jobs: int=-1
47
+ ) -> None:
48
+
49
  # metrics options = "angular", "euclidean", "manhattan", "hamming", or "dot"
50
  # n_jobs=-1 Run over all CPU availables
51
 
52
+ print("\tInit tree...")
53
  self.tt.start()
54
  self.tree = AnnoyIndex(len(self.vectors[0]), metric=metric)
55
+ for i, v in tqdm(enumerate(self.vectors), total=len(self.vectors)):
56
+ self.tree.add_item(i, v)
57
  self.tt.stop()
58
 
59
+ print("\tBuild tree...")
60
  self.tt.start()
61
  self.tree.build(n_trees=n_trees, n_jobs=n_jobs)
62
  self.tt.stop()
63
 
64
+ def __getWordId(
65
+ self,
66
+ word: str
67
+ ) -> int:
68
+
69
  word_id = None
70
  try:
71
  word_id = self.words.index(word)
 
73
  pass
74
  return word_id
75
 
76
+ def get(
77
+ self,
78
+ word: str,
79
+ n_neighbors: int=10
80
+ ) -> List[str]:
81
+
82
  word_id = self.__getWordId(word)
83
+ neighbords_list = None
84
 
85
  if word_id != None:
86
+ neighbords_id = self.tree.get_nns_by_item(word_id, n_neighbors + 1)
87
+ neighbords_list = [self.words[idx] for idx in neighbords_id][1:]
88
+
 
89
  else:
90
  print(f"The word '{word}' does not exist")
91
+
92
+ return neighbords_list
modules/module_connection.py CHANGED
@@ -1,52 +1,75 @@
1
- import numpy as np
2
- import pandas as pd
3
- import gradio as gr
4
- from abc import ABC, abstractmethod
5
 
6
- from modules.module_WordExplorer import WordExplorer # Updated
7
  from modules.module_BiasExplorer import WordBiasExplorer
 
 
8
 
9
  class Connector(ABC):
10
- def parse_word(self, word : str):
 
 
 
 
11
  return word.lower().strip()
12
 
13
- def parse_words(self, array_in_string : str):
 
 
 
 
14
  words = array_in_string.strip()
15
  if not words:
16
  return []
17
- words = [self.parse_word(word) for word in words.split(',') if word.strip() != '']
 
 
 
 
18
  return words
19
 
20
- def process_error(self, err: str):
21
- if err is None:
22
- return
23
- return "<center><h3>" + err + "</h3></center>"
 
 
 
 
24
 
25
 
26
  class WordExplorerConnector(Connector):
 
 
 
 
27
 
28
- def __init__(self, **kwargs):
29
  if 'embedding' in kwargs:
30
  embedding = kwargs.get('embedding')
31
  else:
32
  raise KeyError
33
- self.word_explorer = WordExplorer(embedding)
34
-
35
- def plot_proyection_2d( self,
36
- wordlist_0,
37
- wordlist_1,
38
- wordlist_2,
39
- wordlist_3,
40
- wordlist_4,
41
- color_wordlist_0,
42
- color_wordlist_1,
43
- color_wordlist_2,
44
- color_wordlist_3,
45
- color_wordlist_4,
46
- n_alpha,
47
- fontsize,
48
- n_neighbors
49
- ):
 
 
 
 
 
50
  err = ""
51
  neighbors_method = 'sklearn'
52
  wordlist_0 = self.parse_words(wordlist_0)
@@ -59,49 +82,63 @@ class WordExplorerConnector(Connector):
59
  err = self.process_error("Ingresa al menos 1 palabras para continuar")
60
  return None, err
61
 
62
- err = self.word_explorer.check_oov([wordlist_0, wordlist_1, wordlist_2, wordlist_3, wordlist_4])
 
 
 
63
  if err:
64
  return None, self.process_error(err)
65
 
66
- fig = self.word_explorer.plot_projections_2d(wordlist_0,
67
- wordlist_1,
68
- wordlist_2,
69
- wordlist_3,
70
- wordlist_4,
71
- color_wordlist_0=color_wordlist_0,
72
- color_wordlist_1=color_wordlist_1,
73
- color_wordlist_2=color_wordlist_2,
74
- color_wordlist_3=color_wordlist_3,
75
- color_wordlist_4=color_wordlist_4,
76
- n_alpha=n_alpha,
77
- fontsize=fontsize,
78
- n_neighbors=n_neighbors,
79
- nn_method = neighbors_method
80
- )
 
 
81
  return fig, self.process_error(err)
82
 
83
  class BiasWordExplorerConnector(Connector):
84
 
85
- def __init__(self, **kwargs):
 
 
 
 
86
  if 'embedding' in kwargs:
87
  embedding = kwargs.get('embedding')
88
  else:
89
  raise KeyError
90
- self.bias_word_explorer = WordBiasExplorer(embedding)
91
 
92
- def calculate_bias_2d(self,
93
- wordlist_1,
94
- wordlist_2,
95
- to_diagnose_list
96
- ):
 
 
 
 
 
 
97
  err = ""
98
  wordlist_1 = self.parse_words(wordlist_1)
99
  wordlist_2 = self.parse_words(wordlist_2)
100
  to_diagnose_list = self.parse_words(to_diagnose_list)
101
 
102
  word_lists = [wordlist_1, wordlist_2, to_diagnose_list]
103
- for list in word_lists:
104
- if not list:
105
  err = "Debe ingresar al menos 1 palabra en las lista de palabras a diagnosticar, sesgo 1 y sesgo 2"
106
  if err:
107
  return None, self.process_error(err)
@@ -110,17 +147,23 @@ class BiasWordExplorerConnector(Connector):
110
  if err:
111
  return None, self.process_error(err)
112
 
113
- fig = self.bias_word_explorer.plot_biased_words(to_diagnose_list, wordlist_2, wordlist_1)
 
 
 
 
114
 
115
  return fig, self.process_error(err)
116
 
117
- def calculate_bias_4d(self,
118
- wordlist_1,
119
- wordlist_2,
120
- wordlist_3,
121
- wordlist_4,
122
- to_diagnose_list
123
- ):
 
 
124
  err = ""
125
  wordlist_1 = self.parse_words(wordlist_1)
126
  wordlist_2 = self.parse_words(wordlist_2)
@@ -129,8 +172,8 @@ class BiasWordExplorerConnector(Connector):
129
  to_diagnose_list = self.parse_words(to_diagnose_list)
130
 
131
  wordlists = [wordlist_1, wordlist_2, wordlist_3, wordlist_4, to_diagnose_list]
132
- for list in wordlists:
133
- if not list:
134
  err = "¡Para graficar con 4 espacios, debe ingresar al menos 1 palabra en todas las listas!"
135
  if err:
136
  return None, self.process_error(err)
@@ -139,5 +182,12 @@ class BiasWordExplorerConnector(Connector):
139
  if err:
140
  return None, self.process_error(err)
141
 
142
- fig = self.bias_word_explorer.plot_biased_words(to_diagnose_list, wordlist_1, wordlist_2, wordlist_3, wordlist_4)
 
 
 
 
 
 
 
143
  return fig, self.process_error(err)
 
1
+ from abc import ABC
 
 
 
2
 
3
+ from modules.module_WordExplorer import WordExplorer
4
  from modules.module_BiasExplorer import WordBiasExplorer
5
+ from typing import List, Tuple
6
+
7
 
8
  class Connector(ABC):
9
+ def parse_word(
10
+ self,
11
+ word: str
12
+ ) -> str:
13
+
14
  return word.lower().strip()
15
 
16
+ def parse_words(
17
+ self,
18
+ array_in_string: str
19
+ ) -> List[str]:
20
+
21
  words = array_in_string.strip()
22
  if not words:
23
  return []
24
+
25
+ words = [
26
+ self.parse_word(word)
27
+ for word in words.split(',') if word.strip() != ''
28
+ ]
29
  return words
30
 
31
+ def process_error(
32
+ self,
33
+ err: str
34
+ ) -> str:
35
+
36
+ if err:
37
+ err = "<center><h3>" + err + "</h3></center>"
38
+ return err
39
 
40
 
41
  class WordExplorerConnector(Connector):
42
+ def __init__(
43
+ self,
44
+ **kwargs
45
+ ) -> None:
46
 
 
47
  if 'embedding' in kwargs:
48
  embedding = kwargs.get('embedding')
49
  else:
50
  raise KeyError
51
+
52
+ self.word_explorer = WordExplorer(
53
+ embedding=embedding
54
+ )
55
+
56
+ def plot_proyection_2d(
57
+ self,
58
+ wordlist_0: str,
59
+ wordlist_1: str,
60
+ wordlist_2: str,
61
+ wordlist_3: str,
62
+ wordlist_4: str,
63
+ color_wordlist_0: str,
64
+ color_wordlist_1: str,
65
+ color_wordlist_2: str,
66
+ color_wordlist_3: str,
67
+ color_wordlist_4: str,
68
+ n_alpha: float,
69
+ fontsize: int,
70
+ n_neighbors: int
71
+ ) -> Tuple:
72
+
73
  err = ""
74
  neighbors_method = 'sklearn'
75
  wordlist_0 = self.parse_words(wordlist_0)
 
82
  err = self.process_error("Ingresa al menos 1 palabras para continuar")
83
  return None, err
84
 
85
+ err = self.word_explorer.check_oov(
86
+ [wordlist_0, wordlist_1, wordlist_2, wordlist_3, wordlist_4]
87
+ )
88
+
89
  if err:
90
  return None, self.process_error(err)
91
 
92
+ fig = self.word_explorer.plot_projections_2d(
93
+ wordlist_0,
94
+ wordlist_1,
95
+ wordlist_2,
96
+ wordlist_3,
97
+ wordlist_4,
98
+ color_wordlist_0=color_wordlist_0,
99
+ color_wordlist_1=color_wordlist_1,
100
+ color_wordlist_2=color_wordlist_2,
101
+ color_wordlist_3=color_wordlist_3,
102
+ color_wordlist_4=color_wordlist_4,
103
+ n_alpha=n_alpha,
104
+ fontsize=fontsize,
105
+ n_neighbors=n_neighbors,
106
+ nn_method = neighbors_method
107
+ )
108
+
109
  return fig, self.process_error(err)
110
 
111
  class BiasWordExplorerConnector(Connector):
112
 
113
+ def __init__(
114
+ self,
115
+ **kwargs
116
+ ) -> None:
117
+
118
  if 'embedding' in kwargs:
119
  embedding = kwargs.get('embedding')
120
  else:
121
  raise KeyError
 
122
 
123
+ self.bias_word_explorer = WordBiasExplorer(
124
+ embedding=embedding
125
+ )
126
+
127
+ def calculate_bias_2d(
128
+ self,
129
+ wordlist_1: str,
130
+ wordlist_2: str,
131
+ to_diagnose_list: str
132
+ ) -> Tuple:
133
+
134
  err = ""
135
  wordlist_1 = self.parse_words(wordlist_1)
136
  wordlist_2 = self.parse_words(wordlist_2)
137
  to_diagnose_list = self.parse_words(to_diagnose_list)
138
 
139
  word_lists = [wordlist_1, wordlist_2, to_diagnose_list]
140
+ for _list in word_lists:
141
+ if not _list:
142
  err = "Debe ingresar al menos 1 palabra en las lista de palabras a diagnosticar, sesgo 1 y sesgo 2"
143
  if err:
144
  return None, self.process_error(err)
 
147
  if err:
148
  return None, self.process_error(err)
149
 
150
+ fig = self.bias_word_explorer.plot_biased_words(
151
+ to_diagnose_list,
152
+ wordlist_2,
153
+ wordlist_1
154
+ )
155
 
156
  return fig, self.process_error(err)
157
 
158
+ def calculate_bias_4d(
159
+ self,
160
+ wordlist_1: str,
161
+ wordlist_2: str,
162
+ wordlist_3: str,
163
+ wordlist_4: str,
164
+ to_diagnose_list: str
165
+ ) -> Tuple:
166
+
167
  err = ""
168
  wordlist_1 = self.parse_words(wordlist_1)
169
  wordlist_2 = self.parse_words(wordlist_2)
 
172
  to_diagnose_list = self.parse_words(to_diagnose_list)
173
 
174
  wordlists = [wordlist_1, wordlist_2, wordlist_3, wordlist_4, to_diagnose_list]
175
+ for _list in wordlists:
176
+ if not _list:
177
  err = "¡Para graficar con 4 espacios, debe ingresar al menos 1 palabra en todas las listas!"
178
  if err:
179
  return None, self.process_error(err)
 
182
  if err:
183
  return None, self.process_error(err)
184
 
185
+ fig = self.bias_word_explorer.plot_biased_words(
186
+ to_diagnose_list,
187
+ wordlist_1,
188
+ wordlist_2,
189
+ wordlist_3,
190
+ wordlist_4
191
+ )
192
+
193
  return fig, self.process_error(err)
modules/module_logsManager.py CHANGED
@@ -40,11 +40,11 @@ class HuggingFaceDatasetSaver(FlaggingCallback):
40
 
41
  def __init__(
42
  self,
43
- hf_token: str = os.getenv('HF_TOKEN'),
44
- dataset_name: str = os.getenv('DS_LOGS_NAME'),
45
- organization: Optional[str] = os.getenv('ORG_NAME'),
46
- private: bool = True,
47
- available_logs: bool = False
48
  ):
49
  """
50
  Parameters:
@@ -53,6 +53,8 @@ class HuggingFaceDatasetSaver(FlaggingCallback):
53
  organization: The organization to save the dataset under. The hf_token must provide write access to this organization. If not provided, saved under the name of the user corresponding to the hf_token.
54
  private: Whether the dataset should be private (defaults to False).
55
  """
 
 
56
  self.hf_token = hf_token
57
  self.dataset_name = dataset_name
58
  self.organization_name = organization
 
40
 
41
  def __init__(
42
  self,
43
+ dataset_name: str=None,
44
+ hf_token: str=os.getenv('HF_TOKEN'),
45
+ organization: Optional[str]=os.getenv('ORG_NAME'),
46
+ private: bool=True,
47
+ available_logs: bool=False
48
  ):
49
  """
50
  Parameters:
 
53
  organization: The organization to save the dataset under. The hf_token must provide write access to this organization. If not provided, saved under the name of the user corresponding to the hf_token.
54
  private: Whether the dataset should be private (defaults to False).
55
  """
56
+ assert(dataset_name is not None), "Error: Parameter 'dataset_name' cannot be empty!."
57
+
58
  self.hf_token = hf_token
59
  self.dataset_name = dataset_name
60
  self.organization_name = organization
tool_info.py CHANGED
@@ -4,7 +4,7 @@ TOOL_INFO = """
4
  * [Read Full Paper](https://arxiv.org/abs/2207.06591)
5
 
6
  > ### Licensing Information
7
- * [MIT Licence](https://huggingface.co/spaces/vialibre/vialibre/bias_we_std_tool/resolve/main/LICENSE)
8
 
9
  > ### Citation Information
10
  ```c
 
4
  * [Read Full Paper](https://arxiv.org/abs/2207.06591)
5
 
6
  > ### Licensing Information
7
+ * [MIT Licence](https://huggingface.co/spaces/vialibre/edia_we_es/resolve/main/LICENSE)
8
 
9
  > ### Citation Information
10
  ```c