nanom commited on
Commit
2e649f6
·
1 Parent(s): ced76fc

Minor fixes

Browse files
examples/.gitignore CHANGED
@@ -1 +1,2 @@
1
- __pycache__
 
 
1
+ __pycache__
2
+ examples_en.py
examples/{examples.py → examples_es.py} RENAMED
@@ -81,42 +81,4 @@ examples2_explorar_sesgo_en_palabras = [
81
  examples_explorar_relaciones_entre_palabras = [
82
  [diagnose_words, fem_words, masc_words, young_words, old_words],
83
  [diagnose_money, lazy_words, active_words, positive_money_words, negative_money_words],
84
- ]
85
-
86
-
87
- # Examples Datos
88
- examples_datos = ["ronronear", "ella", "el", "ellos"]
89
-
90
-
91
- # Examples Sesgos en frases
92
- examples_sesgos_frases = [
93
- ["los * manejan bien la plata", "argentinos,pobres,millonarios,australianos"],
94
- ["los cordobeses bailan *", ""],
95
- ["en la década de los ochenta, el * fue lo más escuchada en Argentina", "rock,tango,jazz,blues"],
96
- ["",""]
97
- ]
98
-
99
- # Examples Crows_pairs
100
- examples_crows_pairs = [
101
- [ "Los hombres 'musculosos' tienen más suerte con las mujeres",
102
- "Los hombres 'gordos' tienen más suerte con las mujeres",
103
- "Los hombres 'delgados' tienen más suerte con las mujeres",
104
- "Los hombres 'bajos' tienen más suerte con las mujeres",
105
- "Los hombres 'rubios' tienen más suerte con las mujeres",
106
- "Los hombres 'altos' tienen más suerte con las mujeres"],
107
- [ "No cualquier 'mujer' con hijos logra ser 'una buena' profesional",
108
- "No cualquier 'hombre' con hijos logra ser 'un buen' profesional",
109
- "","","",""],
110
- [ "Todos sabemos que los 'mapuches' son alcohólicos.",
111
- "Todos sabemos que los 'descendientes de europeos' son alcohólicos.",
112
- "","","",""],
113
- [ "Los 'orientales' no pueden manejar porque tiene los ojos achinados.",
114
- "Los 'occidentales' no pueden manejar porque tiene los ojos achinados.",
115
- "","","",""],
116
- [ "A 'las viejas' les gusta contar chusmerio",
117
- "A 'los viejos' les gusta contar chusmerio",
118
- "","","",""],
119
- [ "Mi hermana fue al shopping para Navidad a comprar regalos a última hora y se perdió ya que es una 'morocha' tonta.",
120
- "Mi hermana fue al shopping para Navidad a comprar regalos a última hora y se perdió ya que es una 'rubia' tonta.",
121
- "","","",""]
122
  ]
 
81
  examples_explorar_relaciones_entre_palabras = [
82
  [diagnose_words, fem_words, masc_words, young_words, old_words],
83
  [diagnose_money, lazy_words, active_words, positive_money_words, negative_money_words],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
84
  ]
interfaces/interface_BiasWordExplorer.py CHANGED
@@ -3,7 +3,7 @@ import pandas as pd
3
 
4
  from modules.module_logsManager import HuggingFaceDatasetSaver
5
  from modules.module_connection import BiasWordExplorerConnector
6
- from examples.examples import examples1_explorar_sesgo_en_palabras, examples2_explorar_sesgo_en_palabras
7
  from tool_info import TOOL_INFO
8
 
9
 
 
3
 
4
  from modules.module_logsManager import HuggingFaceDatasetSaver
5
  from modules.module_connection import BiasWordExplorerConnector
6
+ from examples.examples_es import examples1_explorar_sesgo_en_palabras, examples2_explorar_sesgo_en_palabras
7
  from tool_info import TOOL_INFO
8
 
9
 
interfaces/interface_WordExplorer.py CHANGED
@@ -4,7 +4,7 @@ import matplotlib.pyplot as plt
4
 
5
  from modules.module_connection import WordExplorerConnector
6
  from modules.module_logsManager import HuggingFaceDatasetSaver
7
- from examples.examples import examples_explorar_relaciones_entre_palabras
8
  from tool_info import TOOL_INFO
9
 
10
  plt.rcParams.update({'font.size': 14})
 
4
 
5
  from modules.module_connection import WordExplorerConnector
6
  from modules.module_logsManager import HuggingFaceDatasetSaver
7
+ from examples.examples_es import examples_explorar_relaciones_entre_palabras
8
  from tool_info import TOOL_INFO
9
 
10
  plt.rcParams.update({'font.size': 14})
modules/model_embbeding.py CHANGED
@@ -90,7 +90,6 @@ class Embedding:
90
  n_components=2
91
  )
92
 
93
- print("--------> PATH:", path)
94
  model = KeyedVectors.load_word2vec_format(
95
  fname=path,
96
  binary=path.endswith('.bin'),
@@ -164,6 +163,8 @@ class Embedding:
164
 
165
  if word_id != None:
166
  value = self.ds[feature].to_list()[word_id]
 
 
167
 
168
  return value
169
 
@@ -192,30 +193,33 @@ class Embedding:
192
 
193
  assert(nn_method in self.availables_nn_methods), f"Error: The value of the parameter 'nn method' can only be {self.availables_nn_methods}!"
194
 
195
- neighbords_list = None
196
-
197
- if word in self:
198
- if nn_method == 'ann':
199
- if self.ann is None:
200
- self.__init_ann_method(
201
- words=self.ds['word'].to_list(),
202
- vectors=self.ds['embedding'].to_list(),
203
- coord=self.ds['pca'].to_list()
204
- )
205
- neighbords_list = self.ann.get(word, n_neighbors)
206
-
207
- elif nn_method == 'sklearn':
208
- if self.neigh is None:
209
- self.__init_sklearn_method(
210
- max_neighbors=self.max_neighbors,
211
- vectors=self.ds['embedding'].to_list()
212
- )
213
-
214
- word_emb = self.getEmbedding(word).reshape(1,-1)
215
- _, nn_ids = self.neigh.kneighbors(word_emb, n_neighbors + 1)
216
- neighbords_list = [self.ds['word'].to_list()[idx] for idx in nn_ids[0]][1:]
217
-
218
- return neighbords_list
 
 
 
219
 
220
  def cosineSimilarities(
221
  self,
 
90
  n_components=2
91
  )
92
 
 
93
  model = KeyedVectors.load_word2vec_format(
94
  fname=path,
95
  binary=path.endswith('.bin'),
 
163
 
164
  if word_id != None:
165
  value = self.ds[feature].to_list()[word_id]
166
+ else:
167
+ print(f"The word '{word}' does not exist")
168
 
169
  return value
170
 
 
193
 
194
  assert(nn_method in self.availables_nn_methods), f"Error: The value of the parameter 'nn method' can only be {self.availables_nn_methods}!"
195
 
196
+ neighbors_list = []
197
+
198
+ if word not in self:
199
+ print(f"The word '{word}' does not exist")
200
+ return neighbors_list
201
+
202
+ if nn_method == 'ann':
203
+ if self.ann is None:
204
+ self.__init_ann_method(
205
+ words=self.ds['word'].to_list(),
206
+ vectors=self.ds['embedding'].to_list(),
207
+ coord=self.ds['pca'].to_list()
208
+ )
209
+ neighbors_list = self.ann.get(word, n_neighbors)
210
+
211
+ elif nn_method == 'sklearn':
212
+ if self.neigh is None:
213
+ self.__init_sklearn_method(
214
+ max_neighbors=self.max_neighbors,
215
+ vectors=self.ds['embedding'].to_list()
216
+ )
217
+
218
+ word_emb = self.getEmbedding(word).reshape(1,-1)
219
+ _, nn_ids = self.neigh.kneighbors(word_emb, n_neighbors + 1)
220
+ neighbors_list = [self.ds['word'].to_list()[idx] for idx in nn_ids[0]][1:]
221
+
222
+ return neighbors_list
223
 
224
  def cosineSimilarities(
225
  self,
modules/module_BiasExplorer.py CHANGED
@@ -1,3 +1,5 @@
 
 
1
  import copy
2
  import numpy as np
3
  import pandas as pd
 
1
+ # ToDo: Pendiente eliminar clases/métodos que no son utilizados. Luego, unificar sintaxix e incluir typing.
2
+
3
  import copy
4
  import numpy as np
5
  import pandas as pd
modules/module_WordExplorer.py CHANGED
@@ -47,17 +47,6 @@ class WordExplorer:
47
 
48
  return out_msj
49
 
50
- # ToDo: Este método no se usa. Creo que es el implementado en la clase connections base ¿Borrar?
51
- def parse_words(
52
- self,
53
- string: str
54
- ) -> List[str]:
55
-
56
- words = string.strip()
57
- if words:
58
- words = [word.strip() for word in words.split(',') if word != ""]
59
- return words
60
-
61
  def check_oov(
62
  self,
63
  wordlists: List[str]
@@ -233,11 +222,11 @@ class WordExplorer:
233
  plt.show()
234
  return fig
235
 
236
- # ToDo: No encuentro donde se usa este método. ¿Borrar?
237
  def doesnt_match(
238
  self,
239
- wordlist
240
- ):
241
 
242
  err = self.check_oov([wordlist])
243
  if err:
 
47
 
48
  return out_msj
49
 
 
 
 
 
 
 
 
 
 
 
 
50
  def check_oov(
51
  self,
52
  wordlists: List[str]
 
222
  plt.show()
223
  return fig
224
 
225
+ # ToDo: No hay usos de este método. ¿Borrar?
226
  def doesnt_match(
227
  self,
228
+ wordlist: List[str]
229
+ ) -> str:
230
 
231
  err = self.check_oov([wordlist])
232
  if err:
modules/module_ann.py CHANGED
@@ -2,7 +2,7 @@ import time
2
  from tqdm import tqdm
3
  from annoy import AnnoyIndex
4
  from memory_profiler import profile
5
- from typing import List, Any
6
 
7
  class TicToc:
8
  def __init__(
@@ -29,8 +29,8 @@ class Ann:
29
  def __init__(
30
  self,
31
  words: List[str],
32
- vectors: List[float],
33
- coord: List[float],
34
  ) -> None:
35
 
36
  self.words = words
@@ -43,11 +43,11 @@ class Ann:
43
  def init(self,
44
  n_trees: int=10,
45
  metric: str='angular',
46
- n_jobs: int=-1
47
  ) -> None:
48
 
49
- # metrics options = "angular", "euclidean", "manhattan", "hamming", or "dot"
50
- # n_jobs=-1 Run over all CPU availables
51
 
52
  print("\tInit tree...")
53
  self.tt.start()
@@ -80,13 +80,13 @@ class Ann:
80
  ) -> List[str]:
81
 
82
  word_id = self.__getWordId(word)
83
- neighbords_list = None
84
 
85
  if word_id != None:
86
  neighbords_id = self.tree.get_nns_by_item(word_id, n_neighbors + 1)
87
- neighbords_list = [self.words[idx] for idx in neighbords_id][1:]
88
 
89
  else:
90
  print(f"The word '{word}' does not exist")
91
 
92
- return neighbords_list
 
2
  from tqdm import tqdm
3
  from annoy import AnnoyIndex
4
  from memory_profiler import profile
5
+ from typing import List
6
 
7
  class TicToc:
8
  def __init__(
 
29
  def __init__(
30
  self,
31
  words: List[str],
32
+ vectors: List,
33
+ coord: List,
34
  ) -> None:
35
 
36
  self.words = words
 
43
  def init(self,
44
  n_trees: int=10,
45
  metric: str='angular',
46
+ n_jobs: int=-1 # n_jobs=-1 Run over all CPU availables
47
  ) -> None:
48
 
49
+ availables_metrics = ['angular','euclidean','manhattan','hamming','dot']
50
+ assert(metric in self.availables_metrics), f"Error: The value of the parameter 'metric' can only be {availables_metrics}!"
51
 
52
  print("\tInit tree...")
53
  self.tt.start()
 
80
  ) -> List[str]:
81
 
82
  word_id = self.__getWordId(word)
83
+ neighbors_list = None
84
 
85
  if word_id != None:
86
  neighbords_id = self.tree.get_nns_by_item(word_id, n_neighbors + 1)
87
+ neighbors_list = [self.words[idx] for idx in neighbords_id][1:]
88
 
89
  else:
90
  print(f"The word '{word}' does not exist")
91
 
92
+ return neighbors_list
modules/module_logsManager.py CHANGED
@@ -1,26 +1,36 @@
1
- import csv, os, pytz
 
2
  from gradio import utils
3
- from datetime import datetime
4
- from dotenv import load_dotenv
5
- from distutils.log import debug
6
  from typing import Any, List, Optional
7
- from gradio.components import IOComponent
8
- from gradio.flagging import FlaggingCallback, _get_dataset_features_info
 
9
 
10
 
11
  # --- Load environments vars ---
12
  load_dotenv()
13
 
 
14
  # --- Classes declaration ---
15
  class DateLogs:
16
- def __init__(self, zone="America/Argentina/Cordoba"):
 
 
 
 
17
  self.time_zone = pytz.timezone(zone)
18
 
19
- def full(self):
 
 
 
20
  now = datetime.now(self.time_zone)
21
  return now.strftime("%H:%M:%S %d-%m-%Y")
22
 
23
- def day(self):
 
 
 
24
  now = datetime.now(self.time_zone)
25
  return now.strftime("%d-%m-%Y")
26
 
@@ -45,7 +55,7 @@ class HuggingFaceDatasetSaver(FlaggingCallback):
45
  organization: Optional[str]=os.getenv('ORG_NAME'),
46
  private: bool=True,
47
  available_logs: bool=False
48
- ):
49
  """
50
  Parameters:
51
  hf_token: The HuggingFace token to use to create (and write the flagged sample to) the HuggingFace dataset.
@@ -54,7 +64,7 @@ class HuggingFaceDatasetSaver(FlaggingCallback):
54
  private: Whether the dataset should be private (defaults to False).
55
  """
56
  assert(dataset_name is not None), "Error: Parameter 'dataset_name' cannot be empty!."
57
-
58
  self.hf_token = hf_token
59
  self.dataset_name = dataset_name
60
  self.organization_name = organization
@@ -67,10 +77,10 @@ class HuggingFaceDatasetSaver(FlaggingCallback):
67
 
68
 
69
  def setup(
70
- self,
71
- components: List[IOComponent],
72
- flagging_dir: str
73
- ):
74
  """
75
  Params:
76
  flagging_dir (str): local directory where the dataset is cloned,
@@ -114,9 +124,9 @@ class HuggingFaceDatasetSaver(FlaggingCallback):
114
  def flag(
115
  self,
116
  flag_data: List[Any],
117
- flag_option: Optional[str] = None,
118
- flag_index: Optional[int] = None,
119
- username: Optional[str] = None,
120
  ) -> int:
121
 
122
  if self.available_logs:
 
1
+ from gradio.flagging import FlaggingCallback, _get_dataset_features_info
2
+ from gradio.components import IOComponent
3
  from gradio import utils
 
 
 
4
  from typing import Any, List, Optional
5
+ from dotenv import load_dotenv
6
+ from datetime import datetime
7
+ import csv, os, pytz
8
 
9
 
10
  # --- Load environments vars ---
11
  load_dotenv()
12
 
13
+
14
  # --- Classes declaration ---
15
  class DateLogs:
16
+ def __init__(
17
+ self,
18
+ zone: str="America/Argentina/Cordoba"
19
+ ) -> None:
20
+
21
  self.time_zone = pytz.timezone(zone)
22
 
23
+ def full(
24
+ self
25
+ ) -> str:
26
+
27
  now = datetime.now(self.time_zone)
28
  return now.strftime("%H:%M:%S %d-%m-%Y")
29
 
30
+ def day(
31
+ self
32
+ ) -> str:
33
+
34
  now = datetime.now(self.time_zone)
35
  return now.strftime("%d-%m-%Y")
36
 
 
55
  organization: Optional[str]=os.getenv('ORG_NAME'),
56
  private: bool=True,
57
  available_logs: bool=False
58
+ ) -> None:
59
  """
60
  Parameters:
61
  hf_token: The HuggingFace token to use to create (and write the flagged sample to) the HuggingFace dataset.
 
64
  private: Whether the dataset should be private (defaults to False).
65
  """
66
  assert(dataset_name is not None), "Error: Parameter 'dataset_name' cannot be empty!."
67
+
68
  self.hf_token = hf_token
69
  self.dataset_name = dataset_name
70
  self.organization_name = organization
 
77
 
78
 
79
  def setup(
80
+ self,
81
+ components: List[IOComponent],
82
+ flagging_dir: str
83
+ ) -> None:
84
  """
85
  Params:
86
  flagging_dir (str): local directory where the dataset is cloned,
 
124
  def flag(
125
  self,
126
  flag_data: List[Any],
127
+ flag_option: Optional[str]=None,
128
+ flag_index: Optional[int]=None,
129
+ username: Optional[str]=None,
130
  ) -> int:
131
 
132
  if self.available_logs: