nanom commited on
Commit
ad445e5
·
1 Parent(s): 3f5e308

Embedding class fix: neighbors bug, added max_n neighbors, typing, etc.

Browse files
app.py CHANGED
@@ -4,26 +4,34 @@ import pandas as pd
4
 
5
 
6
  # --- Imports modules ---
7
- from modules.model_embbeding import Embedding
 
8
 
9
  # --- Imports interfaces ---
10
- from interfaces.interface_WordExplorer import interface as wordExplorer_interface
11
  from interfaces.interface_BiasWordExplorer import interface as biasWordExplorer_interface
12
 
 
13
  # --- Tool config ---
14
  AVAILABLE_LOGS = True # [True | False]
15
  LANGUAGE = "spanish" # [spanish | english]
16
  EMBEDDINGS_PATH = "data/fasttext-sbwc.100k.vec"
 
 
17
 
18
  # --- Init classes ---
19
  embedding = Embedding(
20
  path=EMBEDDINGS_PATH,
21
  binary=EMBEDDINGS_PATH.endswith('.bin'),
22
  limit=None,
23
- randomizedPCA=False
 
24
  )
 
 
25
  labels = pd.read_json(f"language/{LANGUAGE}.json")["app"]
26
 
 
27
  # --- Main App ---
28
  INTERFACE_LIST = [
29
  biasWordExplorer_interface(
@@ -33,6 +41,7 @@ INTERFACE_LIST = [
33
  wordExplorer_interface(
34
  embedding=embedding,
35
  available_logs=AVAILABLE_LOGS,
 
36
  lang=LANGUAGE),
37
  ]
38
 
 
4
 
5
 
6
  # --- Imports modules ---
7
+ from modules.model_embbeding import Embedding # Fix and Updated
8
+
9
 
10
  # --- Imports interfaces ---
11
+ from interfaces.interface_WordExplorer import interface as wordExplorer_interface # Updated
12
  from interfaces.interface_BiasWordExplorer import interface as biasWordExplorer_interface
13
 
14
+
15
  # --- Tool config ---
16
  AVAILABLE_LOGS = True # [True | False]
17
  LANGUAGE = "spanish" # [spanish | english]
18
  EMBEDDINGS_PATH = "data/fasttext-sbwc.100k.vec"
19
+ MAX_NEIGHBORS = 20 # Updated
20
+
21
 
22
  # --- Init classes ---
23
  embedding = Embedding(
24
  path=EMBEDDINGS_PATH,
25
  binary=EMBEDDINGS_PATH.endswith('.bin'),
26
  limit=None,
27
+ randomizedPCA=False,
28
+ max_neighbors=MAX_NEIGHBORS # Updated
29
  )
30
+
31
+ # --- Init Vars ---
32
  labels = pd.read_json(f"language/{LANGUAGE}.json")["app"]
33
 
34
+
35
  # --- Main App ---
36
  INTERFACE_LIST = [
37
  biasWordExplorer_interface(
 
41
  wordExplorer_interface(
42
  embedding=embedding,
43
  available_logs=AVAILABLE_LOGS,
44
+ max_neighbors=MAX_NEIGHBORS, # Updated
45
  lang=LANGUAGE),
46
  ]
47
 
data/.gitignore ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ __pycache__/
2
+ data_loader.py
interfaces/.gitignore ADDED
@@ -0,0 +1 @@
 
 
1
+ __pycache__/
interfaces/interface_WordExplorer.py CHANGED
@@ -3,13 +3,19 @@ import pandas as pd
3
  import matplotlib.pyplot as plt
4
 
5
  from tool_info import TOOL_INFO
6
- from modules.module_connection import WordExplorerConnector
7
  from modules.module_logsManager import HuggingFaceDatasetSaver
8
  from examples.examples import examples_explorar_relaciones_entre_palabras
9
 
10
  plt.rcParams.update({'font.size': 14})
11
 
12
- def interface(embedding, available_logs, lang="spanish"):
 
 
 
 
 
 
13
  # --- Init logs ---
14
  log_callback = HuggingFaceDatasetSaver(
15
  available_logs=available_logs
@@ -53,10 +59,10 @@ def interface(embedding, available_logs, lang="spanish"):
53
  with gr.Row():
54
  with gr.Row():
55
  gr.Markdown(labels["plotNeighbours"]["title"])
56
- n_neighbors = gr.Slider(minimum=0,maximum=100,step=1,label=labels["plotNeighbours"]["quantity"])
57
  with gr.Row():
58
  alpha = gr.Slider(minimum=0.1,maximum=0.9, value=0.3, step=0.1,label=labels["options"]["transparency"])
59
- fontsize=gr.Number(value=18, label=labels["options"]["font-size"])
60
  with gr.Row():
61
  btn_plot = gr.Button(labels["plot_button"])
62
  with gr.Row():
 
3
  import matplotlib.pyplot as plt
4
 
5
  from tool_info import TOOL_INFO
6
+ from modules.module_connection import WordExplorerConnector # Updated
7
  from modules.module_logsManager import HuggingFaceDatasetSaver
8
  from examples.examples import examples_explorar_relaciones_entre_palabras
9
 
10
  plt.rcParams.update({'font.size': 14})
11
 
12
+ def interface(
13
+ embedding,
14
+ available_logs: bool,
15
+ max_neighbors: int, # Updated
16
+ lang: str="spanish",
17
+ ) -> gr.Blocks:
18
+
19
  # --- Init logs ---
20
  log_callback = HuggingFaceDatasetSaver(
21
  available_logs=available_logs
 
59
  with gr.Row():
60
  with gr.Row():
61
  gr.Markdown(labels["plotNeighbours"]["title"])
62
+ n_neighbors = gr.Slider(minimum=0,maximum=max_neighbors,step=1,label=labels["plotNeighbours"]["quantity"])
63
  with gr.Row():
64
  alpha = gr.Slider(minimum=0.1,maximum=0.9, value=0.3, step=0.1,label=labels["options"]["transparency"])
65
+ fontsize=gr.Number(value=25, label=labels["options"]["font-size"])
66
  with gr.Row():
67
  btn_plot = gr.Button(labels["plot_button"])
68
  with gr.Row():
modules/.gitignore ADDED
@@ -0,0 +1 @@
 
 
1
+ __pycache__/
modules/model_embbeding.py CHANGED
@@ -1,58 +1,127 @@
 
 
 
 
 
 
1
  import os
2
  import operator
3
- import numpy as np
4
  import pandas as pd
 
 
5
  from numpy import dot
6
  from gensim import matutils
7
- from modules.module_ann import Ann
8
- from memory_profiler import profile
9
- from sklearn.neighbors import NearestNeighbors
10
- from data.data_loader import load_embeddings
11
 
12
 
13
  class Embedding:
14
  @profile
15
- def __init__(self, path, binary, limit = None, randomizedPCA = False):
16
- # Dataset info
 
 
 
 
 
 
 
17
  self.path = path
 
 
 
 
18
 
19
- # Pandas dataset
20
  self.ds = None
21
 
22
- # All Words embedding List[List[float]]
23
- self.embedding = None
24
-
25
- # Estimate AproximateNearestNeighbors
26
- self.ann = None
27
 
28
  # Load embedding and pca dataset
29
- self.__load(binary, limit, randomizedPCA)
30
 
31
- def __contains__(self, word):
32
- return word in self.ds['word'].to_list()
 
33
 
34
- def __load(self, binary, limit, randomizedPCA):
35
  print(f"Preparing {os.path.basename(self.path)} embeddings...")
36
 
37
  # --- Prepare dataset ---
38
- self.ds = load_embeddings(self.path, binary, randomizedPCA, limit)
39
-
40
- # --- Get embedding from string
41
- self.embedding = self.ds['embedding'].to_list()
42
 
43
- # --- Get forest tree to estimate Nearest Neighbors ---
 
44
  self.ann = Ann(
45
  words=self.ds['word'],
46
  vectors=self.ds['embedding'],
47
  coord=self.ds['pca']
48
  )
49
- self.ann.init(n_trees=20, metric='dot', n_jobs=-1)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
50
 
51
- # --- Fit Sklearn NN method ---
52
- self.neigh = NearestNeighbors(n_neighbors=20)
53
- self.neigh.fit(self.embedding)
54
 
55
- def __getValue(self, word, feature):
 
 
 
 
56
  word_id, value = None, None
57
 
58
  if word in self:
@@ -63,30 +132,56 @@ class Embedding:
63
 
64
  return value
65
 
66
- def getEmbedding(self, word):
 
 
 
 
67
  return self.__getValue(word, 'embedding')
68
 
69
- def getPCA(self, word):
 
 
 
 
70
  return self.__getValue(word, 'pca')
71
 
72
- def cosineSimilarities(self, vector_1, vectors_all):
73
- norm = np.linalg.norm(vector_1)
74
- all_norms = np.linalg.norm(vectors_all, axis=1)
75
- dot_products = dot(vectors_all, vector_1)
76
- similarities = dot_products / (norm * all_norms)
77
- return similarities
78
-
79
- def getNearestNeighbors(self, word, n_neighbors=10, nn_method='sklearn'):
 
80
  if nn_method == 'ann':
81
  words = self.ann.get(word, n_neighbors)
 
82
  elif nn_method == 'sklearn':
83
- word_emb = self.getEmbedding(word)
84
- neighbors = self.neigh.kneighbors([word_emb], n_neighbors)[1][0]
85
- words = operator.itemgetter(*neighbors)(self.ds['word'])
86
  else:
87
  words = []
88
  return words
89
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
90
  def getCosineSimilarities(self, w1, w2):
91
  return dot(
92
  matutils.unitvec(self.getEmbedding(w1)),
 
1
+ from modules.module_ann import Ann
2
+ from memory_profiler import profile
3
+ from sklearn.neighbors import NearestNeighbors
4
+ from sklearn.decomposition import PCA
5
+ from gensim.models import KeyedVectors
6
+ from typing import List
7
  import os
8
  import operator
 
9
  import pandas as pd
10
+
11
+ import numpy as np
12
  from numpy import dot
13
  from gensim import matutils
 
 
 
 
14
 
15
 
16
  class Embedding:
17
  @profile
18
+ def __init__(self,
19
+ path: str,
20
+ binary: bool,
21
+ limit: int=None,
22
+ randomizedPCA: bool=False,
23
+ max_neighbors: int=20
24
+ ) -> None:
25
+
26
+ # Embedding vars
27
  self.path = path
28
+ self.limit = limit
29
+ self.randomizedPCA = randomizedPCA
30
+ self.binary = binary
31
+ self.max_neighbors = max_neighbors
32
 
33
+ # Full embedding dataset
34
  self.ds = None
35
 
36
+ # Estimate NearestNeighbors
37
+ self.ann = None # Aproximate with Annoy method
38
+ self.neigh = None # Exact with Sklearn method
 
 
39
 
40
  # Load embedding and pca dataset
41
+ self.__load()
42
 
43
+ def __load(
44
+ self,
45
+ ) -> None:
46
 
 
47
  print(f"Preparing {os.path.basename(self.path)} embeddings...")
48
 
49
  # --- Prepare dataset ---
50
+ self.ds = self.__preparate(
51
+ self.path, self.binary, self.limit, self.randomizedPCA
52
+ )
 
53
 
54
+ # --- Estimate Nearest Neighbors
55
+ # Method A: Througth annoy using forest tree
56
  self.ann = Ann(
57
  words=self.ds['word'],
58
  vectors=self.ds['embedding'],
59
  coord=self.ds['pca']
60
  )
61
+ self.ann.init(
62
+ n_trees=20, metric='dot', n_jobs=-1
63
+ )
64
+
65
+ # Method B: Througth Sklearn method
66
+ self.neigh = NearestNeighbors(
67
+ n_neighbors=self.max_neighbors
68
+ )
69
+ self.neigh.fit(
70
+ X=self.ds['embedding'].to_list()
71
+ )
72
+
73
+ def __preparate(
74
+ self,
75
+ path: str,
76
+ binary: bool,
77
+ limit: int,
78
+ randomizedPCA: bool
79
+ ) -> pd.DataFrame:
80
+
81
+ if randomizedPCA:
82
+ pca = PCA(
83
+ n_components=2,
84
+ copy=False,
85
+ whiten=False,
86
+ svd_solver='randomized',
87
+ iterated_power='auto'
88
+ )
89
+
90
+ else:
91
+ pca = PCA(
92
+ n_components=2
93
+ )
94
+
95
+ print("--------> PATH:", path)
96
+ model = KeyedVectors.load_word2vec_format(
97
+ fname=path,
98
+ binary=binary,
99
+ limit=limit
100
+ )
101
+
102
+ # Cased Vocab
103
+ cased_words = model.index_to_key
104
+ cased_emb = model.get_normed_vectors()
105
+ cased_pca = pca.fit_transform(cased_emb)
106
+
107
+ df_cased = pd.DataFrame(
108
+ zip(
109
+ cased_words,
110
+ cased_emb,
111
+ cased_pca
112
+ ),
113
+ columns=['word', 'embedding', 'pca']
114
+ )
115
 
116
+ df_cased['word'] = df_cased.word.apply(lambda w: w.lower())
117
+ df_uncased = df_cased.drop_duplicates(subset='word')
118
+ return df_uncased
119
 
120
+ def __getValue(
121
+ self,
122
+ word: str,
123
+ feature: str
124
+ ):
125
  word_id, value = None, None
126
 
127
  if word in self:
 
132
 
133
  return value
134
 
135
+ def getEmbedding(
136
+ self,
137
+ word: str
138
+ ):
139
+
140
  return self.__getValue(word, 'embedding')
141
 
142
+ def getPCA(
143
+ self,
144
+ word: str
145
+ ):
146
+
147
  return self.__getValue(word, 'pca')
148
 
149
+ def getNearestNeighbors(
150
+ self,
151
+ word: str,
152
+ n_neighbors: int=10,
153
+ nn_method: str='sklearn'
154
+ ) -> List[str]:
155
+
156
+ assert(n_neighbors <= self.max_neighbors), f"Error: The value of the parameter 'n_neighbors:{n_neighbors}' must less than or equal to {self.max_neighbors}!."
157
+
158
  if nn_method == 'ann':
159
  words = self.ann.get(word, n_neighbors)
160
+
161
  elif nn_method == 'sklearn':
162
+ word_emb = self.getEmbedding(word).reshape(1,-1)
163
+ _, nn_ids = self.neigh.kneighbors(word_emb, n_neighbors)
164
+ words = operator.itemgetter(*nn_ids[0])(self.ds['word'].to_list())
165
  else:
166
  words = []
167
  return words
168
 
169
+ def __contains__(
170
+ self,
171
+ word: str
172
+ ) -> bool:
173
+
174
+ return word in self.ds['word'].to_list()
175
+
176
+ # ToDo: Revisar estos dos métodos usados en la pestaña sesgoEnPalabras
177
+ # ya que ahora los embedding vienen normalizados
178
+ def cosineSimilarities(self, vector_1, vectors_all):
179
+ norm = np.linalg.norm(vector_1)
180
+ all_norms = np.linalg.norm(vectors_all, axis=1)
181
+ dot_products = dot(vectors_all, vector_1)
182
+ similarities = dot_products / (norm * all_norms)
183
+ return similarities
184
+
185
  def getCosineSimilarities(self, w1, w2):
186
  return dot(
187
  matutils.unitvec(self.getEmbedding(w1)),
modules/module_WordExplorer.py CHANGED
@@ -142,10 +142,13 @@ class WordExplorer:
142
  processed_word_list.append(WordToPlot(word, color_dict[color], color, 1))
143
 
144
  if n_neighbors > 0:
 
 
145
  neighbors = self.get_neighbors(word,
146
- n_neighbors=n_neighbors+1,
147
- nn_method=kwargs.get('nn_method', 'sklearn')
148
- )
 
149
  for n in neighbors:
150
  if n not in [wtp.word for wtp in processed_word_list]:
151
  processed_word_list.append(WordToPlot(n, color_dict[color], color, n_alpha))
 
142
  processed_word_list.append(WordToPlot(word, color_dict[color], color, 1))
143
 
144
  if n_neighbors > 0:
145
+ # Updated: Con el agregado del parámetro max_neightbors, el (n_neighbors+1)
146
+ # hacia superar ese valor máximo y se producia una aserción
147
  neighbors = self.get_neighbors(word,
148
+ # n_neighbors=n_neighbors+1,
149
+ n_neighbors=n_neighbors,
150
+ nn_method=kwargs.get('nn_method', 'sklearn')
151
+ )
152
  for n in neighbors:
153
  if n not in [wtp.word for wtp in processed_word_list]:
154
  processed_word_list.append(WordToPlot(n, color_dict[color], color, n_alpha))
modules/module_connection.py CHANGED
@@ -3,7 +3,7 @@ import pandas as pd
3
  import gradio as gr
4
  from abc import ABC, abstractmethod
5
 
6
- from modules.module_WordExplorer import WordExplorer
7
  from modules.module_BiasExplorer import WordBiasExplorer
8
 
9
  class Connector(ABC):
 
3
  import gradio as gr
4
  from abc import ABC, abstractmethod
5
 
6
+ from modules.module_WordExplorer import WordExplorer # Updated
7
  from modules.module_BiasExplorer import WordBiasExplorer
8
 
9
  class Connector(ABC):