Lucia Gonzalez commited on
Commit
41dd0ce
Β·
1 Parent(s): dbdf324

Added code missins vecs

Browse files
README.md CHANGED
@@ -1,11 +1,11 @@
1
  ---
2
  title: Explorar Sesgos
3
- emoji: πŸš€
4
- colorFrom: yellow
5
- colorTo: green
6
  sdk: gradio
7
- sdk_version: 3.3.1
8
- app_file: app.py
9
  pinned: false
10
  license: mit
11
  ---
 
1
  ---
2
  title: Explorar Sesgos
3
+ emoji: πŸŒ–
4
+ colorFrom: indigo
5
+ colorTo: red
6
  sdk: gradio
7
+ sdk_version: 3.2
8
+ app_file: explorar_sesgo_en_palabras.py
9
  pinned: false
10
  license: mit
11
  ---
examples/__pycache__/examples.cpython-38.pyc ADDED
Binary file (3.31 kB). View file
 
examples/examples.py ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ example_fem = {
2
+ "mujer": "la mente de una mujer que durante los ΓΊltimos",
3
+ "chica": "enamorado de la misma chica desde la infancia mary",
4
+ "ella": "ella llego a la final",
5
+ "madre": "su padre y su madre margarita de parma",
6
+ "hija": "hija de inmigrantes espaΓ±oles en",
7
+ "femenino": "campeonato mundial de voleibol femenino fue la duodΓ©cima ediciΓ³n",
8
+ }
9
+ example_joven = {
10
+ "joven": "",
11
+ "inmaduro": "",
12
+ "niΓ±o": "",
13
+ "crio": ""
14
+ }
15
+ example_viejo = {
16
+ "viejo": "",
17
+ "maduro": "",
18
+ "anciano": "",
19
+ "adulto": ""
20
+ }
21
+
22
+
23
+ example_masc = {
24
+ "hombre": "deseo innato que todo hombre tiene de comunicar su",
25
+ "chico": "fue un chico interesado en artes",
26
+ "el": "el parque nacional liwonde",
27
+ "padre": "la muerte de su padre en 1832 se formΓ³",
28
+ "hijo": "le dice a su hijo aΓΊn no nacido como",
29
+ "masculino": "el mito es esencialmente masculino y entre las causas",
30
+ }
31
+
32
+ example_diagnose = {
33
+ "ario": "establecer que el pueblo ario viviΓ³ en inmemoriales tiempos",
34
+ "educaciΓ³n": "sentido de vida religiΓ³n educaciΓ³n y cultura para cada mujer",
35
+ "pagado": "un rescate muy grande pagado por sus seguidores a",
36
+ "cocinar": "empezΓ³ a cocinar una sopa usando",
37
+ "lavar": "era directamente usado para lavar ropa por eso la",
38
+ "deporte": "se convirtiΓ³ en el deporte mΓ‘s popular del paΓ­s",
39
+ "ropa": "usan el kimono una ropa tradicional japonesa",
40
+ "pelea": "mal por la violenta pelea entre ambos hermanos",
41
+ "enfermero": "en enfermerΓ­a el diagnΓ³stico enfermero o diagnΓ³stico de enfermerΓ­a es",
42
+ "ganar": "una necesidad un modo de ganar",
43
+ "lΓ­der": "del estado en manos del lΓ­der opositor henrique capriles para el",
44
+ "coser": "realizar tareas domΓ©sticas bΓ‘sicas como coser y poner la mesa",
45
+ "cuidar": "de la fpf encargada de cuidar los intereses de los clubes",
46
+ "cirujano": "afrancesado ocupando el puesto de cirujano militar en el ejΓ©rcito josefino",
47
+ "rey": "la princesa jeongsung esposa del rey danjong que ascendiΓ³ al trono",
48
+ "reina": "aΓ±o ganΓ³ el tΓ­tulo de reina de la bahΓ­a en el"
49
+ }
50
+
51
+
52
+ fem_words = ','.join([word for word, context in example_fem.items()])
53
+ fem_contexts = ','.join([context for word, context in example_fem.items()])
54
+ masc_words = ','.join([word for word, context in example_masc.items()])
55
+ masc_contexts = ','.join([context for word, context in example_masc.items()])
56
+ young_words = ','.join([word for word, context in example_joven.items()])
57
+ old_words = ','.join([word for word, context in example_viejo.items()])
58
+ diagnose_words = ','.join([word for word, context in example_diagnose.items()])
59
+ diagnose_contexts = ','.join([context for word, context in example_diagnose.items()])
explorar_sesgo_en_palabras.py ADDED
@@ -0,0 +1,82 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import matplotlib as mpl
2
+ mpl.use('Agg')
3
+ import gradio as gr
4
+
5
+ from utils.modules_sesgo_en_palabras import WEBiasExplorer2d, WEBiasExplorer4d, Embedding
6
+ from examples.examples import fem_words, masc_words, old_words, young_words, diagnose_words
7
+
8
+ from tool_info import TOOL_INFO
9
+
10
+ word_vectors_path = 'fasttext-sbwc.100k.vec'
11
+ we = Embedding(word_vectors_path)
12
+ we.load_we_as_keyed_vectors(word_vectors_path)
13
+ we.wv.init_sims(replace=True)
14
+
15
+ LABEL_WORD_LIST_1 = 'Lista de palabras 1'
16
+ LABEL_WORD_LIST_2 = 'Lista de palabras 2'
17
+ LABEL_WORD_LIST_3 = 'Lista de palabras 3'
18
+ LABEL_WORD_LIST_4 = 'Lista de palabras 4'
19
+
20
+ LABEL_WORD_LIST_DIAGNOSE = 'Lista de palabras a diagnosticar'
21
+
22
+ we_bias = WEBiasExplorer2d(we.wv)
23
+ we_bias_2d = WEBiasExplorer4d(we.wv)
24
+
25
+ explorar_sesgo_en_palabras_interface = gr.Blocks()
26
+ with explorar_sesgo_en_palabras_interface:
27
+ gr.Markdown("1. Escribi palabras para diagnosticar separadas por comas")
28
+ with gr.Row():
29
+ with gr.Column():
30
+ with gr.Row():
31
+ diagnose_list = gr.Textbox(lines=2, label=LABEL_WORD_LIST_DIAGNOSE)
32
+ with gr.Row():
33
+ gr.Markdown("2. Para graficar 2 espacios, completa las siguientes listas:")
34
+ with gr.Row():
35
+ wordlist_1 = gr.Textbox(lines=2, label=LABEL_WORD_LIST_1)
36
+ wordlist_2 = gr.Textbox(lines=2, label=LABEL_WORD_LIST_2)
37
+ with gr.Row():
38
+ gr.Markdown("2. Para graficar 4 espacios, completa las siguientes listas:")
39
+ with gr.Row():
40
+ wordlist_3 = gr.Textbox(lines=2, label=LABEL_WORD_LIST_3)
41
+ wordlist_4 = gr.Textbox(lines=2, label=LABEL_WORD_LIST_4)
42
+ with gr.Column():
43
+ with gr.Row():
44
+ bias2d = gr.Button('Β‘Graficar 2 estereotipos!')
45
+ with gr.Row():
46
+ bias4d = gr.Button('Β‘Graficar 4 estereotipos!')
47
+ with gr.Row():
48
+ err_msg = gr.Markdown(label='',visible=True)
49
+ with gr.Row():
50
+ bias_plot = gr.Image(shape=(15, 15))
51
+ with gr.Row():
52
+ examples = gr.Examples(
53
+ fn=we_bias.calculate_bias,
54
+ inputs=[wordlist_1,wordlist_2,diagnose_list],
55
+ outputs=[bias_plot,err_msg],
56
+ examples=[
57
+ [fem_words,masc_words,diagnose_words],
58
+ [young_words,old_words,diagnose_words]
59
+ ]
60
+ )
61
+ with gr.Row():
62
+ examples = gr.Examples(
63
+ fn=we_bias_2d.calculate_bias,
64
+ inputs=[wordlist_1,wordlist_2,wordlist_3,wordlist_4,diagnose_list],
65
+ outputs=[bias_plot,err_msg],
66
+ examples=[[fem_words,masc_words,young_words,old_words,diagnose_words]]
67
+ )
68
+ with gr.Row():
69
+ gr.Markdown(TOOL_INFO)
70
+
71
+ bias2d.click(
72
+ fn=we_bias.calculate_bias,
73
+ inputs=[wordlist_1,wordlist_2,diagnose_list],
74
+ outputs=[bias_plot,err_msg])
75
+ bias4d.click(
76
+ fn=we_bias_2d.calculate_bias,
77
+ inputs=[wordlist_1,wordlist_2,wordlist_3,wordlist_4,diagnose_list],
78
+ outputs=[bias_plot,err_msg])
79
+
80
+
81
+ explorar_sesgo_en_palabras_interface.queue(concurrency_count=10)
82
+ explorar_sesgo_en_palabras_interface.launch()
requirements.txt ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # sesgos_en_frases
2
+ regex
3
+ # datos
4
+ torch
5
+ transformers
6
+ # resto
7
+ sklearn
8
+ gensim==3.7.3
9
+ transformers
10
+ tensorflow
11
+ matplotlib
12
+ numpy
13
+ seaborn
tool_info.py ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ TOOL_INFO = """
2
+ > ### A tool to overcome technical barriers for bias assessment in human language technologies
3
+
4
+ * [Read Full Paper](https://arxiv.org/abs/2207.06591)
5
+
6
+ > ### Licensing Information
7
+ * [MIT Licence](https://huggingface.co/spaces/vialibre/new_bias_tools/resolve/main/LICENSE)
8
+
9
+ > ### Citation Information
10
+ ```c
11
+ @misc{https://doi.org/10.48550/arxiv.2207.06591,
12
+ doi = {10.48550/ARXIV.2207.06591},
13
+ url = {https://arxiv.org/abs/2207.06591},
14
+ author = {Alemany, Laura Alonso and Benotti, Luciana and GonzΓ‘lez, LucΓ­a and Maina, HernΓ‘n and Busaniche, Beatriz and Halvorsen, Alexia and Bordone, MatΓ­as and SΓ‘nchez, Jorge},
15
+ keywords = {Computation and Language (cs.CL), Artificial Intelligence (cs.AI),
16
+ FOS: Computer and information sciences, FOS: Computer and information sciences},
17
+ title = {A tool to overcome technical barriers for bias assessment in human language technologies},
18
+ publisher = {arXiv},
19
+ year = {2022},
20
+ copyright = {Creative Commons Attribution Non Commercial Share Alike 4.0 International}
21
+ }
22
+ ```
23
+ """
utils/__pycache__/modules_sesgo_en_palabras.cpython-38.pyc ADDED
Binary file (18.5 kB). View file
 
utils/__pycache__/utils_sesgo_en_palabras.cpython-38.pyc ADDED
Binary file (7.99 kB). View file
 
utils/modules_sesgo_en_palabras.py ADDED
@@ -0,0 +1,714 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import copy
2
+
3
+ from sklearn.decomposition import PCA
4
+ from sklearn.metrics.pairwise import euclidean_distances
5
+ import matplotlib.pyplot as plt
6
+ import numpy as np
7
+ import seaborn as sns
8
+ from transformers import AutoTokenizer, AutoModelForMaskedLM
9
+ import pandas as pd
10
+ from gensim.models import KeyedVectors
11
+ from utils.utils_sesgo_en_palabras import (
12
+ cosine_similarity,
13
+ normalize,
14
+ project_params,
15
+ take_two_sides_extreme_sorted
16
+ )
17
+
18
+
19
+ DIRECTION_METHODS = ['single', 'sum', 'pca']
20
+ DEBIAS_METHODS = ['neutralize', 'hard', 'soft']
21
+ FIRST_PC_THRESHOLD = 0.5
22
+ MAX_NON_SPECIFIC_EXAMPLES = 1000
23
+
24
+ __all__ = ['GenderBiasWE', 'BiasWordEmbedding']
25
+
26
+
27
+ class Loader():
28
+ def __init__(self):
29
+ self.path_to_data = ''
30
+
31
+ def load_tokenizer(self, tokenizer_path):
32
+ tokenizer = AutoTokenizer.from_pretrained(
33
+ tokenizer_path, do_lower_case=True, )
34
+ return tokenizer
35
+
36
+ def load_data_from_file(self, data):
37
+ return data
38
+
39
+ def load_corpus_from_file(self, data):
40
+ return data
41
+
42
+ def load_language_model(self, model_path):
43
+ model = AutoModelForMaskedLM.from_pretrained(
44
+ model_path, output_hidden_states=True)
45
+ return model
46
+
47
+
48
+ class Corpus():
49
+ def __init__(self, corpus) -> None:
50
+ self.vocabulary = self.load_vocabulary_from_corpus()
51
+ self.corpus = corpus
52
+
53
+ def get_context_from_text(self, word):
54
+ pass
55
+
56
+ def get_frequency(self, word):
57
+ pass
58
+
59
+ def get_most_frequent_coocurrence(self, word):
60
+ pass
61
+
62
+
63
+ class Embedding():
64
+ def __init__(self, word_vectors_path) -> None:
65
+ self.wv = self.load_we_as_keyed_vectors(word_vectors_path)
66
+
67
+ def load_we_as_keyed_vectors(self, word_vectors_path):
68
+ we = KeyedVectors.load_word2vec_format(word_vectors_path)
69
+ we.init_sims(replace=True)
70
+ return we
71
+
72
+ def get_word_vector(self, word, context=None):
73
+ return word
74
+
75
+
76
+ class BiasExplorer():
77
+ def __init__(self, model, only_lower=False, verbose=False,
78
+ identify_direction=False, to_normalize=True):
79
+ # pylint: disable=undefined-variable
80
+
81
+ # TODO: this is bad Python, ask someone about it
82
+ # probably should be a better design
83
+ # identify_direction doesn't have any meaning
84
+ # for the class BiasWordEmbedding
85
+ # The goal is to force this interfeace of sub-classes.
86
+ if self.__class__ == __class__ and identify_direction is not False:
87
+ raise ValueError('identify_direction must be False'
88
+ ' for an instance of {}'
89
+ .format(__class__))
90
+
91
+ self.model = model
92
+
93
+ # TODO: write unitest for when it is False
94
+ self.only_lower = only_lower
95
+
96
+ self._verbose = verbose
97
+
98
+ self.direction = None
99
+ self.positive_end = None
100
+ self.negative_end = None
101
+
102
+ if to_normalize:
103
+ self.model.init_sims(replace=True)
104
+
105
+ def __copy__(self):
106
+ bias_word_embedding = self.__class__(self.model,
107
+ self.only_lower,
108
+ self._verbose,
109
+ identify_direction=False)
110
+ bias_word_embedding.direction = copy.deepcopy(self.direction)
111
+ bias_word_embedding.positive_end = copy.deepcopy(self.positive_end)
112
+ bias_word_embedding.negative_end = copy.deepcopy(self.negative_end)
113
+ return bias_word_embedding
114
+
115
+ def __deepcopy__(self, memo):
116
+ bias_word_embedding = copy.copy(self)
117
+ bias_word_embedding.model = copy.deepcopy(bias_word_embedding.model)
118
+ return bias_word_embedding
119
+
120
+ def __getitem__(self, key):
121
+ return self.model[key]
122
+
123
+ def __contains__(self, item):
124
+ return item in self.model
125
+
126
+ def _is_direction_identified(self):
127
+ if self.direction is None:
128
+ raise RuntimeError('The direction was not identified'
129
+ ' for this {} instance'
130
+ .format(self.__class__.__name__))
131
+
132
+ def _identify_subspace_by_pca(self, definitional_pairs, n_components):
133
+ matrix = []
134
+
135
+ for word1, word2 in definitional_pairs:
136
+ vector1 = normalize(self[word1])
137
+ vector2 = normalize(self[word2])
138
+
139
+ center = (vector1 + vector2) / 2
140
+
141
+ matrix.append(vector1 - center)
142
+ matrix.append(vector2 - center)
143
+
144
+ pca = PCA(n_components=n_components)
145
+ pca.fit(matrix)
146
+
147
+ if self._verbose:
148
+ table = enumerate(pca.explained_variance_ratio_, start=1)
149
+ headers = ['Principal Component',
150
+ 'Explained Variance Ratio']
151
+
152
+ return pca
153
+
154
+ def __errorChecking(self, word):
155
+ out_msj = ""
156
+
157
+ if not word:
158
+ out_msj = "Error: Primero debe ingresar una palabra!"
159
+ else:
160
+ if not word in self.model:
161
+ out_msj = f"Error: La palabra '<b>{word}</b>' no se encuentra en el vocabulario!"
162
+
163
+ if out_msj:
164
+ out_msj = "<center><h3>"+out_msj+"</h3></center>"
165
+
166
+ return out_msj
167
+
168
+ # TODO: add the SVD method from section 6 step 1
169
+ # It seems there is a mistake there, I think it is the same as PCA
170
+ # just with replacing it with SVD
171
+ def _identify_direction(self, positive_end, negative_end,
172
+ definitional, method='pca'):
173
+ if method not in DIRECTION_METHODS:
174
+ raise ValueError('method should be one of {}, {} was given'.format(
175
+ DIRECTION_METHODS, method))
176
+
177
+ if positive_end == negative_end:
178
+ raise ValueError('positive_end and negative_end'
179
+ 'should be different, and not the same "{}"'
180
+ .format(positive_end))
181
+ if self._verbose:
182
+ print('Identify direction using {} method...'.format(method))
183
+
184
+ direction = None
185
+
186
+ if method == 'single':
187
+ if self._verbose:
188
+ print('Positive definitional end:', definitional[0])
189
+ print('Negative definitional end:', definitional[1])
190
+ direction = normalize(normalize(self[definitional[0]])
191
+ - normalize(self[definitional[1]]))
192
+
193
+ elif method == 'sum':
194
+ group1_sum_vector = np.sum([self[word]
195
+ for word in definitional[0]], axis=0)
196
+ group2_sum_vector = np.sum([self[word]
197
+ for word in definitional[1]], axis=0)
198
+
199
+ diff_vector = (normalize(group1_sum_vector)
200
+ - normalize(group2_sum_vector))
201
+
202
+ direction = normalize(diff_vector)
203
+
204
+ elif method == 'pca':
205
+ pca = self._identify_subspace_by_pca(definitional, 10)
206
+ if pca.explained_variance_ratio_[0] < FIRST_PC_THRESHOLD:
207
+ raise RuntimeError('The Explained variance'
208
+ 'of the first principal component should be'
209
+ 'at least {}, but it is {}'
210
+ .format(FIRST_PC_THRESHOLD,
211
+ pca.explained_variance_ratio_[0]))
212
+ direction = pca.components_[0]
213
+
214
+ # if direction is opposite (e.g. we cannot control
215
+ # what the PCA will return)
216
+ ends_diff_projection = cosine_similarity((self[positive_end]
217
+ - self[negative_end]),
218
+ direction)
219
+ if ends_diff_projection < 0:
220
+ direction = -direction # pylint: disable=invalid-unary-operand-type
221
+
222
+ self.direction = direction
223
+ self.positive_end = positive_end
224
+ self.negative_end = negative_end
225
+
226
+ def project_on_direction(self, word):
227
+ """Project the normalized vector of the word on the direction.
228
+ :param str word: The word tor project
229
+ :return float: The projection scalar
230
+ """
231
+
232
+ self._is_direction_identified()
233
+
234
+ vector = self[word]
235
+ projection_score = self.model.cosine_similarities(self.direction,
236
+ [vector])[0]
237
+ return projection_score
238
+
239
+ def _calc_projection_scores(self, words):
240
+ self._is_direction_identified()
241
+
242
+ df = pd.DataFrame({'word': words})
243
+
244
+ # TODO: maybe using cosine_similarities on all the vectors?
245
+ # it might be faster
246
+ df['projection'] = df['word'].apply(self.project_on_direction)
247
+ df = df.sort_values('projection', ascending=False)
248
+
249
+ return df
250
+
251
+ def calc_projection_data(self, words):
252
+ """
253
+ Calculate projection, projected and rejected vectors of a words list.
254
+ :param list words: List of words
255
+ :return: :class:`pandas.DataFrame` of the projection,
256
+ projected and rejected vectors of the words list
257
+ """
258
+ projection_data = []
259
+ for word in words:
260
+ vector = self[word]
261
+ projection = self.project_on_direction(word)
262
+ normalized_vector = normalize(vector)
263
+
264
+ (projection,
265
+ projected_vector,
266
+ rejected_vector) = project_params(normalized_vector,
267
+ self.direction)
268
+
269
+ projection_data.append({'word': word,
270
+ 'vector': vector,
271
+ 'projection': projection,
272
+ 'projected_vector': projected_vector,
273
+ 'rejected_vector': rejected_vector})
274
+
275
+ return pd.DataFrame(projection_data)
276
+
277
+ def plot_dist_projections_on_direction(self, word_groups, ax=None):
278
+ """Plot the projection scalars distribution on the direction.
279
+ :param dict word_groups word: The groups to projects
280
+ :return float: The ax object of the plot
281
+ """
282
+
283
+ if ax is None:
284
+ _, ax = plt.subplots(1)
285
+
286
+ names = sorted(word_groups.keys())
287
+
288
+ for name in names:
289
+ words = word_groups[name]
290
+ label = '{} (#{})'.format(name, len(words))
291
+ vectors = [self[word] for word in words]
292
+ projections = self.model.cosine_similarities(self.direction,
293
+ vectors)
294
+ sns.distplot(projections, hist=False, label=label, ax=ax)
295
+
296
+ plt.axvline(0, color='k', linestyle='--')
297
+
298
+ plt.title('← {} {} {} β†’'.format(self.negative_end,
299
+ ' ' * 20,
300
+ self.positive_end))
301
+ plt.xlabel('Direction Projection')
302
+ plt.ylabel('Density')
303
+ ax.legend(loc='center left', bbox_to_anchor=(1, 0.5))
304
+
305
+ return ax
306
+
307
+ def __errorChecking(self, word):
308
+ out_msj = ""
309
+
310
+ if not word:
311
+ out_msj = "Error: Primero debe ingresar una palabra!"
312
+ else:
313
+ if not word in self.model:
314
+ out_msj = f"Error: La palabra '<b>{word}</b>' no se encuentra en el vocabulario!"
315
+
316
+ if out_msj:
317
+ out_msj = "<center><h3>"+out_msj+"</h3></center>"
318
+
319
+ return out_msj
320
+
321
+ def parse_words(self, string):
322
+ words = string.strip()
323
+ if words:
324
+ words = [word.strip() for word in words.split(',') if word != ""]
325
+ return words
326
+
327
+ def check_oov(self, wordlists):
328
+ for wordlist in wordlists:
329
+ parsed_words = self.parse_words(wordlist)
330
+ for word in parsed_words:
331
+ msg = self.__errorChecking(word)
332
+ if msg:
333
+ return msg
334
+ return None
335
+
336
+ def plot_projections_2d(self,
337
+ wordlist,
338
+ wordlist_1,
339
+ wordlist_2,
340
+ wordlist_3,
341
+ wordlist_4,
342
+ color_wordlist,
343
+ color_wordlist_1,
344
+ color_wordlist_2,
345
+ color_wordlist_3,
346
+ color_wordlist_4,
347
+ plot_neighbors,
348
+ n_alpha,
349
+ fontsize,
350
+ figsize=(15, 15),
351
+ method='pca'
352
+ ):
353
+ # convertirlas a vector
354
+ choices = [0, 1, 2, 3, 4]
355
+ word_list = []
356
+ wordlist_choice = [wordlist, wordlist_1, wordlist_2, wordlist_3, wordlist_4]
357
+ err= self.check_oov(wordlist_choice)
358
+ if err:
359
+ return None, err
360
+ words_colors = {}
361
+ label_dict = {
362
+ 0: 'Diagnostico',
363
+ 1: 'Lista de palabras 1',
364
+ 2: 'Lista de palabras 2',
365
+ 3: 'Lista de palabras 3',
366
+ 4: 'Lista de palabras 4'
367
+ }
368
+ color_dict = {
369
+ 0: color_wordlist,
370
+ 1: color_wordlist_1,
371
+ 2: color_wordlist_2,
372
+ 3: color_wordlist_3,
373
+ 4: color_wordlist_4
374
+ }
375
+ word_bias_space = {}
376
+ alpha = {}
377
+
378
+ for raw_word_list, color in zip(wordlist_choice, choices):
379
+ parsed_words = self.parse_words(raw_word_list)
380
+ if parsed_words:
381
+ for word in parsed_words:
382
+ word_bias_space[word] = color
383
+ words_colors[word] = color_dict[color]
384
+ alpha[word] = 1
385
+ if plot_neighbors:
386
+ neighbors = [w for w,s in self.model.most_similar(word,topn=5)]
387
+ for n in neighbors:
388
+ if n not in alpha:
389
+ word_bias_space[n] = color
390
+ words_colors[n] = color_dict[color]
391
+ alpha[n] = n_alpha
392
+ word_list += neighbors
393
+ word_list += parsed_words
394
+ if not word_list:
395
+ return None, "<center><h3>" + "Ingresa al menos 2 palabras para continuar" + "<center><h3>"
396
+ embeddings = [self.model[word] for word in word_list]
397
+ words_embedded = PCA(
398
+ n_components=2, random_state=1).fit_transform(embeddings)
399
+ data = pd.DataFrame(words_embedded)
400
+ data['word'] = word_list
401
+ data['color'] = [words_colors[word] for word in word_list]
402
+ data['alpha'] = [alpha[word] for word in word_list]
403
+ data['word_bias_space'] = [word_bias_space[word] for word in word_list]
404
+ fig, ax = plt.subplots(figsize=figsize)
405
+
406
+ sns.scatterplot(
407
+ data=data[data['alpha'] == 1],
408
+ x=0,
409
+ y=1,
410
+ style='word_bias_space',
411
+ hue='word_bias_space',
412
+ ax=ax,
413
+ palette=color_dict
414
+ )
415
+ if plot_neighbors:
416
+ sns.scatterplot(
417
+ data=data[data['alpha'] != 1],
418
+ x=0,
419
+ y=1,
420
+ style='color',
421
+ hue='word_bias_space',
422
+ ax=ax,
423
+ alpha=n_alpha,
424
+ legend=False,
425
+ palette=color_dict
426
+ )
427
+ for i, label in enumerate(word_list):
428
+ x, y = words_embedded[i, :]
429
+ ax.annotate(label, xy=(x, y), xytext=(5, 2),color=words_colors[label],
430
+ textcoords='offset points',
431
+ ha='right', va='bottom', size=fontsize, alpha=alpha[label])
432
+
433
+ ax.set_xticks([])
434
+ ax.set_yticks([])
435
+
436
+ fig.tight_layout()
437
+ fig.canvas.draw()
438
+
439
+ data = np.frombuffer(fig.canvas.tostring_rgb(), dtype=np.uint8)
440
+ w, h = fig.canvas.get_width_height()
441
+ im = data.reshape((int(h), int(w), -1))
442
+ return im, ''
443
+
444
+
445
+ class WEBiasExplorer2d(BiasExplorer):
446
+ def __init__(self, word_embedding) -> None:
447
+ super().__init__(word_embedding)
448
+
449
+ def calculate_bias(
450
+ self,
451
+ palabras_extremo_1,
452
+ palabras_extremo_2,
453
+ palabras_para_situar
454
+ ):
455
+
456
+ wordlists = [
457
+ palabras_extremo_1,
458
+ palabras_extremo_2,
459
+ palabras_para_situar
460
+ ]
461
+ err = self.check_oov(wordlists)
462
+ for wordlist in wordlists:
463
+ if not wordlist:
464
+ err = "<center><h3>" + 'Debe ingresar al menos 1 palabra en las lista de palabras a diagnosticar, sesgo 1 y sesgo 2' +"<center><h3>"
465
+ if err:
466
+ return None, err
467
+
468
+
469
+ err = self.check_oov([palabras_extremo_1,palabras_extremo_2,palabras_para_situar])
470
+ if err:
471
+ return None, err
472
+ palabras_extremo_1 = self.parse_words(palabras_extremo_1)
473
+ palabras_extremo_2 = self.parse_words(palabras_extremo_2)
474
+ palabras_para_situar = self.parse_words(palabras_para_situar)
475
+ im = self.get_bias_plot(
476
+ palabras_para_situar,
477
+ definitional=(
478
+ palabras_extremo_1, palabras_extremo_2),
479
+ method='sum',
480
+ n_extreme=10
481
+ )
482
+ return im, ''
483
+
484
+ def get_bias_plot(self,
485
+ palabras_para_situar,
486
+ definitional,
487
+ method='sum',
488
+ n_extreme=10,
489
+ figsize=(10, 10)
490
+ ):
491
+
492
+ fig, ax = plt.subplots(1, figsize=figsize)
493
+ self.method = method
494
+ self.plot_projection_scores(
495
+ definitional,
496
+ palabras_para_situar, n_extreme, ax=ax,)
497
+
498
+ fig.tight_layout()
499
+ fig.canvas.draw()
500
+
501
+ data = np.frombuffer(fig.canvas.tostring_rgb(), dtype=np.uint8)
502
+ w, h = fig.canvas.get_width_height()
503
+ im = data.reshape((int(h), int(w), -1))
504
+ return im
505
+
506
+ def plot_projection_scores(self, definitional,
507
+ words, n_extreme=10,
508
+ ax=None, axis_projection_step=None):
509
+ """Plot the projection scalar of words on the direction.
510
+ :param list words: The words tor project
511
+ :param int or None n_extreme: The number of extreme words to show
512
+ :return: The ax object of the plot
513
+ """
514
+ nombre_del_extremo_1 = ', '.join(definitional[0])
515
+ nombre_del_extremo_2 = ', '.join(definitional[1])
516
+
517
+ self._identify_direction(nombre_del_extremo_1, nombre_del_extremo_2,
518
+ definitional=definitional,
519
+ method='sum')
520
+
521
+ self._is_direction_identified()
522
+
523
+ projections_df = self._calc_projection_scores(words)
524
+ projections_df['projection'] = projections_df['projection'].round(2)
525
+
526
+ if n_extreme is not None:
527
+ projections_df = take_two_sides_extreme_sorted(projections_df,
528
+ n_extreme=n_extreme)
529
+
530
+ if ax is None:
531
+ _, ax = plt.subplots(1)
532
+
533
+ if axis_projection_step is None:
534
+ axis_projection_step = 0.1
535
+
536
+ cmap = plt.get_cmap('RdBu')
537
+ projections_df['color'] = ((projections_df['projection'] + 0.5)
538
+ .apply(cmap))
539
+
540
+ most_extream_projection = np.round(
541
+ projections_df['projection']
542
+ .abs()
543
+ .max(),
544
+ decimals=1)
545
+
546
+ sns.barplot(x='projection', y='word', data=projections_df,
547
+ palette=projections_df['color'])
548
+
549
+ plt.xticks(np.arange(-most_extream_projection,
550
+ most_extream_projection + axis_projection_step,
551
+ axis_projection_step))
552
+ xlabel = ('← {} {} {} β†’'.format(self.negative_end,
553
+ ' ' * 20,
554
+ self.positive_end))
555
+
556
+ plt.xlabel(xlabel)
557
+ plt.ylabel('Words')
558
+
559
+ return ax
560
+
561
+
562
+ class WEBiasExplorer4d(BiasExplorer):
563
+ def __init__(self, word_embedding) -> None:
564
+ super().__init__(word_embedding)
565
+
566
+ def calculate_bias(
567
+ self,
568
+ palabras_extremo_1,
569
+ palabras_extremo_2,
570
+ palabras_extremo_3,
571
+ palabras_extremo_4,
572
+ palabras_para_situar
573
+ ):
574
+ wordlists = [
575
+ palabras_extremo_1,
576
+ palabras_extremo_2,
577
+ palabras_extremo_3,
578
+ palabras_extremo_4,
579
+ palabras_para_situar
580
+ ]
581
+ err = self.check_oov(wordlists)
582
+ for wordlist in wordlists:
583
+ if not wordlist:
584
+ err = "<center><h3>" + 'Β‘Para graficar con 4 espacios, debe ingresar al menos 1 palabra en todas las listas!' + "<center><h3>"
585
+ if err:
586
+ return None, err
587
+
588
+ palabras_extremo_1 = self.parse_words(palabras_extremo_1)
589
+ palabras_extremo_2 = self.parse_words(palabras_extremo_2)
590
+ palabras_extremo_3 = self.parse_words(palabras_extremo_3)
591
+ palabras_extremo_4 = self.parse_words(palabras_extremo_4)
592
+
593
+ palabras_para_situar = self.parse_words(palabras_para_situar)
594
+
595
+ im = self.get_bias_plot(
596
+ palabras_para_situar,
597
+ definitional_1=(
598
+ palabras_extremo_1, palabras_extremo_2),
599
+ definitional_2=(
600
+ palabras_extremo_3, palabras_extremo_4),
601
+ method='sum',
602
+ n_extreme=10
603
+ )
604
+ return im, ''
605
+
606
+ def get_bias_plot(self,
607
+ palabras_para_situar,
608
+ definitional_1,
609
+ definitional_2,
610
+ method='sum',
611
+ n_extreme=10,
612
+ figsize=(10, 10)
613
+ ):
614
+
615
+ fig, ax = plt.subplots(1, figsize=figsize)
616
+ self.method = method
617
+ self.plot_projection_scores(
618
+ definitional_1,
619
+ definitional_2,
620
+ palabras_para_situar, n_extreme, ax=ax,)
621
+ fig.canvas.draw()
622
+
623
+ data = np.frombuffer(fig.canvas.tostring_rgb(), dtype=np.uint8)
624
+ w, h = fig.canvas.get_width_height()
625
+ im = data.reshape((int(h), int(w), -1))
626
+ return im
627
+
628
+ def plot_projection_scores(self, definitional_1, definitional_2,
629
+ words, n_extreme=10,
630
+ ax=None, axis_projection_step=None):
631
+ """Plot the projection scalar of words on the direction.
632
+ :param list words: The words tor project
633
+ :param int or None n_extreme: The number of extreme words to show
634
+ :return: The ax object of the plot
635
+ """
636
+
637
+ nombre_del_extremo_1 = ', '.join(definitional_1[1])
638
+ nombre_del_extremo_2 = ', '.join(definitional_1[0])
639
+
640
+ self._identify_direction(nombre_del_extremo_1, nombre_del_extremo_2,
641
+ definitional=definitional_1,
642
+ method='sum')
643
+
644
+ self._is_direction_identified()
645
+
646
+ projections_df = self._calc_projection_scores(words)
647
+ projections_df['projection_x'] = projections_df['projection'].round(2)
648
+
649
+ nombre_del_extremo_3 = ', '.join(definitional_2[1])
650
+ nombre_del_extremo_4 = ', '.join(definitional_2[0])
651
+ self._identify_direction(nombre_del_extremo_3, nombre_del_extremo_4,
652
+ definitional=definitional_2,
653
+ method='sum')
654
+
655
+ self._is_direction_identified()
656
+
657
+ projections_df['projection_y'] = self._calc_projection_scores(words)[
658
+ 'projection'].round(2)
659
+
660
+ if n_extreme is not None:
661
+ projections_df = take_two_sides_extreme_sorted(projections_df,
662
+ n_extreme=n_extreme)
663
+
664
+ if ax is None:
665
+ _, ax = plt.subplots(1)
666
+
667
+ if axis_projection_step is None:
668
+ axis_projection_step = 0.1
669
+
670
+ cmap = plt.get_cmap('RdBu')
671
+ projections_df['color'] = ((projections_df['projection'] + 0.5)
672
+ .apply(cmap))
673
+ most_extream_projection = np.round(
674
+ projections_df['projection']
675
+ .abs()
676
+ .max(),
677
+ decimals=1)
678
+ sns.scatterplot(x='projection_x', y='projection_y', data=projections_df,
679
+ palette=projections_df['color'])
680
+
681
+ plt.xticks(np.arange(-most_extream_projection,
682
+ most_extream_projection + axis_projection_step,
683
+ axis_projection_step))
684
+ for _, row in (projections_df.iterrows()):
685
+ ax.annotate(
686
+ row['word'], (row['projection_x'], row['projection_y']))
687
+ x_label = '← {} {} {} β†’'.format(nombre_del_extremo_1,
688
+ ' ' * 20,
689
+ nombre_del_extremo_2)
690
+
691
+ y_label = '← {} {} {} β†’'.format(nombre_del_extremo_3,
692
+ ' ' * 20,
693
+ nombre_del_extremo_4)
694
+
695
+ plt.xlabel(x_label)
696
+ ax.xaxis.set_label_position('bottom')
697
+ ax.xaxis.set_label_coords(.5, 0)
698
+
699
+ plt.ylabel(y_label)
700
+ ax.yaxis.set_label_position('left')
701
+ ax.yaxis.set_label_coords(0, .5)
702
+
703
+ ax.spines['left'].set_position('center')
704
+ ax.spines['bottom'].set_position('center')
705
+
706
+ ax.set_xticks([])
707
+ ax.set_yticks([])
708
+ #plt.yticks([], [])
709
+ # ax.spines['left'].set_position('zero')
710
+ # ax.spines['bottom'].set_position('zero')
711
+
712
+ return ax
713
+
714
+
utils/utils_sesgo_en_palabras.py ADDED
@@ -0,0 +1,272 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import math
2
+
3
+ import gensim
4
+ import matplotlib.pylab as plt
5
+ import numpy as np
6
+ import pandas as pd
7
+ from six import string_types
8
+ from sklearn.cluster import KMeans
9
+ from sklearn.manifold import TSNE
10
+ from sklearn.metrics import accuracy_score
11
+ import gradio as gr
12
+
13
+
14
+
15
+ WORD_EMBEDDING_MODEL_TYPES = (gensim.models.keyedvectors.KeyedVectors,
16
+ gensim.models.fasttext.FastText,
17
+ gensim.models.word2vec.Word2Vec,
18
+ gensim.models.base_any2vec.BaseWordEmbeddingsModel,) # pylint: disable=line-too-long
19
+
20
+
21
+ def assert_gensim_keyed_vectors(model):
22
+ if not isinstance(model, WORD_EMBEDDING_MODEL_TYPES):
23
+ type_names = (model_type.__name__
24
+ for model_type in WORD_EMBEDDING_MODEL_TYPES)
25
+ raise TypeError('model should be on of the types'
26
+ ' ({}), not {}.'
27
+ .format(', '.join(type_names),
28
+ type(model)))
29
+
30
+ def generate_words_forms(words):
31
+ return sum([generate_one_word_forms(word) for word in words], [])
32
+
33
+
34
+ def cosine_similarity(v, u):
35
+ """Calculate the cosine similarity between two vectors."""
36
+ v_norm = np.linalg.norm(v)
37
+ u_norm = np.linalg.norm(u)
38
+ similarity = v @ u / (v_norm * u_norm)
39
+ return similarity
40
+
41
+
42
+ def generate_one_word_forms(word):
43
+ return [word.lower(), word.upper(), word.title()]
44
+
45
+
46
+ def get_seed_vector(seed, bias_word_embedding):
47
+
48
+ if seed == 'direction':
49
+ positive_end = bias_word_embedding.positive_end
50
+ negative_end = bias_word_embedding.negative_end
51
+ bias_word_embedding._is_direction_identified() # pylint: disable=protected-access
52
+ seed_vector = bias_word_embedding.direction
53
+ else:
54
+ if seed == 'ends':
55
+ positive_end = bias_word_embedding.positive_end
56
+ negative_end = bias_word_embedding.negative_end
57
+
58
+ else:
59
+ positive_end, negative_end = seed
60
+
61
+ seed_vector = normalize(bias_word_embedding.model[positive_end]
62
+ - bias_word_embedding.model[negative_end])
63
+
64
+ return seed_vector, positive_end, negative_end
65
+
66
+
67
+ def most_similar(model, positive=None, negative=None,
68
+ topn=10, restrict_vocab=None, indexer=None,
69
+ unrestricted=True):
70
+ """
71
+ Find the top-N most similar words.
72
+
73
+ Positive words contribute positively towards the similarity,
74
+ negative words negatively.
75
+
76
+ This function computes cosine similarity between a simple mean
77
+ of the projection weight vectors of the given words and
78
+ the vectors for each word in the model.
79
+ The function corresponds to the `word-analogy` and `distance`
80
+ scripts in the original word2vec implementation.
81
+
82
+ Based on Gensim implementation.
83
+
84
+ :param model: Word embedding model of ``gensim.model.KeyedVectors``.
85
+ :param list positive: List of words that contribute positively.
86
+ :param list negative: List of words that contribute negatively.
87
+ :param int topn: Number of top-N similar words to return.
88
+ :param int restrict_vocab: Optional integer which limits the
89
+ range of vectors
90
+ which are searched for most-similar values.
91
+ For example, restrict_vocab=10000 would
92
+ only check the first 10000 word vectors
93
+ in the vocabulary order. (This may be
94
+ meaningful if you've sorted the vocabulary
95
+ by descending frequency.)
96
+ :param bool unrestricted: Whether to restricted the most
97
+ similar words to be not from
98
+ the positive or negative word list.
99
+ :return: Sequence of (word, similarity).
100
+ """
101
+ if topn is not None and topn < 1:
102
+ return []
103
+
104
+ if positive is None:
105
+ positive = []
106
+ if negative is None:
107
+ negative = []
108
+
109
+ model.init_sims()
110
+
111
+ if (isinstance(positive, string_types)
112
+ and not negative):
113
+ # allow calls like most_similar('dog'),
114
+ # as a shorthand for most_similar(['dog'])
115
+ positive = [positive]
116
+
117
+ if ((isinstance(positive, string_types) and negative)
118
+ or (isinstance(negative, string_types) and positive)):
119
+ raise ValueError('If positives and negatives are given, '
120
+ 'both should be lists!')
121
+
122
+ # add weights for each word, if not already present;
123
+ # default to 1.0 for positive and -1.0 for negative words
124
+ positive = [
125
+ (word, 1.0) if isinstance(word, string_types + (np.ndarray,))
126
+ else word
127
+ for word in positive
128
+ ]
129
+ negative = [
130
+ (word, -1.0) if isinstance(word, string_types + (np.ndarray,))
131
+ else word
132
+ for word in negative
133
+ ]
134
+
135
+ # compute the weighted average of all words
136
+ all_words, mean = set(), []
137
+ for word, weight in positive + negative:
138
+ if isinstance(word, np.ndarray):
139
+ mean.append(weight * word)
140
+ else:
141
+ mean.append(weight * model.word_vec(word, use_norm=True))
142
+ if word in model.vocab:
143
+ all_words.add(model.vocab[word].index)
144
+
145
+ if not mean:
146
+ raise ValueError("Cannot compute similarity with no input.")
147
+ mean = gensim.matutils.unitvec(np.array(mean)
148
+ .mean(axis=0)).astype(float)
149
+
150
+ if indexer is not None:
151
+ return indexer.most_similar(mean, topn)
152
+
153
+ limited = (model.vectors_norm if restrict_vocab is None
154
+ else model.vectors_norm[:restrict_vocab])
155
+ dists = limited @ mean
156
+
157
+ if topn is None:
158
+ return dists
159
+
160
+ best = gensim.matutils.argsort(dists,
161
+ topn=topn + len(all_words),
162
+ reverse=True)
163
+
164
+ # if not unrestricted, then ignore (don't return)
165
+ # words from the input
166
+ result = [(model.index2word[sim], float(dists[sim]))
167
+ for sim in best
168
+ if unrestricted or sim not in all_words]
169
+
170
+ return result[:topn]
171
+
172
+
173
+ def normalize(v):
174
+ """Normalize a 1-D vector."""
175
+ if v.ndim != 1:
176
+ raise ValueError('v should be 1-D, {}-D was given'.format(
177
+ v.ndim))
178
+ norm = np.linalg.norm(v)
179
+ if norm == 0:
180
+ return v
181
+ return v / norm
182
+
183
+
184
+ def project_params(u, v):
185
+ """Projecting and rejecting the vector v onto direction u with scalar."""
186
+ normalize_u = normalize(u)
187
+ projection = (v @ normalize_u)
188
+ projected_vector = projection * normalize_u
189
+ rejected_vector = v - projected_vector
190
+ return projection, projected_vector, rejected_vector
191
+
192
+
193
+ def project_reject_vector(v, u):
194
+ """Projecting and rejecting the vector v onto direction u."""
195
+ projected_vector = project_vector(v, u)
196
+ rejected_vector = v - projected_vector
197
+ return projected_vector, rejected_vector
198
+
199
+
200
+ def round_to_extreme(value, digits=2):
201
+ place = 10**digits
202
+ new_value = math.ceil(abs(value) * place) / place
203
+ if value < 0:
204
+ new_value = -new_value
205
+ return new_value
206
+
207
+
208
+ def take_two_sides_extreme_sorted(df, n_extreme,
209
+ part_column=None,
210
+ head_value='',
211
+ tail_value=''):
212
+ head_df = df.head(n_extreme)[:]
213
+ tail_df = df.tail(n_extreme)[:]
214
+
215
+ if part_column is not None:
216
+ head_df[part_column] = head_value
217
+ tail_df[part_column] = tail_value
218
+
219
+ return (pd.concat([head_df, tail_df])
220
+ .drop_duplicates()
221
+ .reset_index(drop=True))
222
+
223
+
224
+ def project_vector(v, u):
225
+ """Projecting the vector v onto direction u."""
226
+ normalize_u = normalize(u)
227
+ return (v @ normalize_u) * normalize_u
228
+
229
+
230
+ def reject_vector(v, u):
231
+ """Rejecting the vector v onto direction u."""
232
+ return v - project_vector(v, u)
233
+
234
+
235
+ def update_word_vector(model, word, new_vector):
236
+ model.vectors[model.vocab[word].index] = new_vector
237
+ if model.vectors_norm is not None:
238
+ model.vectors_norm[model.vocab[word].index] = normalize(new_vector)
239
+
240
+
241
+ def project_params(u, v):
242
+ """Projecting and rejecting the vector v onto direction u with scalar."""
243
+ normalize_u = normalize(u)
244
+ projection = (v @ normalize_u)
245
+ projected_vector = projection * normalize_u
246
+ rejected_vector = v - projected_vector
247
+ return projection, projected_vector, rejected_vector
248
+
249
+
250
+ def plot_clustering_as_classification(X, y_true, random_state=1, ax=None):
251
+
252
+ if ax is None:
253
+ _, ax = plt.subplots(figsize=(10, 5))
254
+
255
+ y_cluster = (KMeans(n_clusters=2, random_state=random_state)
256
+ .fit_predict(X))
257
+
258
+ embedded_vectors = (TSNE(n_components=2, random_state=random_state)
259
+ .fit_transform(X))
260
+
261
+ for y_value in np.unique(y_cluster):
262
+ mask = (y_cluster == y_value)
263
+ label = 'Positive' if y_value else 'Negative'
264
+ ax.scatter(embedded_vectors[mask, 0],
265
+ embedded_vectors[mask, 1],
266
+ label=label)
267
+
268
+ ax.legend()
269
+
270
+ acc = accuracy_score(y_true, y_cluster)
271
+
272
+ return max(acc, 1 - acc)