Spaces:
Runtime error
Runtime error
Lucia Gonzalez
commited on
Commit
Β·
41dd0ce
1
Parent(s):
dbdf324
Added code missins vecs
Browse files- README.md +5 -5
- examples/__pycache__/examples.cpython-38.pyc +0 -0
- examples/examples.py +59 -0
- explorar_sesgo_en_palabras.py +82 -0
- requirements.txt +13 -0
- tool_info.py +23 -0
- utils/__pycache__/modules_sesgo_en_palabras.cpython-38.pyc +0 -0
- utils/__pycache__/utils_sesgo_en_palabras.cpython-38.pyc +0 -0
- utils/modules_sesgo_en_palabras.py +714 -0
- utils/utils_sesgo_en_palabras.py +272 -0
README.md
CHANGED
@@ -1,11 +1,11 @@
|
|
1 |
---
|
2 |
title: Explorar Sesgos
|
3 |
-
emoji:
|
4 |
-
colorFrom:
|
5 |
-
colorTo:
|
6 |
sdk: gradio
|
7 |
-
sdk_version: 3.
|
8 |
-
app_file:
|
9 |
pinned: false
|
10 |
license: mit
|
11 |
---
|
|
|
1 |
---
|
2 |
title: Explorar Sesgos
|
3 |
+
emoji: π
|
4 |
+
colorFrom: indigo
|
5 |
+
colorTo: red
|
6 |
sdk: gradio
|
7 |
+
sdk_version: 3.2
|
8 |
+
app_file: explorar_sesgo_en_palabras.py
|
9 |
pinned: false
|
10 |
license: mit
|
11 |
---
|
examples/__pycache__/examples.cpython-38.pyc
ADDED
Binary file (3.31 kB). View file
|
|
examples/examples.py
ADDED
@@ -0,0 +1,59 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
example_fem = {
|
2 |
+
"mujer": "la mente de una mujer que durante los ΓΊltimos",
|
3 |
+
"chica": "enamorado de la misma chica desde la infancia mary",
|
4 |
+
"ella": "ella llego a la final",
|
5 |
+
"madre": "su padre y su madre margarita de parma",
|
6 |
+
"hija": "hija de inmigrantes espaΓ±oles en",
|
7 |
+
"femenino": "campeonato mundial de voleibol femenino fue la duodΓ©cima ediciΓ³n",
|
8 |
+
}
|
9 |
+
example_joven = {
|
10 |
+
"joven": "",
|
11 |
+
"inmaduro": "",
|
12 |
+
"niΓ±o": "",
|
13 |
+
"crio": ""
|
14 |
+
}
|
15 |
+
example_viejo = {
|
16 |
+
"viejo": "",
|
17 |
+
"maduro": "",
|
18 |
+
"anciano": "",
|
19 |
+
"adulto": ""
|
20 |
+
}
|
21 |
+
|
22 |
+
|
23 |
+
example_masc = {
|
24 |
+
"hombre": "deseo innato que todo hombre tiene de comunicar su",
|
25 |
+
"chico": "fue un chico interesado en artes",
|
26 |
+
"el": "el parque nacional liwonde",
|
27 |
+
"padre": "la muerte de su padre en 1832 se formΓ³",
|
28 |
+
"hijo": "le dice a su hijo aΓΊn no nacido como",
|
29 |
+
"masculino": "el mito es esencialmente masculino y entre las causas",
|
30 |
+
}
|
31 |
+
|
32 |
+
example_diagnose = {
|
33 |
+
"ario": "establecer que el pueblo ario viviΓ³ en inmemoriales tiempos",
|
34 |
+
"educaciΓ³n": "sentido de vida religiΓ³n educaciΓ³n y cultura para cada mujer",
|
35 |
+
"pagado": "un rescate muy grande pagado por sus seguidores a",
|
36 |
+
"cocinar": "empezΓ³ a cocinar una sopa usando",
|
37 |
+
"lavar": "era directamente usado para lavar ropa por eso la",
|
38 |
+
"deporte": "se convirtiΓ³ en el deporte mΓ‘s popular del paΓs",
|
39 |
+
"ropa": "usan el kimono una ropa tradicional japonesa",
|
40 |
+
"pelea": "mal por la violenta pelea entre ambos hermanos",
|
41 |
+
"enfermero": "en enfermerΓa el diagnΓ³stico enfermero o diagnΓ³stico de enfermerΓa es",
|
42 |
+
"ganar": "una necesidad un modo de ganar",
|
43 |
+
"lΓder": "del estado en manos del lΓder opositor henrique capriles para el",
|
44 |
+
"coser": "realizar tareas domΓ©sticas bΓ‘sicas como coser y poner la mesa",
|
45 |
+
"cuidar": "de la fpf encargada de cuidar los intereses de los clubes",
|
46 |
+
"cirujano": "afrancesado ocupando el puesto de cirujano militar en el ejΓ©rcito josefino",
|
47 |
+
"rey": "la princesa jeongsung esposa del rey danjong que ascendiΓ³ al trono",
|
48 |
+
"reina": "aΓ±o ganΓ³ el tΓtulo de reina de la bahΓa en el"
|
49 |
+
}
|
50 |
+
|
51 |
+
|
52 |
+
fem_words = ','.join([word for word, context in example_fem.items()])
|
53 |
+
fem_contexts = ','.join([context for word, context in example_fem.items()])
|
54 |
+
masc_words = ','.join([word for word, context in example_masc.items()])
|
55 |
+
masc_contexts = ','.join([context for word, context in example_masc.items()])
|
56 |
+
young_words = ','.join([word for word, context in example_joven.items()])
|
57 |
+
old_words = ','.join([word for word, context in example_viejo.items()])
|
58 |
+
diagnose_words = ','.join([word for word, context in example_diagnose.items()])
|
59 |
+
diagnose_contexts = ','.join([context for word, context in example_diagnose.items()])
|
explorar_sesgo_en_palabras.py
ADDED
@@ -0,0 +1,82 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import matplotlib as mpl
|
2 |
+
mpl.use('Agg')
|
3 |
+
import gradio as gr
|
4 |
+
|
5 |
+
from utils.modules_sesgo_en_palabras import WEBiasExplorer2d, WEBiasExplorer4d, Embedding
|
6 |
+
from examples.examples import fem_words, masc_words, old_words, young_words, diagnose_words
|
7 |
+
|
8 |
+
from tool_info import TOOL_INFO
|
9 |
+
|
10 |
+
word_vectors_path = 'fasttext-sbwc.100k.vec'
|
11 |
+
we = Embedding(word_vectors_path)
|
12 |
+
we.load_we_as_keyed_vectors(word_vectors_path)
|
13 |
+
we.wv.init_sims(replace=True)
|
14 |
+
|
15 |
+
LABEL_WORD_LIST_1 = 'Lista de palabras 1'
|
16 |
+
LABEL_WORD_LIST_2 = 'Lista de palabras 2'
|
17 |
+
LABEL_WORD_LIST_3 = 'Lista de palabras 3'
|
18 |
+
LABEL_WORD_LIST_4 = 'Lista de palabras 4'
|
19 |
+
|
20 |
+
LABEL_WORD_LIST_DIAGNOSE = 'Lista de palabras a diagnosticar'
|
21 |
+
|
22 |
+
we_bias = WEBiasExplorer2d(we.wv)
|
23 |
+
we_bias_2d = WEBiasExplorer4d(we.wv)
|
24 |
+
|
25 |
+
explorar_sesgo_en_palabras_interface = gr.Blocks()
|
26 |
+
with explorar_sesgo_en_palabras_interface:
|
27 |
+
gr.Markdown("1. Escribi palabras para diagnosticar separadas por comas")
|
28 |
+
with gr.Row():
|
29 |
+
with gr.Column():
|
30 |
+
with gr.Row():
|
31 |
+
diagnose_list = gr.Textbox(lines=2, label=LABEL_WORD_LIST_DIAGNOSE)
|
32 |
+
with gr.Row():
|
33 |
+
gr.Markdown("2. Para graficar 2 espacios, completa las siguientes listas:")
|
34 |
+
with gr.Row():
|
35 |
+
wordlist_1 = gr.Textbox(lines=2, label=LABEL_WORD_LIST_1)
|
36 |
+
wordlist_2 = gr.Textbox(lines=2, label=LABEL_WORD_LIST_2)
|
37 |
+
with gr.Row():
|
38 |
+
gr.Markdown("2. Para graficar 4 espacios, completa las siguientes listas:")
|
39 |
+
with gr.Row():
|
40 |
+
wordlist_3 = gr.Textbox(lines=2, label=LABEL_WORD_LIST_3)
|
41 |
+
wordlist_4 = gr.Textbox(lines=2, label=LABEL_WORD_LIST_4)
|
42 |
+
with gr.Column():
|
43 |
+
with gr.Row():
|
44 |
+
bias2d = gr.Button('Β‘Graficar 2 estereotipos!')
|
45 |
+
with gr.Row():
|
46 |
+
bias4d = gr.Button('Β‘Graficar 4 estereotipos!')
|
47 |
+
with gr.Row():
|
48 |
+
err_msg = gr.Markdown(label='',visible=True)
|
49 |
+
with gr.Row():
|
50 |
+
bias_plot = gr.Image(shape=(15, 15))
|
51 |
+
with gr.Row():
|
52 |
+
examples = gr.Examples(
|
53 |
+
fn=we_bias.calculate_bias,
|
54 |
+
inputs=[wordlist_1,wordlist_2,diagnose_list],
|
55 |
+
outputs=[bias_plot,err_msg],
|
56 |
+
examples=[
|
57 |
+
[fem_words,masc_words,diagnose_words],
|
58 |
+
[young_words,old_words,diagnose_words]
|
59 |
+
]
|
60 |
+
)
|
61 |
+
with gr.Row():
|
62 |
+
examples = gr.Examples(
|
63 |
+
fn=we_bias_2d.calculate_bias,
|
64 |
+
inputs=[wordlist_1,wordlist_2,wordlist_3,wordlist_4,diagnose_list],
|
65 |
+
outputs=[bias_plot,err_msg],
|
66 |
+
examples=[[fem_words,masc_words,young_words,old_words,diagnose_words]]
|
67 |
+
)
|
68 |
+
with gr.Row():
|
69 |
+
gr.Markdown(TOOL_INFO)
|
70 |
+
|
71 |
+
bias2d.click(
|
72 |
+
fn=we_bias.calculate_bias,
|
73 |
+
inputs=[wordlist_1,wordlist_2,diagnose_list],
|
74 |
+
outputs=[bias_plot,err_msg])
|
75 |
+
bias4d.click(
|
76 |
+
fn=we_bias_2d.calculate_bias,
|
77 |
+
inputs=[wordlist_1,wordlist_2,wordlist_3,wordlist_4,diagnose_list],
|
78 |
+
outputs=[bias_plot,err_msg])
|
79 |
+
|
80 |
+
|
81 |
+
explorar_sesgo_en_palabras_interface.queue(concurrency_count=10)
|
82 |
+
explorar_sesgo_en_palabras_interface.launch()
|
requirements.txt
ADDED
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# sesgos_en_frases
|
2 |
+
regex
|
3 |
+
# datos
|
4 |
+
torch
|
5 |
+
transformers
|
6 |
+
# resto
|
7 |
+
sklearn
|
8 |
+
gensim==3.7.3
|
9 |
+
transformers
|
10 |
+
tensorflow
|
11 |
+
matplotlib
|
12 |
+
numpy
|
13 |
+
seaborn
|
tool_info.py
ADDED
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
TOOL_INFO = """
|
2 |
+
> ### A tool to overcome technical barriers for bias assessment in human language technologies
|
3 |
+
|
4 |
+
* [Read Full Paper](https://arxiv.org/abs/2207.06591)
|
5 |
+
|
6 |
+
> ### Licensing Information
|
7 |
+
* [MIT Licence](https://huggingface.co/spaces/vialibre/new_bias_tools/resolve/main/LICENSE)
|
8 |
+
|
9 |
+
> ### Citation Information
|
10 |
+
```c
|
11 |
+
@misc{https://doi.org/10.48550/arxiv.2207.06591,
|
12 |
+
doi = {10.48550/ARXIV.2207.06591},
|
13 |
+
url = {https://arxiv.org/abs/2207.06591},
|
14 |
+
author = {Alemany, Laura Alonso and Benotti, Luciana and GonzΓ‘lez, LucΓa and Maina, HernΓ‘n and Busaniche, Beatriz and Halvorsen, Alexia and Bordone, MatΓas and SΓ‘nchez, Jorge},
|
15 |
+
keywords = {Computation and Language (cs.CL), Artificial Intelligence (cs.AI),
|
16 |
+
FOS: Computer and information sciences, FOS: Computer and information sciences},
|
17 |
+
title = {A tool to overcome technical barriers for bias assessment in human language technologies},
|
18 |
+
publisher = {arXiv},
|
19 |
+
year = {2022},
|
20 |
+
copyright = {Creative Commons Attribution Non Commercial Share Alike 4.0 International}
|
21 |
+
}
|
22 |
+
```
|
23 |
+
"""
|
utils/__pycache__/modules_sesgo_en_palabras.cpython-38.pyc
ADDED
Binary file (18.5 kB). View file
|
|
utils/__pycache__/utils_sesgo_en_palabras.cpython-38.pyc
ADDED
Binary file (7.99 kB). View file
|
|
utils/modules_sesgo_en_palabras.py
ADDED
@@ -0,0 +1,714 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import copy
|
2 |
+
|
3 |
+
from sklearn.decomposition import PCA
|
4 |
+
from sklearn.metrics.pairwise import euclidean_distances
|
5 |
+
import matplotlib.pyplot as plt
|
6 |
+
import numpy as np
|
7 |
+
import seaborn as sns
|
8 |
+
from transformers import AutoTokenizer, AutoModelForMaskedLM
|
9 |
+
import pandas as pd
|
10 |
+
from gensim.models import KeyedVectors
|
11 |
+
from utils.utils_sesgo_en_palabras import (
|
12 |
+
cosine_similarity,
|
13 |
+
normalize,
|
14 |
+
project_params,
|
15 |
+
take_two_sides_extreme_sorted
|
16 |
+
)
|
17 |
+
|
18 |
+
|
19 |
+
DIRECTION_METHODS = ['single', 'sum', 'pca']
|
20 |
+
DEBIAS_METHODS = ['neutralize', 'hard', 'soft']
|
21 |
+
FIRST_PC_THRESHOLD = 0.5
|
22 |
+
MAX_NON_SPECIFIC_EXAMPLES = 1000
|
23 |
+
|
24 |
+
__all__ = ['GenderBiasWE', 'BiasWordEmbedding']
|
25 |
+
|
26 |
+
|
27 |
+
class Loader():
|
28 |
+
def __init__(self):
|
29 |
+
self.path_to_data = ''
|
30 |
+
|
31 |
+
def load_tokenizer(self, tokenizer_path):
|
32 |
+
tokenizer = AutoTokenizer.from_pretrained(
|
33 |
+
tokenizer_path, do_lower_case=True, )
|
34 |
+
return tokenizer
|
35 |
+
|
36 |
+
def load_data_from_file(self, data):
|
37 |
+
return data
|
38 |
+
|
39 |
+
def load_corpus_from_file(self, data):
|
40 |
+
return data
|
41 |
+
|
42 |
+
def load_language_model(self, model_path):
|
43 |
+
model = AutoModelForMaskedLM.from_pretrained(
|
44 |
+
model_path, output_hidden_states=True)
|
45 |
+
return model
|
46 |
+
|
47 |
+
|
48 |
+
class Corpus():
|
49 |
+
def __init__(self, corpus) -> None:
|
50 |
+
self.vocabulary = self.load_vocabulary_from_corpus()
|
51 |
+
self.corpus = corpus
|
52 |
+
|
53 |
+
def get_context_from_text(self, word):
|
54 |
+
pass
|
55 |
+
|
56 |
+
def get_frequency(self, word):
|
57 |
+
pass
|
58 |
+
|
59 |
+
def get_most_frequent_coocurrence(self, word):
|
60 |
+
pass
|
61 |
+
|
62 |
+
|
63 |
+
class Embedding():
|
64 |
+
def __init__(self, word_vectors_path) -> None:
|
65 |
+
self.wv = self.load_we_as_keyed_vectors(word_vectors_path)
|
66 |
+
|
67 |
+
def load_we_as_keyed_vectors(self, word_vectors_path):
|
68 |
+
we = KeyedVectors.load_word2vec_format(word_vectors_path)
|
69 |
+
we.init_sims(replace=True)
|
70 |
+
return we
|
71 |
+
|
72 |
+
def get_word_vector(self, word, context=None):
|
73 |
+
return word
|
74 |
+
|
75 |
+
|
76 |
+
class BiasExplorer():
|
77 |
+
def __init__(self, model, only_lower=False, verbose=False,
|
78 |
+
identify_direction=False, to_normalize=True):
|
79 |
+
# pylint: disable=undefined-variable
|
80 |
+
|
81 |
+
# TODO: this is bad Python, ask someone about it
|
82 |
+
# probably should be a better design
|
83 |
+
# identify_direction doesn't have any meaning
|
84 |
+
# for the class BiasWordEmbedding
|
85 |
+
# The goal is to force this interfeace of sub-classes.
|
86 |
+
if self.__class__ == __class__ and identify_direction is not False:
|
87 |
+
raise ValueError('identify_direction must be False'
|
88 |
+
' for an instance of {}'
|
89 |
+
.format(__class__))
|
90 |
+
|
91 |
+
self.model = model
|
92 |
+
|
93 |
+
# TODO: write unitest for when it is False
|
94 |
+
self.only_lower = only_lower
|
95 |
+
|
96 |
+
self._verbose = verbose
|
97 |
+
|
98 |
+
self.direction = None
|
99 |
+
self.positive_end = None
|
100 |
+
self.negative_end = None
|
101 |
+
|
102 |
+
if to_normalize:
|
103 |
+
self.model.init_sims(replace=True)
|
104 |
+
|
105 |
+
def __copy__(self):
|
106 |
+
bias_word_embedding = self.__class__(self.model,
|
107 |
+
self.only_lower,
|
108 |
+
self._verbose,
|
109 |
+
identify_direction=False)
|
110 |
+
bias_word_embedding.direction = copy.deepcopy(self.direction)
|
111 |
+
bias_word_embedding.positive_end = copy.deepcopy(self.positive_end)
|
112 |
+
bias_word_embedding.negative_end = copy.deepcopy(self.negative_end)
|
113 |
+
return bias_word_embedding
|
114 |
+
|
115 |
+
def __deepcopy__(self, memo):
|
116 |
+
bias_word_embedding = copy.copy(self)
|
117 |
+
bias_word_embedding.model = copy.deepcopy(bias_word_embedding.model)
|
118 |
+
return bias_word_embedding
|
119 |
+
|
120 |
+
def __getitem__(self, key):
|
121 |
+
return self.model[key]
|
122 |
+
|
123 |
+
def __contains__(self, item):
|
124 |
+
return item in self.model
|
125 |
+
|
126 |
+
def _is_direction_identified(self):
|
127 |
+
if self.direction is None:
|
128 |
+
raise RuntimeError('The direction was not identified'
|
129 |
+
' for this {} instance'
|
130 |
+
.format(self.__class__.__name__))
|
131 |
+
|
132 |
+
def _identify_subspace_by_pca(self, definitional_pairs, n_components):
|
133 |
+
matrix = []
|
134 |
+
|
135 |
+
for word1, word2 in definitional_pairs:
|
136 |
+
vector1 = normalize(self[word1])
|
137 |
+
vector2 = normalize(self[word2])
|
138 |
+
|
139 |
+
center = (vector1 + vector2) / 2
|
140 |
+
|
141 |
+
matrix.append(vector1 - center)
|
142 |
+
matrix.append(vector2 - center)
|
143 |
+
|
144 |
+
pca = PCA(n_components=n_components)
|
145 |
+
pca.fit(matrix)
|
146 |
+
|
147 |
+
if self._verbose:
|
148 |
+
table = enumerate(pca.explained_variance_ratio_, start=1)
|
149 |
+
headers = ['Principal Component',
|
150 |
+
'Explained Variance Ratio']
|
151 |
+
|
152 |
+
return pca
|
153 |
+
|
154 |
+
def __errorChecking(self, word):
|
155 |
+
out_msj = ""
|
156 |
+
|
157 |
+
if not word:
|
158 |
+
out_msj = "Error: Primero debe ingresar una palabra!"
|
159 |
+
else:
|
160 |
+
if not word in self.model:
|
161 |
+
out_msj = f"Error: La palabra '<b>{word}</b>' no se encuentra en el vocabulario!"
|
162 |
+
|
163 |
+
if out_msj:
|
164 |
+
out_msj = "<center><h3>"+out_msj+"</h3></center>"
|
165 |
+
|
166 |
+
return out_msj
|
167 |
+
|
168 |
+
# TODO: add the SVD method from section 6 step 1
|
169 |
+
# It seems there is a mistake there, I think it is the same as PCA
|
170 |
+
# just with replacing it with SVD
|
171 |
+
def _identify_direction(self, positive_end, negative_end,
|
172 |
+
definitional, method='pca'):
|
173 |
+
if method not in DIRECTION_METHODS:
|
174 |
+
raise ValueError('method should be one of {}, {} was given'.format(
|
175 |
+
DIRECTION_METHODS, method))
|
176 |
+
|
177 |
+
if positive_end == negative_end:
|
178 |
+
raise ValueError('positive_end and negative_end'
|
179 |
+
'should be different, and not the same "{}"'
|
180 |
+
.format(positive_end))
|
181 |
+
if self._verbose:
|
182 |
+
print('Identify direction using {} method...'.format(method))
|
183 |
+
|
184 |
+
direction = None
|
185 |
+
|
186 |
+
if method == 'single':
|
187 |
+
if self._verbose:
|
188 |
+
print('Positive definitional end:', definitional[0])
|
189 |
+
print('Negative definitional end:', definitional[1])
|
190 |
+
direction = normalize(normalize(self[definitional[0]])
|
191 |
+
- normalize(self[definitional[1]]))
|
192 |
+
|
193 |
+
elif method == 'sum':
|
194 |
+
group1_sum_vector = np.sum([self[word]
|
195 |
+
for word in definitional[0]], axis=0)
|
196 |
+
group2_sum_vector = np.sum([self[word]
|
197 |
+
for word in definitional[1]], axis=0)
|
198 |
+
|
199 |
+
diff_vector = (normalize(group1_sum_vector)
|
200 |
+
- normalize(group2_sum_vector))
|
201 |
+
|
202 |
+
direction = normalize(diff_vector)
|
203 |
+
|
204 |
+
elif method == 'pca':
|
205 |
+
pca = self._identify_subspace_by_pca(definitional, 10)
|
206 |
+
if pca.explained_variance_ratio_[0] < FIRST_PC_THRESHOLD:
|
207 |
+
raise RuntimeError('The Explained variance'
|
208 |
+
'of the first principal component should be'
|
209 |
+
'at least {}, but it is {}'
|
210 |
+
.format(FIRST_PC_THRESHOLD,
|
211 |
+
pca.explained_variance_ratio_[0]))
|
212 |
+
direction = pca.components_[0]
|
213 |
+
|
214 |
+
# if direction is opposite (e.g. we cannot control
|
215 |
+
# what the PCA will return)
|
216 |
+
ends_diff_projection = cosine_similarity((self[positive_end]
|
217 |
+
- self[negative_end]),
|
218 |
+
direction)
|
219 |
+
if ends_diff_projection < 0:
|
220 |
+
direction = -direction # pylint: disable=invalid-unary-operand-type
|
221 |
+
|
222 |
+
self.direction = direction
|
223 |
+
self.positive_end = positive_end
|
224 |
+
self.negative_end = negative_end
|
225 |
+
|
226 |
+
def project_on_direction(self, word):
|
227 |
+
"""Project the normalized vector of the word on the direction.
|
228 |
+
:param str word: The word tor project
|
229 |
+
:return float: The projection scalar
|
230 |
+
"""
|
231 |
+
|
232 |
+
self._is_direction_identified()
|
233 |
+
|
234 |
+
vector = self[word]
|
235 |
+
projection_score = self.model.cosine_similarities(self.direction,
|
236 |
+
[vector])[0]
|
237 |
+
return projection_score
|
238 |
+
|
239 |
+
def _calc_projection_scores(self, words):
|
240 |
+
self._is_direction_identified()
|
241 |
+
|
242 |
+
df = pd.DataFrame({'word': words})
|
243 |
+
|
244 |
+
# TODO: maybe using cosine_similarities on all the vectors?
|
245 |
+
# it might be faster
|
246 |
+
df['projection'] = df['word'].apply(self.project_on_direction)
|
247 |
+
df = df.sort_values('projection', ascending=False)
|
248 |
+
|
249 |
+
return df
|
250 |
+
|
251 |
+
def calc_projection_data(self, words):
|
252 |
+
"""
|
253 |
+
Calculate projection, projected and rejected vectors of a words list.
|
254 |
+
:param list words: List of words
|
255 |
+
:return: :class:`pandas.DataFrame` of the projection,
|
256 |
+
projected and rejected vectors of the words list
|
257 |
+
"""
|
258 |
+
projection_data = []
|
259 |
+
for word in words:
|
260 |
+
vector = self[word]
|
261 |
+
projection = self.project_on_direction(word)
|
262 |
+
normalized_vector = normalize(vector)
|
263 |
+
|
264 |
+
(projection,
|
265 |
+
projected_vector,
|
266 |
+
rejected_vector) = project_params(normalized_vector,
|
267 |
+
self.direction)
|
268 |
+
|
269 |
+
projection_data.append({'word': word,
|
270 |
+
'vector': vector,
|
271 |
+
'projection': projection,
|
272 |
+
'projected_vector': projected_vector,
|
273 |
+
'rejected_vector': rejected_vector})
|
274 |
+
|
275 |
+
return pd.DataFrame(projection_data)
|
276 |
+
|
277 |
+
def plot_dist_projections_on_direction(self, word_groups, ax=None):
|
278 |
+
"""Plot the projection scalars distribution on the direction.
|
279 |
+
:param dict word_groups word: The groups to projects
|
280 |
+
:return float: The ax object of the plot
|
281 |
+
"""
|
282 |
+
|
283 |
+
if ax is None:
|
284 |
+
_, ax = plt.subplots(1)
|
285 |
+
|
286 |
+
names = sorted(word_groups.keys())
|
287 |
+
|
288 |
+
for name in names:
|
289 |
+
words = word_groups[name]
|
290 |
+
label = '{} (#{})'.format(name, len(words))
|
291 |
+
vectors = [self[word] for word in words]
|
292 |
+
projections = self.model.cosine_similarities(self.direction,
|
293 |
+
vectors)
|
294 |
+
sns.distplot(projections, hist=False, label=label, ax=ax)
|
295 |
+
|
296 |
+
plt.axvline(0, color='k', linestyle='--')
|
297 |
+
|
298 |
+
plt.title('β {} {} {} β'.format(self.negative_end,
|
299 |
+
' ' * 20,
|
300 |
+
self.positive_end))
|
301 |
+
plt.xlabel('Direction Projection')
|
302 |
+
plt.ylabel('Density')
|
303 |
+
ax.legend(loc='center left', bbox_to_anchor=(1, 0.5))
|
304 |
+
|
305 |
+
return ax
|
306 |
+
|
307 |
+
def __errorChecking(self, word):
|
308 |
+
out_msj = ""
|
309 |
+
|
310 |
+
if not word:
|
311 |
+
out_msj = "Error: Primero debe ingresar una palabra!"
|
312 |
+
else:
|
313 |
+
if not word in self.model:
|
314 |
+
out_msj = f"Error: La palabra '<b>{word}</b>' no se encuentra en el vocabulario!"
|
315 |
+
|
316 |
+
if out_msj:
|
317 |
+
out_msj = "<center><h3>"+out_msj+"</h3></center>"
|
318 |
+
|
319 |
+
return out_msj
|
320 |
+
|
321 |
+
def parse_words(self, string):
|
322 |
+
words = string.strip()
|
323 |
+
if words:
|
324 |
+
words = [word.strip() for word in words.split(',') if word != ""]
|
325 |
+
return words
|
326 |
+
|
327 |
+
def check_oov(self, wordlists):
|
328 |
+
for wordlist in wordlists:
|
329 |
+
parsed_words = self.parse_words(wordlist)
|
330 |
+
for word in parsed_words:
|
331 |
+
msg = self.__errorChecking(word)
|
332 |
+
if msg:
|
333 |
+
return msg
|
334 |
+
return None
|
335 |
+
|
336 |
+
def plot_projections_2d(self,
|
337 |
+
wordlist,
|
338 |
+
wordlist_1,
|
339 |
+
wordlist_2,
|
340 |
+
wordlist_3,
|
341 |
+
wordlist_4,
|
342 |
+
color_wordlist,
|
343 |
+
color_wordlist_1,
|
344 |
+
color_wordlist_2,
|
345 |
+
color_wordlist_3,
|
346 |
+
color_wordlist_4,
|
347 |
+
plot_neighbors,
|
348 |
+
n_alpha,
|
349 |
+
fontsize,
|
350 |
+
figsize=(15, 15),
|
351 |
+
method='pca'
|
352 |
+
):
|
353 |
+
# convertirlas a vector
|
354 |
+
choices = [0, 1, 2, 3, 4]
|
355 |
+
word_list = []
|
356 |
+
wordlist_choice = [wordlist, wordlist_1, wordlist_2, wordlist_3, wordlist_4]
|
357 |
+
err= self.check_oov(wordlist_choice)
|
358 |
+
if err:
|
359 |
+
return None, err
|
360 |
+
words_colors = {}
|
361 |
+
label_dict = {
|
362 |
+
0: 'Diagnostico',
|
363 |
+
1: 'Lista de palabras 1',
|
364 |
+
2: 'Lista de palabras 2',
|
365 |
+
3: 'Lista de palabras 3',
|
366 |
+
4: 'Lista de palabras 4'
|
367 |
+
}
|
368 |
+
color_dict = {
|
369 |
+
0: color_wordlist,
|
370 |
+
1: color_wordlist_1,
|
371 |
+
2: color_wordlist_2,
|
372 |
+
3: color_wordlist_3,
|
373 |
+
4: color_wordlist_4
|
374 |
+
}
|
375 |
+
word_bias_space = {}
|
376 |
+
alpha = {}
|
377 |
+
|
378 |
+
for raw_word_list, color in zip(wordlist_choice, choices):
|
379 |
+
parsed_words = self.parse_words(raw_word_list)
|
380 |
+
if parsed_words:
|
381 |
+
for word in parsed_words:
|
382 |
+
word_bias_space[word] = color
|
383 |
+
words_colors[word] = color_dict[color]
|
384 |
+
alpha[word] = 1
|
385 |
+
if plot_neighbors:
|
386 |
+
neighbors = [w for w,s in self.model.most_similar(word,topn=5)]
|
387 |
+
for n in neighbors:
|
388 |
+
if n not in alpha:
|
389 |
+
word_bias_space[n] = color
|
390 |
+
words_colors[n] = color_dict[color]
|
391 |
+
alpha[n] = n_alpha
|
392 |
+
word_list += neighbors
|
393 |
+
word_list += parsed_words
|
394 |
+
if not word_list:
|
395 |
+
return None, "<center><h3>" + "Ingresa al menos 2 palabras para continuar" + "<center><h3>"
|
396 |
+
embeddings = [self.model[word] for word in word_list]
|
397 |
+
words_embedded = PCA(
|
398 |
+
n_components=2, random_state=1).fit_transform(embeddings)
|
399 |
+
data = pd.DataFrame(words_embedded)
|
400 |
+
data['word'] = word_list
|
401 |
+
data['color'] = [words_colors[word] for word in word_list]
|
402 |
+
data['alpha'] = [alpha[word] for word in word_list]
|
403 |
+
data['word_bias_space'] = [word_bias_space[word] for word in word_list]
|
404 |
+
fig, ax = plt.subplots(figsize=figsize)
|
405 |
+
|
406 |
+
sns.scatterplot(
|
407 |
+
data=data[data['alpha'] == 1],
|
408 |
+
x=0,
|
409 |
+
y=1,
|
410 |
+
style='word_bias_space',
|
411 |
+
hue='word_bias_space',
|
412 |
+
ax=ax,
|
413 |
+
palette=color_dict
|
414 |
+
)
|
415 |
+
if plot_neighbors:
|
416 |
+
sns.scatterplot(
|
417 |
+
data=data[data['alpha'] != 1],
|
418 |
+
x=0,
|
419 |
+
y=1,
|
420 |
+
style='color',
|
421 |
+
hue='word_bias_space',
|
422 |
+
ax=ax,
|
423 |
+
alpha=n_alpha,
|
424 |
+
legend=False,
|
425 |
+
palette=color_dict
|
426 |
+
)
|
427 |
+
for i, label in enumerate(word_list):
|
428 |
+
x, y = words_embedded[i, :]
|
429 |
+
ax.annotate(label, xy=(x, y), xytext=(5, 2),color=words_colors[label],
|
430 |
+
textcoords='offset points',
|
431 |
+
ha='right', va='bottom', size=fontsize, alpha=alpha[label])
|
432 |
+
|
433 |
+
ax.set_xticks([])
|
434 |
+
ax.set_yticks([])
|
435 |
+
|
436 |
+
fig.tight_layout()
|
437 |
+
fig.canvas.draw()
|
438 |
+
|
439 |
+
data = np.frombuffer(fig.canvas.tostring_rgb(), dtype=np.uint8)
|
440 |
+
w, h = fig.canvas.get_width_height()
|
441 |
+
im = data.reshape((int(h), int(w), -1))
|
442 |
+
return im, ''
|
443 |
+
|
444 |
+
|
445 |
+
class WEBiasExplorer2d(BiasExplorer):
|
446 |
+
def __init__(self, word_embedding) -> None:
|
447 |
+
super().__init__(word_embedding)
|
448 |
+
|
449 |
+
def calculate_bias(
|
450 |
+
self,
|
451 |
+
palabras_extremo_1,
|
452 |
+
palabras_extremo_2,
|
453 |
+
palabras_para_situar
|
454 |
+
):
|
455 |
+
|
456 |
+
wordlists = [
|
457 |
+
palabras_extremo_1,
|
458 |
+
palabras_extremo_2,
|
459 |
+
palabras_para_situar
|
460 |
+
]
|
461 |
+
err = self.check_oov(wordlists)
|
462 |
+
for wordlist in wordlists:
|
463 |
+
if not wordlist:
|
464 |
+
err = "<center><h3>" + 'Debe ingresar al menos 1 palabra en las lista de palabras a diagnosticar, sesgo 1 y sesgo 2' +"<center><h3>"
|
465 |
+
if err:
|
466 |
+
return None, err
|
467 |
+
|
468 |
+
|
469 |
+
err = self.check_oov([palabras_extremo_1,palabras_extremo_2,palabras_para_situar])
|
470 |
+
if err:
|
471 |
+
return None, err
|
472 |
+
palabras_extremo_1 = self.parse_words(palabras_extremo_1)
|
473 |
+
palabras_extremo_2 = self.parse_words(palabras_extremo_2)
|
474 |
+
palabras_para_situar = self.parse_words(palabras_para_situar)
|
475 |
+
im = self.get_bias_plot(
|
476 |
+
palabras_para_situar,
|
477 |
+
definitional=(
|
478 |
+
palabras_extremo_1, palabras_extremo_2),
|
479 |
+
method='sum',
|
480 |
+
n_extreme=10
|
481 |
+
)
|
482 |
+
return im, ''
|
483 |
+
|
484 |
+
def get_bias_plot(self,
|
485 |
+
palabras_para_situar,
|
486 |
+
definitional,
|
487 |
+
method='sum',
|
488 |
+
n_extreme=10,
|
489 |
+
figsize=(10, 10)
|
490 |
+
):
|
491 |
+
|
492 |
+
fig, ax = plt.subplots(1, figsize=figsize)
|
493 |
+
self.method = method
|
494 |
+
self.plot_projection_scores(
|
495 |
+
definitional,
|
496 |
+
palabras_para_situar, n_extreme, ax=ax,)
|
497 |
+
|
498 |
+
fig.tight_layout()
|
499 |
+
fig.canvas.draw()
|
500 |
+
|
501 |
+
data = np.frombuffer(fig.canvas.tostring_rgb(), dtype=np.uint8)
|
502 |
+
w, h = fig.canvas.get_width_height()
|
503 |
+
im = data.reshape((int(h), int(w), -1))
|
504 |
+
return im
|
505 |
+
|
506 |
+
def plot_projection_scores(self, definitional,
|
507 |
+
words, n_extreme=10,
|
508 |
+
ax=None, axis_projection_step=None):
|
509 |
+
"""Plot the projection scalar of words on the direction.
|
510 |
+
:param list words: The words tor project
|
511 |
+
:param int or None n_extreme: The number of extreme words to show
|
512 |
+
:return: The ax object of the plot
|
513 |
+
"""
|
514 |
+
nombre_del_extremo_1 = ', '.join(definitional[0])
|
515 |
+
nombre_del_extremo_2 = ', '.join(definitional[1])
|
516 |
+
|
517 |
+
self._identify_direction(nombre_del_extremo_1, nombre_del_extremo_2,
|
518 |
+
definitional=definitional,
|
519 |
+
method='sum')
|
520 |
+
|
521 |
+
self._is_direction_identified()
|
522 |
+
|
523 |
+
projections_df = self._calc_projection_scores(words)
|
524 |
+
projections_df['projection'] = projections_df['projection'].round(2)
|
525 |
+
|
526 |
+
if n_extreme is not None:
|
527 |
+
projections_df = take_two_sides_extreme_sorted(projections_df,
|
528 |
+
n_extreme=n_extreme)
|
529 |
+
|
530 |
+
if ax is None:
|
531 |
+
_, ax = plt.subplots(1)
|
532 |
+
|
533 |
+
if axis_projection_step is None:
|
534 |
+
axis_projection_step = 0.1
|
535 |
+
|
536 |
+
cmap = plt.get_cmap('RdBu')
|
537 |
+
projections_df['color'] = ((projections_df['projection'] + 0.5)
|
538 |
+
.apply(cmap))
|
539 |
+
|
540 |
+
most_extream_projection = np.round(
|
541 |
+
projections_df['projection']
|
542 |
+
.abs()
|
543 |
+
.max(),
|
544 |
+
decimals=1)
|
545 |
+
|
546 |
+
sns.barplot(x='projection', y='word', data=projections_df,
|
547 |
+
palette=projections_df['color'])
|
548 |
+
|
549 |
+
plt.xticks(np.arange(-most_extream_projection,
|
550 |
+
most_extream_projection + axis_projection_step,
|
551 |
+
axis_projection_step))
|
552 |
+
xlabel = ('β {} {} {} β'.format(self.negative_end,
|
553 |
+
' ' * 20,
|
554 |
+
self.positive_end))
|
555 |
+
|
556 |
+
plt.xlabel(xlabel)
|
557 |
+
plt.ylabel('Words')
|
558 |
+
|
559 |
+
return ax
|
560 |
+
|
561 |
+
|
562 |
+
class WEBiasExplorer4d(BiasExplorer):
|
563 |
+
def __init__(self, word_embedding) -> None:
|
564 |
+
super().__init__(word_embedding)
|
565 |
+
|
566 |
+
def calculate_bias(
|
567 |
+
self,
|
568 |
+
palabras_extremo_1,
|
569 |
+
palabras_extremo_2,
|
570 |
+
palabras_extremo_3,
|
571 |
+
palabras_extremo_4,
|
572 |
+
palabras_para_situar
|
573 |
+
):
|
574 |
+
wordlists = [
|
575 |
+
palabras_extremo_1,
|
576 |
+
palabras_extremo_2,
|
577 |
+
palabras_extremo_3,
|
578 |
+
palabras_extremo_4,
|
579 |
+
palabras_para_situar
|
580 |
+
]
|
581 |
+
err = self.check_oov(wordlists)
|
582 |
+
for wordlist in wordlists:
|
583 |
+
if not wordlist:
|
584 |
+
err = "<center><h3>" + 'Β‘Para graficar con 4 espacios, debe ingresar al menos 1 palabra en todas las listas!' + "<center><h3>"
|
585 |
+
if err:
|
586 |
+
return None, err
|
587 |
+
|
588 |
+
palabras_extremo_1 = self.parse_words(palabras_extremo_1)
|
589 |
+
palabras_extremo_2 = self.parse_words(palabras_extremo_2)
|
590 |
+
palabras_extremo_3 = self.parse_words(palabras_extremo_3)
|
591 |
+
palabras_extremo_4 = self.parse_words(palabras_extremo_4)
|
592 |
+
|
593 |
+
palabras_para_situar = self.parse_words(palabras_para_situar)
|
594 |
+
|
595 |
+
im = self.get_bias_plot(
|
596 |
+
palabras_para_situar,
|
597 |
+
definitional_1=(
|
598 |
+
palabras_extremo_1, palabras_extremo_2),
|
599 |
+
definitional_2=(
|
600 |
+
palabras_extremo_3, palabras_extremo_4),
|
601 |
+
method='sum',
|
602 |
+
n_extreme=10
|
603 |
+
)
|
604 |
+
return im, ''
|
605 |
+
|
606 |
+
def get_bias_plot(self,
|
607 |
+
palabras_para_situar,
|
608 |
+
definitional_1,
|
609 |
+
definitional_2,
|
610 |
+
method='sum',
|
611 |
+
n_extreme=10,
|
612 |
+
figsize=(10, 10)
|
613 |
+
):
|
614 |
+
|
615 |
+
fig, ax = plt.subplots(1, figsize=figsize)
|
616 |
+
self.method = method
|
617 |
+
self.plot_projection_scores(
|
618 |
+
definitional_1,
|
619 |
+
definitional_2,
|
620 |
+
palabras_para_situar, n_extreme, ax=ax,)
|
621 |
+
fig.canvas.draw()
|
622 |
+
|
623 |
+
data = np.frombuffer(fig.canvas.tostring_rgb(), dtype=np.uint8)
|
624 |
+
w, h = fig.canvas.get_width_height()
|
625 |
+
im = data.reshape((int(h), int(w), -1))
|
626 |
+
return im
|
627 |
+
|
628 |
+
def plot_projection_scores(self, definitional_1, definitional_2,
|
629 |
+
words, n_extreme=10,
|
630 |
+
ax=None, axis_projection_step=None):
|
631 |
+
"""Plot the projection scalar of words on the direction.
|
632 |
+
:param list words: The words tor project
|
633 |
+
:param int or None n_extreme: The number of extreme words to show
|
634 |
+
:return: The ax object of the plot
|
635 |
+
"""
|
636 |
+
|
637 |
+
nombre_del_extremo_1 = ', '.join(definitional_1[1])
|
638 |
+
nombre_del_extremo_2 = ', '.join(definitional_1[0])
|
639 |
+
|
640 |
+
self._identify_direction(nombre_del_extremo_1, nombre_del_extremo_2,
|
641 |
+
definitional=definitional_1,
|
642 |
+
method='sum')
|
643 |
+
|
644 |
+
self._is_direction_identified()
|
645 |
+
|
646 |
+
projections_df = self._calc_projection_scores(words)
|
647 |
+
projections_df['projection_x'] = projections_df['projection'].round(2)
|
648 |
+
|
649 |
+
nombre_del_extremo_3 = ', '.join(definitional_2[1])
|
650 |
+
nombre_del_extremo_4 = ', '.join(definitional_2[0])
|
651 |
+
self._identify_direction(nombre_del_extremo_3, nombre_del_extremo_4,
|
652 |
+
definitional=definitional_2,
|
653 |
+
method='sum')
|
654 |
+
|
655 |
+
self._is_direction_identified()
|
656 |
+
|
657 |
+
projections_df['projection_y'] = self._calc_projection_scores(words)[
|
658 |
+
'projection'].round(2)
|
659 |
+
|
660 |
+
if n_extreme is not None:
|
661 |
+
projections_df = take_two_sides_extreme_sorted(projections_df,
|
662 |
+
n_extreme=n_extreme)
|
663 |
+
|
664 |
+
if ax is None:
|
665 |
+
_, ax = plt.subplots(1)
|
666 |
+
|
667 |
+
if axis_projection_step is None:
|
668 |
+
axis_projection_step = 0.1
|
669 |
+
|
670 |
+
cmap = plt.get_cmap('RdBu')
|
671 |
+
projections_df['color'] = ((projections_df['projection'] + 0.5)
|
672 |
+
.apply(cmap))
|
673 |
+
most_extream_projection = np.round(
|
674 |
+
projections_df['projection']
|
675 |
+
.abs()
|
676 |
+
.max(),
|
677 |
+
decimals=1)
|
678 |
+
sns.scatterplot(x='projection_x', y='projection_y', data=projections_df,
|
679 |
+
palette=projections_df['color'])
|
680 |
+
|
681 |
+
plt.xticks(np.arange(-most_extream_projection,
|
682 |
+
most_extream_projection + axis_projection_step,
|
683 |
+
axis_projection_step))
|
684 |
+
for _, row in (projections_df.iterrows()):
|
685 |
+
ax.annotate(
|
686 |
+
row['word'], (row['projection_x'], row['projection_y']))
|
687 |
+
x_label = 'β {} {} {} β'.format(nombre_del_extremo_1,
|
688 |
+
' ' * 20,
|
689 |
+
nombre_del_extremo_2)
|
690 |
+
|
691 |
+
y_label = 'β {} {} {} β'.format(nombre_del_extremo_3,
|
692 |
+
' ' * 20,
|
693 |
+
nombre_del_extremo_4)
|
694 |
+
|
695 |
+
plt.xlabel(x_label)
|
696 |
+
ax.xaxis.set_label_position('bottom')
|
697 |
+
ax.xaxis.set_label_coords(.5, 0)
|
698 |
+
|
699 |
+
plt.ylabel(y_label)
|
700 |
+
ax.yaxis.set_label_position('left')
|
701 |
+
ax.yaxis.set_label_coords(0, .5)
|
702 |
+
|
703 |
+
ax.spines['left'].set_position('center')
|
704 |
+
ax.spines['bottom'].set_position('center')
|
705 |
+
|
706 |
+
ax.set_xticks([])
|
707 |
+
ax.set_yticks([])
|
708 |
+
#plt.yticks([], [])
|
709 |
+
# ax.spines['left'].set_position('zero')
|
710 |
+
# ax.spines['bottom'].set_position('zero')
|
711 |
+
|
712 |
+
return ax
|
713 |
+
|
714 |
+
|
utils/utils_sesgo_en_palabras.py
ADDED
@@ -0,0 +1,272 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import math
|
2 |
+
|
3 |
+
import gensim
|
4 |
+
import matplotlib.pylab as plt
|
5 |
+
import numpy as np
|
6 |
+
import pandas as pd
|
7 |
+
from six import string_types
|
8 |
+
from sklearn.cluster import KMeans
|
9 |
+
from sklearn.manifold import TSNE
|
10 |
+
from sklearn.metrics import accuracy_score
|
11 |
+
import gradio as gr
|
12 |
+
|
13 |
+
|
14 |
+
|
15 |
+
WORD_EMBEDDING_MODEL_TYPES = (gensim.models.keyedvectors.KeyedVectors,
|
16 |
+
gensim.models.fasttext.FastText,
|
17 |
+
gensim.models.word2vec.Word2Vec,
|
18 |
+
gensim.models.base_any2vec.BaseWordEmbeddingsModel,) # pylint: disable=line-too-long
|
19 |
+
|
20 |
+
|
21 |
+
def assert_gensim_keyed_vectors(model):
|
22 |
+
if not isinstance(model, WORD_EMBEDDING_MODEL_TYPES):
|
23 |
+
type_names = (model_type.__name__
|
24 |
+
for model_type in WORD_EMBEDDING_MODEL_TYPES)
|
25 |
+
raise TypeError('model should be on of the types'
|
26 |
+
' ({}), not {}.'
|
27 |
+
.format(', '.join(type_names),
|
28 |
+
type(model)))
|
29 |
+
|
30 |
+
def generate_words_forms(words):
|
31 |
+
return sum([generate_one_word_forms(word) for word in words], [])
|
32 |
+
|
33 |
+
|
34 |
+
def cosine_similarity(v, u):
|
35 |
+
"""Calculate the cosine similarity between two vectors."""
|
36 |
+
v_norm = np.linalg.norm(v)
|
37 |
+
u_norm = np.linalg.norm(u)
|
38 |
+
similarity = v @ u / (v_norm * u_norm)
|
39 |
+
return similarity
|
40 |
+
|
41 |
+
|
42 |
+
def generate_one_word_forms(word):
|
43 |
+
return [word.lower(), word.upper(), word.title()]
|
44 |
+
|
45 |
+
|
46 |
+
def get_seed_vector(seed, bias_word_embedding):
|
47 |
+
|
48 |
+
if seed == 'direction':
|
49 |
+
positive_end = bias_word_embedding.positive_end
|
50 |
+
negative_end = bias_word_embedding.negative_end
|
51 |
+
bias_word_embedding._is_direction_identified() # pylint: disable=protected-access
|
52 |
+
seed_vector = bias_word_embedding.direction
|
53 |
+
else:
|
54 |
+
if seed == 'ends':
|
55 |
+
positive_end = bias_word_embedding.positive_end
|
56 |
+
negative_end = bias_word_embedding.negative_end
|
57 |
+
|
58 |
+
else:
|
59 |
+
positive_end, negative_end = seed
|
60 |
+
|
61 |
+
seed_vector = normalize(bias_word_embedding.model[positive_end]
|
62 |
+
- bias_word_embedding.model[negative_end])
|
63 |
+
|
64 |
+
return seed_vector, positive_end, negative_end
|
65 |
+
|
66 |
+
|
67 |
+
def most_similar(model, positive=None, negative=None,
|
68 |
+
topn=10, restrict_vocab=None, indexer=None,
|
69 |
+
unrestricted=True):
|
70 |
+
"""
|
71 |
+
Find the top-N most similar words.
|
72 |
+
|
73 |
+
Positive words contribute positively towards the similarity,
|
74 |
+
negative words negatively.
|
75 |
+
|
76 |
+
This function computes cosine similarity between a simple mean
|
77 |
+
of the projection weight vectors of the given words and
|
78 |
+
the vectors for each word in the model.
|
79 |
+
The function corresponds to the `word-analogy` and `distance`
|
80 |
+
scripts in the original word2vec implementation.
|
81 |
+
|
82 |
+
Based on Gensim implementation.
|
83 |
+
|
84 |
+
:param model: Word embedding model of ``gensim.model.KeyedVectors``.
|
85 |
+
:param list positive: List of words that contribute positively.
|
86 |
+
:param list negative: List of words that contribute negatively.
|
87 |
+
:param int topn: Number of top-N similar words to return.
|
88 |
+
:param int restrict_vocab: Optional integer which limits the
|
89 |
+
range of vectors
|
90 |
+
which are searched for most-similar values.
|
91 |
+
For example, restrict_vocab=10000 would
|
92 |
+
only check the first 10000 word vectors
|
93 |
+
in the vocabulary order. (This may be
|
94 |
+
meaningful if you've sorted the vocabulary
|
95 |
+
by descending frequency.)
|
96 |
+
:param bool unrestricted: Whether to restricted the most
|
97 |
+
similar words to be not from
|
98 |
+
the positive or negative word list.
|
99 |
+
:return: Sequence of (word, similarity).
|
100 |
+
"""
|
101 |
+
if topn is not None and topn < 1:
|
102 |
+
return []
|
103 |
+
|
104 |
+
if positive is None:
|
105 |
+
positive = []
|
106 |
+
if negative is None:
|
107 |
+
negative = []
|
108 |
+
|
109 |
+
model.init_sims()
|
110 |
+
|
111 |
+
if (isinstance(positive, string_types)
|
112 |
+
and not negative):
|
113 |
+
# allow calls like most_similar('dog'),
|
114 |
+
# as a shorthand for most_similar(['dog'])
|
115 |
+
positive = [positive]
|
116 |
+
|
117 |
+
if ((isinstance(positive, string_types) and negative)
|
118 |
+
or (isinstance(negative, string_types) and positive)):
|
119 |
+
raise ValueError('If positives and negatives are given, '
|
120 |
+
'both should be lists!')
|
121 |
+
|
122 |
+
# add weights for each word, if not already present;
|
123 |
+
# default to 1.0 for positive and -1.0 for negative words
|
124 |
+
positive = [
|
125 |
+
(word, 1.0) if isinstance(word, string_types + (np.ndarray,))
|
126 |
+
else word
|
127 |
+
for word in positive
|
128 |
+
]
|
129 |
+
negative = [
|
130 |
+
(word, -1.0) if isinstance(word, string_types + (np.ndarray,))
|
131 |
+
else word
|
132 |
+
for word in negative
|
133 |
+
]
|
134 |
+
|
135 |
+
# compute the weighted average of all words
|
136 |
+
all_words, mean = set(), []
|
137 |
+
for word, weight in positive + negative:
|
138 |
+
if isinstance(word, np.ndarray):
|
139 |
+
mean.append(weight * word)
|
140 |
+
else:
|
141 |
+
mean.append(weight * model.word_vec(word, use_norm=True))
|
142 |
+
if word in model.vocab:
|
143 |
+
all_words.add(model.vocab[word].index)
|
144 |
+
|
145 |
+
if not mean:
|
146 |
+
raise ValueError("Cannot compute similarity with no input.")
|
147 |
+
mean = gensim.matutils.unitvec(np.array(mean)
|
148 |
+
.mean(axis=0)).astype(float)
|
149 |
+
|
150 |
+
if indexer is not None:
|
151 |
+
return indexer.most_similar(mean, topn)
|
152 |
+
|
153 |
+
limited = (model.vectors_norm if restrict_vocab is None
|
154 |
+
else model.vectors_norm[:restrict_vocab])
|
155 |
+
dists = limited @ mean
|
156 |
+
|
157 |
+
if topn is None:
|
158 |
+
return dists
|
159 |
+
|
160 |
+
best = gensim.matutils.argsort(dists,
|
161 |
+
topn=topn + len(all_words),
|
162 |
+
reverse=True)
|
163 |
+
|
164 |
+
# if not unrestricted, then ignore (don't return)
|
165 |
+
# words from the input
|
166 |
+
result = [(model.index2word[sim], float(dists[sim]))
|
167 |
+
for sim in best
|
168 |
+
if unrestricted or sim not in all_words]
|
169 |
+
|
170 |
+
return result[:topn]
|
171 |
+
|
172 |
+
|
173 |
+
def normalize(v):
|
174 |
+
"""Normalize a 1-D vector."""
|
175 |
+
if v.ndim != 1:
|
176 |
+
raise ValueError('v should be 1-D, {}-D was given'.format(
|
177 |
+
v.ndim))
|
178 |
+
norm = np.linalg.norm(v)
|
179 |
+
if norm == 0:
|
180 |
+
return v
|
181 |
+
return v / norm
|
182 |
+
|
183 |
+
|
184 |
+
def project_params(u, v):
|
185 |
+
"""Projecting and rejecting the vector v onto direction u with scalar."""
|
186 |
+
normalize_u = normalize(u)
|
187 |
+
projection = (v @ normalize_u)
|
188 |
+
projected_vector = projection * normalize_u
|
189 |
+
rejected_vector = v - projected_vector
|
190 |
+
return projection, projected_vector, rejected_vector
|
191 |
+
|
192 |
+
|
193 |
+
def project_reject_vector(v, u):
|
194 |
+
"""Projecting and rejecting the vector v onto direction u."""
|
195 |
+
projected_vector = project_vector(v, u)
|
196 |
+
rejected_vector = v - projected_vector
|
197 |
+
return projected_vector, rejected_vector
|
198 |
+
|
199 |
+
|
200 |
+
def round_to_extreme(value, digits=2):
|
201 |
+
place = 10**digits
|
202 |
+
new_value = math.ceil(abs(value) * place) / place
|
203 |
+
if value < 0:
|
204 |
+
new_value = -new_value
|
205 |
+
return new_value
|
206 |
+
|
207 |
+
|
208 |
+
def take_two_sides_extreme_sorted(df, n_extreme,
|
209 |
+
part_column=None,
|
210 |
+
head_value='',
|
211 |
+
tail_value=''):
|
212 |
+
head_df = df.head(n_extreme)[:]
|
213 |
+
tail_df = df.tail(n_extreme)[:]
|
214 |
+
|
215 |
+
if part_column is not None:
|
216 |
+
head_df[part_column] = head_value
|
217 |
+
tail_df[part_column] = tail_value
|
218 |
+
|
219 |
+
return (pd.concat([head_df, tail_df])
|
220 |
+
.drop_duplicates()
|
221 |
+
.reset_index(drop=True))
|
222 |
+
|
223 |
+
|
224 |
+
def project_vector(v, u):
|
225 |
+
"""Projecting the vector v onto direction u."""
|
226 |
+
normalize_u = normalize(u)
|
227 |
+
return (v @ normalize_u) * normalize_u
|
228 |
+
|
229 |
+
|
230 |
+
def reject_vector(v, u):
|
231 |
+
"""Rejecting the vector v onto direction u."""
|
232 |
+
return v - project_vector(v, u)
|
233 |
+
|
234 |
+
|
235 |
+
def update_word_vector(model, word, new_vector):
|
236 |
+
model.vectors[model.vocab[word].index] = new_vector
|
237 |
+
if model.vectors_norm is not None:
|
238 |
+
model.vectors_norm[model.vocab[word].index] = normalize(new_vector)
|
239 |
+
|
240 |
+
|
241 |
+
def project_params(u, v):
|
242 |
+
"""Projecting and rejecting the vector v onto direction u with scalar."""
|
243 |
+
normalize_u = normalize(u)
|
244 |
+
projection = (v @ normalize_u)
|
245 |
+
projected_vector = projection * normalize_u
|
246 |
+
rejected_vector = v - projected_vector
|
247 |
+
return projection, projected_vector, rejected_vector
|
248 |
+
|
249 |
+
|
250 |
+
def plot_clustering_as_classification(X, y_true, random_state=1, ax=None):
|
251 |
+
|
252 |
+
if ax is None:
|
253 |
+
_, ax = plt.subplots(figsize=(10, 5))
|
254 |
+
|
255 |
+
y_cluster = (KMeans(n_clusters=2, random_state=random_state)
|
256 |
+
.fit_predict(X))
|
257 |
+
|
258 |
+
embedded_vectors = (TSNE(n_components=2, random_state=random_state)
|
259 |
+
.fit_transform(X))
|
260 |
+
|
261 |
+
for y_value in np.unique(y_cluster):
|
262 |
+
mask = (y_cluster == y_value)
|
263 |
+
label = 'Positive' if y_value else 'Negative'
|
264 |
+
ax.scatter(embedded_vectors[mask, 0],
|
265 |
+
embedded_vectors[mask, 1],
|
266 |
+
label=label)
|
267 |
+
|
268 |
+
ax.legend()
|
269 |
+
|
270 |
+
acc = accuracy_score(y_true, y_cluster)
|
271 |
+
|
272 |
+
return max(acc, 1 - acc)
|