Spaces:
Configuration error
Configuration error
File size: 6,204 Bytes
a779273 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 |
import numpy as np
import pandas as pd
import seaborn as sns
from numpy.linalg import norm
import matplotlib as mpl
mpl.use('Agg')
import matplotlib.pyplot as plt
class WordToPlot:
def __init__(self, word, color, bias_space, alpha):
self.word = word
self.color = color
self.bias_space = bias_space
self.alpha = alpha
class WordExplorer:
def __init__(self, vocabulary) -> None:
self.vocabulary = vocabulary
def __errorChecking(self, word):
out_msj = ""
if not word:
out_msj = "Error: Primero debe ingresar una palabra!"
else:
if word not in self.vocabulary:
out_msj = f"Error: La palabra '<b>{word}</b>' no se encuentra en el vocabulario!"
return out_msj
def parse_words(self, string):
words = string.strip()
if words:
words = [word.strip() for word in words.split(',') if word != ""]
return words
def check_oov(self, wordlists):
for wordlist in wordlists:
for word in wordlist:
msg = self.__errorChecking(word)
if msg:
return msg
return None
def get_neighbors(self, word, n_neighbors, nn_method):
return self.vocabulary.getNearestNeighbors(word, n_neighbors, nn_method)
def get_df(self, words_embedded, processed_word_list):
df = pd.DataFrame(words_embedded)
df['word'] = [wtp.word for wtp in processed_word_list]
df['color'] = [wtp.color for wtp in processed_word_list]
df['alpha'] = [wtp.alpha for wtp in processed_word_list]
df['word_bias_space'] = [wtp.bias_space for wtp in processed_word_list]
return df
def get_plot(self,
data,
processed_word_list,
words_embedded,
color_dict,
n_neighbors,
n_alpha,
fontsize=18,
figsize=(20, 15)
):
fig, ax = plt.subplots(figsize=figsize)
sns.scatterplot(
data=data[data['alpha'] == 1],
x=0,
y=1,
style='word_bias_space',
hue='word_bias_space',
ax=ax,
palette=color_dict
)
if n_neighbors > 0:
sns.scatterplot(
data=data[data['alpha'] != 1],
x=0,
y=1,
style='color',
hue='word_bias_space',
ax=ax,
alpha=n_alpha,
legend=False,
palette=color_dict
)
for i, wtp in enumerate(processed_word_list):
x, y = words_embedded[i, :]
ax.annotate(wtp.word, xy=(x, y), xytext=(5, 2), color=wtp.color,
textcoords='offset points',
ha='right', va='bottom', size=fontsize, alpha=wtp.alpha)
ax.set_xticks([])
ax.set_yticks([])
ax.set_xlabel('')
ax.set_ylabel('')
fig.tight_layout()
return fig
def plot_projections_2d(self,
wordlist_0,
wordlist_1 = [],
wordlist_2 = [],
wordlist_3 = [],
wordlist_4 = [],
**kwargs
):
# convertirlas a vector
choices = [0, 1, 2, 3, 4]
wordlist_choice = [
wordlist_0,
wordlist_1,
wordlist_2,
wordlist_3,
wordlist_4
]
err = self.check_oov(wordlist_choice)
if err:
raise Exception(err)
color_dict = {
0: kwargs.get('color_wordlist_0', '#000000'),
1: kwargs.get('color_wordlist_1', '#1f78b4'),
2: kwargs.get('color_wordlist_2', '#33a02c'),
3: kwargs.get('color_wordlist_3', '#e31a1c'),
4: kwargs.get('color_wordlist_4', '#6a3d9a')
}
n_neighbors = kwargs.get('n_neighbors', 0)
n_alpha = kwargs.get('n_alpha', 0.3)
processed_word_list = []
for word_list_to_process, color in zip(wordlist_choice, choices):
for word in word_list_to_process:
processed_word_list.append(WordToPlot(word, color_dict[color], color, 1))
if n_neighbors > 0:
neighbors = self.get_neighbors(word,
n_neighbors=n_neighbors+1,
nn_method=kwargs.get('nn_method', 'sklearn')
)
for n in neighbors:
if n not in [wtp.word for wtp in processed_word_list]:
processed_word_list.append(WordToPlot(n, color_dict[color], color, n_alpha))
if not processed_word_list:
raise Exception('Only empty lists were passed')
words_embedded = np.array([self.vocabulary.getPCA(wtp.word) for wtp in processed_word_list])
data = self.get_df(words_embedded, processed_word_list)
fig = self.get_plot(data, processed_word_list, words_embedded,
color_dict, n_neighbors, n_alpha,
kwargs.get('fontsize', 18),
kwargs.get('figsize', (20, 15))
)
plt.show()
return fig
def doesnt_match(self, wordlist):
err = self.check_oov([wordlist])
if err:
raise Exception(err)
words_emb = np.array([self.vocabulary.getEmbedding(word) for word in wordlist])
mean_vec = np.mean(words_emb, axis=0)
doesnt_match = ""
farthest_emb = 1.0
for word in wordlist:
word_emb = self.vocabulary.getEmbedding(word)
cos_sim = np.dot(mean_vec, word_emb) / (norm(mean_vec)*norm(word_emb))
if cos_sim <= farthest_emb:
farthest_emb = cos_sim
doesnt_match = word
return doesnt_match
|