word_embeddings / app.py
felipekitamura's picture
Update app.py
9d66951 verified
raw
history blame
3.79 kB
import gradio as gr
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
model = np.load('gpt2-red-1k-words.npy',allow_pickle='TRUE').item()
data = np.asarray([x for x in model.values()])
keys = np.asarray([x for x in model.keys()])
cache = "/home/user/app/d.jpg"
def find_most_similar_vectors(vector, lookup_table):
"""
Finds the indices of the three most similar vectors in the lookup table to the given vector.
:param vector: A 1xN numpy array (the vector to compare against others)
:param lookup_table: An MxN numpy array (a matrix of vectors)
:return: A list of indices of the three most similar vectors from the lookup table
"""
# Calculate the Euclidean distances from the given vector to all vectors in the lookup table
distances = np.linalg.norm(lookup_table - vector, axis=1)
# Get the indices of the three smallest distances
indices_of_smallest = np.argsort(distances)[:3]
return indices_of_smallest.tolist()
# Function to reduce dimensions
def reduce_dimensions(data, method='PCA'):
if method == 'PCA':
model = PCA(n_components=2)
elif method == 'TSNE':
model = TSNE(n_components=2, learning_rate='auto', init='random', perplexity=3)
return model.fit_transform(data)
# Plotting function
def plot_reduced_data(reduced_data, labels, title):
plt.figure(figsize=(10, 8))
plt.scatter(reduced_data[:, 0], reduced_data[:, 1], alpha=0.6)
for i, label in enumerate(labels):
plt.annotate(" " + label, (reduced_data[i, 0], reduced_data[i, 1]), fontsize=18)
plt.title(title)
# Data for the arrow 1
start_point = (reduced_data[0, 0], reduced_data[0, 1]) # Starting point of the arrow
end_point = (reduced_data[1, 0], reduced_data[1, 1]) # Ending point of the arrow
# Adding an arrow 1
plt.annotate('', xy=end_point, xytext=start_point,
arrowprops=dict(arrowstyle="->", color='green', lw=3))
# Data for the arrow 2
end_point = (reduced_data[-1, 0] , reduced_data[-1, 1]) # Starting point of the arrow
start_point = (reduced_data[2, 0], reduced_data[2, 1]) # Ending point of the arrow
# Adding an arrow 2
plt.annotate('', xy=end_point, xytext=start_point,
arrowprops=dict(arrowstyle="->", color='green', lw=3))
plt.xlabel('Component 1')
plt.ylabel('Component 2')
plt.grid(True)
plt.savefig(cache)
description = """
### Word Embedding Demo App
Universidade Federal de São Paulo - Escola Paulista de Medicina
The output is Word3 + (Word2 - Word1)
Credits:
* Gensim
* Glove
"""
Word1 = gr.Textbox()
Word2 = gr.Textbox()
Word3 = gr.Textbox()
label = gr.Label(show_label=True, label="Word4")
sp = gr.Image()
def inference(word1, word2, word3):
transform = model[word3] + model[word2] - model[word1]
output = keys[find_most_similar_vectors(transform[np.newaxis, ...], data)]
print(output)
word_list = [word1, word2, word3]
word_list.extend(output)
words = {key: model[key] for key in word_list}
words[word3 + " + (" + word2 + " - " + word1 + ")"] = transform
#data = np.concatenate([x[np.newaxis, :] for x in words.values()], axis=0)
#print(data.shape)
labels = words.keys()
#reduced_data_pca = reduce_dimensions(data, method='PCA')
#print(reduced_data_pca.shape)
plot_reduced_data(data, labels, 'PCA Results')
return cache
examples = [
["woman", "man", "aunt"],
["woman", "man", "girl"],
["woman", "man", "granddaughter"],
]
iface = gr.Interface(
fn=inference,
inputs=[Word1, Word2, Word3],
outputs=sp,
description=description,
examples=examples
)
iface.launch()