Spaces:

felipekitamura
/

word_embeddings

Sleeping

File size: 3,786 Bytes

import gradio as gr
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
model = np.load('gpt2-red-1k-words.npy',allow_pickle='TRUE').item()
data = np.asarray([x for x in model.values()])
keys = np.asarray([x for x in model.keys()])

cache = "/home/user/app/d.jpg"

def find_most_similar_vectors(vector, lookup_table):
    """
    Finds the indices of the three most similar vectors in the lookup table to the given vector.
    
    :param vector: A 1xN numpy array (the vector to compare against others)
    :param lookup_table: An MxN numpy array (a matrix of vectors)
    :return: A list of indices of the three most similar vectors from the lookup table
    """
    # Calculate the Euclidean distances from the given vector to all vectors in the lookup table
    distances = np.linalg.norm(lookup_table - vector, axis=1)
    
    # Get the indices of the three smallest distances
    indices_of_smallest = np.argsort(distances)[:3]
    
    return indices_of_smallest.tolist()

    
# Function to reduce dimensions
def reduce_dimensions(data, method='PCA'):
    if method == 'PCA':
        model = PCA(n_components=2)
    elif method == 'TSNE':
        model = TSNE(n_components=2, learning_rate='auto', init='random', perplexity=3)
    return model.fit_transform(data)

# Plotting function
def plot_reduced_data(reduced_data, labels, title):
    plt.figure(figsize=(10, 8))
    plt.scatter(reduced_data[:, 0], reduced_data[:, 1], alpha=0.6)
    for i, label in enumerate(labels):
        plt.annotate("  " + label, (reduced_data[i, 0], reduced_data[i, 1]), fontsize=18)
    plt.title(title)
    # Data for the arrow 1
    start_point = (reduced_data[0, 0], reduced_data[0, 1])  # Starting point of the arrow
    end_point = (reduced_data[1, 0], reduced_data[1, 1])  # Ending point of the arrow
    
    # Adding an arrow 1
    plt.annotate('', xy=end_point, xytext=start_point,
                 arrowprops=dict(arrowstyle="->", color='green', lw=3))
    
    # Data for the arrow 2
    end_point = (reduced_data[-1, 0] , reduced_data[-1, 1])  # Starting point of the arrow
    start_point = (reduced_data[2, 0], reduced_data[2, 1])  # Ending point of the arrow
    
    # Adding an arrow 2
    plt.annotate('', xy=end_point, xytext=start_point,
                 arrowprops=dict(arrowstyle="->", color='green', lw=3))  
    
    plt.xlabel('Component 1')
    plt.ylabel('Component 2')
    plt.grid(True)
    plt.savefig(cache)

description = """
### Word Embedding Demo App
Universidade Federal de São Paulo - Escola Paulista de Medicina
The output is Word3 + (Word2 - Word1)
Credits:  
* Gensim
* Glove
"""

Word1 = gr.Textbox()
Word2 = gr.Textbox()
Word3 = gr.Textbox()
label = gr.Label(show_label=True, label="Word4")
sp = gr.Image()


def inference(word1, word2, word3):
    transform = model[word3] + model[word2] - model[word1]
    output = keys[find_most_similar_vectors(transform[np.newaxis, ...], data)]
    print(output)
    word_list = [word1, word2, word3]
    word_list.extend(output)
    words = {key: model[key] for key in word_list}
    words[word3 + " + (" + word2 + " - " + word1 + ")"] = transform
    data2 = np.concatenate([x[np.newaxis, :] for x in words.values()], axis=0)
    #print(data.shape)
    labels = words.keys()
    #reduced_data_pca = reduce_dimensions(data, method='PCA')
    #print(reduced_data_pca.shape)
    plot_reduced_data(data2, labels, 'PCA Results')
    return cache

examples = [
    ["woman", "man", "aunt"],  
    ["woman", "man", "girl"],
    ["woman", "man", "granddaughter"],
]

iface = gr.Interface(
    fn=inference,
    inputs=[Word1, Word2, Word3],
    outputs=sp,
    description=description,
    examples=examples
    )

iface.launch()