File size: 3,786 Bytes
c2b2793
 
 
 
 
 
b065d7a
52be539
b2748e6
c2b2793
 
 
b065d7a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c2b2793
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9d66951
c2b2793
 
b065d7a
c2b2793
 
cbb23b4
ef24c39
c2b2793
b065d7a
 
cbb23b4
c2b2793
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
import gradio as gr
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
model = np.load('gpt2-red-1k-words.npy',allow_pickle='TRUE').item()
data = np.asarray([x for x in model.values()])
keys = np.asarray([x for x in model.keys()])

cache = "/home/user/app/d.jpg"

def find_most_similar_vectors(vector, lookup_table):
    """
    Finds the indices of the three most similar vectors in the lookup table to the given vector.
    
    :param vector: A 1xN numpy array (the vector to compare against others)
    :param lookup_table: An MxN numpy array (a matrix of vectors)
    :return: A list of indices of the three most similar vectors from the lookup table
    """
    # Calculate the Euclidean distances from the given vector to all vectors in the lookup table
    distances = np.linalg.norm(lookup_table - vector, axis=1)
    
    # Get the indices of the three smallest distances
    indices_of_smallest = np.argsort(distances)[:3]
    
    return indices_of_smallest.tolist()

    
# Function to reduce dimensions
def reduce_dimensions(data, method='PCA'):
    if method == 'PCA':
        model = PCA(n_components=2)
    elif method == 'TSNE':
        model = TSNE(n_components=2, learning_rate='auto', init='random', perplexity=3)
    return model.fit_transform(data)

# Plotting function
def plot_reduced_data(reduced_data, labels, title):
    plt.figure(figsize=(10, 8))
    plt.scatter(reduced_data[:, 0], reduced_data[:, 1], alpha=0.6)
    for i, label in enumerate(labels):
        plt.annotate("  " + label, (reduced_data[i, 0], reduced_data[i, 1]), fontsize=18)
    plt.title(title)
    # Data for the arrow 1
    start_point = (reduced_data[0, 0], reduced_data[0, 1])  # Starting point of the arrow
    end_point = (reduced_data[1, 0], reduced_data[1, 1])  # Ending point of the arrow
    
    # Adding an arrow 1
    plt.annotate('', xy=end_point, xytext=start_point,
                 arrowprops=dict(arrowstyle="->", color='green', lw=3))
    
    # Data for the arrow 2
    end_point = (reduced_data[-1, 0] , reduced_data[-1, 1])  # Starting point of the arrow
    start_point = (reduced_data[2, 0], reduced_data[2, 1])  # Ending point of the arrow
    
    # Adding an arrow 2
    plt.annotate('', xy=end_point, xytext=start_point,
                 arrowprops=dict(arrowstyle="->", color='green', lw=3))  
    
    plt.xlabel('Component 1')
    plt.ylabel('Component 2')
    plt.grid(True)
    plt.savefig(cache)

description = """
### Word Embedding Demo App
Universidade Federal de São Paulo - Escola Paulista de Medicina
The output is Word3 + (Word2 - Word1)
Credits:  
* Gensim
* Glove
"""

Word1 = gr.Textbox()
Word2 = gr.Textbox()
Word3 = gr.Textbox()
label = gr.Label(show_label=True, label="Word4")
sp = gr.Image()


def inference(word1, word2, word3):
    transform = model[word3] + model[word2] - model[word1]
    output = keys[find_most_similar_vectors(transform[np.newaxis, ...], data)]
    print(output)
    word_list = [word1, word2, word3]
    word_list.extend(output)
    words = {key: model[key] for key in word_list}
    words[word3 + " + (" + word2 + " - " + word1 + ")"] = transform
    data2 = np.concatenate([x[np.newaxis, :] for x in words.values()], axis=0)
    #print(data.shape)
    labels = words.keys()
    #reduced_data_pca = reduce_dimensions(data, method='PCA')
    #print(reduced_data_pca.shape)
    plot_reduced_data(data2, labels, 'PCA Results')
    return cache

examples = [
    ["woman", "man", "aunt"],  
    ["woman", "man", "girl"],
    ["woman", "man", "granddaughter"],
]

iface = gr.Interface(
    fn=inference,
    inputs=[Word1, Word2, Word3],
    outputs=sp,
    description=description,
    examples=examples
    )

iface.launch()