Spaces:

felipekitamura
/

word_embeddings

Sleeping

App Files Files Community

word_embeddings / app.py

felipekitamura

Update app.py

9d66951 verified 9 months ago

raw

history blame

3.79 kB

	import gradio as gr
	import pandas as pd
	import numpy as np
	import matplotlib.pyplot as plt
	from sklearn.decomposition import PCA
	from sklearn.manifold import TSNE
	model = np.load('gpt2-red-1k-words.npy',allow_pickle='TRUE').item()
	data = np.asarray([x for x in model.values()])
	keys = np.asarray([x for x in model.keys()])

	cache = "/home/user/app/d.jpg"

	def find_most_similar_vectors(vector, lookup_table):
	"""
	Finds the indices of the three most similar vectors in the lookup table to the given vector.

	:param vector: A 1xN numpy array (the vector to compare against others)
	:param lookup_table: An MxN numpy array (a matrix of vectors)
	:return: A list of indices of the three most similar vectors from the lookup table
	"""
	# Calculate the Euclidean distances from the given vector to all vectors in the lookup table
	distances = np.linalg.norm(lookup_table - vector, axis=1)

	# Get the indices of the three smallest distances
	indices_of_smallest = np.argsort(distances)[:3]

	return indices_of_smallest.tolist()


	# Function to reduce dimensions
	def reduce_dimensions(data, method='PCA'):
	if method == 'PCA':
	model = PCA(n_components=2)
	elif method == 'TSNE':
	model = TSNE(n_components=2, learning_rate='auto', init='random', perplexity=3)
	return model.fit_transform(data)

	# Plotting function
	def plot_reduced_data(reduced_data, labels, title):
	plt.figure(figsize=(10, 8))
	plt.scatter(reduced_data[:, 0], reduced_data[:, 1], alpha=0.6)
	for i, label in enumerate(labels):
	plt.annotate(" " + label, (reduced_data[i, 0], reduced_data[i, 1]), fontsize=18)
	plt.title(title)
	# Data for the arrow 1
	start_point = (reduced_data[0, 0], reduced_data[0, 1]) # Starting point of the arrow
	end_point = (reduced_data[1, 0], reduced_data[1, 1]) # Ending point of the arrow

	# Adding an arrow 1
	plt.annotate('', xy=end_point, xytext=start_point,
	arrowprops=dict(arrowstyle="->", color='green', lw=3))

	# Data for the arrow 2
	end_point = (reduced_data[-1, 0] , reduced_data[-1, 1]) # Starting point of the arrow
	start_point = (reduced_data[2, 0], reduced_data[2, 1]) # Ending point of the arrow

	# Adding an arrow 2
	plt.annotate('', xy=end_point, xytext=start_point,
	arrowprops=dict(arrowstyle="->", color='green', lw=3))

	plt.xlabel('Component 1')
	plt.ylabel('Component 2')
	plt.grid(True)
	plt.savefig(cache)

	description = """
	### Word Embedding Demo App
	Universidade Federal de São Paulo - Escola Paulista de Medicina
	The output is Word3 + (Word2 - Word1)
	Credits:
	* Gensim
	* Glove
	"""

	Word1 = gr.Textbox()
	Word2 = gr.Textbox()
	Word3 = gr.Textbox()
	label = gr.Label(show_label=True, label="Word4")
	sp = gr.Image()


	def inference(word1, word2, word3):
	transform = model[word3] + model[word2] - model[word1]
	output = keys[find_most_similar_vectors(transform[np.newaxis, ...], data)]
	print(output)
	word_list = [word1, word2, word3]
	word_list.extend(output)
	words = {key: model[key] for key in word_list}
	words[word3 + " + (" + word2 + " - " + word1 + ")"] = transform
	#data = np.concatenate([x[np.newaxis, :] for x in words.values()], axis=0)
	#print(data.shape)
	labels = words.keys()
	#reduced_data_pca = reduce_dimensions(data, method='PCA')
	#print(reduced_data_pca.shape)
	plot_reduced_data(data, labels, 'PCA Results')
	return cache

	examples = [
	["woman", "man", "aunt"],
	["woman", "man", "girl"],
	["woman", "man", "granddaughter"],
	]

	iface = gr.Interface(
	fn=inference,
	inputs=[Word1, Word2, Word3],
	outputs=sp,
	description=description,
	examples=examples
	)

	iface.launch()