File size: 3,464 Bytes
7e70cfa 73d009a 7e70cfa 73d009a 7e70cfa 73d009a 7e70cfa 73d009a 7e70cfa 73d009a 13b5dca 0d09ea1 13b5dca 8522e75 ef08db2 13b5dca 0d09ea1 13b5dca 8522e75 ef08db2 7e70cfa 13b5dca 7e70cfa 13b5dca 7e70cfa 13b5dca 7e70cfa 13b5dca 7e70cfa 13b5dca 7e70cfa |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 |
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import numpy as np
# Latent Feature Cluster for Training Data using T-SNE
def TSNE_reduction(latent_points: np.ndarray, perplexity=30, learning_rate=20):
"""
:param latent_points: [ndarray] - an array of arrays that define the points of an object in the latent space
:param perplexity: [int] - default perplexity = 30 " Perplexity balances the attention t-SNE gives to local and
global aspects of the data. It is roughly a guess of the number of close neighbors each point has...
a denser dataset ... requires higher perplexity value" Recommended: Perplexity(5-50)
:param learning_rate: [int] - default learning rate = 200 "If the learning rate is too high, the data may look
like a ‘ball’ with any point approximately equidistant from its nearest neighbours.
If the learning rate is too low, most points may look compressed in a dense cloud with few outliers."
Recommended: learning_rate(10-1000)
:return: [tuple] - the output is the x and y coordinates for the reduced latent space, a title, and an embedding
"""
model = TSNE(n_components=2, random_state=0, perplexity=perplexity,
learning_rate=learning_rate)
# the number of components = dimension of the embedded space
embedding = model
tsne_data = model.fit_transform(latent_points)
# When there are more data points, only use a couple of hundred points so TSNE doesn't take too long
x = tsne_data[:, 0]
y = tsne_data[:, 1]
title = ("T-SNE of Data")
return x, y, title, embedding
def plot_dimensionality_reduction(x: list, y: list, label_set: list, title: str):
plt.title(title)
# Color points based on their density
if label_set[0].dtype == float:
plt.scatter(x, y, c=label_set)
cbar = plt.colorbar()
cbar.set_label('Average Density', fontsize=12)
print("using scatter")
# Color points based on a discrete label
else:
for label in set(label_set):
cond = np.where(np.array(label_set) == str(label))
plt.plot(x[cond], y[cond], marker='o', linestyle='none', label=label)
plt.legend(numpoints=1)
plt.xlabel("Dimension 1")
plt.ylabel("Dimension 2")
########################################################################################################################
"""
# Use for personal plotting
import pandas as pd
import json
df = pd.read_csv('2D_Lattice.csv')
# row = 0
# box = df.iloc[row,1]
# array = np.array(json.loads(box))
# Select a subset of the data to use
number_samples = 10000
perplexity = 300
random_samples = sorted(np.random.randint(0,len(df), number_samples)) # Generates ordered samples
df = df.iloc[random_samples]
print(df)
print(np.shape(df))
# For plotting CSV data
# define a function to flatten a box
def flatten_box(box_str):
box = json.loads(box_str)
return np.array(box).flatten()
# apply the flatten_box function to each row of the dataframe and create a list of flattened arrays
flattened_arrays = df['Array'].apply(flatten_box).tolist()
avg_density = np.sum(flattened_arrays, axis=1)/(len(flattened_arrays[0]))
x, y, title, embedding = TSNE_reduction(flattened_arrays, perplexity=perplexity)
plot_dimensionality_reduction(x, y, avg_density, title)
plt.title(title)
plt.savefig('TSNE_Partial_Factorial_Perplexity_' + str(perplexity) + "_Data_Samples_" + str(number_samples))
"""
|