File size: 3,464 Bytes
7e70cfa
 
 
 
73d009a
7e70cfa
73d009a
 
 
 
 
 
 
 
 
 
 
 
7e70cfa
73d009a
7e70cfa
73d009a
 
 
 
 
7e70cfa
 
 
 
 
 
73d009a
13b5dca
0d09ea1
13b5dca
 
8522e75
ef08db2
13b5dca
0d09ea1
 
13b5dca
 
 
 
 
 
8522e75
ef08db2
7e70cfa
13b5dca
 
 
7e70cfa
 
 
 
13b5dca
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7e70cfa
 
 
 
 
 
 
 
 
 
13b5dca
7e70cfa
13b5dca
 
7e70cfa
13b5dca
 
7e70cfa
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import numpy as np


# Latent Feature Cluster for Training Data using T-SNE
def TSNE_reduction(latent_points: np.ndarray, perplexity=30, learning_rate=20):
    """
    :param latent_points: [ndarray] - an array of arrays that define the points of an object in the latent space
    :param perplexity: [int] - default perplexity = 30 " Perplexity balances the attention t-SNE gives to local and
    global aspects of the data. It is roughly a guess of the number of close neighbors each point has...
    a denser dataset ... requires higher perplexity value" Recommended: Perplexity(5-50)
    :param learning_rate: [int] - default learning rate = 200 "If the learning rate is too high, the data may look
    like a ‘ball’ with any point approximately equidistant from its nearest neighbours.
    If the learning rate is too low, most points may look compressed in a dense cloud with few outliers."
    Recommended: learning_rate(10-1000)
    :return: [tuple] - the output is the x and y coordinates for the reduced latent space, a title, and an embedding
    """
    model = TSNE(n_components=2, random_state=0, perplexity=perplexity,
                 learning_rate=learning_rate)
    # the number of components = dimension of the embedded space

    embedding = model

    tsne_data = model.fit_transform(latent_points)
    # When there are more data points, only use a couple of hundred points so TSNE doesn't take too long
    x = tsne_data[:, 0]
    y = tsne_data[:, 1]
    title = ("T-SNE of Data")
    return x, y, title, embedding


def plot_dimensionality_reduction(x: list, y: list, label_set: list, title: str):
    plt.title(title)
    # Color points based on their density
    if label_set[0].dtype == float:
        plt.scatter(x, y, c=label_set)
        cbar = plt.colorbar()
        cbar.set_label('Average Density', fontsize=12)
        print("using scatter")

    # Color points based on a discrete label
    else:
        for label in set(label_set):
            cond = np.where(np.array(label_set) == str(label))
            plt.plot(x[cond], y[cond], marker='o', linestyle='none', label=label)

        plt.legend(numpoints=1)
    plt.xlabel("Dimension 1")
    plt.ylabel("Dimension 2")
########################################################################################################################
"""
# Use for personal plotting

import pandas as pd
import json

df = pd.read_csv('2D_Lattice.csv')
# row = 0
# box = df.iloc[row,1]
# array = np.array(json.loads(box))

# Select a subset of the data to use
number_samples = 10000
perplexity = 300

random_samples = sorted(np.random.randint(0,len(df), number_samples))  # Generates ordered samples

df = df.iloc[random_samples]

print(df)
print(np.shape(df))


# For plotting CSV data
# define a function to flatten a box
def flatten_box(box_str):
    box = json.loads(box_str)
    return np.array(box).flatten()


# apply the flatten_box function to each row of the dataframe and create a list of flattened arrays
flattened_arrays = df['Array'].apply(flatten_box).tolist()
avg_density = np.sum(flattened_arrays, axis=1)/(len(flattened_arrays[0]))

x, y, title, embedding = TSNE_reduction(flattened_arrays, perplexity=perplexity)
plot_dimensionality_reduction(x, y, avg_density, title)
plt.title(title)
plt.savefig('TSNE_Partial_Factorial_Perplexity_' + str(perplexity) + "_Data_Samples_" + str(number_samples))

"""