## Necessary Packages import scipy.stats import numpy as np import seaborn as sns import matplotlib.pyplot as plt from sklearn.manifold import TSNE from sklearn.decomposition import PCA def display_scores(results): mean = np.mean(results) sigma = scipy.stats.sem(results) sigma = sigma * scipy.stats.t.ppf((1 + 0.95) / 2.0, 5 - 1) # sigma = 1.96*(np.std(results)/np.sqrt(len(results))) print("Final Score: ", f"{mean} \xB1 {sigma}") return mean, sigma def train_test_divide(data_x, data_x_hat, data_t, data_t_hat, train_rate=0.8): """Divide train and test data for both original and synthetic data. Args: - data_x: original data - data_x_hat: generated data - data_t: original time - data_t_hat: generated time - train_rate: ratio of training data from the original data """ # Divide train/test index (original data) no = len(data_x) idx = np.random.permutation(no) train_idx = idx[: int(no * train_rate)] test_idx = idx[int(no * train_rate) :] train_x = [data_x[i] for i in train_idx] test_x = [data_x[i] for i in test_idx] train_t = [data_t[i] for i in train_idx] test_t = [data_t[i] for i in test_idx] # Divide train/test index (synthetic data) no = len(data_x_hat) idx = np.random.permutation(no) train_idx = idx[: int(no * train_rate)] test_idx = idx[int(no * train_rate) :] train_x_hat = [data_x_hat[i] for i in train_idx] test_x_hat = [data_x_hat[i] for i in test_idx] train_t_hat = [data_t_hat[i] for i in train_idx] test_t_hat = [data_t_hat[i] for i in test_idx] return ( train_x, train_x_hat, test_x, test_x_hat, train_t, train_t_hat, test_t, test_t_hat, ) def extract_time(data): """Returns Maximum sequence length and each sequence length. Args: - data: original data Returns: - time: extracted time information - max_seq_len: maximum sequence length """ time = list() max_seq_len = 0 for i in range(len(data)): max_seq_len = max(max_seq_len, len(data[i][:, 0])) time.append(len(data[i][:, 0])) return time, max_seq_len def visualization(ori_data, generated_data, analysis, compare=3000, output_label=""): """Using PCA or tSNE for generated and original data visualization. Args: - ori_data: original data - generated_data: generated synthetic data - analysis: tsne or pca or kernel """ # Analysis sample size (for faster computation) anal_sample_no = min([compare, ori_data.shape[0]]) idx = np.random.permutation(ori_data.shape[0])[:anal_sample_no] # Data preprocessing # ori_data = np.asarray(ori_data) # generated_data = np.asarray(generated_data) ori_data = ori_data[idx] generated_data = generated_data[idx] no, seq_len, dim = ori_data.shape for i in range(anal_sample_no): if i == 0: prep_data = np.reshape(np.mean(ori_data[0, :, :], 1), [1, seq_len]) prep_data_hat = np.reshape( np.mean(generated_data[0, :, :], 1), [1, seq_len] ) else: prep_data = np.concatenate( (prep_data, np.reshape(np.mean(ori_data[i, :, :], 1), [1, seq_len])) ) prep_data_hat = np.concatenate( ( prep_data_hat, np.reshape(np.mean(generated_data[i, :, :], 1), [1, seq_len]), ) ) # Visualization parameter # colors = [ # "red" for i in range(anal_sample_no)] + [ # "blue" for i in range(anal_sample_no) # ] colors = [ # "#CA0020", "#F4A582", # "#92C5DE", "#0571B0", "#5E4FA2", "#54278F", ] if analysis == "pca": # PCA Analysis pca = PCA(n_components=2) pca.fit(prep_data) pca_results = pca.transform(prep_data) pca_hat_results = pca.transform(prep_data_hat) # Plotting fig, ax = plt.subplots(1, figsize=(8, 6)) plt.scatter( pca_results[:, 0], pca_results[:, 1], # c=colors[:anal_sample_no], c=[colors[0] for _ in range(anal_sample_no)], alpha=0.5, label="Original", ) plt.scatter( pca_hat_results[:, 0], pca_hat_results[:, 1], # c=colors[anal_sample_no:], c=[colors[1] for _ in range(anal_sample_no)], alpha=0.5, label="Generated", ) ax.legend() plt.title("PCA plot") plt.xlabel("x") plt.ylabel("y") plt.show() from matplotlib.backends.backend_pdf import PdfPages pdf = PdfPages(f"./figures/{output_label}_pca.pdf") pdf.savefig(fig) pdf.close() elif analysis == "tsne": # Do t-SNE Analysis together prep_data_final = np.concatenate((prep_data, prep_data_hat), axis=0) # TSNE anlaysis tsne = TSNE(n_components=2, verbose=1, perplexity=40, n_iter=300) tsne_results = tsne.fit_transform(prep_data_final) # Plotting fig, ax = plt.subplots(1, figsize=(8, 6)) plt.scatter( tsne_results[:anal_sample_no, 0], tsne_results[:anal_sample_no, 1], c=[colors[0] for _ in range(anal_sample_no)], alpha=0.5, label="Original", ) plt.scatter( tsne_results[anal_sample_no:, 0], tsne_results[anal_sample_no:, 1], c=[colors[1] for _ in range(anal_sample_no)], alpha=0.5, label="Generated", ) ax.legend() plt.title("t-SNE plot") plt.xlabel("x") plt.ylabel("y") plt.show() from matplotlib.backends.backend_pdf import PdfPages pdf = PdfPages(f"./figures/{output_label}_tsne.pdf") pdf.savefig(fig) pdf.close() elif analysis == "kernel": # Visualization parameter # colors = ["red" for i in range(anal_sample_no)] + ["blue" for i in range(anal_sample_no)] fig, ax = plt.subplots(1, figsize=(8, 6)) sns.distplot( prep_data, hist=False, kde=True, kde_kws={"linewidth": 2}, label="Original", color=colors[0], ) sns.distplot( prep_data_hat, hist=False, kde=True, kde_kws={"linewidth": 2, "linestyle": "--"}, label="Generated", color=colors[1], ) # Plot formatting # plt.legend(prop={'size': 22}) plt.legend() plt.xlabel("Data Value") plt.ylabel("Data Density Estimate") # plt.rcParams['pdf.fonttype'] = 42 # plt.savefig(str(args.save_dir)+"/"+args.model1+"_histo.png", dpi=100,bbox_inches='tight') # plt.ylim((0, 12)) plt.show() from matplotlib.backends.backend_pdf import PdfPages pdf = PdfPages(f"./figures/{output_label}_kernel.pdf") pdf.savefig(fig) pdf.close() plt.close() def visualization_control(data, analysis, compare=3000, output_label=""): """Using PCA or tSNE for generated and original data visualization. Args: - data: dictionary of original and generated data - analysis: tsne or pca or kernel """ ori_data = data.get("ori_data") keys = list(data.keys()) keys.remove("ori_data") # Analysis sample size (for faster computation) anal_sample_no = min([compare, ori_data.shape[0]]) idx = np.random.permutation(ori_data.shape[0])[:anal_sample_no] # Data preprocessing # ori_data = np.asarray(ori_data) # generated_data = np.asarray(generated_data) ori_data = ori_data[idx] for i, key in enumerate(keys): data[key] = data[key][idx] _, seq_len, dim = ori_data.shape preprossed_data = {} for i in range(anal_sample_no): if i == 0: prep_data = np.reshape(np.mean(ori_data[0, :, :], 1), [1, seq_len]) # prep_data_hat = np.reshape( # np.mean(generated_data[0, :, :], 1), [1, seq_len] # ) for key in keys: prep_data_hat = np.reshape( np.mean(data[key][0, :, :], 1), [1, seq_len] ) preprossed_data[key] = prep_data_hat else: prep_data = np.concatenate( (prep_data, np.reshape(np.mean(ori_data[i, :, :], 1), [1, seq_len])) ) # prep_data_hat = np.concatenate( # ( # prep_data_hat, # np.reshape(np.mean(generated_data[i, :, :], 1), [1, seq_len]), # ) # ) for key in keys: prep_data_hat = np.concatenate( ( preprossed_data[key], np.reshape(np.mean(data[key][i, :, :], 1), [1, seq_len]), ) ) preprossed_data[key] = prep_data_hat # Visualization parameter # colors = [ # "red" for i in range(anal_sample_no)] + [ # "blue" for i in range(anal_sample_no) # ] colors = [ "#CA0020", "#F4A582", "#92C5DE", "#0571B0", "#5E4FA2", "#54278F", "#6A3D9A", "#9E0142", "#D53E4F", "#F46D43", "#FDAE61", "#FEE08B", ] * 3 if analysis == "pca": # PCA Analysis pca = PCA(n_components=2) pca.fit(prep_data) pca_results = pca.transform(prep_data) pca_control_results = {} for key in keys: pca_control_results[key] = pca.transform(preprossed_data[key]) # pca_hat_results = pca.transform(prep_data_hat) # Plotting fig, ax = plt.subplots(1, figsize=(8, 6)) plt.scatter( pca_results[:, 0], pca_results[:, 1], # c=colors[:anal_sample_no], c=[colors[0] for _ in range(anal_sample_no)], alpha=0.5, label="Original", ) # plt.scatter( # pca_hat_results[:, 0], # pca_hat_results[:, 1], # # c=colors[anal_sample_no:], # c=[colors[1] for _ in range(anal_sample_no)], # alpha=0.5, # label="Generated", # ) for i, key in enumerate(keys): plt.scatter( pca_control_results[key][:, 0], pca_control_results[key][:, 1], c=[colors[i+1] for _ in range(anal_sample_no)], alpha=0.5, label=key, ) ax.legend() plt.title("PCA plot") plt.xlabel("x") plt.ylabel("y") plt.show() from matplotlib.backends.backend_pdf import PdfPages pdf = PdfPages(f"./figures/{output_label}_pca.pdf") pdf.savefig(fig) pdf.close() elif analysis == "tsne": # Do t-SNE Analysis together prep_data_final = np.concatenate([prep_data] + [preprossed_data[key] for key in keys], axis=0) # TSNE anlaysis tsne = TSNE(n_components=2, verbose=1, perplexity=40, n_iter=300) tsne_results = tsne.fit_transform(prep_data_final) # Plotting fig, ax = plt.subplots(1, figsize=(8, 6)) plt.scatter( tsne_results[:anal_sample_no, 0], tsne_results[:anal_sample_no, 1], c=[colors[0] for _ in range(anal_sample_no)], alpha=0.5, label="Original", ) for i, key in enumerate(keys): plt.scatter( tsne_results[(i+1)*anal_sample_no:(i+2)*anal_sample_no, 0], tsne_results[(i+1)*anal_sample_no:(i+2)*anal_sample_no, 1], c=[colors[i+1] for _ in range(anal_sample_no)], alpha=0.5, label=key, ) # plt.scatter( # tsne_results[anal_sample_no:, 0], # tsne_results[anal_sample_no:, 1], # c=[colors[1] for _ in range(anal_sample_no)], # alpha=0.5, # label="Generated", # ) ax.legend() plt.title("t-SNE plot") plt.xlabel("x") plt.ylabel("y") plt.show() from matplotlib.backends.backend_pdf import PdfPages pdf = PdfPages(f"./figures/{output_label}_tsne.pdf") pdf.savefig(fig) pdf.close() elif analysis == "kernel": # Visualization parameter # colors = ["red" for i in range(anal_sample_no)] + ["blue" for i in range(anal_sample_no)] fig, ax = plt.subplots(1, figsize=(8, 6)) sns.distplot( prep_data, hist=False, kde=True, kde_kws={"linewidth": 2}, label="Original", color=colors[0], ) # sns.distplot( # prep_data_hat, # hist=False, # kde=True, # kde_kws={"linewidth": 2, "linestyle": "--"}, # label="Generated", # color=colors[1], # ) for i, key in enumerate(keys): sns.distplot( preprossed_data[key], hist=False, kde=True, kde_kws={"linewidth": 2, "linestyle": "--"}, label=key, color=colors[i+1], ) # Plot formatting # plt.legend(prop={'size': 22}) plt.legend() plt.xlabel("Data Value") plt.ylabel("Data Density Estimate") # plt.rcParams['pdf.fonttype'] = 42 # plt.savefig(str(args.save_dir)+"/"+args.model1+"_histo.png", dpi=100,bbox_inches='tight') # plt.ylim((0, 12)) plt.show() from matplotlib.backends.backend_pdf import PdfPages pdf = PdfPages(f"./figures/{output_label}_kernel.pdf") pdf.savefig(fig) pdf.close() plt.close() def save_pdf(fig, path): # from matplotlib.backends.backend_pdf import PdfPages # pdf = PdfPages(path) # pdf.savefig(fig) # pdf.close() fig.savefig(path, format="pdf", bbox_inches="tight") if __name__ == "__main__": pass