Fill-Mask
Transformers
PyTorch
esm
Inference Endpoints
File size: 27,400 Bytes
ffaff91
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
import matplotlib.pyplot as plt
import matplotlib.font_manager as fm
from matplotlib.font_manager import FontProperties
from scipy.stats import entropy
from sklearn.manifold import TSNE
import pickle
import pandas as pd
import os
import numpy as np
from fuson_plm.utils.logging import log_update, find_fuson_plm_directory

def set_font():
    # Load and set the font
    fuson_plm_dir = find_fuson_plm_directory()
    
    # Paths for regular, bold, italic fonts
    regular_font_path = os.path.join(fuson_plm_dir, 'ubuntu_font', 'Ubuntu-Regular.ttf')
    bold_font_path = os.path.join(fuson_plm_dir, 'ubuntu_font', 'Ubuntu-Bold.ttf')
    italic_font_path = os.path.join(fuson_plm_dir, 'ubuntu_font', 'Ubuntu-Italic.ttf')
    bold_italic_font_path = os.path.join(fuson_plm_dir, 'ubuntu_font', 'Ubuntu-BoldItalic.ttf')
    
    # Load the font properties
    regular_font = FontProperties(fname=regular_font_path)
    bold_font = FontProperties(fname=bold_font_path)
    italic_font = FontProperties(fname=italic_font_path)
    bold_italic_font = FontProperties(fname=bold_italic_font_path)
    
    # Add the fonts to the font manager
    fm.fontManager.addfont(regular_font_path)
    fm.fontManager.addfont(bold_font_path)
    fm.fontManager.addfont(italic_font_path)
    fm.fontManager.addfont(bold_italic_font_path)
    
    # Set the font family globally to Ubuntu
    plt.rcParams['font.family'] = regular_font.get_name()
    
    # Set the fonts for math text (like for labels) to use the loaded Ubuntu fonts
    plt.rcParams['mathtext.fontset'] = 'custom'
    plt.rcParams['mathtext.rm'] = regular_font.get_name()
    plt.rcParams['mathtext.it'] = f'{italic_font.get_name()}'
    plt.rcParams['mathtext.bf'] = f'{bold_font.get_name()}'

global default_color_map
default_color_map = {
    'train': '#0072B2',
    'val': '#009E73',
    'test': '#E69F00'
}

def get_avg_embeddings_for_tsne(train_sequences=None, val_sequences=None, test_sequences=None, embedding_path='fuson_db_embeddings/fuson_db_esm2_t33_650M_UR50D_avg_embeddings.pkl'):
    if train_sequences is None: train_sequences = []
    if val_sequences is None: val_sequences = []
    if test_sequences is None: test_sequences = []
    
    embeddings = {}
    
    try:
        with open(embedding_path, 'rb') as f:
            embeddings = pickle.load(f)
            
        train_embeddings = [v for k, v in embeddings.items() if k in train_sequences]
        val_embeddings = [v for k, v in embeddings.items() if k in val_sequences]
        test_embeddings = [v for k, v in embeddings.items() if k in test_sequences]
        
        return train_embeddings, val_embeddings, test_embeddings
    except:
        print("could not open embeddings")


def calculate_aa_composition(sequences):
    composition = {}
    total_length = sum([len(seq) for seq in sequences])
    
    for seq in sequences:
        for aa in seq:
            if aa in composition:
                composition[aa] += 1
            else:
                composition[aa] = 1
    
    # Convert counts to relative frequency
    for aa in composition:
        composition[aa] /= total_length
    
    return composition

def calculate_shannon_entropy(sequence):
    """
    Calculate the Shannon entropy for a given sequence.

    Args:
        sequence (str): A sequence of characters (e.g., amino acids or nucleotides).

    Returns:
        float: Shannon entropy value.
    """
    bases = set(sequence)
    counts = [sequence.count(base) for base in bases]
    return entropy(counts, base=2)

def visualize_splits_hist(train_lengths=None, val_lengths=None, test_lengths=None, colormap=None, savepath=f'splits/length_distributions.png', axes=None):
    """
    Works to plot train, val, test; train, val; or train, test
    """
    set_font()
    if colormap is None: colormap=default_color_map
    
    log_update('\nMaking histogram of length distributions')
    
    # Get index for test plot
    val_plot_index, test_plot_index, total_plots = 1, 2, 3
    if val_lengths is None:
        val_plot_index = None
        test_plot_index-= 1
        total_plots-=1
    if test_lengths is None:
        test_plot_index = None
        total_plots-=1
        
    # Create a figure and axes with 1 row and 3 columns
    fig_individual, axes_individual = plt.subplots(1, total_plots, figsize=(6*total_plots, 6))
    
    # Set axes list
    axes_list = [axes_individual] if axes is None else [axes_individual, axes]
    
    # Unpack the labels and titles
    xlabel, ylabel = ['Sequence Length (AA)', 'Frequency']

    for cur_axes in axes_list:
        # Plot the first histogram
        cur_axes[0].hist(train_lengths, bins=20, edgecolor='k',color=colormap['train'])
        cur_axes[0].set_xlabel(xlabel)
        cur_axes[0].set_ylabel(ylabel)
        cur_axes[0].set_title(f'Train Set Length Distribution (n={len(train_lengths)})')
        cur_axes[0].grid(True)
        cur_axes[0].set_axisbelow(True)

        # Plot the second histogram
        if not(val_plot_index is None):
            cur_axes[val_plot_index].hist(val_lengths, bins=20, edgecolor='k',color=colormap['val'])
            cur_axes[val_plot_index].set_xlabel(xlabel)
            cur_axes[val_plot_index].set_ylabel(ylabel)
            cur_axes[val_plot_index].set_title(f'Validation Set Length Distribution (n={len(val_lengths)})')
            cur_axes[val_plot_index].grid(True)
            cur_axes[val_plot_index].set_axisbelow(True)

        # Plot the third histogram
        if not(test_plot_index is None):
            cur_axes[test_plot_index].hist(test_lengths, bins=20, edgecolor='k',color=colormap['test'])
            cur_axes[test_plot_index].set_xlabel(xlabel)
            cur_axes[test_plot_index].set_ylabel(ylabel)
            cur_axes[test_plot_index].set_title(f'Test Set Length Distribution (n={len(test_lengths)})')
            cur_axes[test_plot_index].grid(True)
            cur_axes[test_plot_index].set_axisbelow(True)

    # Adjust layout
    fig_individual.set_tight_layout(True) 

    # Save the figure
    fig_individual.savefig(savepath)
    log_update(f"\tSaved figure to {savepath}") 
    
def visualize_splits_scatter(train_clusters=None, val_clusters=None, test_clusters=None, benchmark_cluster_reps=None, colormap=None, savepath='splits/scatterplot.png', axes=None):
    set_font()
    if colormap is None: colormap=default_color_map
    
    # Create a figure and axes with 1 row and 3 columns
    fig_individual, axes_individual = plt.subplots(figsize=(18, 6))
    
    # Set axes list
    axes_list = [axes_individual] if axes is None else [axes_individual, axes]
    
    log_update("\nMaking scatterplot with distribution of cluster sizes across train, test, and val")
    # Make grouped versions of these DataFrames for size analysis
    train_clustersgb = train_clusters.groupby('representative seq_id')['member seq_id'].count().reset_index().rename(columns={'member seq_id':'member count'})
    if not(val_clusters is None): 
        val_clustersgb = val_clusters.groupby('representative seq_id')['member seq_id'].count().reset_index().rename(columns={'member seq_id':'member count'})
    if not(test_clusters is None):
        test_clustersgb = test_clusters.groupby('representative seq_id')['member seq_id'].count().reset_index().rename(columns={'member seq_id':'member count'})
        # Isolate benchmark-containing clusters so their contribution can be plotted separately
        total_test_proteins = sum(test_clustersgb['member count'])
    if not(benchmark_cluster_reps is None):
        test_clustersgb['benchmark cluster'] = test_clustersgb['representative seq_id'].isin(benchmark_cluster_reps)
        benchmark_clustersgb = test_clustersgb.loc[test_clustersgb['benchmark cluster']].reset_index(drop=True)
        test_clustersgb = test_clustersgb.loc[test_clustersgb['benchmark cluster']==False].reset_index(drop=True)
        
    # Convert them to value counts
    train_clustersgb = train_clustersgb['member count'].value_counts().reset_index().rename(columns={'index':'cluster size (n_members)','member count': 'n_clusters'})
    if not(val_clusters is None): 
        val_clustersgb = val_clustersgb['member count'].value_counts().reset_index().rename(columns={'index':'cluster size (n_members)','member count': 'n_clusters'})
    if not(test_clusters is None):
        test_clustersgb = test_clustersgb['member count'].value_counts().reset_index().rename(columns={'index':'cluster size (n_members)','member count': 'n_clusters'})
    if not(benchmark_cluster_reps is None): 
        benchmark_clustersgb = benchmark_clustersgb['member count'].value_counts().reset_index().rename(columns={'index':'cluster size (n_members)','member count': 'n_clusters'})
    
    # Get the percentage of each dataset that's made of each cluster size
    train_clustersgb['n_proteins'] = train_clustersgb['cluster size (n_members)']*train_clustersgb['n_clusters']    # proteins per cluster * n clusters = # proteins
    train_clustersgb['percent_proteins'] = train_clustersgb['n_proteins']/sum(train_clustersgb['n_proteins'])
    if not(val_clusters is None):
        val_clustersgb['n_proteins'] = val_clustersgb['cluster size (n_members)']*val_clustersgb['n_clusters']
        val_clustersgb['percent_proteins'] = val_clustersgb['n_proteins']/sum(val_clustersgb['n_proteins'])
    if not(test_clusters is None):
        test_clustersgb['n_proteins'] = test_clustersgb['cluster size (n_members)']*test_clustersgb['n_clusters']
        test_clustersgb['percent_proteins'] = test_clustersgb['n_proteins']/total_test_proteins
    if not(benchmark_cluster_reps is None):
        benchmark_clustersgb['n_proteins'] = benchmark_clustersgb['cluster size (n_members)']*benchmark_clustersgb['n_clusters']
        benchmark_clustersgb['percent_proteins'] = benchmark_clustersgb['n_proteins']/total_test_proteins
    
    # Specially mark the benchmark clusters because these can't be reallocated
    for ax in axes_list:
        ax.plot(train_clustersgb['cluster size (n_members)'],train_clustersgb['percent_proteins'],linestyle='None',marker='.',color=colormap['train'],label='train')
        if not(val_clusters is None): 
            ax.plot(val_clustersgb['cluster size (n_members)'],val_clustersgb['percent_proteins'],linestyle='None',marker='.',color=colormap['val'],label='val')
        if not(test_clusters is None):
            ax.plot(test_clustersgb['cluster size (n_members)'],test_clustersgb['percent_proteins'],linestyle='None',marker='.',color=colormap['test'],label='test')
        if not(benchmark_cluster_reps is None):
            ax.plot(benchmark_clustersgb['cluster size (n_members)'],benchmark_clustersgb['percent_proteins'],
                marker='o',                
                linestyle='None',          
                markerfacecolor=colormap['test'],      # fill same as test
                markeredgecolor='black',    # outline black
                markeredgewidth=1.5,        
                label='benchmark'
            )
        ax.set(ylabel='Percentage of Proteins in Dataset',xlabel='cluster_size')
        ax.legend()
        
    # save the figure
    fig_individual.set_tight_layout(True)
    fig_individual.savefig(savepath)
    log_update(f"\tSaved figure to {savepath}") 
    

def visualize_splits_tsne(train_sequences=None, val_sequences=None, test_sequences=None, colormap=None, esm_type="esm2_t33_650M_UR50D", embedding_path="fuson_db_embeddings/fuson_db_esm2_t33_650M_UR50D_avg_embeddings.pkl", savepath='splits/tsne_plot.png',axes=None):
    set_font()
    
    if colormap is None: colormap=default_color_map
    
    """
    Generate a t-SNE plot of embeddings for train, test, and validation.
    """
    log_update('\nMaking t-SNE plot of train, val, and test embeddings')
    # Create a figure and axes with 1 row and 3 columns
    fig_individual, axes_individual = plt.subplots(figsize=(18, 6))
    
    # Set axes list
    axes_list = [axes_individual] if axes is None else [axes_individual, axes]
    
    # Combine the embeddings into one array
    train_embeddings, val_embeddings, test_embeddings = get_avg_embeddings_for_tsne(train_sequences=train_sequences, 
                                                                                    val_sequences=val_sequences, 
                                                                                    test_sequences=test_sequences, embedding_path=embedding_path)
    if not(val_embeddings is None) and not(test_embeddings is None):
        embeddings = np.concatenate([train_embeddings, val_embeddings, test_embeddings])
        labels = ['train'] * len(train_embeddings) + ['val'] * len(val_embeddings) + ['test'] * len(test_embeddings)
    if not(val_embeddings is None) and (test_embeddings is None):
        embeddings = np.concatenate([train_embeddings, val_embeddings])
        labels = ['train'] * len(train_embeddings) + ['val'] * len(val_embeddings)
    if (val_embeddings is None) and not(test_embeddings is None):
        embeddings = np.concatenate([train_embeddings, test_embeddings])
        labels = ['train'] * len(train_embeddings) + ['test'] * len(test_embeddings)

    # Perform t-SNE
    tsne = TSNE(n_components=2, random_state=42)
    tsne_results = tsne.fit_transform(embeddings)

    # Convert t-SNE results into a DataFrame
    tsne_df = pd.DataFrame(data=tsne_results, columns=['TSNE_1', 'TSNE_2'])
    tsne_df['label'] = labels

    for ax in axes_list:
        # Scatter plot for each set
        for label, color in colormap.items():
            subset = tsne_df[tsne_df['label'] == label].reset_index(drop=True)
            ax.scatter(subset['TSNE_1'], subset['TSNE_2'], c=color, label=label.capitalize(), alpha=0.6)
        
        ax.set_title(f't-SNE of {esm_type} Embeddings')
        ax.set_xlabel('t-SNE Dimension 1')
        ax.set_ylabel('t-SNE Dimension 2')
        ax.legend()
        ax.grid(True)
    
    # Save the figure if savepath is provided
    fig_individual.set_tight_layout(True)
    fig_individual.savefig(savepath)
    log_update(f"\tSaved figure to {savepath}") 

def visualize_splits_shannon_entropy(train_sequences=None, val_sequences=None, test_sequences=None, colormap=None, savepath='splits/shannon_entropy_plot.png',axes=None):
    set_font()
    """
    Generate Shannon entropy plots for train, validation, and test sets.
    """
    # Get index for test plot
    val_plot_index, test_plot_index, total_plots = 1, 2, 3
    if val_sequences is None:
        val_plot_index = None
        test_plot_index-= 1
        total_plots-=1
    if test_sequences is None:
        test_plot_index = None
        total_plots-=1
    
    if colormap is None: colormap=default_color_map
    # Create a figure and axes with 1 row and 3 columns
    fig_individual, axes_individual = plt.subplots(1, total_plots, figsize=(6*total_plots, 6))
    
    # Set axes list
    axes_list = [axes_individual] if axes is None else [axes_individual, axes]

    log_update('\nMaking histogram of Shannon Entropy distributions')
    train_entropy = [calculate_shannon_entropy(seq) for seq in train_sequences]
    if not(val_plot_index is None): 
        val_entropy = [calculate_shannon_entropy(seq) for seq in val_sequences]
    if not(test_plot_index is None):
        test_entropy = [calculate_shannon_entropy(seq) for seq in test_sequences]
    
    for ax in axes_list:
        ax[0].hist(train_entropy, bins=20, edgecolor='k', color=colormap['train'])
        ax[0].set_title(f'Train Set (n={len(train_entropy)})')
        ax[0].set_xlabel('Shannon Entropy')
        ax[0].set_ylabel('Frequency')
        ax[0].grid(True)
        ax[0].set_axisbelow(True)
        
        if not(val_plot_index is None):
            ax[val_plot_index].hist(val_entropy, bins=20, edgecolor='k', color=colormap['val'])
            ax[val_plot_index].set_title(f'Validation Set (n={len(val_entropy)})')
            ax[val_plot_index].set_xlabel('Shannon Entropy')
            ax[val_plot_index].grid(True)
            ax[val_plot_index].set_axisbelow(True)
        
        if not(test_plot_index is None):
            ax[test_plot_index].hist(test_entropy, bins=20, edgecolor='k', color=colormap['test'])
            ax[test_plot_index].set_title(f'Test Set (n={len(test_entropy)})')
            ax[test_plot_index].set_xlabel('Shannon Entropy')
            ax[test_plot_index].grid(True)
            ax[test_plot_index].set_axisbelow(True)
        
    fig_individual.set_tight_layout(True)
    fig_individual.savefig(savepath)
    log_update(f"\tSaved figure to {savepath}") 

def visualize_splits_aa_composition(train_sequences=None, val_sequences=None, test_sequences=None, colormap=None, savepath='splits/aa_comp.png',axes=None):
    set_font()
    if colormap is None: colormap=default_color_map
    
    # Create a figure and axes with 1 row and 3 columns
    fig_individual, axes_individual = plt.subplots(figsize=(18, 6))
    
    # Set axes list
    axes_list = [axes_individual] if axes is None else [axes_individual, axes]
    
    log_update('\nMaking bar plot of AA composition across each set')
    train_comp = calculate_aa_composition(train_sequences)
    if not(val_sequences is None): 
        val_comp = calculate_aa_composition(val_sequences)
    if not(test_sequences is None):
        test_comp = calculate_aa_composition(test_sequences)

    # Create DataFrame
    if not(val_sequences is None) and not(test_sequences is None):
        comp_df = pd.DataFrame([train_comp, val_comp, test_comp], index=['train', 'val', 'test']).T
    if not(val_sequences is None) and (test_sequences is None):
        comp_df = pd.DataFrame([train_comp, val_comp], index=['train', 'val']).T
    if (val_sequences is None) and not(test_sequences is None):
        comp_df = pd.DataFrame([train_comp, test_comp], index=['train', 'test']).T
    colors = [colormap[col] for col in comp_df.columns]

    # Plotting
    for ax in axes_list: 
        comp_df.plot(kind='bar', color=colors, ax=ax)
        ax.set_title('Amino Acid Composition Across Datasets')
        ax.set_xlabel('Amino Acid')
        ax.set_ylabel('Relative Frequency')
    
    fig_individual.set_tight_layout(True)
    fig_individual.savefig(savepath)
    log_update(f"\tSaved figure to {savepath}") 

### Outer methods for visualizing splits
def visualize_splits(train_clusters=None, val_clusters=None, test_clusters=None, benchmark_cluster_reps=None, train_color='#0072B2',val_color='#009E73',test_color='#E69F00',esm_embeddings_path=None, onehot_embeddings_path=None):
    colormap = {
        'train': train_color,
        'val': val_color,
        'test': test_color
    }
    valid_entry = False
    # Add columns for plotting
    if not(train_clusters is None) and not(val_clusters is None) and not(test_clusters is None):
        visualize_train_val_test_splits(train_clusters, val_clusters, test_clusters,benchmark_cluster_reps=benchmark_cluster_reps,colormap=colormap)
        valid_entry=True
    if not(train_clusters is None) and (val_clusters is None) and not(test_clusters is None):
        visualize_train_test_splits(train_clusters, test_clusters, benchmark_cluster_reps=benchmark_cluster_reps,colormap=colormap)
        valid_entry=True
    if not(train_clusters is None) and not(val_clusters is None) and (test_clusters is None):
        visualize_train_val_splits(train_clusters, val_clusters, benchmark_cluster_reps=benchmark_cluster_reps,colormap=colormap)
        valid_entry=True
    
    if not(valid_entry): raise Exception("Must pass train and at least one of val or test")
    
def visualize_train_val_test_splits(train_clusters, val_clusters, test_clusters, benchmark_cluster_reps=None, colormap=None, esm_embeddings_path=None, onehot_embeddings_path=None):
    if colormap is None: colormap=default_color_map
    # Add length column
    train_clusters['member length'] = train_clusters['member seq'].str.len()
    val_clusters['member length'] = val_clusters['member seq'].str.len()
    test_clusters['member length'] = test_clusters['member seq'].str.len()
    
    # Prepare lengths and seqs for plotting
    train_lengths = train_clusters['member length'].tolist()
    val_lengths = val_clusters['member length'].tolist()
    test_lengths = test_clusters['member length'].tolist()
    train_sequences = train_clusters['member seq'].tolist()
    val_sequences = val_clusters['member seq'].tolist()
    test_sequences = test_clusters['member seq'].tolist()
    
    # Create a combined figure with 3 rows and 3 columns
    set_font()
    fig_combined, axs = plt.subplots(3, 3, figsize=(24, 18))
    
    # Make the three visualization plots for saving TOGETHER
    visualize_splits_hist(train_lengths=train_lengths,
                          val_lengths=val_lengths,
                          test_lengths=test_lengths,
                          colormap=colormap, axes=axs[0])
    visualize_splits_shannon_entropy(train_sequences=train_sequences,
                                     val_sequences=val_sequences,
                                     test_sequences=test_sequences,
                                     colormap=colormap,axes=axs[1])
    visualize_splits_scatter(train_clusters=train_clusters, 
                             val_clusters=val_clusters,
                             test_clusters=test_clusters,
                             benchmark_cluster_reps=benchmark_cluster_reps, 
                             colormap=colormap, axes=axs[2, 0])
    visualize_splits_aa_composition(train_sequences=train_sequences,
                                    val_sequences=val_sequences,
                                    test_sequences=test_sequences,
                                    colormap=colormap, axes=axs[2, 1])
    if not(esm_embeddings_path is None) and os.path.exists(esm_embeddings_path):
        visualize_splits_tsne(train_sequences=train_sequences, 
                              val_sequences=val_sequences, 
                              test_sequences=test_sequences, 
                              colormap=colormap, axes=axs[2, 2])
    else:
    # Leave the last subplot blank
        axs[2, 2].axis('off')

    plt.tight_layout()
    fig_combined.savefig('splits/combined_plot.png')
    log_update(f"\nSaved combined figure to splits/combined_plot.png") 

def visualize_train_test_splits(train_clusters, test_clusters,  benchmark_cluster_reps=None, colormap=None, esm_embeddings_path=None, onehot_embeddings_path=None):
    if colormap is None: colormap=default_color_map
    # Add length column
    train_clusters['member length'] = train_clusters['member seq'].str.len()
    test_clusters['member length'] = test_clusters['member seq'].str.len()
    
    # Prepare lengths and seqs for plotting
    train_lengths = train_clusters['member length'].tolist()
    test_lengths = test_clusters['member length'].tolist()
    train_sequences = train_clusters['member seq'].tolist()
    test_sequences = test_clusters['member seq'].tolist()
    
    # Create a combined figure with 4 rows and 2 columns if TSNE plot, 3 x 2 otherwise
    if not(esm_embeddings_path is None) and os.path.exists(esm_embeddings_path):
        set_font()
        fig_combined, axs = plt.subplots(4, 2, figsize=(18, 36))
        visualize_splits_tsne(train_sequences=train_sequences, 
                              val_sequences=None, 
                              test_sequences=test_sequences, 
                              colormap=colormap, axes=axs[3, 0])
        axs[-1,1].axis('off')
    else:
        set_font()
        fig_combined, axs = plt.subplots(3, 2, figsize=(18, 18))
    
    # Make the three visualization plots for saving TOGETHER
    visualize_splits_hist(train_lengths=train_lengths,
                          val_lengths=None,
                          test_lengths=test_lengths,
                          colormap=colormap, axes=axs[0])
    visualize_splits_shannon_entropy(train_sequences=train_sequences,
                                     val_sequences=None,
                                     test_sequences=test_sequences,
                                     colormap=colormap,axes=axs[1])
    visualize_splits_scatter(train_clusters=train_clusters, 
                             val_clusters=None,
                             test_clusters=test_clusters,
                             benchmark_cluster_reps=benchmark_cluster_reps, 
                             colormap=colormap, axes=axs[2, 0])
    visualize_splits_aa_composition(train_sequences=train_sequences,
                                    val_sequences=None,
                                    test_sequences=test_sequences,
                                    colormap=colormap, axes=axs[2, 1])

    plt.tight_layout()
    fig_combined.savefig('splits/combined_plot.png')
    log_update(f"\nSaved combined figure to splits/combined_plot.png") 

def visualize_train_val_splits(train_clusters, val_clusters, benchmark_cluster_reps=None, colormap=None, esm_embeddings_path=None, onehot_embeddings_path=None):
    if colormap is None: colormap=default_color_map
    # Add length column
    train_clusters['member length'] = train_clusters['member seq'].str.len()
    val_clusters['member length'] = val_clusters['member seq'].str.len()
    
    # Prepare lengths and seqs for plotting
    train_lengths = train_clusters['member length'].tolist()
    val_lengths = val_clusters['member length'].tolist()
    train_sequences = train_clusters['member seq'].tolist()
    val_sequences = val_clusters['member seq'].tolist()
    
    # Create a combined figure with 4 rows and 2 columns if TSNE plot, 3 x 2 otherwise
    if not(esm_embeddings_path is None) and os.path.exists(esm_embeddings_path):
        set_font()
        fig_combined, axs = plt.subplots(4, 2, figsize=(18, 36))
        visualize_splits_tsne(train_sequences=train_sequences, 
                              val_sequences=val_sequences, 
                              test_sequences=None, 
                              colormap=colormap, axes=axs[3, 0])
        axs[-1,1].axis('off')
    else:
        set_font()
        fig_combined, axs = plt.subplots(3, 2, figsize=(18, 18))
    
    # Make the three visualization plots for saving TOGETHER
    visualize_splits_hist(train_lengths=train_lengths,
                          val_lengths=val_lengths,
                          test_lengths=None,
                          colormap=colormap, axes=axs[0])
    visualize_splits_shannon_entropy(train_sequences=train_sequences,
                                     val_sequences=val_sequences,
                                     test_sequences=None,
                                     colormap=colormap,axes=axs[1])
    visualize_splits_scatter(train_clusters=train_clusters, 
                             val_clusters=val_clusters,
                             test_clusters=None,
                             benchmark_cluster_reps=benchmark_cluster_reps, 
                             colormap=colormap, axes=axs[2, 0])
    visualize_splits_aa_composition(train_sequences=train_sequences,
                                    val_sequences=val_sequences,
                                    test_sequences=None,
                                    colormap=colormap, axes=axs[2, 1])

    plt.tight_layout()
    fig_combined.savefig('splits/combined_plot.png')
    log_update(f"\nSaved combined figure to splits/combined_plot.png")