File size: 3,656 Bytes
e9744b4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
import streamlit as st
import pandas as pd
from os import listdir
import plotly.graph_objects as go

gene_names_eve = [file.split('_')[0] for file in listdir('./CPT1_score_EVE_set/')]
gene_names_no_eve1 = [file.split('_')[0] for file in listdir('./CPT1_score_no_EVE_set1/')]
gene_names_no_eve2 = [file.split('_')[0] for file in listdir('./CPT1_score_no_EVE_set2/')]

st.subheader('CPT-1')
st.markdown('Cross-protein transfer learning for variant effect prediction')
st.markdown('This is a lookup tool for the variant effect preditions of CPT-1 for 18,602 human proteins, initially released with the manuscript "Cross-protein transfer learning substantially improves zero-shot prediction of disease variant effects (2022)".')

# Input query gene
gene = st.selectbox(
    'Which gene/protein are you interested in?',
    gene_names_eve + gene_names_no_eve1 + gene_names_no_eve2)

if st.button('Show results'):
    # Read the gene file
    if gene in gene_names_eve:
        pred = pd.read_csv('./CPT1_score_EVE_set/' + gene + '_HUMAN.csv.gz', compression = 'gzip')
    elif gene in gene_names_no_eve1:
        pred = pd.read_csv('./CPT1_score_no_EVE_set1/' + gene + '_HUMAN.csv.gz', compression = 'gzip')
    else:
        pred = pd.read_csv('./CPT1_score_no_EVE_set2/' + gene + '_HUMAN.csv.gz', compression = 'gzip')
    
    # Plot heatmap
    mat = pred.copy()
    mat['Mutant amino acid'] = mat['mutant'].str[-1]
    mat['Position on protein sequence'] = mat['mutant'].str[:-1]
    mat = mat.set_index('mutant')
    mat['Position on protein sequence'] = pd.Categorical(
        mat['Position on protein sequence'],
        categories = mat['Position on protein sequence'].unique(),
        ordered = True)
    mat = mat.pivot(index='Mutant amino acid', columns='Position on protein sequence', values = mat.columns[0])
    fig = go.Figure()
    fig.add_trace(
        go.Heatmap(z = mat, y = mat.index, x = mat.columns,
                   colorbar = dict(title='Variant effect'),
                   colorscale=[[0, '#6FA8DC'],
                               [0.3, '#CFE2F3'],
                               [0.5, '#FFFFFF'],
                               [0.7, '#F4CCCC'],
                               [1.0, '#BA1111']],
                   hovertemplate='Wild-type amino acid and position: %{x}\
                       <br>Mutant amino acid: %{y}\
                       <br>Prediction: %{z}<extra></extra>')
    )
    fig.update_layout(
        title_text = "CPT-1 variant effect prediction for " + gene + " (red: pathogenic, blue: benign)",
        xaxis = dict(
            title = 'Position on protein sequence',
            rangeslider = dict(visible=True)
        ),
        yaxis = dict(
            title = 'Mutant amino acid'
        ),
        yaxis_nticks = mat.shape[0],
        height = 600
    )
    st.plotly_chart(fig, theme = "streamlit", use_container_width = True, height = 600)
    
    # Generate download link
    st.download_button('Download CSV', 
                       pred.set_index('mutant').to_csv().encode('utf-8'),
                       gene + '_CPT_score.csv',
                       'text/csv')
    
# Citation
st.markdown('</br>'
            '<h5> Citation </h5>'
'Jagota, M.\*, Ye, C.\*, Rastogi, R., Albors, C., Koehl, A., Ioannidis, N., and Song, Y.S.&dagger;'
'"Cross-protein transfer learning substantially improves zero-shot prediction of disease variant effects", '
'bioRxiv (2022)  </br>'
'\*These authors contributed equally to this work.  </br>'
'&dagger; To whom correspondence should be addressed:  yss@berkeley.edu  </br>'
'DOI: https://doi.org/10.1101/2022.11.15.516532', 
    unsafe_allow_html = True)