AI-assisted Rare Disease Diagnosis with SHEPHERD

import gradio as gr
import pandas as pd
from pathlib import Path
import ast

'''
Causal Gene Discovery Model
/home/ema30/zaklab/rare_disease_dx/checkpoints/aligner/04_30_22:13:29:55_lr_1e-05_val_simulated_pats.disease_split_val_sim_pats_kg_8.9.21_kg_losstype_gene_multisimilarity/all_udn_patients_kg_8.9.21_kgsolved_manual_baylor_nobgm_distractor_genes_5_candidates_mapped_only_genes

Patients-Like-Me Model
/home/ema30/zaklab/rare_disease_dx/checkpoints/patient_NCA/04_26_22:17:38:30_lr_5e-05_val_simulated_pats.disease_split_val_sim_pats_kg_8.9.21_kg_losstype_patient_patient_NCA/mygene2_all_sim_all_udn_patients_kg_8.9.21_kgsolved_with_phenotypes

Disease Characterization Model
/home/ema30/zaklab/rare_disease_dx/checkpoints/patient_NCA/05_13_22:08:00:32_lr_1e-05_val_simulated_pats.disease_split_val_sim_pats_kg_8.9.21_kg_losstype_pd_NCA/mygene2_all_sim_all_udn_patients_kg_8.9.21_kgsolved_with_phenotypes
'''


gene_scores_df = pd.read_csv('gene_discovery_scores.csv')
exomiser_gene_scores_df = pd.read_csv('exomiser_gene_discovery_scores.csv')
patient_scores_df = pd.read_csv('patients_like_me_scores.csv')
dx_scores_df = pd.read_csv('dx_characterization_scores.csv')
plm_attn_df = pd.read_csv('patients_like_me_scores_attn.csv')
dx_attn_df = pd.read_csv('dx_characterization_scores_attn.csv')
gene_attn_df = pd.read_csv('gene_discovery_scores_attn.csv')
exomiser_gene_attn_df = pd.read_csv('exomiser_gene_discovery_scores_attn.csv')

diseases_map = {'UDN-P1': 'POLR3-releated leukodystrophy', 'UDN-P2': 'Novel Syndrome', 'UDN-P3':'Coffin-Lowry syndrome' , 
                'UDN-P4': 'automsomal recessive spastic paraplegia type 76', 'UDN-P5': 'atypical presentation of familial cold autoinflammatory syndrome', 
                'UDN-P6': '*GATAD2B*-associated syndrome', 'UDN-P7': 'AR limb-girdle muscular atrophy type 2D', 'UDN-P8': '*ATP5PO*-related Leigh syndrome', 'UDN-P9': 'Spondyloepimetaphyseal dysplasia, Isidor-Toutain type'}
genes_map = {'UDN-P3': 'RPS6KA3', 'UDN-P4': 'CAPN1', 'UDN-P5': 'NLRP12, RAPGEFL1', 'UDN-P6': 'GATAD2B', 'UDN-P7': 'SGCA', 'UDN-P8': 'ATP5P0', 'UDN-P9': 'RPL13'}


def get_patient(patient_id, attn_df):
    '''
    Returns phenotypes, candidate genes, Causal gene, disease
    '''
    if patient_id in genes_map: gene = genes_map[patient_id]
    else:
        patient_gene_scores_df = gene_scores_df.loc[gene_scores_df['patient_id'] == patient_id]
        gene = ', '.join(patient_gene_scores_df.loc[patient_gene_scores_df['correct_gene_label'] == 1, 'genes'].tolist())
    
    if patient_id in diseases_map: disease = diseases_map[patient_id]
    else:
        patient_dx_scores_df = dx_scores_df.loc[dx_scores_df['patient_id'] == patient_id] 
        disease = ', '.join(patient_dx_scores_df.loc[patient_dx_scores_df['correct_label'] == 1, 'diseases'].tolist())
    
    patient_attn_df = attn_df.loc[attn_df['patient_id'] == patient_id]
    phenotypes = ', '.join(patient_attn_df['phenotypes'].tolist())

    patient_str = f'''
    **Selected Patient:** {patient_id}<br>
    **Causal Gene:** *{gene}*<br>
    **Disease:** {disease}<br>
    **Phenotypes:** {phenotypes}<br><br>
    '''

    return patient_str


def read_file(filename):
    with open(filename, 'r') as file:
        f = file.read()
    return f


def causal_gene_discovery(patient_id, prioritization_type):
    if prioritization_type == 'Variant Filtered':
        scores_df = exomiser_gene_scores_df.loc[exomiser_gene_scores_df['patient_id'] == patient_id]
    else:
        scores_df = gene_scores_df.loc[gene_scores_df['patient_id'] == patient_id]


    # read in gene scores
    scores_df = scores_df.sort_values("similarities", ascending=False)
    scores_df['similarities'] = scores_df['similarities'].round(3).astype(str)

    # add links to gene cards
    scores_df['genes'] = scores_df['genes'].apply(lambda x: f'<u>[{x}](https://www.genecards.org/cgi-bin/carddisp.pl?gene={x})</u>')

    # bold/color causal gene
    scores_df.loc[scores_df['correct_gene_label'] == 1, 'similarities'] = scores_df.loc[scores_df['correct_gene_label'] == 1, 'similarities'].apply(lambda x: f'<span style="color:green">**{x}**</span>')
    scores_df.loc[scores_df['correct_gene_label'] == 1, 'genes'] = scores_df.loc[scores_df['correct_gene_label'] == 1, 'genes'].apply(lambda x: f'<span style="color:green">**{x}**</span>')

    #filter df
    scores_df = scores_df.drop(columns=['patient_id', 'correct_gene_label']).rename(columns={ 'similarities': 'SHEPHERD Score', 'genes': 'Candidate Genes'}) #'correct_gene_label' : 'Is Causal Gene',

    #############
    # Attention

    #read in phenotype attention
    if prioritization_type == 'Variant Filtered':
        attn_df = exomiser_gene_attn_df.loc[exomiser_gene_attn_df['patient_id'] == patient_id]
    else:
        attn_df = gene_attn_df.loc[gene_attn_df['patient_id'] == patient_id]
    attn_df = attn_df.sort_values("attention", ascending=False)
    attn_df['attention'] = attn_df['attention'].round(4)
    attn_df = attn_df.drop(columns=['patient_id', 'degrees'])

    #############
    # KG neighborhood
    #image_loc = f'images/{patient_id}.png'
    html_file = f'https://michellemli.github.io/test_html/{patient_id}.html'
    kg_html = f'''<iframe id="igraph" scrolling="no" style="border:none;  width: 100%; height: 600px" seamless="seamless" src="{html_file}"></iframe>'''


    #patient_info
    patient = get_patient(patient_id, gene_attn_df)

    return patient, scores_df, attn_df, kg_html


def patients_like_me(patient_id, k=10):


    scores_df = patient_scores_df.loc[patient_scores_df['patient_id'] == patient_id]
    scores_df = scores_df.sort_values("similarities", ascending=False)

    #scores_df['phenotypes'] ='PHEN'

    # add links to disease pages
    scores_df['disease_ids'] = scores_df['disease_ids'].apply(lambda x: f'(https://www.orpha.net/consor/cgi-bin/OC_Exp.php?lng=en&Expert={x})</u>')
    scores_df['diseases'] = scores_df['diseases'].apply(lambda x: f'<u>[{x}]')
    scores_df['diseases'] = scores_df['diseases'] + scores_df['disease_ids']

    scores_df['genes'] = scores_df['genes'].apply(lambda x: f'<u>[{x}](https://www.genecards.org/cgi-bin/carddisp.pl?gene={x})</u>')


    # bold/color patients with same causal gene
    scores_df.loc[scores_df['correct_label'] == 1, 'candidate_patients'] = scores_df.loc[scores_df['correct_label'] == 1, 'candidate_patients'].apply(lambda x: f'<span style="color:green">**{x}**</span>')
    scores_df.loc[scores_df['correct_label'] == 1, 'genes'] = scores_df.loc[scores_df['correct_label'] == 1, 'genes'].apply(lambda x: f'<span style="color:green">**{x}**</span>')
    scores_df.loc[scores_df['correct_label'] == 1, 'diseases'] = scores_df.loc[scores_df['correct_label'] == 1, 'diseases'].apply(lambda x: f'<span style="color:green">**{x}**</span>')

    scores_df = scores_df.drop(columns=['patient_id', 'similarities', 'correct_label', 'disease_ids']).rename(columns={'candidate_patients': 'Candidate Patient', 'genes': 'Candidate Patient\'s Gene', 'diseases': 'Candidate Patient\'s Disease' }) #'phenotypes': 'Candidate Patient\'s Phenotypes'
    scores_df = scores_df.head(k)


    #read in phenotype attention
    attn_df = plm_attn_df.loc[plm_attn_df['patient_id'] == patient_id]
    attn_df = attn_df.sort_values("attention", ascending=False)
    attn_df['attention'] = attn_df['attention'].round(4)
    attn_df = attn_df.drop(columns=['patient_id', 'degrees'])

    #patient_info
    patient = get_patient(patient_id, plm_attn_df)


    return patient, scores_df, attn_df


def disease_characterization(patient_id, k=10):


    #TODO: limit # of rows
    scores_df = dx_scores_df.loc[dx_scores_df['patient_id'] == patient_id]
    scores_df = scores_df.sort_values("similarities", ascending=False)
    scores_df = scores_df.head(k)

    scores_df.loc[ scores_df['disease_ids'].str.contains('Coxa vara'), 'disease_ids'] = '2812'
    scores_df.loc[ scores_df['disease_ids'].str.contains('Multiple epiphyseal dysplasia'), 'disease_ids'] = '2654'


    scores_df['disease_ids'] = scores_df['disease_ids'].apply(lambda x: ast.literal_eval(x))
    scores_df['type_disease_ids'] = scores_df['disease_ids'].apply(lambda x: type(x))

    scores_df.loc[scores_df['type_disease_ids'] == list, 'disease_ids'] = scores_df.loc[scores_df['type_disease_ids'] == list, 'disease_ids'].apply(lambda x: x[0])


    # add links to disease pages
    scores_df['disease_ids'] = scores_df['disease_ids'].apply(lambda x: f'(https://www.orpha.net/consor/cgi-bin/OC_Exp.php?lng=en&Expert={x})</u>')
    scores_df['diseases'] = scores_df['diseases'].apply(lambda x: f'<u>[{x}]')
    scores_df['diseases'] = scores_df['diseases'] + scores_df['disease_ids']

    # one disease couldn't map to orphanet
    scores_df.loc[ scores_df['disease_ids'].str.contains('33657'), 'diseases'] = '<u>[leukodystrophy, hypomyelinating, 20](https://www.omim.org/entry/619071)</u>'
    scores_df.loc[ scores_df['disease_ids'].str.contains('2654'), 'diseases'] = '<u>[Multiple epiphyseal dysplasia](https://www.orpha.net/consor/cgi-bin/OC_Exp.php?lng=EN&Expert=251)</u>'
    scores_df.loc[ scores_df['disease_ids'].str.contains('2812'), 'diseases'] = '<u>[Coxa vara](https://omim.org/entry/122750)</u>'

    
    scores_df = scores_df.drop(columns=['patient_id', 'similarities', 'correct_label', 'disease_ids','type_disease_ids']).rename(columns={'diseases' : 'Disease'})


    #read in phenotype attention
    attn_df = dx_attn_df.loc[dx_attn_df['patient_id'] == patient_id]
    attn_df = attn_df.sort_values("attention", ascending=False)
    attn_df['attention'] = attn_df['attention'].round(4)
    attn_df = attn_df.drop(columns=['patient_id', 'degrees'])

    #patient_info
    patient = get_patient(patient_id, dx_attn_df)


    return patient, scores_df, attn_df

def get_umap(umap_type):
        # get UMAP
    if umap_type == 'disease':
        html_file = 'https://michellemli.github.io/test_html/shepherd_disease_characterization_umap.html'
        #html_file = read_file('images/udn_orphafit_patient_umap_nneigh=50_mindist=0.9_spread=1.0colored_by_disease_category.html')
    elif umap_type == 'patient':
        html_file = 'https://michellemli.github.io/test_html/shepherd_patient_umap.html'

    else:
        raise NotImplementedError


    # return f"""<iframe style="width: 100%; height: 480px" name="result" allow="midi; 
    # display-capture; encrypted-media;" sandbox="allow-modals allow-forms 
    # allow-scripts allow-same-origin allow-popups 
    # allow-top-navigation-by-user-activation allow-downloads" allowfullscreen="" 
    # allowpaymentrequest="" frameborder="0" srcdoc='{html_file}'></iframe>"""
    return f'''<embed style="border: none;" src="{html_file}" dpi="300" width="100%" height="750px" />'''

    #return f'''<iframe id="igraph" scrolling="no" style="border:none;  width: 100%; height: 750px" seamless="seamless" src="{html_file}"></iframe>'''


with gr.Blocks() as demo: #css="#gene_attn_accordion {text-align: center}" css="kg_neigh {width: 70%}"
    gr.Markdown('<center><h1>AI-assisted Rare Disease Diagnosis with SHEPHERD</h1></center>')
    #gr.Markdown('<center><h2>A few SHot Explainable Predictor for Hard-to-diagnosE Rare Diseases</h2></center>')

    with gr.Tabs():
        with gr.TabItem("Causal Gene Discovery"):
            with gr.Column():
                gr.Markdown('<center><h2>Select a patient to view SHEPHERD\'s predictions</h2></center>')
                gene_dropdown = gr.Dropdown(choices=['UDN-P1', 'UDN-P2'],  label='Rare Disease Patients', type='value') #value='UDN-P1',
                gene_radio = gr.Radio(choices=['Expert Curated', 'Variant Filtered'], value='Expert Curated', label='Type of Gene List')
                patient_info = gr.Markdown() #get_patient('UDN-P1')

                with gr.Accordion(label=f'SHEPHERD\'s Ranking of Patient\'s Candidate Genes', open=True, elem_id='gene_accordion'):
                    #gr.Markdown(f'<center><h3>SHEPHERD\'s Ranking of Patient\'s Candidate Genes</h3></center>')
                    gr.Markdown('Below are SHEPHERD\'s ranking of either all Expert Curated candidate genes or the top 10 Variant Filtered candidate genes. The patient\'s causal gene (i.e. gene harboring a variant that explains the patient\'s symptoms) is colored in green.')
                    gene_dataframe = gr.Dataframe(  elem_id="gene_df", datatype = 'markdown', headers=['Candidate Genes', 'SHEPHERD Score' ], overflow_row_behaviour='paginate') # label='Candidate Genes', show_label=False,
                with gr.Accordion(label=f'SHEPHERD\'s Attention to Patient\'s Phenotypes', open=False, elem_id='gene_attn_accordion'):
                    #gr.Markdown(f'<center><h3>SHEPHERD\'s Attention to Patient\'s Phenotypes</h3></center>')
                    gene_attn_dataframe = gr.Dataframe( elem_id="gene_attn_df", headers=['Phenotypes', 'Attention' ], overflow_row_behaviour='paginate') # label='Candidate Genes', show_label=False,
                with gr.Accordion(label=f'Visualization of Patient\'s Neighborhood in the Knowledge Graph', open=False, elem_id='kg_neigh_accordion'):
                    #kg_neighborhood_image = gr.Image(elem_id='kg_neigh')#.style(height=200, width=200)
                    kg_neighborhood_image = gr.HTML(elem_id = 'kg_patient_neighborhood')

                #gene_button = gr.Button("Go")

        with gr.TabItem("Patients Like Me"):
            gr.HTML(get_umap('patient'))
            gr.Markdown('<center><h2>Select a patient to view SHEPHERD\'s predictions</h2></center>')
            patient_dropdown = gr.Dropdown(choices=['UDN-P3','UDN-P4','UDN-P5','UDN-P6'],  label='Rare Disease Patients', type='value')
            p_patient_info = gr.Markdown() 
            with gr.Accordion(label=f'Top 10 Most Similar Patients according to SHEPHERD', open=True, elem_id='pt_accordion'): #
                patient_dataframe = gr.Dataframe(max_rows=10, datatype = 'markdown', show_label=False, elem_id="pat_df", headers=['Candidate Patient', 'Candidate Patient\'s Gene', 'Candidate Patient\'s Disease' ]) #'Candidate Patient\'s Phenotypes'
                #patient_button = gr.Button("Go")
            with gr.Accordion(label='SHEPHERD\'s Attention to Patient\'s Phenotypes', open=False, elem_id='pt_attn_accordion'):
                pt_attn_dataframe = gr.Dataframe( elem_id="pt_attn_df", headers=['Phenotypes', 'Attention' ], overflow_row_behaviour='paginate')


        with gr.TabItem("Disease Characterization"):
            gr.HTML(get_umap('disease'))
            gr.Markdown('<center><h2>Select a patient to view SHEPHERD\'s predictions</h2></center>')
            dx_dropdown = gr.Dropdown(choices=['UDN-P7','UDN-P8','UDN-P9','UDN-P2'],  label='Rare Disease Patients', type='value')
            dx_patient_info = gr.Markdown() 
            with gr.Accordion(label='Top 10 Most Similar Diseases according to SHEPHERD', open=True, elem_id='pt_accordion'): #
                dx_dataframe = gr.Dataframe(max_rows=10, datatype = 'markdown', show_label=False, elem_id="dx_df",  headers=['Diseases'])
            with gr.Accordion(label='SHEPHERD\'s Attention to Patient\'s Phenotypes', open=False, elem_id='dx_attn_accordion'):
                dx_attn_dataframe = gr.Dataframe(  elem_id="dx_attn_df", headers=['Phenotypes', 'Attention' ], overflow_row_behaviour='paginate')

            #dx_button = gr.Button("Go")

    gene_dropdown.change(causal_gene_discovery, inputs=[gene_dropdown,gene_radio], outputs=[patient_info, gene_dataframe, gene_attn_dataframe, kg_neighborhood_image])
    gene_radio.change(causal_gene_discovery, inputs=[gene_dropdown,gene_radio], outputs=[patient_info, gene_dataframe, gene_attn_dataframe, kg_neighborhood_image])

    patient_dropdown.change(patients_like_me, inputs=patient_dropdown, outputs=[p_patient_info, patient_dataframe, pt_attn_dataframe])
    dx_dropdown.change(disease_characterization, inputs=dx_dropdown, outputs=[dx_patient_info, dx_dataframe, dx_attn_dataframe])

    #gene_dropdown.change(get_patient, inputs=gene_dropdown, outputs=patient_info)
    #gene_button.click(causal_gene_discovery, inputs=gene_dropdown, outputs=[gene_dataframe,gene_attn_dataframe, kg_neighborhood_image])
    #patient_button.click(patients_like_me, inputs=patient_dropdown, outputs=patient_dataframe)
    #dx_button.click(disease_characterization, inputs=dx_dropdown, outputs=dx_dataframe)

demo.launch( ) #server_port=50018, share=True