Spaces:

wenkai
/

FAPM_demo

Running on Zero

File size: 11,939 Bytes

2b26389
72b0e49
2b26389
 
 
 
 
 
 
aad9fe1
9b993cf
f3ed046
 
 
aad9fe1
 
 
 
 
 
 
07ac117
aad9fe1
 
 
0b30831
aad9fe1
 
 
07ac117
aad9fe1
 
 
 
 
 
 
eb615db
aad9fe1
 
 
c34048a
 
aad9fe1
 
3daa625
 
 
 
f3ed046
 
 
 
 
 
 
 
 
 
 
 
eb615db
 
 
 
 
 
 
 
 
 
 
 
1a0324b
72b0e49
eb615db
3daa625
 
 
 
 
 
 
 
 
 
 
 
 
 
 
507babf
3daa625
507babf
3daa625
507babf
c8e59d5
3daa625
 
 
507babf
3daa625
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3705c34
77b966b
3705c34
77b966b
3705c34
cdf31f1
507babf
2b26389
2bc812b
 
 
 
2b26389
 
 
 
 
eb615db
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6b16660
aad9fe1
6b16660
aad9fe1
eb615db
e95deab
eb615db
 
507babf
ab4d8af
eb615db
507babf
ab4d8af
eb615db
507babf
ab4d8af
1167137
2b26389
 
 
 
 
222ff4a
 
 
 
 
2b26389
c3846ee
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
892748f
c3846ee
 
507babf
ab4d8af
507babf
aad9fe1
c3846ee
 
bf58ca4
 
 
 
 
 
c3846ee
eb615db
507babf
c3846ee
 
 
 
507babf
c3846ee
 
2b26389

import os
import torch
import torch.nn as nn
import pandas as pd
import torch.nn.functional as F
from lavis.models.protein_models.protein_function_opt import Blip2ProteinMistral
from lavis.models.base_model import FAPMConfig
import spaces
import gradio as gr
# from esm_scripts.extract import run_demo
from esm import pretrained, FastaBatchedDataset
from data.evaluate_data.utils import Ontology
import difflib
import re
from transformers import MistralForCausalLM

# Load the trained model
def get_model(type='Molecule Function'):
    model = Blip2ProteinMistral(config=FAPMConfig(), esm_size='3b')
    if type == 'Molecule Function':
        model.load_checkpoint("model/checkpoint_mf2.pth")
        model.Qformer.bert = torch.load('model/mf2_bert.pth', map_location=torch.device('cpu'))
        model.to('cuda')
    elif type == 'Biological Process':
        model.load_checkpoint("model/checkpoint_bp1.pth")
        model.Qformer.bert = torch.load('model/bp1_bert.pth', map_location=torch.device('cpu'))
        model.to('cuda')
    elif type == 'Cellar Component':
        model.load_checkpoint("model/checkpoint_cc2.pth")
        model.Qformer.bert = torch.load('model/cc2_bert.pth', map_location=torch.device('cpu'))
        model.to('cuda')
    return model


models = {
    'Molecule Function': get_model('Molecule Function'),
    'Biological Process': get_model('Biological Process'),
    'Cellular Component': get_model('Cellar Component'),
    }

# Load the mistral model
mistral_model = MistralForCausalLM.from_pretrained("teknium/OpenHermes-2.5-Mistral-7B", torch_dtype=torch.float16)
mistral_model.to('cuda')

# Load ESM2 model
model_esm, alphabet = pretrained.load_model_and_alphabet('esm2_t36_3B_UR50D')
model_esm.to('cuda')
model_esm.eval()

godb = Ontology(f'data/go1.4-basic.obo', with_rels=True)
go_des = pd.read_csv('data/go_descriptions1.4.txt', sep='|', header=None)
go_des.columns = ['id', 'text']
go_des = go_des.dropna()
go_des['id'] = go_des['id'].apply(lambda x: re.sub('_', ':', x))
go_obo_set = set(go_des['id'].tolist())
go_des['text'] = go_des['text'].apply(lambda x: x.lower())
GO_dict = dict(zip(go_des['text'], go_des['id']))
Func_dict = dict(zip(go_des['id'], go_des['text']))

terms_mf = pd.read_pickle('data/terms/mf_terms.pkl')
choices_mf = [Func_dict[i] for i in list(set(terms_mf['gos']))]
choices_mf = {x.lower(): x for x in choices_mf}
terms_bp = pd.read_pickle('data/terms/bp_terms.pkl')
choices_bp = [Func_dict[i] for i in list(set(terms_bp['gos']))]
choices_bp = {x.lower(): x for x in choices_bp}
terms_cc = pd.read_pickle('data/terms/cc_terms.pkl')
choices_cc = [Func_dict[i] for i in list(set(terms_cc['gos']))]
choices_cc = {x.lower(): x for x in choices_cc}
choices = {
    'Molecule Function': choices_mf,
    'Biological Process': choices_bp,
    'Cellular Component': choices_cc,
    }

@spaces.GPU
def generate_caption(protein, prompt):
    # Process the image and the prompt
    # with open('/home/user/app/example.fasta', 'w') as f:
    #     f.write('>{}\n'.format("protein_name"))
    #     f.write('{}\n'.format(protein.strip()))
    # os.system("python esm_scripts/extract.py esm2_t36_3B_UR50D /home/user/app/example.fasta /home/user/app --repr_layers 36 --truncation_seq_length 1024 --include per_tok")
    # esm_emb = run_demo(protein_name='protein_name', protein_seq=protein,
    #                    model=model_esm, alphabet=alphabet,
    #                    include='per_tok', repr_layers=[36], truncation_seq_length=1024)

    protein_name = 'protein_name'
    protein_seq = protein
    include = 'per_tok'
    repr_layers = [36]
    truncation_seq_length = 1024
    toks_per_batch = 4096
    # print("start")
    dataset = FastaBatchedDataset([protein_name], [protein_seq])
    # print("dataset prepared")
    batches = dataset.get_batch_indices(toks_per_batch, extra_toks_per_seq=1)
    # print("batches prepared")

    data_loader = torch.utils.data.DataLoader(
        dataset, collate_fn=alphabet.get_batch_converter(truncation_seq_length), batch_sampler=batches
    )
    # print(f"Read sequences")
    return_contacts = "contacts" in include

    assert all(-(model_esm.num_layers + 1) <= i <= model_esm.num_layers for i in repr_layers)
    repr_layers = [(i + model_esm.num_layers + 1) % (model_esm.num_layers + 1) for i in repr_layers]

    with torch.no_grad():
        for batch_idx, (labels, strs, toks) in enumerate(data_loader):
            print(
                f"Processing {batch_idx + 1} of {len(batches)} batches ({toks.size(0)} sequences)"
            )
            if torch.cuda.is_available():
                toks = toks.to(device="cuda", non_blocking=True)
            out = model_esm(toks, repr_layers=repr_layers, return_contacts=return_contacts)
            representations = {
                layer: t.to(device="cpu") for layer, t in out["representations"].items()
            }
            if return_contacts:
                contacts = out["contacts"].to(device="cpu")
            for i, label in enumerate(labels):
                result = {"label": label}
                truncate_len = min(truncation_seq_length, len(strs[i]))
                # Call clone on tensors to ensure tensors are not views into a larger representation
                # See https://github.com/pytorch/pytorch/issues/1995
                if "per_tok" in include:
                    result["representations"] = {
                        layer: t[i, 1: truncate_len + 1].clone()
                        for layer, t in representations.items()
                    }
                if "mean" in include:
                    result["mean_representations"] = {
                        layer: t[i, 1: truncate_len + 1].mean(0).clone()
                        for layer, t in representations.items()
                    }
                if "bos" in include:
                    result["bos_representations"] = {
                        layer: t[i, 0].clone() for layer, t in representations.items()
                    }
                if return_contacts:
                    result["contacts"] = contacts[i, : truncate_len, : truncate_len].clone()
            esm_emb = result['representations'][36]
    '''
    inputs = tokenizer([protein], return_tensors="pt", padding=True, truncation=True).to('cuda')
    with torch.no_grad():
        outputs = model_esm(**inputs)
    esm_emb = outputs.last_hidden_state.detach()[0]
    '''
    # print("esm embedding generated")
    esm_emb = F.pad(esm_emb.t(), (0, 1024 - len(esm_emb))).t().to('cuda')
    if prompt is None:
        prompt = 'none'
    else:
        prompt = prompt.lower()
    samples = {'name': ['protein_name'],
               'image': torch.unsqueeze(esm_emb, dim=0),
               'text_input': ['none'],
               'prompt': [prompt]}

    union_pred_terms = []
    for model_id in models.keys():
        model = models[model_id]
        # Generate the output
        prediction = model.generate(mistral_model, samples, length_penalty=0., num_beams=15, num_captions=10, temperature=1.,
                                    repetition_penalty=1.0)
        x = prediction[0]
        x = [eval(i) for i in x.split('; ')]
        pred_terms = []
        temp = []
        for i in x:
            txt = i[0]
            prob = i[1]
            sim_list = difflib.get_close_matches(txt.lower(), choices[model_id], n=1, cutoff=0.9)
            if len(sim_list) > 0:
                t_standard = sim_list[0]
                if t_standard not in temp:
                    pred_terms.append(t_standard+f'({prob})')
                    temp.append(t_standard)
        union_pred_terms.append(pred_terms)

    if prompt == 'none':
        res_str = "No available predictions for this protein, you can use other two types of model, remove prompt or try another sequence!"
    else:
        res_str = "No available predictions for this protein, you can use other two types of model or try another sequence!"
    if len(union_pred_terms[0]) == 0 and len(union_pred_terms[1]) == 0 and len(union_pred_terms[2]) == 0:
        return res_str
    res_str = ''
    if len(union_pred_terms[0]) != 0:
        temp = ['- '+i+'\n' for i in union_pred_terms[0]]
        res_str += f"Based on the given amino acid sequence, the protein appears to have a primary function of \n{''.join(temp)} \n"
    if len(union_pred_terms[1]) != 0:
        temp = ['- ' + i + '\n' for i in union_pred_terms[1]]
        res_str += f"It is likely involved in the following process: \n{''.join(temp)} \n"
    if len(union_pred_terms[2]) != 0:
        temp = ['- ' + i + '\n' for i in union_pred_terms[2]]
        res_str += f"It's subcellular localization is within the: \n{''.join(temp)}"
    return res_str


# Define the FAPM interface
description = """Quick demonstration of the FAPM model for protein function prediction. Upload an protein sequence to generate a function description. Modify the Prompt to provide the taxonomy information.

Our paper is available at [BioRxiv](https://www.biorxiv.org/content/10.1101/2024.05.07.593067v1)

The model used in this app is available at [Hugging Face Model Hub](https://huggingface.co/wenkai/FAPM) and the source code can be found on [GitHub](https://github.com/xiangwenkai/FAPM/tree/main).

Thanks for the support from ProtonUnfold Tech.  Co., Ltd (https://www.protonunfold.com/)."""

# iface = gr.Interface(
#     fn=generate_caption,
#     inputs=[gr.Textbox(type="text", label="Upload sequence"), gr.Textbox(type="text", label="Prompt")],
#     outputs=gr.Textbox(label="Generated description"),
#     description=description
# )
# # Launch the interface
# iface.launch()

css = """
  #output {
    height: 500px; 
    overflow: auto; 
    border: 1px solid #ccc; 
  }
"""

with gr.Blocks(css=css) as demo:
    gr.Markdown(description)
    with gr.Tab(label="Protein caption"):
        with gr.Row():
            with gr.Column():
                input_protein = gr.Textbox(type="text", label="Upload sequence")
                prompt = gr.Textbox(type="text", label="Taxonomy Prompt (Optional)")
                submit_btn = gr.Button(value="Submit")
            with gr.Column():
                # output_text = gr.Textbox(label="Output Text")
                with gr.Accordion('Prediction:', open=True):
                    output_markdown = gr.Markdown(label="Output")
        # O14813 train index 127, 266, 738, 1060 test index 4
        gr.Examples(
            examples=[
                ["MDYSYLNSYDSCVAAMEASAYGDFGACSQPGGFQYSPLRPAFPAAGPPCPALGSSNCALGALRDHQPAPYSAVPYKFFPEPSGLHEKRKQRRIRTTFTSAQLKELERVFAETHYPDIYTREELALKIDLTEARVQVWFQNRRAKFRKQERAASAKGAAGAAGAKKGEARCSSEDDDSKESTCSPTPDSTASLPPPPAPGLASPRLSPSPLPVALGSGPGPGPGPQPLKGALWAGVAGGGGGGPGAGAAELLKAWQPAESGPGPFSGVLSSFHRKPGPALKTNLF", ''],
                ["MKTLALFLVLVCVLGLVQSWEWPWNRKPTKFPIPSPNPRDKWCRLNLGPAWGGRC", ''],
                ["MAAAGGARLLRAASAVLGGPAGRWLHHAGSRAGSSGLLRNRGPGGSAEASRSLSVSARARSSSEDKITVHFINRDGETLTTKGKVGDSLLDVVVENNLDIDGFGACEGTLACSTCHLIFEDHIYEKLDAITDEENDMLDLAYGLTDRSRLGCQICLTKSMDNMTVRVPETVADARQSIDVGKTS", 'Homo'],
                ['MASAELSREENVYMAKLAEQAERYEEMVEFMEKVAKTVDSEELTVEERNLLSVAYKNVIGARRASWRIISSIEQKEEGRGNEDRVTLIKDYRGKIETELTKICDGILKLLETHLVPSSTAPESKVFYLKMKGDYYRYLAEFKTGAERKDAAENTMVAYKAAQDIALAELAPTHPIRLGLALNFSVFYYEILNSPDRACSLAKQAFDEAISELDTLSEESYKDSTLIMQLLRDNLTLWTSDISEDPAEEIREAPKRDSSEGQ', 'Zea'],
                ['MIKAAVTKESLYRMNTLMEAFQGFLGLDLGEFTFKVKPGVFLLTDVKSYLIGDKYDDAFNALIDFVLRNDRDAVEGTETDVSIRLGLSPSDMVVKRQDKTFTFTHGDLEFEVHWINL', 'Bacteriophage'],
                ['MNDLMIQLLDQFEMGLRERAIKVMATINDEKHRFPMELNKKQCSLMLLGTTDTTTFDMRFNSKKDFPRIKGAREKYPRDAVIEWYHQNWMRTEVKQ', 'Bacteriophage'],
            ],
            inputs=[input_protein, prompt],
            outputs=[output_markdown],
            fn=generate_caption,
            cache_examples=True,
            label='Try examples'
        )
        submit_btn.click(generate_caption, [input_protein, prompt], [output_markdown])

demo.launch(debug=True)