Spaces:

abondrn
/

SVM

Sleeping

App Files Files Community

abondrn commited on Jun 4, 2023

Commit

463ec0f

1 Parent(s): a9799e9

First commit

Browse files

Files changed (3) hide show

README.md +11 -5
app.py +225 -73
requirements.txt +10 -4

README.md CHANGED Viewed

@@ -1,12 +1,18 @@
 ---
 title: SVM
-emoji: 🔥
-colorFrom: purple
-colorTo: yellow
-sdk: streamlit
-sdk_version: 1.21.0
 app_file: app.py
 pinned: false
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+# https://huggingface.co/docs/hub/spaces-config-reference
 title: SVM
+emoji: 🧬
+colorFrom: green
+colorTo: green
+sdk: gradio
 app_file: app.py
 pinned: false
+models:
+ - InstaDeepAI/nucleotide-transformer-500m-1000g
+ - facebook/esmfold_v1
+ - sentence-transformers/all-mpnet-base-v2
+python_version: 3.10.4
+license: mit
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py CHANGED Viewed

@@ -1,76 +1,228 @@
 import torch
-import streamlit as st
-from transformers import AutoTokenizer, OPTForCausalLM
-@st.cache_resource
-def load_model():
-    tokenizer = AutoTokenizer.from_pretrained("facebook/galactica-30b")
-    model = OPTForCausalLM.from_pretrained("facebook/galactica-30b", device_map='auto', low_cpu_mem_usage=True, torch_dtype=torch.float16)
-    model.gradient_checkpointing_enable()
-    return tokenizer, model
-st.set_page_config(
-    page_title='BioML-SVM',
-    layout="wide"
-)
-with st.spinner("Loading Models and Tokens..."):
-    tokenizer, model = load_model()
-with st.form(key='my_form'):
-    col1, col2 = st.columns([10, 1])
-    text_input = col1.text_input(label='Enter the amino sequence')
-    with col2:
-        st.text('')
-        st.text('')
-        submit_button = st.form_submit_button(label='Submit')
-    if submit_button:
-        st.session_state['result_done'] = False
-    # input_text = "[START_AMINO]GHMQSITAGQKVISKHKNGRFYQCEVVRLTTETFYEVNFDDGSFSDNLYPEDIVSQDCLQFGPPAEGEVVQVRWTDGQVYGAKFVASHPIQMYQVEFEDGSQLVVKRDDVYTLDEELP[END_AMINO]"
-        with st.spinner('Generating...'):
-            # formatted_text = f"[START_AMINO]{text_input}[END_AMINO]"
-            # formatted_text = f"Here is the sequence: [START_AMINO]{text_input}[END_AMINO]"
-            formatted_text = f"{text_input}"
-            input_ids = tokenizer(formatted_text, return_tensors="pt").input_ids.to("cuda")
-            outputs = model.generate(
-                input_ids=input_ids,
-                max_new_tokens=500
-            )
-            result = tokenizer.decode(outputs[0]).replace(formatted_text, "")
-        st.markdown(result)
-        if 'result_done' not in st.session_state or not st.session_state.result_done:
-            st.session_state['result_done'] = True
-            st.session_state['previous_state'] = result
     else:
-        if 'result_done' in st.session_state and st.session_state.result_done:
-            st.markdown(st.session_state.previous_state)
-if 'result_done' in st.session_state and st.session_state.result_done:
-    with st.form(key='ask_more'):
-        col1, col2 = st.columns([10, 1])
-        text_input = col1.text_input(label='Ask more question')
-        with col2:
-            st.text('')
-            st.text('')
-            submit_button = st.form_submit_button(label='Submit')
-        if submit_button:
-            with st.spinner('Generating...'):
-                # formatted_text = f"[START_AMINO]{text_input}[END_AMINO]"
-                formatted_text = f"Q:{text_input}\n\nA:\n\n"
-                input_ids = tokenizer(formatted_text, return_tensors="pt").input_ids.to("cuda")
-                outputs = model.generate(
-                    input_ids=input_ids,
-                    max_length=len(formatted_text) + 500,
-                    do_sample=True,
-                    top_k=40,
-                    num_beams=1,
-                    num_return_sequences=1
                 )
-                result = tokenizer.decode(outputs[0]).replace(formatted_text, "")
-            st.markdown(result)

+# credit: https://huggingface.co/spaces/simonduerr/3dmol.js/blob/main/app.py
+import os
+import sys
+from urllib import request
+import gradio as gr
+import requests
+from transformers import AutoTokenizer, AutoModelForMaskedLM, EsmModel, AutoModel
 import torch
+import progres as pg
+tokenizer_nt = AutoTokenizer.from_pretrained("InstaDeepAI/nucleotide-transformer-500m-1000g")
+model_nt = AutoModelForMaskedLM.from_pretrained("InstaDeepAI/nucleotide-transformer-500m-1000g")
+model_nt.eval()
+tokenizer_aa = AutoTokenizer.from_pretrained("facebook/esm2_t12_35M_UR50D")
+model_aa = EsmModel.from_pretrained("facebook/esm2_t12_35M_UR50D")
+model_aa.eval()
+tokenizer_se = AutoTokenizer.from_pretrained('sentence-transformers/all-mpnet-base-v2')
+model_se = AutoModel.from_pretrained('sentence-transformers/all-mpnet-base-v2')
+model_se.eval()
+def nt_embed(sequence: str):
+    tokens_ids = tokenizer_nt.batch_encode_plus([sequence], return_tensors="pt")["input_ids"]
+    attention_mask = tokens_ids != tokenizer_nt.pad_token_id
+    with torch.no_grad():
+        torch_outs = model_nt(
+            tokens_ids,#.to('cuda'),
+            attention_mask=attention_mask,#.to('cuda'),
+            output_hidden_states=True
+        )
+    last_layer_CLS = torch_outs.hidden_states[-1].detach()[:, 0, :][0]
+    return last_layer_CLS
+def aa_embed(sequence: str):
+    tokens = tokenizer_aa([sequence], return_tensors="pt")
+    with torch.no_grad():
+        torch_outs = model_aa(**tokens)
+    return torch_outs
+def se_embed(sentence: str):
+    encoded_input = tokenizer_se([sentence], return_tensors='pt')
+    with torch.no_grad():
+        model_output = model_se(**encoded_input)
+    return model_output
+def download_data_if_required():
+    url_base = f"https://zenodo.org/record/{pg.zenodo_record}/files"
+    fps = [pg.trained_model_fp]
+    urls = [f"{url_base}/trained_model.pt"]
+    #for targetdb in pre_embedded_dbs:
+    #    fps.append(os.path.join(database_dir, targetdb + ".pt"))
+    #    urls.append(f"{url_base}/{targetdb}.pt")
+    if not os.path.isdir(pg.trained_model_dir):
+        os.makedirs(pg.trained_model_dir)
+    #if not os.path.isdir(database_dir):
+    #    os.makedirs(database_dir)
+    printed = False
+    for fp, url in zip(fps, urls):
+        if not os.path.isfile(fp):
+            if not printed:
+                print("Downloading data as first time setup (~340 MB) to ", pg.progres_dir,
+                      ", internet connection required, this can take a few minutes",
+                      sep="", file=sys.stderr)
+                printed = True
+            try:
+                request.urlretrieve(url, fp)
+                d = torch.load(fp, map_location="cpu")
+                if fp == pg.trained_model_fp:
+                    assert "model" in d
+                else:
+                    assert "embeddings" in d
+            except:
+                if os.path.isfile(fp):
+                    os.remove(fp)
+                print("Failed to download from", url, "and save to", fp, file=sys.stderr)
+                print("Exiting", file=sys.stderr)
+                sys.exit(1)
+    if printed:
+        print("Data downloaded successfully", file=sys.stderr)
+def get_pdb(pdb_code="", filepath=""):
+    if pdb_code is None or pdb_code == "":
+        try:
+            with open(filepath.name) as f:
+                return f.read()
+        except AttributeError as e:
+            return None
     else:
+        return requests.get(f"https://files.rcsb.org/view/{pdb_code}.pdb").content.decode()
+def molecule(pdb):
+    x = (
+        """<!DOCTYPE html>
+        <html>
+        <head>
+    <meta http-equiv="content-type" content="text/html; charset=UTF-8" />
+    <style>
+    body{
+        font-family:sans-serif
+    }
+    .mol-container {
+    width: 100%;
+    height: 600px;
+    position: relative;
+    }
+    .mol-container select{
+        background-image:None;
+    }
+    </style>
+     <script src="https://cdnjs.cloudflare.com/ajax/libs/jquery/3.6.3/jquery.min.js" integrity="sha512-STof4xm1wgkfm7heWqFJVn58Hm3EtS31XFaagaa8VMReCXAkQnJZ+jEy8PCC/iT18dFy95WcExNHFTqLyp72eQ==" crossorigin="anonymous" referrerpolicy="no-referrer"></script>
+    <script src="https://3Dmol.csb.pitt.edu/build/3Dmol-min.js"></script>
+    </head>
+    <body>
+    <div id="container" class="mol-container"></div>
+            <script>
+               let pdb = `"""
+        + pdb
+        + """`
+             $(document).ready(function () {
+                let element = $("#container");
+                let config = { backgroundColor: "black" };
+                let viewer = $3Dmol.createViewer(element, config);
+                viewer.addModel(pdb, "pdb");
+                viewer.getModel(0).setStyle({}, { cartoon: { color:"spectrum" } });
+                viewer.addSurface("MS", { opacity: .5, color: "white" });
+                viewer.zoomTo();
+                viewer.render();
+                viewer.zoom(0.8, 2000);
+              })
+        </script>
+        </body></html>"""
+    )
+    return f"""<iframe style="width: 100%; height: 600px" name="result" allow="midi; geolocation; microphone; camera;
+    display-capture; encrypted-media;" sandbox="allow-modals allow-forms
+    allow-scripts allow-same-origin allow-popups
+    allow-top-navigation-by-user-activation allow-downloads" allowfullscreen=""
+    allowpaymentrequest="" frameborder="0" srcdoc='{x}'></iframe>"""
+def str2coords(s):
+    coords = []
+    for line in s.split('\n'):
+        if (line.startswith("ATOM  ") or line.startswith("HETATM")) and line[12:16].strip() == "CA":
+            coords.append([float(line[30:38]), float(line[38:46]), float(line[46:54])])
+        elif line.startswith("ENDMDL"):
+            break
+    return coords
+def update_st(inp, file):
+    pdb = get_pdb(inp, file)
+    return (molecule(pdb), pg.embed_coords(str2coords(pdb)))
+def update_nt(inp):
+    return str(nt_embed(inp or ''))
+def update_aa(inp):
+    return str(aa_embed(inp))
+def update_se(inp):
+    return str(se_embed(inp))
+demo = gr.Blocks()
+with demo:
+    with gr.Tabs():
+        with gr.TabItem("PDB Structural Embeddings"):
+            with gr.Row():
+                with gr.Box():
+                    inp = gr.Textbox(
+                        placeholder="PDB Code or upload file below", label="Input structure"
+                    )
+                    file = gr.File(file_count="single")
+                    gr.Examples(["2CBA", "6VXX"], inp)
+                    btn = gr.Button("View structure")
+            gr.Markdown("# PDB viewer using 3Dmol.js")
+            mol = gr.HTML()
+            emb = gr.Textbox(interactive=False)
+            btn.click(fn=update_st, inputs=[inp, file], outputs=[mol, emb])
+        with gr.TabItem("Nucleotide Sequence Embeddings"):
+            with gr.Box():
+                inp = gr.Textbox(
+                    placeholder="ATCGCTGCCCGTAGATAATAAGAGACACTGAGGCC", label="Input Nucleotide Sequence"
+                )
+                btn = gr.Button("View embeddings")
+                emb = gr.Textbox(interactive=False)
+                btn.click(fn=update_nt, inputs=[inp], outputs=emb)
+        with gr.TabItem("Amino Acid Sequence Embeddings"):
+            with gr.Box():
+                inp = gr.Textbox(
+                    placeholder="AAGQCYRGRCSGGLCCSKYGYCGSGPAYCG", label="Input Amino Acid Sequence"
                 )
+                btn = gr.Button("View embeddings")
+                emb = gr.Textbox(interactive=False)
+                btn.click(fn=update_aa, inputs=[inp], outputs=emb)
+        with gr.TabItem("Sentence Embeddings"):
+            with gr.Box():
+                inp = gr.Textbox(
+                    placeholder="Your text here", label="Input Sentence"
+                )
+                btn = gr.Button("View embeddings")
+                emb = gr.Textbox(interactive=False)
+                btn.click(fn=update_se, inputs=[inp], outputs=emb)
+if __name__ == "__main__":
+    download_data_if_required()
+    demo.launch()

requirements.txt CHANGED Viewed

@@ -1,5 +1,11 @@
-transformers
 accelerate
-streamlit
-# bitsandbytes
-# scipy

 accelerate
+gradio==3.33.1
+pyg-lib==0.2.0+pt20
+requests==2.31.0
+torch==2.0.1
+torch-cluster==1.6.1
+torch-geometric==2.3.1
+torch-scatter==2.1.1
+torch-sparse==0.6.17
+torch-spline-conv==1.2.2
+transformers==4.29.2