testSt / app.py
Tadeusz's picture
Update app.py
06bb42e
# pip install streamlit
import pandas as pd
import streamlit as st
from transformers import RobertaTokenizer, pipeline
ALLOWED_CHARS = set('ACDEFGHIKLMNPQRSTVWY*')
def validate_input_seq(seq: str):
seq_chars = set(seq.upper())
if not seq_chars.issubset(ALLOWED_CHARS):
disallowed_chars = ''.join(seq_chars - ALLOWED_CHARS)
st.error(f'Incorrect chars: "{disallowed_chars}"!')
st.stop()
num_placeholder = seq.count('*')
if num_placeholder != 1:
st.error('Please use exactly one * placeholder!')
st.stop()
@st.experimental_memo
def get_unmasker():
tokenizer = RobertaTokenizer.from_pretrained("antibody-tokenizer", return_tensors="pt")
# Reduce the number of predicted residues from 20 to gain speed
unmasker = pipeline('fill-mask', model="Tadeusz/testModel1", tokenizer=tokenizer, top_k=20)
return unmasker
def calculate_probabilities(seq: str):
unmasker = get_unmasker()
seq_predict = seq.replace('*', '<mask>')
# st.write(seq_predict)
residueProbability = unmasker(seq_predict)
return residueProbability
st.set_page_config(layout="wide")
st.title('nanoBERT - A model for gene agnostic navigation of the nanobody space')
seq = st.text_input(label='Sequence (Use only AA codes and * as placeholder for prediction)',
value='QLVQSGPEVKKP*ASVKVSCKASGYIFNNYGISWVRQAPGQGLEWMGWISTDNGNTNYAQKVQGRVTMTTDTSTSTAYMELRSLRYDDTAVYYCANNWGSYFEHWGQGTLVTVSS')
validate_input_seq(seq)
residueProbability = calculate_probabilities(seq)
df = pd.DataFrame(residueProbability).rename(columns={'token_str': 'aa', 'score': 'probability'}).set_index('aa').drop(
columns=['token'])
df_style = df.style. \
format('{:.3f}', subset=['probability']). \
bar(subset=['probability'], vmin=0, vmax=1)
st.write(df_style.to_html(escape=False), unsafe_allow_html=True)