Spaces:
Sleeping
Sleeping
import numpy as np | |
from transformers import AutoTokenizer, AutoModelForSequenceClassification | |
import pandas as pd | |
from transformers import set_seed | |
import torch | |
import torch.nn as nn | |
from collections import OrderedDict | |
import warnings | |
import random | |
import gradio as gr | |
warnings.filterwarnings('ignore') | |
set_seed(4) | |
device = "cpu" | |
model_checkpoint = "facebook/esm2_t12_35M_UR50D" | |
dropout = 0.1 | |
def setup_seed(seed): | |
torch.manual_seed(seed) | |
torch.cuda.manual_seed_all(seed) | |
np.random.seed(seed) | |
random.seed(seed) | |
torch.backends.cudnn.deterministic = True | |
setup_seed(4) | |
class MyModel(nn.Module): | |
def __init__(self): | |
super().__init__() | |
self.bert = AutoModelForSequenceClassification.from_pretrained(model_checkpoint,num_labels=320) | |
self.bn1 = nn.BatchNorm1d(256) | |
self.bn2 = nn.BatchNorm1d(128) | |
self.bn3 = nn.BatchNorm1d(64) | |
self.relu = nn.ReLU() | |
self.fc1 = nn.Linear(320,256) | |
self.fc2 = nn.Linear(256,128) | |
self.fc3 = nn.Linear(128,64) | |
self.output_layer = nn.Linear(64,2) | |
self.dropout = nn.Dropout(dropout) | |
def forward(self,x): | |
with torch.no_grad(): | |
bert_output = self.bert(input_ids=x['input_ids'].to(device),attention_mask=x['attention_mask'].to(device)) | |
output_feature = self.dropout(bert_output["logits"]) | |
output_feature = self.dropout(self.relu(self.bn1(self.fc1(output_feature)))) | |
output_feature = self.dropout(self.relu(self.bn2(self.fc2(output_feature)))) | |
output_feature = self.dropout(self.relu(self.bn3(self.fc3(output_feature)))) | |
output_feature = self.dropout(self.output_layer(output_feature)) | |
return torch.softmax(output_feature,dim=1) | |
model = MyModel() | |
model.load_state_dict(torch.load("best_model.pth", map_location=torch.device('cpu')), strict=False) | |
model = model.to(device) | |
model.eval() | |
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint) | |
def pre(file): | |
test_sequences = file | |
max_len = 30 | |
test_data = tokenizer(test_sequences, max_length=max_len, padding="max_length",truncation=True, return_tensors='pt') | |
out_probability = [] | |
with torch.no_grad(): | |
predict = model(test_data) | |
out_probability.extend(np.max(np.array(predict.cpu()),axis=1).tolist()) | |
test_argmax = np.argmax(predict.cpu(), axis=1).tolist() | |
id2str = {0:"non-nAChRs", 1:"nAChRs"} | |
return id2str[test_argmax[0]], out_probability[0] | |
def conotoxinfinder(files): | |
fr=open(files, 'r') | |
seqs = [] | |
for line in fr: | |
if not line.startswith('>'): | |
seqs.append(line) | |
seq_all = [] | |
output_all = [] | |
probability_all = [] | |
for seq in seqs: | |
output, probability = pre(str(seq)) | |
seq_all.append(seq) | |
output_all.append(output) | |
probability_all.append(probability) | |
summary = OrderedDict() | |
summary['Seq'] = seq_all | |
summary['Class'] = output_all | |
summary['Probability'] = probability_all | |
summary_df = pd.DataFrame(summary) | |
summary_df.to_csv('output.csv', index=False) | |
return 'outputs.csv' | |
with open("conotoxinfinder.md", "r") as f: | |
description = f.read() | |
iface = gr.Interface(fn=conotoxinfinder, | |
title="ConotoxinFinder nAChRs", | |
inputs=[gr.Input("file", type="file", label="upload a file (.txt, .fasta) containing sequences") | |
], | |
outputs= "file", | |
description=description | |
) | |
iface.launch() |