|
import pandas as pd |
|
import streamlit as st |
|
import torch |
|
from transformers import AutoModelForSequenceClassification, AutoTokenizer |
|
|
|
|
|
def combine_title_summary(title, summary): |
|
return "title: " + title + " summary: " + summary |
|
|
|
|
|
tag2ind = { |
|
"Biology": 0, |
|
"Physics": 1, |
|
"Math": 2, |
|
"Computer Science": 3, |
|
} |
|
|
|
|
|
@st.cache_resource |
|
def load_model(): |
|
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") |
|
|
|
|
|
dir_name = "./microsoft/deberta-v3-small/checkpoint-4915" |
|
tokenizer = AutoTokenizer.from_pretrained(dir_name, use_fast=False) |
|
model = AutoModelForSequenceClassification.from_pretrained(dir_name).to(device) |
|
|
|
return tokenizer, model |
|
|
|
|
|
|
|
|
|
|
|
def run_model(title, summary): |
|
tokenizer, model = load_model() |
|
|
|
text = combine_title_summary(title, summary) |
|
|
|
tokens_info = tokenizer( |
|
text, |
|
padding=False, |
|
truncation=True, |
|
return_tensors="pt", |
|
max_length=512, |
|
) |
|
|
|
model.eval() |
|
model.cpu() |
|
with torch.no_grad(): |
|
out = model(**tokens_info) |
|
probs = torch.nn.functional.softmax(out.logits, dim=-1)[0] |
|
|
|
ids = torch.argsort(probs, descending=True) |
|
p = 0 |
|
best_tags, best_probs = [], [] |
|
for ind in ids: |
|
p += probs[ind] |
|
|
|
best_tags.append(list(tag2ind.keys())[ind]) |
|
best_probs.append(probs[ind]) |
|
|
|
if p >= 0.95: |
|
break |
|
|
|
return best_tags, best_probs |
|
|
|
|
|
def main(): |
|
title = st.text_input(label="Title", value="") |
|
abstract = st.text_area(label="Abstract", value="", height=200) |
|
if st.button("Classify"): |
|
if title == "" and abstract == "": |
|
st.error("At least one of title or abstract must be provided") |
|
else: |
|
best_tags, best_probs = run_model(title, abstract) |
|
|
|
df = pd.DataFrame( |
|
dict(zip(best_tags, best_probs)).items(), |
|
columns=["Theme", "Probability"], |
|
) |
|
st.table(df) |
|
|
|
|
|
if __name__ == "__main__": |
|
main() |
|
|