Spaces:
Runtime error
Runtime error
import streamlit as st | |
import numpy as np | |
import pandas as pd | |
import torch | |
import tokenizers # for streamlit caching | |
import transformers | |
from transformers import TextClassificationPipeline, AutoTokenizer, AutoModelForSequenceClassification | |
def load_tok_and_model(): | |
tokenizer = AutoTokenizer.from_pretrained('distilbert-base-cased') | |
model = AutoModelForSequenceClassification.from_pretrained(".") | |
return tokenizer, model | |
CATEGORIES = ["Computer Science", "Economics", "Electrical Engineering", "Mathematics", | |
"Q. Biology", "Q. Finances", "Statistics" , "Physics"] | |
def forward_pass(title, abstract, tokenizer, model): | |
title_tensor = torch.tensor(tokenizer(title, padding="max_length", truncation=True, max_length=32)['input_ids']) | |
abstract_tensor = torch.tensor(tokenizer(abstract, padding="max_length", truncation=True, max_length=480)['input_ids']) | |
embeddings = torch.cat((title_tensor, abstract_tensor)) | |
assert embeddings.shape == (512,) | |
with torch.no_grad(): | |
logits = model(embeddings[None])['logits'][0] | |
assert logits.shape == (8,) | |
probs = torch.softmax(logits, dim=0).data.cpu().numpy() | |
return probs | |
st.title("Classification of arXiv articles' main topic") | |
st.markdown("Please provide both summary and title when possible") | |
tokenizer, model = load_tok_and_model() | |
title = st.text_area(label='Title', height=50) | |
abstract = st.text_area(label='Abstract', height=250) | |
button = st.button('Run classifier') | |
if button: | |
probs = forward_pass(title, abstract, tokenizer, model) | |
prob_strings = [(str(prob) + '%') for prob in np.round(probs * 100, 3)] | |
micro_df = pd.DataFrame({'Categories': CATEGORIES, 'Cat. Probability': prob_strings}) | |
micro_df = micro_df.sort_values(by='Cat. Probability', ascending=False) | |
micro_df.index.name = 'Internal ID' | |
st.write(micro_df) |