import streamlit as st import numpy as np import pandas as pd import torch import tokenizers # for streamlit caching import transformers from transformers import TextClassificationPipeline, AutoTokenizer, AutoModelForSequenceClassification @st.cache(suppress_st_warning=True, hash_funcs={tokenizers.Tokenizer: lambda _: None}) def load_tok_and_model(): tokenizer = AutoTokenizer.from_pretrained('distilbert-base-cased') model = AutoModelForSequenceClassification.from_pretrained(".") return tokenizer, model CATEGORIES = ["Computer Science", "Economics", "Electrical Engineering", "Mathematics", "Q. Biology", "Q. Finances", "Statistics" , "Physics"] @st.cache(suppress_st_warning=True, hash_funcs={tokenizers.Tokenizer: lambda _: None}) def forward_pass(title, abstract, tokenizer, model): title_tensor = torch.tensor(tokenizer(title, padding="max_length", truncation=True, max_length=32)['input_ids']) abstract_tensor = torch.tensor(tokenizer(abstract, padding="max_length", truncation=True, max_length=480)['input_ids']) embeddings = torch.cat((title_tensor, abstract_tensor)) assert embeddings.shape == (512,) with torch.no_grad(): logits = model(embeddings[None])['logits'][0] assert logits.shape == (8,) probs = torch.softmax(logits, dim=0).data.cpu().numpy() return probs st.title("Classification of arXiv articles' main topic") st.markdown("Please provide both summary and title when possible") tokenizer, model = load_tok_and_model() title = st.text_area(label='Title', height=50) abstract = st.text_area(label='Abstract', height=250) button = st.button('Run classifier') if button: probs = forward_pass(title, abstract, tokenizer, model) prob_strings = [(str(prob) + '%') for prob in np.round(probs * 100, 3)] micro_df = pd.DataFrame({'Categories': CATEGORIES, 'Cat. Probability': prob_strings, 'sort_probs': probs}) micro_df = micro_df.sort_values(by='sort_probs', ascending=False) micro_df = micro_df[['Categories', 'Cat. Probability']] micro_df.index.name = 'Internal ID' st.write(micro_df)