Spaces:

qc7
/

shad_transformer

Runtime error

File size: 2,131 Bytes

6c8ab76
 
 
5e99dea
6c8ab76
571d116
6c8ab76
 
 
571d116
6c8ab76
dcbf77d
6c8ab76
 
 
 
 
 
 
 
571d116
6c8ab76
 
 
 
 
 
 
 
 
28d5381
6c8ab76
 
 
 
 
 
 
 
93da556
 
6c8ab76
 
 
 
73dcd7e
6b0b28e
 
78b329c
4e05b94
ad35274

import streamlit as st
import numpy as np
import pandas as pd
import torch

import tokenizers  # for streamlit caching
import transformers
from transformers import TextClassificationPipeline, AutoTokenizer, AutoModelForSequenceClassification   

@st.cache(suppress_st_warning=True, hash_funcs={tokenizers.Tokenizer: lambda _: None})
def load_tok_and_model(): 
    tokenizer = AutoTokenizer.from_pretrained('distilbert-base-cased')
    model = AutoModelForSequenceClassification.from_pretrained(".")
    return tokenizer, model


CATEGORIES = ["Computer Science", "Economics", "Electrical Engineering", "Mathematics",
 "Q. Biology", "Q. Finances", "Statistics" , "Physics"]
    
    
@st.cache(suppress_st_warning=True, hash_funcs={tokenizers.Tokenizer: lambda _: None})  
def forward_pass(title, abstract, tokenizer, model):    
    title_tensor = torch.tensor(tokenizer(title, padding="max_length", truncation=True, max_length=32)['input_ids'])
    abstract_tensor = torch.tensor(tokenizer(abstract, padding="max_length", truncation=True, max_length=480)['input_ids'])
    
    embeddings = torch.cat((title_tensor, abstract_tensor))
    assert embeddings.shape == (512,)
    with torch.no_grad():
        logits = model(embeddings[None])['logits'][0]
        assert logits.shape == (8,)
        probs = torch.softmax(logits, dim=0).data.cpu().numpy()
    
    return probs

st.title("Classification of arXiv articles' main topic")
st.markdown("Please provide both summary and title when possible")

tokenizer, model = load_tok_and_model()

title = st.text_area(label='Title', height=50)
abstract = st.text_area(label='Abstract', height=250)
button = st.button('Run classifier')

if button:
    probs = forward_pass(title, abstract, tokenizer, model)
    prob_strings = [(str(prob) + '%') for prob in np.round(probs * 100, 3)]
    micro_df = pd.DataFrame({'Categories': CATEGORIES, 'Cat. Probability': prob_strings, 'sort_probs': probs})
    micro_df = micro_df.sort_values(by='sort_probs', ascending=False)
    micro_df = micro_df[['Categories', 'Cat. Probability']]
    micro_df.index.name = 'Internal ID'
    st.write(micro_df)