File size: 2,131 Bytes
6c8ab76
 
 
5e99dea
6c8ab76
571d116
6c8ab76
 
 
571d116
6c8ab76
dcbf77d
6c8ab76
 
 
 
 
 
 
 
571d116
6c8ab76
 
 
 
 
 
 
 
 
28d5381
6c8ab76
 
 
 
 
 
 
 
93da556
 
6c8ab76
 
 
 
73dcd7e
6b0b28e
 
78b329c
4e05b94
ad35274
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
import streamlit as st
import numpy as np
import pandas as pd
import torch

import tokenizers  # for streamlit caching
import transformers
from transformers import TextClassificationPipeline, AutoTokenizer, AutoModelForSequenceClassification   

@st.cache(suppress_st_warning=True, hash_funcs={tokenizers.Tokenizer: lambda _: None})
def load_tok_and_model(): 
    tokenizer = AutoTokenizer.from_pretrained('distilbert-base-cased')
    model = AutoModelForSequenceClassification.from_pretrained(".")
    return tokenizer, model


CATEGORIES = ["Computer Science", "Economics", "Electrical Engineering", "Mathematics",
 "Q. Biology", "Q. Finances", "Statistics" , "Physics"]
    
    
@st.cache(suppress_st_warning=True, hash_funcs={tokenizers.Tokenizer: lambda _: None})  
def forward_pass(title, abstract, tokenizer, model):    
    title_tensor = torch.tensor(tokenizer(title, padding="max_length", truncation=True, max_length=32)['input_ids'])
    abstract_tensor = torch.tensor(tokenizer(abstract, padding="max_length", truncation=True, max_length=480)['input_ids'])
    
    embeddings = torch.cat((title_tensor, abstract_tensor))
    assert embeddings.shape == (512,)
    with torch.no_grad():
        logits = model(embeddings[None])['logits'][0]
        assert logits.shape == (8,)
        probs = torch.softmax(logits, dim=0).data.cpu().numpy()
    
    return probs

st.title("Classification of arXiv articles' main topic")
st.markdown("Please provide both summary and title when possible")

tokenizer, model = load_tok_and_model()

title = st.text_area(label='Title', height=50)
abstract = st.text_area(label='Abstract', height=250)
button = st.button('Run classifier')

if button:
    probs = forward_pass(title, abstract, tokenizer, model)
    prob_strings = [(str(prob) + '%') for prob in np.round(probs * 100, 3)]
    micro_df = pd.DataFrame({'Categories': CATEGORIES, 'Cat. Probability': prob_strings, 'sort_probs': probs})
    micro_df = micro_df.sort_values(by='sort_probs', ascending=False)
    micro_df = micro_df[['Categories', 'Cat. Probability']]
    micro_df.index.name = 'Internal ID'
    st.write(micro_df)