Spaces:

AleksBlacky
/

Arxiv_paper_classifier

Runtime error

File size: 3,899 Bytes

dabf7ab
 
 
 
 
 
 
 
 
 
a432184
 
 
 
 
 
 
dabf7ab
 
 
 
a432184
dabf7ab
 
 
a432184
dabf7ab
 
 
 
a432184
dabf7ab
 
 
 
 
 
 
 
a432184
dabf7ab
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a432184
dabf7ab
 
 
 
 
 
 
 
 
a432184
 
 
 
 
 
 
 
dabf7ab
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a432184
 
 
 
dabf7ab
66700ad
dabf7ab

import streamlit as st
import transformers
import pickle
import seaborn as sns
from pandas import DataFrame
from transformers import AutoTokenizer, AutoModelForSequenceClassification

st.markdown("# Hello, friend!")
st.markdown(" This magic application going to help you with understanding of science paper topic! Cool? Yeah! ")

try:
    model_name_global = "allenai/scibert_scivocab_uncased"
    tokenizer_ = AutoTokenizer.from_pretrained(model_name_global)
    with open('./models/scibert/decode_dict.pkl', 'rb') as f:
        decode_dict = pickle.load(f)
except ValueError:
    st.error("Load tokenizer or decode answer dict goes wrong! Pls contact author alekseystepin13@gmail.com")

with st.form(key="my_form"):
    st.markdown("### 🎈 Do you want a little magic?  ")
    st.markdown(" Write your article title and abstract to textboxes bellow and I'll gues topic of your paper!  ")
    ce, c2, c3 = st.columns([0.07, 7, 0.07])

    with c2:
        doc_title = st.text_area(
            "Paste your abstract title below (1 to 50 words)",
            height=210,
        )

        doc_abstract = st.text_area(
            "Paste your abstract text below (1 to 500 words)",
            height=410,
        )

        MAX_WORDS_TITLE, MAX_WORDS_ABSTRACT = 50, 500
        import re

        len_title = len(re.findall(r"\w+", doc_title))
        len_abstract = len(re.findall(r"\w+", doc_abstract))

        if len_title > MAX_WORDS_TITLE:
            st.warning(
                "⚠️ Your title contains "
                + str(len_title)
                + " words."
                + " Only the first 50 words will be reviewed. Stay tuned as increased allowance is coming! 😊"
            )

            doc_title = doc_title[:MAX_WORDS_TITLE]

        if len_abstract > MAX_WORDS_ABSTRACT:
            st.warning(
                "⚠️ Your abstract contains "
                + str(len_abstract)
                + " words."
                + " Only the first 500 words will be reviewed. Stay tuned as increased allowance is coming! 😊"
            )

            doc_abstract = doc_abstract[:MAX_WORDS_ABSTRACT]

        submit_button = st.form_submit_button(label="✨ Let's play, try it!")

if not submit_button:
    st.stop()

if len_title < 1:
    st.error("Article without any words in title? Pls give me correct title!")
    st.stop()

if len_abstract < 1:
    st.error("Article without any words in abstract? Pls give me correct abstract!")
    st.stop()


#  allow_output_mutation=True
@st.cache(suppress_st_warning=True)
def load_model():
    st.write("Loading big model")
    return AutoModelForSequenceClassification.from_pretrained("models/scibert/")


def make_predict(tokens, decode_dict):

    model_ = load_model()
    outs = model_(tokens.input_ids)

    probs = outs["logits"].softmax(dim=-1).tolist()[0]
    topic_probs = {}
    for i, p in enumerate(probs):
        if p > 0.1:
            topic_probs[decode_dict[i]] = p
    return topic_probs


model_local = "models/scibert/"

title = doc_title
abstract = doc_abstract
try:
    tokens = tokenizer_(title + abstract, return_tensors="pt")
except ValueError:
    st.error("Word parsing into tokens went wrong! Is input valid? If yes, pls contact author alekseystepin13@gmail.com")

predicts = make_predict(tokens, decode_dict)

st.markdown("## 🎈 Yor article probably about:  ")
st.header("")

df = (
    DataFrame(predicts.items(), columns=["Topic", "Prob"])
        .sort_values(by="Prob", ascending=False)
        .reset_index(drop=True)
)

df.index += 1

# Add styling
cmGreen = sns.light_palette("green", as_cmap=True)
cmRed = sns.light_palette("red", as_cmap=True)
df = df.style.background_gradient(
    cmap=cmGreen,
    subset=[
        "Prob",
    ],
)

c1, c2, c3 = st.columns([1, 3, 1])

format_dictionary = {
    "Prob": "{:.1%}",
}

df = df.format(format_dictionary)

with c2:
    st.table(df)