File size: 4,551 Bytes
dabf7ab
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
import streamlit as st
import transformers
import pickle
import seaborn as sns
from pandas import DataFrame
from transformers import AutoTokenizer, AutoModelForSequenceClassification

st.markdown("# Hello, friend!")
st.markdown(" This magic application going to help you with understanding of science paper topic! Cool? Yeah! ")
# st.markdown("<img width=200px src='https://rozetked.me/images/uploads/dwoilp3BVjlE.jpg'>", unsafe_allow_html=True)

st.write("Loading tokenizer and dict")
model_name_global = "allenai/scibert_scivocab_uncased"
tokenizer_ = AutoTokenizer.from_pretrained(model_name_global)
with open('./models/scibert/decode_dict.pkl', 'rb') as f:
    decode_dict = pickle.load(f)

with st.form(key="my_form"):
    st.markdown("### 🎈 Do you want a little magic?  ")
    st.markdown(" Write your article title and abstract to textboxes bellow and I'll gues topic of your paper!  ")
    # ce, c1, ce, c2, c3 = st.columns([0.07, 1, 0.07, 5, 0.07])
    ce, c2, c3 = st.columns([0.07, 5, 0.07])
    # with c1:
    #     ModelType = st.radio(
    #         "Choose your model",
    #         ["DistilBERT (Default)", "Flair"],
    #         help="At present, you can choose between 2 models (Flair or DistilBERT) to embed your text. More to come!",
    #     )
    #
    #     if ModelType == "Default (DistilBERT)":
    #         # kw_model = KeyBERT(model=roberta)
    #
    #         @st.cache(allow_output_mutation=True)
    #         def load_model():
    #             return KeyBERT(model=roberta)
    #
    #
    #         kw_model = load_model()
    #
    #     else:
    #         @st.cache(allow_output_mutation=True)
    #         def load_model():
    #             return KeyBERT("distilbert-base-nli-mean-tokens")
    #
    #
    #         kw_model = load_model()

    with c2:
        doc_title = st.text_area(
            "Paste your abstract title below (max 100 words)",
            height=210,
        )

        doc_abstract = st.text_area(
            "Paste your abstract text below (max 100500 words)",
            height=410,
        )

        MAX_WORDS_TITLE, MAX_WORDS_ABSTRACT = 50, 500
        import re

        len_title = len(re.findall(r"\w+", doc_title))
        len_abstract = len(re.findall(r"\w+", doc_abstract))
        if len_title > MAX_WORDS_TITLE:
            st.warning(
                "⚠️ Your title contains "
                + str(len_title)
                + " words."
                + " Only the first 50 words will be reviewed. Stay tuned as increased allowance is coming! 😊"
            )

            doc_title = doc_title[:MAX_WORDS_TITLE]

        if len_abstract > MAX_WORDS_ABSTRACT:
            st.warning(
                "⚠️ Your abstract contains "
                + str(len_abstract)
                + " words."
                + " Only the first 50 words will be reviewed. Stay tuned as increased allowance is coming! 😊"
            )

            doc_abstract = doc_abstract[:MAX_WORDS_ABSTRACT]

        submit_button = st.form_submit_button(label="✨ Let's play, try it!")

if not submit_button:
    st.stop()


#  allow_output_mutation=True
@st.cache(suppress_st_warning=True)
def load_model():
    st.write("Loading big model")
    return AutoModelForSequenceClassification.from_pretrained("models/scibert/")


def make_predict(tokens, decode_dict):
    # tokenizer_ = AutoTokenizer.from_pretrained(model_name_global)
    # tokens = tokenizer_(title + abstract, return_tensors="pt")

    model_ = load_model()
    outs = model_(tokens.input_ids)

    probs = outs["logits"].softmax(dim=-1).tolist()[0]
    topic_probs = {}
    for i, p in enumerate(probs):
        if p > 0.1:
            topic_probs[decode_dict[i]] = p
    return topic_probs


model_local = "models/scibert/"

title = doc_title
abstract = doc_abstract
tokens = tokenizer_(title + abstract, return_tensors="pt")

predicts = make_predict(model_name_global, model_local, tokens, decode_dict, title, abstract)

st.markdown("## 🎈 Yor article probably about:  ")
st.header("")

df = (
    DataFrame(predicts.items(), columns=["Topic", "Prob"])
        .sort_values(by="Prob", ascending=False)
        .reset_index(drop=True)
)

df.index += 1

# Add styling
cmGreen = sns.light_palette("green", as_cmap=True)
cmRed = sns.light_palette("red", as_cmap=True)
df = df.style.background_gradient(
    cmap=cmGreen,
    subset=[
        "Prob",
    ],
)

c1, c2, c3 = st.columns([1, 3, 1])

format_dictionary = {
    "Prob": "{:.1%}",
}

df = df.format(format_dictionary)

with c2:
    st.table(df)