ysda_hw / app.py
planetearth79's picture
Update app.py
a008ff9
raw
history blame
3.05 kB
import streamlit as st
from transformers import AutoModelForSequenceClassification
from transformers import AutoTokenizer
import torch
import pandas as pd
import numpy as np
st.markdown("# Arxiv Papers Classifier")
st.markdown("<img width=200px src='https://blog.arxiv.org/files/2021/02/arxiv-logo.svg'>", unsafe_allow_html=True)
st.markdown("После обработки и фильтрации датасета у каждой статьи остался один или несколько классов из 9:")
st.markdown("""
1) ai
2) cs
3) cv
4) lg
5) math
6) ml
7) phys
8) q-bio
9) stat
""")
id2label = {
0: "ai",
1: "cs",
2: "cv",
3: "lg",
4: "math",
5: "ml",
6: "phys",
7: "q-bio",
8: "stat"
}
title_text = st.text_input("ENTER TITLE HERE")
summary_text = st.text_area("ENTER SUMMARY HERE")
text = title_text + " " + summary_text
# 1
@st.cache
def load_first_model():
loaded_tokenizer = AutoTokenizer.from_pretrained("multi_class_model")
loaded_model = AutoModelForSequenceClassification.from_pretrained("multi_class_model")
return loaded_tokenizer, loaded_model
tokenizer_1, model_1 = load_first_model()
# loaded_tokenizer = AutoTokenizer.from_pretrained("multi_class_model")
# loaded_model = AutoModelForSequenceClassification.from_pretrained("multi_class_model")
st.markdown("## multi-class classification")
text_input = tokenizer_1(text, padding="max_length", truncation=True, return_tensors='pt')
with torch.no_grad():
text_res = model_1(**text_input)
text_probs = torch.softmax(text_res.logits, dim=1).cpu().numpy()[0]
order = np.argsort(text_probs)[::-1]
ordered_text_probs = text_probs[order]
idxs = order[np.cumsum(ordered_text_probs) <= 0.95]
st.markdown("Топ-95 классов: " + ", ".join([id2label[i] for i in idxs]))
chart_data = pd.DataFrame(
text_probs,
columns=['class probability'])
chart_data["index"] = np.array(list(id2label.values()))
chart_data = chart_data.set_index("index")
st.bar_chart(chart_data)
# 2
@st.cache
def load_first_model():
loaded_tokenizer = AutoTokenizer.from_pretrained("multi_label_model")
loaded_model = AutoModelForSequenceClassification.from_pretrained("multi_label_model")
return loaded_tokenizer, loaded_model
tokenizer_2, model_2 = load_first_model()
# loaded_tokenizer = AutoTokenizer.from_pretrained("multi_label_model")
# loaded_model = AutoModelForSequenceClassification.from_pretrained("multi_label_model")
st.markdown("## multi-label classification")
text_input = tokenizer_2(text, padding="max_length", truncation=True, return_tensors='pt')
with torch.no_grad():
text_res = model_2(**text_input)
text_probs = torch.sigmoid(torch.Tensor(text_res.logits)).cpu().numpy()[0]
probs = np.stack([text_probs, 1 - text_probs], axis=1)
chart_data = pd.DataFrame(
probs,
columns=['belong', "not belong"])
chart_data["index"] = np.array(list(id2label.values()))
chart_data = chart_data.set_index("index")
st.markdown("Probabilities for each class")
st.bar_chart(chart_data)