Spaces:
Sleeping
Sleeping
import streamlit as st | |
from transformers import AutoModelForSequenceClassification | |
from transformers import AutoTokenizer | |
import torch | |
import pandas as pd | |
import numpy as np | |
st.markdown("# Arxiv Papers Classifier") | |
st.markdown("<img width=200px src='https://blog.arxiv.org/files/2021/02/arxiv-logo.svg'>", unsafe_allow_html=True) | |
st.markdown("После обработки и фильтрации датасета у каждой статьи остался один или несколько классов из 9:") | |
st.markdown(""" | |
1) ai - cs.AI (Artificial Intelligence) | |
2) cs - все подгруппы из класса Computer Science, кроме cs.AI, cs.CV, cs.LG | |
3) cv - cs.CV (Computer Vision and Pattern Recognition) | |
4) lg - cs.LG (Machine Learning) | |
5) math - все подгруппы из класса Mathematics | |
6) ml - stat.ML (Machine Learning) | |
7) phys - все подгруппы из класса Physics | |
8) q-bio - все подгруппы из класса Quantitative Biology | |
9) stat - все подгруппы из класса Statistics, кроме stat.ML | |
""") | |
id2label = { | |
0: "ai", | |
1: "cs", | |
2: "cv", | |
3: "lg", | |
4: "math", | |
5: "ml", | |
6: "phys", | |
7: "q-bio", | |
8: "stat" | |
} | |
title_text = st.text_input("ENTER TITLE HERE") | |
summary_text = st.text_area("ENTER SUMMARY HERE") | |
text = title_text + " " + summary_text | |
# 1 | |
def load_first_model(): | |
loaded_tokenizer = AutoTokenizer.from_pretrained("multi_class_model") | |
loaded_model = AutoModelForSequenceClassification.from_pretrained("multi_class_model") | |
return loaded_tokenizer, loaded_model | |
tokenizer_1, model_1 = load_first_model() | |
# loaded_tokenizer = AutoTokenizer.from_pretrained("multi_class_model") | |
# loaded_model = AutoModelForSequenceClassification.from_pretrained("multi_class_model") | |
st.markdown("## multi-class classification") | |
text_input = tokenizer_1(text, padding="max_length", truncation=True, return_tensors='pt') | |
with torch.no_grad(): | |
text_res = model_1(**text_input) | |
text_probs = torch.softmax(text_res.logits, dim=1).cpu().numpy()[0] | |
order = np.argsort(text_probs)[::-1] | |
ordered_text_probs = text_probs[order] | |
idxs = order[np.cumsum(ordered_text_probs) <= 0.95] | |
st.markdown("Топ-95 классов: " + ", ".join([id2label[i] for i in idxs])) | |
chart_data = pd.DataFrame( | |
text_probs, | |
columns=['class probability']) | |
chart_data["index"] = np.array(list(id2label.values())) | |
chart_data = chart_data.set_index("index") | |
st.bar_chart(chart_data) | |
# 2 | |
def load_first_model(): | |
loaded_tokenizer = AutoTokenizer.from_pretrained("multi_label_model") | |
loaded_model = AutoModelForSequenceClassification.from_pretrained("multi_label_model") | |
return loaded_tokenizer, loaded_model | |
tokenizer_2, model_2 = load_first_model() | |
# loaded_tokenizer = AutoTokenizer.from_pretrained("multi_label_model") | |
# loaded_model = AutoModelForSequenceClassification.from_pretrained("multi_label_model") | |
st.markdown("## multi-label classification") | |
text_input = tokenizer_2(text, padding="max_length", truncation=True, return_tensors='pt') | |
with torch.no_grad(): | |
text_res = model_2(**text_input) | |
text_probs = torch.sigmoid(torch.Tensor(text_res.logits)).cpu().numpy()[0] | |
probs = np.stack([text_probs, 1 - text_probs], axis=1) | |
chart_data = pd.DataFrame( | |
probs, | |
columns=['belong', "not belong"]) | |
chart_data["index"] = np.array(list(id2label.values())) | |
chart_data = chart_data.set_index("index") | |
st.markdown("Probabilities for each class") | |
st.bar_chart(chart_data) | |