import streamlit as st from transformers import AutoModelForSequenceClassification from transformers import AutoTokenizer import torch import pandas as pd import numpy as np st.markdown("# Arxiv Papers Classifier") st.markdown("", unsafe_allow_html=True) st.markdown("После обработки и фильтрации датасета у каждой статьи остался один или несколько классов из 9:") st.markdown(""" 1) ai - cs.AI (Artificial Intelligence) 2) cs - все подгруппы из класса Computer Science, кроме cs.AI, cs.CV, cs.LG 3) cv - cs.CV (Computer Vision and Pattern Recognition) 4) lg - cs.LG (Machine Learning) 5) math - все подгруппы из класса Mathematics 6) ml - stat.ML (Machine Learning) 7) phys - все подгруппы из класса Physics 8) q-bio - все подгруппы из класса Quantitative Biology 9) stat - все подгруппы из класса Statistics, кроме stat.ML """) id2label = { 0: "ai", 1: "cs", 2: "cv", 3: "lg", 4: "math", 5: "ml", 6: "phys", 7: "q-bio", 8: "stat" } title_text = st.text_input("ENTER TITLE HERE") summary_text = st.text_area("ENTER SUMMARY HERE") text = title_text + " " + summary_text # 1 @st.cache def load_first_model(): loaded_tokenizer = AutoTokenizer.from_pretrained("multi_class_model") loaded_model = AutoModelForSequenceClassification.from_pretrained("multi_class_model") return loaded_tokenizer, loaded_model tokenizer_1, model_1 = load_first_model() # loaded_tokenizer = AutoTokenizer.from_pretrained("multi_class_model") # loaded_model = AutoModelForSequenceClassification.from_pretrained("multi_class_model") st.markdown("## multi-class classification") text_input = tokenizer_1(text, padding="max_length", truncation=True, return_tensors='pt') with torch.no_grad(): text_res = model_1(**text_input) text_probs = torch.softmax(text_res.logits, dim=1).cpu().numpy()[0] order = np.argsort(text_probs)[::-1] ordered_text_probs = text_probs[order] idxs = order[np.cumsum(ordered_text_probs) <= 0.95] st.markdown("Топ-95 классов: " + ", ".join([id2label[i] for i in idxs])) chart_data = pd.DataFrame( text_probs, columns=['class probability']) chart_data["index"] = np.array(list(id2label.values())) chart_data = chart_data.set_index("index") st.bar_chart(chart_data) # 2 @st.cache def load_first_model(): loaded_tokenizer = AutoTokenizer.from_pretrained("multi_label_model") loaded_model = AutoModelForSequenceClassification.from_pretrained("multi_label_model") return loaded_tokenizer, loaded_model tokenizer_2, model_2 = load_first_model() # loaded_tokenizer = AutoTokenizer.from_pretrained("multi_label_model") # loaded_model = AutoModelForSequenceClassification.from_pretrained("multi_label_model") st.markdown("## multi-label classification") text_input = tokenizer_2(text, padding="max_length", truncation=True, return_tensors='pt') with torch.no_grad(): text_res = model_2(**text_input) text_probs = torch.sigmoid(torch.Tensor(text_res.logits)).cpu().numpy()[0] probs = np.stack([text_probs, 1 - text_probs], axis=1) chart_data = pd.DataFrame( probs, columns=['belong', "not belong"]) chart_data["index"] = np.array(list(id2label.values())) chart_data = chart_data.set_index("index") st.markdown("Probabilities for each class") st.bar_chart(chart_data)