Spaces:
Sleeping
Sleeping
planetearth79
commited on
Commit
•
8d358a1
1
Parent(s):
b982018
Upload 14 files
Browse files- app.py +107 -0
- multi_class_model/config.json +48 -0
- multi_class_model/pytorch_model.bin +3 -0
- multi_class_model/special_tokens_map.json +1 -0
- multi_class_model/tokenizer.json +0 -0
- multi_class_model/tokenizer_config.json +1 -0
- multi_class_model/trainer_state.json +46 -0
- multi_class_model/vocab.txt +0 -0
- multi_label_model/config.json +48 -0
- multi_label_model/pytorch_model.bin +3 -0
- multi_label_model/special_tokens_map.json +1 -0
- multi_label_model/tokenizer.json +0 -0
- multi_label_model/tokenizer_config.json +1 -0
- multi_label_model/vocab.txt +0 -0
app.py
ADDED
@@ -0,0 +1,107 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
from transformers import AutoModelForSequenceClassification
|
3 |
+
from transformers import AutoTokenizer
|
4 |
+
import torch
|
5 |
+
import pandas as pd
|
6 |
+
import numpy as np
|
7 |
+
|
8 |
+
|
9 |
+
st.markdown("# Arxiv Papers Classifier")
|
10 |
+
st.markdown("<img width=200px src='https://blog.arxiv.org/files/2021/02/arxiv-logo.svg'>", unsafe_allow_html=True)
|
11 |
+
|
12 |
+
st.markdown("После обработки и фильтрации датасета у каждой статьи остался один или несколько классов из 9:")
|
13 |
+
st.markdown("""
|
14 |
+
1) ai
|
15 |
+
|
16 |
+
2) cs
|
17 |
+
|
18 |
+
3) cv
|
19 |
+
|
20 |
+
4) lg
|
21 |
+
|
22 |
+
5) math
|
23 |
+
|
24 |
+
6) ml
|
25 |
+
|
26 |
+
7) phys
|
27 |
+
|
28 |
+
8) q-bio
|
29 |
+
|
30 |
+
9) stat
|
31 |
+
|
32 |
+
""")
|
33 |
+
|
34 |
+
id2label = {
|
35 |
+
0: "ai",
|
36 |
+
1: "cs",
|
37 |
+
2: "cv",
|
38 |
+
3: "lg",
|
39 |
+
4: "math",
|
40 |
+
5: "ml",
|
41 |
+
6: "phys",
|
42 |
+
7: "q-bio",
|
43 |
+
8: "stat"
|
44 |
+
}
|
45 |
+
|
46 |
+
title_text = st.text_input("ENTER TITLE HERE")
|
47 |
+
summary_text = st.text_area("ENTER SUMMARY HERE")
|
48 |
+
text = title_text + " " + summary_text
|
49 |
+
|
50 |
+
|
51 |
+
# 1
|
52 |
+
@st.cache_resource
|
53 |
+
def load_first_model():
|
54 |
+
loaded_tokenizer = AutoTokenizer.from_pretrained("multi_class_model")
|
55 |
+
loaded_model = AutoModelForSequenceClassification.from_pretrained("multi_class_model")
|
56 |
+
return loaded_tokenizer, loaded_model
|
57 |
+
tokenizer_1, model_1 = load_first_model()
|
58 |
+
# loaded_tokenizer = AutoTokenizer.from_pretrained("multi_class_model")
|
59 |
+
# loaded_model = AutoModelForSequenceClassification.from_pretrained("multi_class_model")
|
60 |
+
|
61 |
+
st.markdown("## multi-class classification")
|
62 |
+
|
63 |
+
text_input = tokenizer_1(text, padding="max_length", truncation=True, return_tensors='pt')
|
64 |
+
with torch.no_grad():
|
65 |
+
text_res = model_1(**text_input)
|
66 |
+
text_probs = torch.softmax(text_res.logits, dim=1).cpu().numpy()[0]
|
67 |
+
|
68 |
+
order = np.argsort(text_probs)[::-1]
|
69 |
+
ordered_text_probs = text_probs[order]
|
70 |
+
idxs = order[np.cumsum(ordered_text_probs) <= 0.95]
|
71 |
+
st.markdown("Топ-95 классов: " + ", ".join([id2label[i] for i in idxs]))
|
72 |
+
|
73 |
+
chart_data = pd.DataFrame(
|
74 |
+
text_probs,
|
75 |
+
columns=['class probability'])
|
76 |
+
chart_data["index"] = np.array(list(id2label.values()))
|
77 |
+
chart_data = chart_data.set_index("index")
|
78 |
+
st.bar_chart(chart_data)
|
79 |
+
|
80 |
+
|
81 |
+
# 2
|
82 |
+
@st.cache_resource
|
83 |
+
def load_first_model():
|
84 |
+
loaded_tokenizer = AutoTokenizer.from_pretrained("multi_label_model")
|
85 |
+
loaded_model = AutoModelForSequenceClassification.from_pretrained("multi_label_model")
|
86 |
+
return loaded_tokenizer, loaded_model
|
87 |
+
tokenizer_2, model_2 = load_first_model()
|
88 |
+
# loaded_tokenizer = AutoTokenizer.from_pretrained("multi_label_model")
|
89 |
+
# loaded_model = AutoModelForSequenceClassification.from_pretrained("multi_label_model")
|
90 |
+
|
91 |
+
st.markdown("## multi-label classification")
|
92 |
+
|
93 |
+
text_input = tokenizer_2(text, padding="max_length", truncation=True, return_tensors='pt')
|
94 |
+
with torch.no_grad():
|
95 |
+
text_res = model_2(**text_input)
|
96 |
+
|
97 |
+
text_probs = torch.sigmoid(torch.Tensor(text_res.logits)).cpu().numpy()[0]
|
98 |
+
|
99 |
+
probs = np.stack([text_probs, 1 - text_probs], axis=1)
|
100 |
+
|
101 |
+
chart_data = pd.DataFrame(
|
102 |
+
probs,
|
103 |
+
columns=['belong', "not belong"])
|
104 |
+
chart_data["index"] = np.array(list(id2label.values()))
|
105 |
+
chart_data = chart_data.set_index("index")
|
106 |
+
st.markdown("Probabilities for each class")
|
107 |
+
st.bar_chart(chart_data)
|
multi_class_model/config.json
ADDED
@@ -0,0 +1,48 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"_name_or_path": "distilbert-base-cased",
|
3 |
+
"activation": "gelu",
|
4 |
+
"architectures": [
|
5 |
+
"DistilBertForSequenceClassification"
|
6 |
+
],
|
7 |
+
"attention_dropout": 0.1,
|
8 |
+
"dim": 768,
|
9 |
+
"dropout": 0.1,
|
10 |
+
"hidden_dim": 3072,
|
11 |
+
"id2label": {
|
12 |
+
"0": "LABEL_0",
|
13 |
+
"1": "LABEL_1",
|
14 |
+
"2": "LABEL_2",
|
15 |
+
"3": "LABEL_3",
|
16 |
+
"4": "LABEL_4",
|
17 |
+
"5": "LABEL_5",
|
18 |
+
"6": "LABEL_6",
|
19 |
+
"7": "LABEL_7",
|
20 |
+
"8": "LABEL_8"
|
21 |
+
},
|
22 |
+
"initializer_range": 0.02,
|
23 |
+
"label2id": {
|
24 |
+
"LABEL_0": 0,
|
25 |
+
"LABEL_1": 1,
|
26 |
+
"LABEL_2": 2,
|
27 |
+
"LABEL_3": 3,
|
28 |
+
"LABEL_4": 4,
|
29 |
+
"LABEL_5": 5,
|
30 |
+
"LABEL_6": 6,
|
31 |
+
"LABEL_7": 7,
|
32 |
+
"LABEL_8": 8
|
33 |
+
},
|
34 |
+
"max_position_embeddings": 512,
|
35 |
+
"model_type": "distilbert",
|
36 |
+
"n_heads": 12,
|
37 |
+
"n_layers": 6,
|
38 |
+
"output_past": true,
|
39 |
+
"pad_token_id": 0,
|
40 |
+
"problem_type": "single_label_classification",
|
41 |
+
"qa_dropout": 0.1,
|
42 |
+
"seq_classif_dropout": 0.2,
|
43 |
+
"sinusoidal_pos_embds": false,
|
44 |
+
"tie_weights_": true,
|
45 |
+
"torch_dtype": "float32",
|
46 |
+
"transformers_version": "4.14.0",
|
47 |
+
"vocab_size": 28996
|
48 |
+
}
|
multi_class_model/pytorch_model.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:be39938c1ff4164d67f61b50aec81d799e07f297de6d627e0701dee0f2594aa5
|
3 |
+
size 263193713
|
multi_class_model/special_tokens_map.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"unk_token": "[UNK]", "sep_token": "[SEP]", "pad_token": "[PAD]", "cls_token": "[CLS]", "mask_token": "[MASK]"}
|
multi_class_model/tokenizer.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
multi_class_model/tokenizer_config.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"do_lower_case": false, "unk_token": "[UNK]", "sep_token": "[SEP]", "pad_token": "[PAD]", "cls_token": "[CLS]", "mask_token": "[MASK]", "tokenize_chinese_chars": true, "strip_accents": null, "model_max_length": 512, "special_tokens_map_file": null, "name_or_path": "distilbert-base-cased", "tokenizer_class": "DistilBertTokenizer"}
|
multi_class_model/trainer_state.json
ADDED
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"best_metric": 1.0723919868469238,
|
3 |
+
"best_model_checkpoint": "clf_trainer/checkpoint-6560",
|
4 |
+
"epoch": 2.0,
|
5 |
+
"global_step": 6560,
|
6 |
+
"is_hyper_param_search": false,
|
7 |
+
"is_local_process_zero": true,
|
8 |
+
"is_world_process_zero": true,
|
9 |
+
"log_history": [
|
10 |
+
{
|
11 |
+
"epoch": 1.0,
|
12 |
+
"learning_rate": 4.5e-05,
|
13 |
+
"loss": 1.1784,
|
14 |
+
"step": 3280
|
15 |
+
},
|
16 |
+
{
|
17 |
+
"epoch": 1.0,
|
18 |
+
"eval_accuracy": 0.6109756097560975,
|
19 |
+
"eval_loss": 1.100429654121399,
|
20 |
+
"eval_runtime": 43.9741,
|
21 |
+
"eval_samples_per_second": 186.474,
|
22 |
+
"eval_steps_per_second": 18.647,
|
23 |
+
"step": 3280
|
24 |
+
},
|
25 |
+
{
|
26 |
+
"epoch": 2.0,
|
27 |
+
"learning_rate": 4e-05,
|
28 |
+
"loss": 1.0022,
|
29 |
+
"step": 6560
|
30 |
+
},
|
31 |
+
{
|
32 |
+
"epoch": 2.0,
|
33 |
+
"eval_accuracy": 0.6201219512195122,
|
34 |
+
"eval_loss": 1.0723919868469238,
|
35 |
+
"eval_runtime": 43.854,
|
36 |
+
"eval_samples_per_second": 186.984,
|
37 |
+
"eval_steps_per_second": 18.698,
|
38 |
+
"step": 6560
|
39 |
+
}
|
40 |
+
],
|
41 |
+
"max_steps": 32800,
|
42 |
+
"num_train_epochs": 10,
|
43 |
+
"total_flos": 8690946151219200.0,
|
44 |
+
"trial_name": null,
|
45 |
+
"trial_params": null
|
46 |
+
}
|
multi_class_model/vocab.txt
ADDED
The diff for this file is too large to render.
See raw diff
|
|
multi_label_model/config.json
ADDED
@@ -0,0 +1,48 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"_name_or_path": "distilbert-base-cased",
|
3 |
+
"activation": "gelu",
|
4 |
+
"architectures": [
|
5 |
+
"DistilBertForSequenceClassification"
|
6 |
+
],
|
7 |
+
"attention_dropout": 0.1,
|
8 |
+
"dim": 768,
|
9 |
+
"dropout": 0.1,
|
10 |
+
"hidden_dim": 3072,
|
11 |
+
"id2label": {
|
12 |
+
"0": "ai",
|
13 |
+
"1": "cs",
|
14 |
+
"2": "cv",
|
15 |
+
"3": "lg",
|
16 |
+
"4": "math",
|
17 |
+
"5": "ml",
|
18 |
+
"6": "phys",
|
19 |
+
"7": "q-bio",
|
20 |
+
"8": "stat"
|
21 |
+
},
|
22 |
+
"initializer_range": 0.02,
|
23 |
+
"label2id": {
|
24 |
+
"ai": 0,
|
25 |
+
"cs": 1,
|
26 |
+
"cv": 2,
|
27 |
+
"lg": 3,
|
28 |
+
"math": 4,
|
29 |
+
"ml": 5,
|
30 |
+
"phys": 6,
|
31 |
+
"q-bio": 7,
|
32 |
+
"stat": 8
|
33 |
+
},
|
34 |
+
"max_position_embeddings": 512,
|
35 |
+
"model_type": "distilbert",
|
36 |
+
"n_heads": 12,
|
37 |
+
"n_layers": 6,
|
38 |
+
"output_past": true,
|
39 |
+
"pad_token_id": 0,
|
40 |
+
"problem_type": "multi_label_classification",
|
41 |
+
"qa_dropout": 0.1,
|
42 |
+
"seq_classif_dropout": 0.2,
|
43 |
+
"sinusoidal_pos_embds": false,
|
44 |
+
"tie_weights_": true,
|
45 |
+
"torch_dtype": "float32",
|
46 |
+
"transformers_version": "4.14.0",
|
47 |
+
"vocab_size": 28996
|
48 |
+
}
|
multi_label_model/pytorch_model.bin
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:0252ffb53aac0c0bf85efe4253b68e1756ab2241ef152c28384256cd4a856828
|
3 |
+
size 263193713
|
multi_label_model/special_tokens_map.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"unk_token": "[UNK]", "sep_token": "[SEP]", "pad_token": "[PAD]", "cls_token": "[CLS]", "mask_token": "[MASK]"}
|
multi_label_model/tokenizer.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
multi_label_model/tokenizer_config.json
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
{"do_lower_case": false, "unk_token": "[UNK]", "sep_token": "[SEP]", "pad_token": "[PAD]", "cls_token": "[CLS]", "mask_token": "[MASK]", "tokenize_chinese_chars": true, "strip_accents": null, "model_max_length": 512, "special_tokens_map_file": null, "name_or_path": "distilbert-base-cased", "tokenizer_class": "DistilBertTokenizer"}
|
multi_label_model/vocab.txt
ADDED
The diff for this file is too large to render.
See raw diff
|
|