AleksBlacky commited on
Commit
8cf1f84
β€’
1 Parent(s): de73359

added secod model

Browse files
__pycache__/model.cpython-39.pyc CHANGED
Binary files a/__pycache__/model.cpython-39.pyc and b/__pycache__/model.cpython-39.pyc differ
 
app.py CHANGED
@@ -1,13 +1,12 @@
1
  import streamlit as st
2
  from pandas import DataFrame
3
  import seaborn as sns
4
- from model import ArxivClassifierModel
5
 
6
  st.markdown("# Hello, friend!")
7
  st.markdown(" This magic application going to help you with understanding of science paper topic! Cool? Yeah! ")
8
 
9
- # st.write("Loading model")
10
- model = ArxivClassifierModel()
11
 
12
  with st.form(key="my_form"):
13
  st.markdown("### 🎈 Do you want a little magic? ")
@@ -63,24 +62,24 @@ abstract = doc_abstract
63
  # except ValueError:
64
  # st.error("Word parsing into tokens went wrong! Is input valid? If yes, pls contact author alekseystepin13@gmail.com")
65
 
66
- predicts = model.make_predict(title + abstract)
67
 
68
  st.markdown("## 🎈 Yor article probably about: ")
69
  st.header("")
70
 
71
  df = (
72
- DataFrame(predicts.items(), columns=["Topic", "Prob"])
73
  .sort_values(by="Prob", ascending=False)
74
  .reset_index(drop=True)
75
  )
76
  df.index += 1
77
 
78
  df2 = (
79
- DataFrame(predicts.items(), columns=["Topic", "Prob"])
80
  .sort_values(by="Prob", ascending=False)
81
  .reset_index(drop=True)
82
  )
83
- # df2.index += 1
84
 
85
  # Add styling
86
  cmGreen = sns.light_palette("green", as_cmap=True)
@@ -91,6 +90,12 @@ df = df.style.background_gradient(
91
  "Prob",
92
  ],
93
  )
 
 
 
 
 
 
94
 
95
  c1, c2, c3 = st.columns([1, 3, 1])
96
 
@@ -99,10 +104,10 @@ format_dictionary = {
99
  }
100
 
101
  df = df.format(format_dictionary)
102
- df2 = df.format(format_dictionary)
103
 
104
  with c2:
105
  st.markdown("#### We suppose your research about: ")
106
- st.table(df)
107
- st.markdown("##### More detailed, it's about topic: ")
108
  st.table(df2)
 
 
 
1
  import streamlit as st
2
  from pandas import DataFrame
3
  import seaborn as sns
4
+ from model import ArxivClassifierModel, ArxivClassifierModelsPipeline
5
 
6
  st.markdown("# Hello, friend!")
7
  st.markdown(" This magic application going to help you with understanding of science paper topic! Cool? Yeah! ")
8
 
9
+ model = ArxivClassifierModelsPipeline()
 
10
 
11
  with st.form(key="my_form"):
12
  st.markdown("### 🎈 Do you want a little magic? ")
 
62
  # except ValueError:
63
  # st.error("Word parsing into tokens went wrong! Is input valid? If yes, pls contact author alekseystepin13@gmail.com")
64
 
65
+ preds_topic, preds_maintopic = model.make_predict(title + abstract)
66
 
67
  st.markdown("## 🎈 Yor article probably about: ")
68
  st.header("")
69
 
70
  df = (
71
+ DataFrame(preds_topic.items(), columns=["Topic", "Prob"])
72
  .sort_values(by="Prob", ascending=False)
73
  .reset_index(drop=True)
74
  )
75
  df.index += 1
76
 
77
  df2 = (
78
+ DataFrame(preds_maintopic.items(), columns=["Topic", "Prob"])
79
  .sort_values(by="Prob", ascending=False)
80
  .reset_index(drop=True)
81
  )
82
+ df2.index += 1
83
 
84
  # Add styling
85
  cmGreen = sns.light_palette("green", as_cmap=True)
 
90
  "Prob",
91
  ],
92
  )
93
+ df2 = df2.style.background_gradient(
94
+ cmap=cmGreen,
95
+ subset=[
96
+ "Prob",
97
+ ],
98
+ )
99
 
100
  c1, c2, c3 = st.columns([1, 3, 1])
101
 
 
104
  }
105
 
106
  df = df.format(format_dictionary)
107
+ df2 = df2.format(format_dictionary)
108
 
109
  with c2:
110
  st.markdown("#### We suppose your research about: ")
 
 
111
  st.table(df2)
112
+ st.markdown("##### More detailed, it's about topic: ")
113
+ st.table(df)
model.py CHANGED
@@ -29,4 +29,55 @@ class ArxivClassifierModel():
29
  @st.cache(suppress_st_warning=True)
30
  def __load_model(self):
31
  st.write("Loading big model")
32
- return AutoModelForSequenceClassification.from_pretrained("models/scibert/")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29
  @st.cache(suppress_st_warning=True)
30
  def __load_model(self):
31
  st.write("Loading big model")
32
+ return AutoModelForSequenceClassification.from_pretrained("models/scibert/")
33
+
34
+
35
+
36
+ class ArxivClassifierModelsPipeline():
37
+
38
+ def __init__(self):
39
+ self.model_topic_clf = self.__load_topic_clf()
40
+ self.model_maintopic_clf = self.__load_maintopic_clf()
41
+
42
+ topic_clf_default_model = "allenai/scibert_scivocab_uncased"
43
+ self.topic_tokenizer = AutoTokenizer.from_pretrained(topic_clf_default_model)
44
+
45
+ maintopic_clf_default_model = "Wi/arxiv-topics-distilbert-base-cased"
46
+ self.maintopic_tokenizer = AutoTokenizer.from_pretrained(maintopic_clf_default_model)
47
+
48
+ with open('models/scibert/decode_dict_topic.pkl', 'rb') as f:
49
+ self.decode_dict_topic = pickle.load(f)
50
+
51
+ with open('models/maintopic_clf/decode_dict_maintopic.pkl', 'rb') as f:
52
+ self.decode_dict_maintopic = pickle.load(f)
53
+
54
+ def make_predict(self, text):
55
+ tokens_topic = self.topic_tokenizer(text, return_tensors="pt")
56
+ topic_outs = self.model_topic_clf(tokens_topic.input_ids)
57
+ probs_topic = topic_outs["logits"].softmax(dim=-1).tolist()[0]
58
+ topic_probs = {}
59
+ for i, p in enumerate(probs_topic):
60
+ if p > 0.1:
61
+ topic_probs[self.decode_dict_topic[i]] = p
62
+
63
+ tokens_maintopic = self.maintopic_tokenizer(text, return_tensors="pt")
64
+ maintopic_outs = self.model_maintopic_clf(tokens_maintopic.input_ids)
65
+ probs_maintopic = maintopic_outs["logits"].softmax(dim=-1).tolist()[0]
66
+ maintopic_probs = {}
67
+ for i, p in enumerate(probs_maintopic):
68
+ if p > 0.1:
69
+ maintopic_probs[self.decode_dict_maintopic[i]] = p
70
+
71
+
72
+
73
+ return topic_probs, maintopic_probs
74
+
75
+ @st.cache(suppress_st_warning=True)
76
+ def __load_topic_clf(self):
77
+ st.write("Loading model")
78
+ return AutoModelForSequenceClassification.from_pretrained("models/scibert/")
79
+
80
+ @st.cache(suppress_st_warning=True)
81
+ def __load_maintopic_clf(self):
82
+ st.write("Loading second model")
83
+ return AutoModelForSequenceClassification.from_pretrained("models/maintopic_clf/")
models/maintopic_clf/config.json ADDED
@@ -0,0 +1,72 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "Wi/arxiv-topics-distilbert-base-cased",
3
+ "activation": "gelu",
4
+ "architectures": [
5
+ "DistilBertForSequenceClassification"
6
+ ],
7
+ "attention_dropout": 0.1,
8
+ "dim": 768,
9
+ "dropout": 0.1,
10
+ "hidden_dim": 3072,
11
+ "id2label": {
12
+ "0": "Astrophysics",
13
+ "1": "Condensed Matter",
14
+ "2": "Computer Science",
15
+ "3": "Economics",
16
+ "4": "Electrical Engineering and Systems Science",
17
+ "5": "General Relativity and Quantum Cosmology",
18
+ "6": "High Energy Physics - Experiment",
19
+ "7": "High Energy Physics - Lattice",
20
+ "8": "High Energy Physics - Phenomenology",
21
+ "9": "High Energy Physics - Theory",
22
+ "10": "Mathematics",
23
+ "11": "Mathematical Physics",
24
+ "12": "Nonlinear Sciences",
25
+ "13": "Nuclear Experiment",
26
+ "14": "Nuclear Theory",
27
+ "15": "Physics",
28
+ "16": "Quantitative Biology",
29
+ "17": "Quantitative Finance",
30
+ "18": "Quantum Physics",
31
+ "19": "Statistics",
32
+ "20": "Other"
33
+ },
34
+ "initializer_range": 0.02,
35
+ "label2id": {
36
+ "Astrophysics": 0,
37
+ "Computer Science": 2,
38
+ "Condensed Matter": 1,
39
+ "Economics": 3,
40
+ "Electrical Engineering and Systems Science": 4,
41
+ "General Relativity and Quantum Cosmology": 5,
42
+ "High Energy Physics - Experiment": 6,
43
+ "High Energy Physics - Lattice": 7,
44
+ "High Energy Physics - Phenomenology": 8,
45
+ "High Energy Physics - Theory": 9,
46
+ "Mathematical Physics": 11,
47
+ "Mathematics": 10,
48
+ "Nonlinear Sciences": 12,
49
+ "Nuclear Experiment": 13,
50
+ "Nuclear Theory": 14,
51
+ "Other": 20,
52
+ "Physics": 15,
53
+ "Quantitative Biology": 16,
54
+ "Quantitative Finance": 17,
55
+ "Quantum Physics": 18,
56
+ "Statistics": 19
57
+ },
58
+ "max_position_embeddings": 512,
59
+ "model_type": "distilbert",
60
+ "n_heads": 12,
61
+ "n_layers": 6,
62
+ "output_past": true,
63
+ "pad_token_id": 0,
64
+ "problem_type": "single_label_classification",
65
+ "qa_dropout": 0.1,
66
+ "seq_classif_dropout": 0.2,
67
+ "sinusoidal_pos_embds": false,
68
+ "tie_weights_": true,
69
+ "torch_dtype": "float32",
70
+ "transformers_version": "4.23.1",
71
+ "vocab_size": 28996
72
+ }
models/maintopic_clf/decode_dict_maintopic.pkl ADDED
Binary file (230 Bytes). View file
 
models/maintopic_clf/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:af3e1c904bab3e773dfabebc016952ab4aac12dd9e30db35272eb908b461eba9
3
+ size 263224881
models/scibert/{decode_dict.pkl β†’ decode_dict_topic.pkl} RENAMED
File without changes