MauriceV2021 commited on
Commit
6f70897
1 Parent(s): 00f161b

Upload 5 files

Browse files
predict.py ADDED
@@ -0,0 +1,168 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # IMPORTS
2
+
3
+ import pandas as pd
4
+ import glob
5
+ from nltk import tokenize
6
+ from transformers import BertTokenizer, TFBertModel, BertConfig
7
+ from transformers.utils.dummy_tf_objects import TFBertMainLayer
8
+ from tensorflow.keras.preprocessing.sequence import pad_sequences
9
+ from tensorflow import convert_to_tensor
10
+ from tensorflow.keras.layers import Input, Dense
11
+ from tensorflow.keras.initializers import TruncatedNormal
12
+ from tensorflow.keras.models import load_model, Model
13
+ from tensorflow.keras.optimizers import Adam
14
+ from tensorflow.keras.metrics import BinaryAccuracy, Precision, Recall
15
+
16
+
17
+ # SET PARAMETERS
18
+
19
+ DATA="..." # DATA need to be a list of texts
20
+
21
+ MODELS=".../"
22
+
23
+ SAVE_PREDICTIONS_TO="..."
24
+
25
+
26
+ # PREPROCESS TEXTS
27
+
28
+ def tokenize_abstracts(abstracts):
29
+ """For given texts, adds '[CLS]' and '[SEP]' tokens
30
+ at the beginning and the end of each sentence, respectively.
31
+ """
32
+ t_abstracts=[]
33
+ for abstract in abstracts:
34
+ t_abstract="[CLS] "
35
+ for sentence in tokenize.sent_tokenize(abstract):
36
+ t_abstract=t_abstract + sentence + " [SEP] "
37
+ t_abstracts.append(t_abstract)
38
+ return t_abstracts
39
+
40
+
41
+ tokenizer=BertTokenizer.from_pretrained('bert-base-multilingual-uncased')
42
+
43
+
44
+ def b_tokenize_abstracts(t_abstracts, max_len=512):
45
+ """Tokenizes sentences with the help
46
+ of a 'bert-base-multilingual-uncased' tokenizer.
47
+ """
48
+ b_t_abstracts=[tokenizer.tokenize(_)[:max_len] for _ in t_abstracts]
49
+ return b_t_abstracts
50
+
51
+
52
+ def convert_to_ids(b_t_abstracts):
53
+ """Converts tokens to its specific
54
+ IDs in a bert vocabulary.
55
+ """
56
+ input_ids=[tokenizer.convert_tokens_to_ids(_) for _ in b_t_abstracts]
57
+ return input_ids
58
+
59
+
60
+ def abstracts_to_ids(abstracts):
61
+ """Tokenizes abstracts and converts
62
+ tokens to their specific IDs
63
+ in a bert vocabulary.
64
+ """
65
+ tokenized_abstracts=tokenize_abstracts(abstracts)
66
+ b_tokenized_abstracts=b_tokenize_abstracts(tokenized_abstracts)
67
+ ids=convert_to_ids(b_tokenized_abstracts)
68
+ return ids
69
+
70
+
71
+ def pad_ids(input_ids, max_len=512):
72
+ """Padds sequences of a given IDs.
73
+ """
74
+ p_input_ids=pad_sequences(input_ids,
75
+ maxlen=max_len,
76
+ dtype="long",
77
+ truncating="post",
78
+ padding="post")
79
+ return p_input_ids
80
+
81
+
82
+ def create_attention_masks(inputs):
83
+ """Creates attention masks
84
+ for a given seuquences.
85
+ """
86
+ masks=[]
87
+ for sequence in inputs:
88
+ sequence_mask=[float(_>0) for _ in sequence]
89
+ masks.append(sequence_mask)
90
+ return masks
91
+
92
+
93
+ # PREDICT
94
+
95
+ def float_to_percent(float, decimal=3):
96
+ """Takes a float from range 0. to 0.9... as an input
97
+ and converts it to a percentage with specified decimal places.
98
+ """
99
+ return str(float*100)[:(decimal+3)]+"%"
100
+
101
+
102
+ def models_predict(directory, inputs, attention_masks, float_to_percent=False):
103
+ """Loads separate .h5 models from a given directory.
104
+ For predictions, inputs are expected to be:
105
+ tensors of token's ids (bert vocab) and tensors of attention masks.
106
+ Output is of format:
107
+ {'model/target N': [the probability of a text N dealing with the target N , ...], ...}
108
+ """
109
+ models=glob.glob(f"{directory}*.h5")
110
+ predictions_dict={}
111
+ for _ in models:
112
+ model=load_model(_)
113
+ predictions=model.predict_step([inputs, attention_masks])
114
+ predictions=[float(_) for _ in predictions]
115
+ if float_to_percent==True:
116
+ predictions=[float_to_percent(_) for _ in predictions]
117
+ predictions_dict[model.name]=predictions
118
+ del predictions, model
119
+ return predictions_dict
120
+
121
+
122
+ def predictions_dict_to_df(predictions_dictionary):
123
+ """Converts model's predictions of format:
124
+ {'model/target N': [the probability of a text N dealing with the target N , ...], ...}
125
+ to a dataframe of format:
126
+ | text N | the probability of the text N dealing with the target N | ... |
127
+ """
128
+ predictions_df=pd.DataFrame(predictions_dictionary)
129
+ predictions_df.columns=[_.replace("model_", "").replace("_", ".") for _ in predictions_df.columns]
130
+ predictions_df.insert(0, column="text", value=[_ for _ in range(len(predictions_df))])
131
+ return predictions_df
132
+
133
+
134
+ def predictions_above_treshold(predictions_dataframe, treshold=0.95):
135
+ """Filters predictions above specified treshold.
136
+ Input is expected to be a dataframe of format:
137
+ | text N | the probability of the text N dealing with the target N | ... |
138
+ Output is of format:
139
+ {text N: [target N dealing with probability > trheshold with text N, ...], ...}
140
+ """
141
+ above_treshold_dict={}
142
+ above_treshold=predictions_dataframe.iloc[:,1:].apply(lambda row: row[row > treshold].index, axis=1)
143
+ for _ in range(len(above_treshold)):
144
+ above_treshold_dict[_]=list(above_treshold[_])
145
+ return above_treshold_dict
146
+
147
+
148
+ # RUN
149
+
150
+ abstracts=DATA
151
+
152
+ ids=abstracts_to_ids(abstracts)
153
+
154
+ padded_ids=pad_ids(ids)
155
+
156
+ masks=create_attention_masks(padded_ids)
157
+
158
+ masks=convert_to_tensor(masks)
159
+
160
+ inputs=convert_to_tensor(padded_ids)
161
+
162
+ predictions=models_predict(MODELS, inputs, masks)
163
+
164
+ predictions_df=predictions_dict_to_df(predictions)
165
+
166
+ predictions_df.to_excel("SAVE_PREDICTIONS_TO/predictions.xlsx", index=False)
167
+
168
+
predict_in_batches.py ADDED
@@ -0,0 +1,186 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # IMPORTS
2
+
3
+ import pandas as pd
4
+ import glob
5
+ from nltk import tokenize
6
+ from transformers import BertTokenizer, TFBertModel, BertConfig
7
+ from transformers.utils.dummy_tf_objects import TFBertMainLayer
8
+ from tensorflow.keras.preprocessing.sequence import pad_sequences
9
+ from tensorflow import convert_to_tensor
10
+ from tensorflow.keras.layers import Input, Dense
11
+ from tensorflow.keras.initializers import TruncatedNormal
12
+ from tensorflow.keras.models import load_model, Model
13
+ from tensorflow.keras.optimizers import Adam
14
+ from tensorflow.keras.metrics import BinaryAccuracy, Precision, Recall
15
+
16
+
17
+ # SET PARAMETERS
18
+
19
+ DATA="..." # DATA need to be a list of texts
20
+
21
+ MODELS=".../"
22
+
23
+ SAVE_PREDICTIONS_TO="..."
24
+
25
+
26
+ # PREPROCESS TEXTS
27
+
28
+ def tokenize_abstracts(abstracts):
29
+ """For given texts, adds '[CLS]' and '[SEP]' tokens
30
+ at the beginning and the end of each sentence, respectively.
31
+ """
32
+ t_abstracts=[]
33
+ for abstract in abstracts:
34
+ t_abstract="[CLS] "
35
+ for sentence in tokenize.sent_tokenize(abstract):
36
+ t_abstract=t_abstract + sentence + " [SEP] "
37
+ t_abstracts.append(t_abstract)
38
+ return t_abstracts
39
+
40
+
41
+ tokenizer=BertTokenizer.from_pretrained('bert-base-multilingual-uncased')
42
+
43
+
44
+ def b_tokenize_abstracts(t_abstracts, max_len=512):
45
+ """Tokenizes sentences with the help
46
+ of a 'bert-base-multilingual-uncased' tokenizer.
47
+ """
48
+ b_t_abstracts=[tokenizer.tokenize(_)[:max_len] for _ in t_abstracts]
49
+ return b_t_abstracts
50
+
51
+
52
+ def convert_to_ids(b_t_abstracts):
53
+ """Converts tokens to its specific
54
+ IDs in a bert vocabulary.
55
+ """
56
+ input_ids=[tokenizer.convert_tokens_to_ids(_) for _ in b_t_abstracts]
57
+ return input_ids
58
+
59
+
60
+ def abstracts_to_ids(abstracts):
61
+ """Tokenizes abstracts and converts
62
+ tokens to their specific IDs
63
+ in a bert vocabulary.
64
+ """
65
+ tokenized_abstracts=tokenize_abstracts(abstracts)
66
+ b_tokenized_abstracts=b_tokenize_abstracts(tokenized_abstracts)
67
+ ids=convert_to_ids(b_tokenized_abstracts)
68
+ return ids
69
+
70
+
71
+ def pad_ids(input_ids, max_len=512):
72
+ """Padds sequences of a given IDs.
73
+ """
74
+ p_input_ids=pad_sequences(input_ids,
75
+ maxlen=max_len,
76
+ dtype="long",
77
+ truncating="post",
78
+ padding="post")
79
+ return p_input_ids
80
+
81
+
82
+ def create_attention_masks(inputs):
83
+ """Creates attention masks
84
+ for a given seuquences.
85
+ """
86
+ masks=[]
87
+ for sequence in inputs:
88
+ sequence_mask=[float(_>0) for _ in sequence]
89
+ masks.append(sequence_mask)
90
+ return masks
91
+
92
+
93
+ # PREDICT
94
+
95
+ def float_to_percent(float, decimal=3):
96
+ """Takes a float from range 0. to 0.9... as an input
97
+ and converts it to a percentage with specified decimal places.
98
+ """
99
+ return str(float*100)[:(decimal+3)]+"%"
100
+
101
+
102
+ def models_predict(directory, inputs, attention_masks, float_to_percent=False):
103
+ """Loads separate .h5 models from a given directory.
104
+ For predictions, inputs are expected to be:
105
+ tensors of token's ids (bert vocab) and tensors of attention masks.
106
+ Output is of format:
107
+ {'model/target N': [the probability of a text N dealing with the target N , ...], ...}
108
+ """
109
+ models=glob.glob(f"{directory}*.h5")
110
+ predictions_dict={}
111
+ for _ in models:
112
+ model=load_model(_)
113
+ print(f"Model {_} is loaded.")
114
+ predictions=model.predict_step([inputs, attention_masks])
115
+ print(f"Predictions from the model {_} are finished.")
116
+ predictions=[float(_) for _ in predictions]
117
+ if float_to_percent==True:
118
+ predictions=[float_to_percent(_) for _ in predictions]
119
+ predictions_dict[model.name]=predictions
120
+ print(f"Predictions from the model {_} are saved.")
121
+ del predictions, model
122
+ return predictions_dict
123
+
124
+
125
+ def predictions_dict_to_df(predictions_dictionary):
126
+ """Converts model's predictions of format:
127
+ {'model/target N': [the probability of a text N dealing with the target N , ...], ...}
128
+ to a dataframe of format:
129
+ | text N | the probability of the text N dealing with the target N | ... |
130
+ """
131
+ predictions_df=pd.DataFrame(predictions_dictionary)
132
+ predictions_df.columns=[_.replace("model_", "").replace("_", ".") for _ in predictions_df.columns]
133
+ predictions_df.insert(0, column="text", value=[_ for _ in range(len(predictions_df))])
134
+ return predictions_df
135
+
136
+
137
+ def predictions_above_treshold(predictions_dataframe, treshold=0.95):
138
+ """Filters predictions above specified treshold.
139
+ Input is expected to be a dataframe of format:
140
+ | text N | the probability of the text N dealing with the target N | ... |
141
+ Output is of format:
142
+ {text N: [target N dealing with probability > trheshold with text N, ...], ...}
143
+ """
144
+ above_treshold_dict={}
145
+ above_treshold=predictions_dataframe.iloc[:,1:].apply(lambda row: row[row > treshold].index, axis=1)
146
+ for _ in range(len(above_treshold)):
147
+ above_treshold_dict[_]=list(above_treshold[_])
148
+ return above_treshold_dict
149
+
150
+
151
+ # RUN
152
+
153
+ marks=[_ for _ in range(int(len(DATA)/100))]
154
+
155
+ output=pd.DataFrame()
156
+
157
+ for _ in marks:
158
+ if _ == 0:
159
+ abstracts=DATA[_: (_+1)*100]
160
+ else:
161
+ abstracts=DATA[_*100: (_+1)*100]
162
+ ids=abstracts_to_ids(abstracts)
163
+ padded_ids=pad_ids(ids)
164
+ masks=create_attention_masks(padded_ids)
165
+ masks=convert_to_tensor(masks)
166
+ inputs=convert_to_tensor(padded_ids)
167
+ predictions=models_predict(MODELS, inputs, masks)
168
+ predictions_df=predictions_dict_to_df(predictions)
169
+ output=output.append(predictions_df)
170
+ del abstracts, predictions, predictions_df
171
+
172
+ if len(DATA)!=((marks[-1]+1)*100):
173
+ rest_idx=((marks[-1]+1)*100)
174
+ abstracts=DATA[rest_idx:]
175
+ ids=abstracts_to_ids(abstracts)
176
+ padded_ids=pad_ids(ids)
177
+ masks=create_attention_masks(padded_ids)
178
+ masks=convert_to_tensor(masks)
179
+ inputs=convert_to_tensor(padded_ids)
180
+ predictions=models_predict(MODELS, inputs, masks)
181
+ predictions_df=predictions_dict_to_df(predictions)
182
+ output=output.append(predictions_df)
183
+ del abstracts, predictions, predictions_df
184
+
185
+
186
+ output.to_excel("SAVE_PREDICTIONS_TO/predictions.xlsx", index=False)
train_multiclass_model.py ADDED
@@ -0,0 +1,193 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # IMPORTS
2
+
3
+ import pandas as pd
4
+ import nltk
5
+ nltk.download("punkt")
6
+ from nltk import tokenize
7
+ import time
8
+ from sklearn.model_selection import train_test_split
9
+ from transformers import BertConfig, BertTokenizer, TFBertModel
10
+ from tensorflow.keras.preprocessing.sequence import pad_sequences
11
+ from tensorflow import convert_to_tensor
12
+ from tensorflow.keras.layers import Input, Dense
13
+ from tensorflow.keras.initializers import TruncatedNormal
14
+ from tensorflow.keras.models import Model
15
+ from tensorflow.keras.optimizers import Adam
16
+ from tensorflow.keras.metrics import CategoricalAccuracy, Precision, Recall
17
+
18
+
19
+ # SET PARAMETERS
20
+
21
+ DATA_PATH="..."
22
+
23
+ SAVE_MODEL_TO=".../"
24
+
25
+
26
+ # READ DATA
27
+
28
+ tab=pd.read_hdf(DATA_PATH)
29
+
30
+
31
+ # PREPARE DATA FOR BERT
32
+
33
+ def data_to_values(dataframe):
34
+ """Converts data to values.
35
+ """
36
+ abstracts=dataframe.Abstract.values
37
+ labels=dataframe.Label.values
38
+ return abstracts, labels
39
+
40
+
41
+ def tokenize_abstracts(abstracts):
42
+ """For given texts, adds '[CLS]' and '[SEP]' tokens
43
+ at the beginning and the end of each sentence, respectively.
44
+ """
45
+ t_abstracts=[]
46
+ for abstract in abstracts:
47
+ t_abstract="[CLS] "
48
+ for sentence in tokenize.sent_tokenize(abstract):
49
+ t_abstract=t_abstract + sentence + " [SEP] "
50
+ t_abstracts.append(t_abstract)
51
+ return t_abstracts
52
+
53
+
54
+ tokenizer=BertTokenizer.from_pretrained('bert-base-multilingual-uncased')
55
+
56
+
57
+ def b_tokenize_abstracts(t_abstracts, max_len=512):
58
+ """Tokenizes sentences with the help
59
+ of a 'bert-base-multilingual-uncased' tokenizer.
60
+ """
61
+ b_t_abstracts=[tokenizer.tokenize(_)[:max_len] for _ in t_abstracts]
62
+ return b_t_abstracts
63
+
64
+
65
+ def convert_to_ids(b_t_abstracts):
66
+ """Converts tokens to its specific
67
+ IDs in a bert vocabulary.
68
+ """
69
+ input_ids=[tokenizer.convert_tokens_to_ids(_) for _ in b_t_abstracts]
70
+ return input_ids
71
+
72
+
73
+ def abstracts_to_ids(abstracts):
74
+ """Tokenizes abstracts and converts
75
+ tokens to their specific IDs
76
+ in a bert vocabulary.
77
+ """
78
+ tokenized_abstracts=tokenize_abstracts(abstracts)
79
+ b_tokenized_abstracts=b_tokenize_abstracts(tokenized_abstracts)
80
+ ids=convert_to_ids(b_tokenized_abstracts)
81
+ return ids
82
+
83
+
84
+ def pad_ids(input_ids, max_len=512):
85
+ """Padds sequences of a given IDs.
86
+ """
87
+ p_input_ids=pad_sequences(input_ids,
88
+ maxlen=max_len,
89
+ dtype="long",
90
+ truncating="post",
91
+ padding="post")
92
+ return p_input_ids
93
+
94
+
95
+ def create_attention_masks(inputs):
96
+ """Creates attention masks
97
+ for a given seuquences.
98
+ """
99
+ masks=[]
100
+ for sequence in inputs:
101
+ sequence_mask=[float(_>0) for _ in sequence]
102
+ masks.append(sequence_mask)
103
+ return masks
104
+
105
+
106
+ # CREATE MODEL
107
+
108
+ def create_model():
109
+ config=BertConfig.from_pretrained(
110
+ "bert-base-multilingual-uncased",
111
+ num_labels=16,
112
+ hidden_dropout_prob=0.2,
113
+ attention_probs_dropout_prob=0.2)
114
+ bert=TFBertModel.from_pretrained(
115
+ "bert-base-multilingual-uncased",
116
+ config=config)
117
+ bert_layer=bert.layers[0]
118
+ input_ids_layer=Input(
119
+ shape=(512),
120
+ name="input_ids",
121
+ dtype="int32")
122
+ input_attention_masks_layer=Input(
123
+ shape=(512),
124
+ name="attention_masks",
125
+ dtype="int32")
126
+ bert_model=bert_layer(
127
+ input_ids_layer,
128
+ input_attention_masks_layer)
129
+ target_layer=Dense(
130
+ units=16,
131
+ kernel_initializer=TruncatedNormal(stddev=config.initializer_range),
132
+ name="target_layer",
133
+ activation="softmax")(bert_model[1])
134
+ model=Model(
135
+ inputs=[input_ids_layer, input_attention_masks_layer],
136
+ outputs=target_layer,
137
+ name="mbert_multiclass_16")
138
+ optimizer=Adam(
139
+ learning_rate=5e-05,
140
+ epsilon=1e-08,
141
+ decay=0.01,
142
+ clipnorm=1.0)
143
+ model.compile(
144
+ optimizer=optimizer,
145
+ loss="categorical_crossentropy",
146
+ metrics=[CategoricalAccuracy(), Precision(), Recall()])
147
+ return model
148
+
149
+
150
+ abstracts, labels=data_to_values(tab)
151
+ ids=abstracts_to_ids(abstracts)
152
+ print("Abstracts tokenized, tokens converted to ids.")
153
+
154
+ padded_ids=pad_ids(ids)
155
+ print("Sequences padded.")
156
+
157
+ train_inputs, temp_inputs, train_labels, temp_labels=train_test_split(padded_ids, labels, random_state=1993, test_size=0.3)
158
+ validation_inputs, test_inputs, validation_labels, test_labels=train_test_split(temp_inputs, temp_labels, random_state=1993, test_size=0.5)
159
+ print("Data splited into train, validation, test sets.")
160
+
161
+ train_masks, validation_masks, test_masks=[create_attention_masks(_) for _ in [train_inputs, validation_inputs, test_inputs]]
162
+ print("Attention masks created.")
163
+ train_inputs, validation_inputs, test_inputs=[convert_to_tensor(_) for _ in [train_inputs, validation_inputs, test_inputs]]
164
+ print("Inputs converted to tensors.")
165
+ train_labels, validation_labels, test_labels=[convert_to_tensor(_) for _ in [train_labels, validation_labels, test_labels]]
166
+ print("Labels converted to tensors.")
167
+ train_masks, validation_masks, test_masks=[convert_to_tensor(_) for _ in [train_masks, validation_masks, test_masks]]
168
+ print("Masks converted to tensors.")
169
+
170
+
171
+ model=create_model()
172
+ print("Model initialized.")
173
+
174
+
175
+ history=model.fit([train_inputs, train_masks], train_labels,
176
+ batch_size=8,
177
+ epochs=2,
178
+ validation_data=([validation_inputs, validation_masks], validation_labels))
179
+
180
+
181
+ model.save(SAVE_MODEL_TO+"mbert_multiclass_16.h5")
182
+ print("Model saved.")
183
+
184
+ test_score=model.evaluate([test_inputs, test_masks], test_labels,
185
+ batch_size=8)
186
+
187
+ print("Model tested.")
188
+
189
+
190
+ stats=pd.DataFrame(test_score, columns=["loss", "accuracy", "precision", "recall"])
191
+ stats.to_excel(SAVE_MODEL_TO+"mbert_multiclass_16_stats.xlsx", index=False)
192
+
193
+ print("Stats saved.")
train_multilabel_model.py ADDED
@@ -0,0 +1,193 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # IMPORTS
2
+
3
+ import pandas as pd
4
+ import nltk
5
+ nltk.download("punkt")
6
+ from nltk import tokenize
7
+ import time
8
+ from sklearn.model_selection import train_test_split
9
+ from transformers import BertConfig, BertTokenizer, TFBertModel
10
+ from tensorflow.keras.preprocessing.sequence import pad_sequences
11
+ from tensorflow import convert_to_tensor
12
+ from tensorflow.keras.layers import Input, Dense
13
+ from tensorflow.keras.initializers import TruncatedNormal
14
+ from tensorflow.keras.models import Model
15
+ from tensorflow.keras.optimizers import Adam
16
+ from tensorflow.keras.metrics import CategoricalAccuracy, Precision, Recall
17
+
18
+
19
+ # SET PARAMETERS
20
+
21
+ DATA_PATH="..."
22
+
23
+ SAVE_MODELS_TO=".../"
24
+
25
+
26
+ # READ DATA
27
+
28
+ tab=pd.read_hdf(DATA_PATH)
29
+
30
+
31
+ # PREPARE DATA FOR BERT
32
+
33
+ def data_to_values(dataframe):
34
+ """Converts data to values.
35
+ """
36
+ abstracts=dataframe.Abstract.values
37
+ labels=dataframe.Label.values
38
+ return abstracts, labels
39
+
40
+
41
+ def tokenize_abstracts(abstracts):
42
+ """For given texts, adds '[CLS]' and '[SEP]' tokens
43
+ at the beginning and the end of each sentence, respectively.
44
+ """
45
+ t_abstracts=[]
46
+ for abstract in abstracts:
47
+ t_abstract="[CLS] "
48
+ for sentence in tokenize.sent_tokenize(abstract):
49
+ t_abstract=t_abstract + sentence + " [SEP] "
50
+ t_abstracts.append(t_abstract)
51
+ return t_abstracts
52
+
53
+
54
+ tokenizer=BertTokenizer.from_pretrained('bert-base-multilingual-uncased')
55
+
56
+
57
+ def b_tokenize_abstracts(t_abstracts, max_len=512):
58
+ """Tokenizes sentences with the help
59
+ of a 'bert-base-multilingual-uncased' tokenizer.
60
+ """
61
+ b_t_abstracts=[tokenizer.tokenize(_)[:max_len] for _ in t_abstracts]
62
+ return b_t_abstracts
63
+
64
+
65
+ def convert_to_ids(b_t_abstracts):
66
+ """Converts tokens to its specific
67
+ IDs in a bert vocabulary.
68
+ """
69
+ input_ids=[tokenizer.convert_tokens_to_ids(_) for _ in b_t_abstracts]
70
+ return input_ids
71
+
72
+
73
+ def abstracts_to_ids(abstracts):
74
+ """Tokenizes abstracts and converts
75
+ tokens to their specific IDs
76
+ in a bert vocabulary.
77
+ """
78
+ tokenized_abstracts=tokenize_abstracts(abstracts)
79
+ b_tokenized_abstracts=b_tokenize_abstracts(tokenized_abstracts)
80
+ ids=convert_to_ids(b_tokenized_abstracts)
81
+ return ids
82
+
83
+
84
+ def pad_ids(input_ids, max_len=512):
85
+ """Padds sequences of a given IDs.
86
+ """
87
+ p_input_ids=pad_sequences(input_ids,
88
+ maxlen=max_len,
89
+ dtype="long",
90
+ truncating="post",
91
+ padding="post")
92
+ return p_input_ids
93
+
94
+
95
+ def create_attention_masks(inputs):
96
+ """Creates attention masks
97
+ for a given seuquences.
98
+ """
99
+ masks=[]
100
+ for sequence in inputs:
101
+ sequence_mask=[float(_>0) for _ in sequence]
102
+ masks.append(sequence_mask)
103
+ return masks
104
+
105
+
106
+ # CREATE MODEL
107
+
108
+ def create_model():
109
+ config=BertConfig.from_pretrained(
110
+ "bert-base-multilingual-uncased",
111
+ num_labels=17,
112
+ hidden_dropout_prob=0.2,
113
+ attention_probs_dropout_prob=0.2)
114
+ bert=TFBertModel.from_pretrained(
115
+ "bert-base-multilingual-uncased",
116
+ config=config)
117
+ bert_layer=bert.layers[0]
118
+ input_ids_layer=Input(
119
+ shape=(512),
120
+ name="input_ids",
121
+ dtype="int32")
122
+ input_attention_masks_layer=Input(
123
+ shape=(512),
124
+ name="attention_masks",
125
+ dtype="int32")
126
+ bert_model=bert_layer(
127
+ input_ids_layer,
128
+ input_attention_masks_layer)
129
+ target_layer=Dense(
130
+ units=17,
131
+ kernel_initializer=TruncatedNormal(stddev=config.initializer_range),
132
+ name="target_layer",
133
+ activation="sigmoid")(bert_model[1])
134
+ model=Model(
135
+ inputs=[input_ids_layer, input_attention_masks_layer],
136
+ outputs=target_layer,
137
+ name="aurora_sdg_mbert_multilabel")
138
+ optimizer=Adam(
139
+ learning_rate=5e-05,
140
+ epsilon=1e-08,
141
+ decay=0.01,
142
+ clipnorm=1.0)
143
+ model.compile(
144
+ optimizer=optimizer,
145
+ loss="binary_crossentropy",
146
+ metrics=[Precision(), Recall()])
147
+ return model
148
+
149
+
150
+ abstracts, labels=data_to_values(tab)
151
+ ids=abstracts_to_ids(abstracts)
152
+ print("Abstracts tokenized, tokens converted to ids.")
153
+
154
+ padded_ids=pad_ids(ids)
155
+ print("Sequences padded.")
156
+
157
+ train_inputs, temp_inputs, train_labels, temp_labels=train_test_split(padded_ids, labels, random_state=1993, test_size=0.3)
158
+ validation_inputs, test_inputs, validation_labels, test_labels=train_test_split(temp_inputs, temp_labels, random_state=1993, test_size=0.5)
159
+ print("Data splited into train, validation, test sets.")
160
+
161
+ train_masks, validation_masks, test_masks=[create_attention_masks(_) for _ in [train_inputs, validation_inputs, test_inputs]]
162
+ print("Attention masks created.")
163
+ train_inputs, validation_inputs, test_inputs=[convert_to_tensor(_) for _ in [train_inputs, validation_inputs, test_inputs]]
164
+ print("Inputs converted to tensors.")
165
+ train_labels, validation_labels, test_labels=[convert_to_tensor(_) for _ in [train_labels, validation_labels, test_labels]]
166
+ print("Labels converted to tensors.")
167
+ train_masks, validation_masks, test_masks=[convert_to_tensor(_) for _ in [train_masks, validation_masks, test_masks]]
168
+ print("Masks converted to tensors.")
169
+
170
+
171
+ model=create_model()
172
+ print("Model initialized.")
173
+
174
+
175
+ history=model.fit([train_inputs, train_masks], train_labels,
176
+ batch_size=16,
177
+ epochs=4,
178
+ validation_data=([validation_inputs, validation_masks], validation_labels))
179
+
180
+
181
+ model.save(SAVE_MODEL_TO+"mbert_multilabel.h5")
182
+ print("Model saved.")
183
+
184
+ test_score=model.evaluate([test_inputs, test_masks], test_labels,
185
+ batch_size=8)
186
+
187
+ print("Model tested.")
188
+
189
+
190
+ stats=pd.DataFrame(test_score)
191
+ stats.to_excel(SAVE_MODEL_TO+"mbert_multilabel_stats.xlsx", index=False)
192
+
193
+ print("Stats saved.")
train_multilabel_models.py ADDED
@@ -0,0 +1,221 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # IMPORTS
2
+
3
+ import pandas as pd
4
+ from nltk import tokenize
5
+ import time
6
+ from sklearn.model_selection import train_test_split
7
+ from transformers import BertConfig, BertTokenizer, TFBertModel
8
+ from tensorflow.keras.preprocessing.sequence import pad_sequences
9
+ from tensorflow import convert_to_tensor
10
+ from tensorflow.keras.layers import Input, Dense
11
+ from tensorflow.keras.initializers import TruncatedNormal
12
+ from tensorflow.keras.models import Model
13
+ from tensorflow.keras.optimizers import Adam
14
+ from tensorflow.keras.metrics import BinaryAccuracy, Precision, Recall
15
+
16
+
17
+ # SET PARAMETERS
18
+
19
+ DATA_PATH="..."
20
+
21
+ SAVE_MODELS_TO=".../"
22
+
23
+
24
+ # READ DATA
25
+
26
+ tab=pd.read_hdf(DATA_PATH)
27
+
28
+
29
+ # SLICE DATA
30
+
31
+ def slice_data(dataframe, label):
32
+ """Slices dataframe of a structure:
33
+ | text/abstract | label |
34
+ Prepares data for a binary classification
35
+ training. For a given label, creates new
36
+ dataset where number of items belonging
37
+ to the given label equals number of randomly
38
+ generated items from all the other labels items.
39
+ """
40
+ label_data=dataframe[dataframe[label]==1]
41
+ label_data_len=len(label_data)
42
+ temp_data=dataframe.copy()[dataframe[label]!=1].sample(n=label_data_len)
43
+ label_data=label_data[["Abstract", label]]
44
+ label_data=label_data.append(temp_data[["Abstract", label]])
45
+ label_data.columns=["Abstract", "Label"]
46
+ return label_data
47
+
48
+
49
+ # PREPARE DATA FOR BERT
50
+
51
+ def data_to_values(dataframe):
52
+ """Converts data to values.
53
+ """
54
+ abstracts=dataframe.Abstract.values
55
+ labels=dataframe.Label.values
56
+ return abstracts, labels
57
+
58
+
59
+ def tokenize_abstracts(abstracts):
60
+ """For given texts, adds '[CLS]' and '[SEP]' tokens
61
+ at the beginning and the end of each sentence, respectively.
62
+ """
63
+ t_abstracts=[]
64
+ for abstract in abstracts:
65
+ t_abstract="[CLS] "
66
+ for sentence in tokenize.sent_tokenize(abstract):
67
+ t_abstract=t_abstract + sentence + " [SEP] "
68
+ t_abstracts.append(t_abstract)
69
+ return t_abstracts
70
+
71
+
72
+ tokenizer=BertTokenizer.from_pretrained('bert-base-multilingual-uncased')
73
+
74
+
75
+ def b_tokenize_abstracts(t_abstracts, max_len=512):
76
+ """Tokenizes sentences with the help
77
+ of a 'bert-base-multilingual-uncased' tokenizer.
78
+ """
79
+ b_t_abstracts=[tokenizer.tokenize(_)[:max_len] for _ in t_abstracts]
80
+ return b_t_abstracts
81
+
82
+
83
+ def convert_to_ids(b_t_abstracts):
84
+ """Converts tokens to its specific
85
+ IDs in a bert vocabulary.
86
+ """
87
+ input_ids=[tokenizer.convert_tokens_to_ids(_) for _ in b_t_abstracts]
88
+ return input_ids
89
+
90
+
91
+ def abstracts_to_ids(abstracts):
92
+ """Tokenizes abstracts and converts
93
+ tokens to their specific IDs
94
+ in a bert vocabulary.
95
+ """
96
+ tokenized_abstracts=tokenize_abstracts(abstracts)
97
+ b_tokenized_abstracts=b_tokenize_abstracts(tokenized_abstracts)
98
+ ids=convert_to_ids(b_tokenized_abstracts)
99
+ return ids
100
+
101
+
102
+ def pad_ids(input_ids, max_len=512):
103
+ """Padds sequences of a given IDs.
104
+ """
105
+ p_input_ids=pad_sequences(input_ids,
106
+ maxlen=max_len,
107
+ dtype="long",
108
+ truncating="post",
109
+ padding="post")
110
+ return p_input_ids
111
+
112
+
113
+ def create_attention_masks(inputs):
114
+ """Creates attention masks
115
+ for a given seuquences.
116
+ """
117
+ masks=[]
118
+ for sequence in inputs:
119
+ sequence_mask=[float(_>0) for _ in sequence]
120
+ masks.append(sequence_mask)
121
+ return masks
122
+
123
+
124
+ # CREATE MODEL
125
+
126
+ def create_model(label):
127
+ config=BertConfig.from_pretrained(
128
+ "bert-base-multilingual-uncased",
129
+ num_labels=2,
130
+ hidden_dropout_prob=0.2,
131
+ attention_probs_dropout_prob=0.2)
132
+ bert=TFBertModel.from_pretrained(
133
+ "bert-base-multilingual-uncased",
134
+ config=config)
135
+ bert_layer=bert.layers[0]
136
+ input_ids_layer=Input(
137
+ shape=(512),
138
+ name="input_ids",
139
+ dtype="int32")
140
+ input_attention_masks_layer=Input(
141
+ shape=(512),
142
+ name="attention_masks",
143
+ dtype="int32")
144
+ bert_model=bert_layer(
145
+ input_ids_layer,
146
+ input_attention_masks_layer)
147
+ target_layer=Dense(
148
+ units=1,
149
+ kernel_initializer=TruncatedNormal(stddev=config.initializer_range),
150
+ name="target_layer",
151
+ activation="sigmoid")(bert_model[1])
152
+ model=Model(
153
+ inputs=[input_ids_layer, input_attention_masks_layer],
154
+ outputs=target_layer,
155
+ name="model_"+label.replace(".", "_"))
156
+ optimizer=Adam(
157
+ learning_rate=5e-05,
158
+ epsilon=1e-08,
159
+ decay=0.01,
160
+ clipnorm=1.0)
161
+ model.compile(
162
+ optimizer=optimizer,
163
+ loss="binary_crossentropy",
164
+ metrics=[BinaryAccuracy(), Precision(), Recall()])
165
+ return model
166
+
167
+
168
+ # THE LOOP
169
+
170
+ test_scores=[]
171
+ elapsed_times=[]
172
+
173
+ for _ in tab.columns[4:]: # here you have to specify the index where label’s columns start
174
+ print(f"PROCESSING TARGET {_}...")
175
+ start_time=time.process_time()
176
+ data=slice_data(tab, _)
177
+ print("Data sliced.")
178
+ abstracts, labels=data_to_values(data)
179
+ ids=abstracts_to_ids(abstracts)
180
+ print("Abstracts tokenized, tokens converted to ids.")
181
+ padded_ids=pad_ids(ids)
182
+ print("Sequences padded.")
183
+ train_inputs, temp_inputs, train_labels, temp_labels=train_test_split(padded_ids, labels, random_state=1993, test_size=0.3)
184
+ validation_inputs, test_inputs, validation_labels, test_labels=train_test_split(temp_inputs, temp_labels, random_state=1993, test_size=0.5)
185
+ print("Data splited into train, validation, test sets.")
186
+ train_masks, validation_masks, test_masks=[create_attention_masks(_) for _ in [train_inputs, validation_inputs, test_inputs]]
187
+ print("Attention masks created.")
188
+ train_inputs, validation_inputs, test inputs=[convert_to_tensor(_) for _ in [train_inputs, validation_inputs, test_inputs]]
189
+ print("Inputs converted to tensors.")
190
+ train_labels, validation_labels, test_labels=[convert_to_tensor(_) for _ in [train_lables, validation_labels, test_labels]]
191
+ print("Labels converted to tensors.")
192
+ train_masks, validation_masks, test_masks=[convert_to_tensor(_) for _ in [train_masks, validation_masks, test_masks]]
193
+ print("Masks converted to tensors.")
194
+ model=create_model(_)
195
+ print("Model initialized.")
196
+ history=model.fit([train_inputs, train_masks], train_labels,
197
+ batch_size=3,
198
+ epochs=3,
199
+ validation_data=([validation_inputs, validation_masks], validation_labels))
200
+ histories.append(history)
201
+ print(f"Model for {_} target trained.")
202
+ model.save(SAVE_MODELS_TO+_.replace(".", "_")+".h5")
203
+ print(f"Model for target {_} saved.")
204
+ test_score=model.evaluate([test_inputs, test_masks], test_labels,
205
+ batch_size=3)
206
+ elapsed_times.append(time.process_time()-start_time)
207
+ test_scores.append(test_score)
208
+ print(f"""Model for target {_} tested.
209
+ .
210
+ .
211
+ .""")
212
+
213
+
214
+ # SAVE STATISTICS
215
+
216
+ stats=pd.DataFrame(test_scores, columns=["loss", "accuracy", "precision", "recall"])
217
+ stats.insert(loc=0, "target", tab.columns[4:])
218
+ stats.insert(loc=5, "elapsed_time", elapsed_times)
219
+ stats.to_excel(SAVE_MODELS_TO+"_stats.xlsx", index=False)
220
+
221
+