Upload compAnIonv1.py
Browse files- compAnIonv1.py +154 -0
compAnIonv1.py
ADDED
@@ -0,0 +1,154 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# compAnIonv1.py>
|
2 |
+
# Setup environment for Spark
|
3 |
+
import os
|
4 |
+
#os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
|
5 |
+
#os.environ["SPARK_HOME"] = '/home/ubuntu/spark-3.5.1-bin-hadoop3'
|
6 |
+
|
7 |
+
# importing pandas as pd
|
8 |
+
import pandas as pd
|
9 |
+
|
10 |
+
# Install spark-nlp
|
11 |
+
#!pip install spark-nlp
|
12 |
+
#import sparknlp
|
13 |
+
#from sparknlp.base import *
|
14 |
+
#from sparknlp.annotator import *
|
15 |
+
#from sparknlp.common import *
|
16 |
+
#from pyspark.sql.functions import *
|
17 |
+
#from pyspark.sql.functions import lit
|
18 |
+
#from pyspark.sql.window import Window
|
19 |
+
#from pyspark.sql.types import *
|
20 |
+
#from pyspark.ml.classification import LogisticRegression
|
21 |
+
#from pyspark.ml.evaluation import BinaryClassificationEvaluator
|
22 |
+
#from pyspark.mllib.evaluation import MulticlassMetrics
|
23 |
+
#from pyspark.ml import Pipeline
|
24 |
+
#from pyspark.ml.feature import StandardScaler, VectorAssembler, Imputer, OneHotEncoder, StringIndexer
|
25 |
+
#from pyspark.ml.tuning import ParamGridBuilder, CrossValidator, CrossValidatorModel, TrainValidationSplit, TrainValidationSplitModel
|
26 |
+
#from pyspark.ml.evaluation import MulticlassClassificationEvaluator
|
27 |
+
#from pyspark.ml.linalg import Vectors, VectorUDT
|
28 |
+
#import pyspark.pandas as ps
|
29 |
+
|
30 |
+
# Import Tensorflow and BERT models
|
31 |
+
import tensorflow as tf
|
32 |
+
from transformers import BertTokenizer
|
33 |
+
from transformers import TFBertModel
|
34 |
+
|
35 |
+
MAX_SEQUENCE_LENGTH = 400
|
36 |
+
|
37 |
+
def create_bert_classification_model(bert_model,
|
38 |
+
num_train_layers=0,
|
39 |
+
max_sequence_length=MAX_SEQUENCE_LENGTH,
|
40 |
+
num_filters = [100, 100, 50, 25],
|
41 |
+
kernel_sizes = [3, 4, 5, 10],
|
42 |
+
hidden_size = 200,
|
43 |
+
hidden2_size = 100,
|
44 |
+
dropout = 0.1,
|
45 |
+
learning_rate = 0.001,
|
46 |
+
label_smoothing = 0.03
|
47 |
+
):
|
48 |
+
"""
|
49 |
+
Build a simple classification model with BERT. Use the Pooler Output or CLS for classification purposes
|
50 |
+
"""
|
51 |
+
if num_train_layers == 0:
|
52 |
+
# Freeze all layers of pre-trained BERT model
|
53 |
+
bert_model.trainable = False
|
54 |
+
|
55 |
+
elif num_train_layers == 12:
|
56 |
+
# Train all layers of the BERT model
|
57 |
+
bert_model.trainable = True
|
58 |
+
|
59 |
+
else:
|
60 |
+
# Restrict training to the num_train_layers outer transformer layers
|
61 |
+
retrain_layers = []
|
62 |
+
|
63 |
+
for retrain_layer_number in range(num_train_layers):
|
64 |
+
|
65 |
+
layer_code = '_' + str(11 - retrain_layer_number)
|
66 |
+
retrain_layers.append(layer_code)
|
67 |
+
|
68 |
+
|
69 |
+
#print('retrain layers: ', retrain_layers)
|
70 |
+
|
71 |
+
for w in bert_model.weights:
|
72 |
+
if not any([x in w.name for x in retrain_layers]):
|
73 |
+
#print('freezing: ', w)
|
74 |
+
w._trainable = False
|
75 |
+
|
76 |
+
input_ids = tf.keras.layers.Input(shape=(MAX_SEQUENCE_LENGTH,), dtype=tf.int64, name='input_ids')
|
77 |
+
token_type_ids = tf.keras.layers.Input(shape=(MAX_SEQUENCE_LENGTH,), dtype=tf.int64, name='token_type_ids')
|
78 |
+
attention_mask = tf.keras.layers.Input(shape=(MAX_SEQUENCE_LENGTH,), dtype=tf.int64, name='attention_mask')
|
79 |
+
|
80 |
+
bert_inputs = {'input_ids': input_ids,
|
81 |
+
'token_type_ids': token_type_ids,
|
82 |
+
'attention_mask': attention_mask}
|
83 |
+
|
84 |
+
bert_out = bert_model(bert_inputs)
|
85 |
+
|
86 |
+
pooler_token = bert_out[1]
|
87 |
+
cls_token = bert_out[0][:, 0, :]
|
88 |
+
bert_out_avg = tf.math.reduce_mean(bert_out[0], axis=1)
|
89 |
+
cnn_token = bert_out[0]
|
90 |
+
|
91 |
+
conv_layers_for_all_kernel_sizes = []
|
92 |
+
for kernel_size, filters in zip(kernel_sizes, num_filters):
|
93 |
+
conv_layer = tf.keras.layers.Conv1D(filters=filters, kernel_size=kernel_size, activation='relu')(cnn_token)
|
94 |
+
conv_layer = tf.keras.layers.GlobalMaxPooling1D()(conv_layer)
|
95 |
+
conv_layers_for_all_kernel_sizes.append(conv_layer)
|
96 |
+
|
97 |
+
conv_output = tf.keras.layers.concatenate(conv_layers_for_all_kernel_sizes, axis=1)
|
98 |
+
|
99 |
+
# classification layer
|
100 |
+
hidden = tf.keras.layers.Dense(hidden_size, activation='relu', name='hidden_layer')(conv_output)
|
101 |
+
hidden = tf.keras.layers.Dropout(dropout)(hidden)
|
102 |
+
|
103 |
+
hidden = tf.keras.layers.Dense(hidden2_size, activation='relu', name='hidden_layer2')(hidden)
|
104 |
+
hidden = tf.keras.layers.Dropout(dropout)(hidden)
|
105 |
+
|
106 |
+
classification = tf.keras.layers.Dense(1, activation='sigmoid',name='classification_layer')(hidden)
|
107 |
+
|
108 |
+
classification_model = tf.keras.Model(inputs=[input_ids, token_type_ids, attention_mask], outputs=[classification])
|
109 |
+
|
110 |
+
classification_model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate),
|
111 |
+
# LOSS FUNCTION
|
112 |
+
loss=tf.keras.losses.BinaryFocalCrossentropy(
|
113 |
+
gamma=2.0, from_logits=False, apply_class_balancing=True, label_smoothing=label_smoothing
|
114 |
+
),
|
115 |
+
# METRIC FUNCTIONS
|
116 |
+
metrics=['accuracy']
|
117 |
+
)
|
118 |
+
return classification_model
|
119 |
+
|
120 |
+
|
121 |
+
f_one_or_zero = lambda x: 1 if x > 0.5 else 0
|
122 |
+
|
123 |
+
def run_inference_model(conversations):
|
124 |
+
# Tokenize conversations with BERT tokenizer
|
125 |
+
tokenized_input = tokenizer(conversations,
|
126 |
+
max_length=MAX_SEQUENCE_LENGTH,
|
127 |
+
truncation=True,
|
128 |
+
padding='max_length',
|
129 |
+
return_tensors='tf')
|
130 |
+
bert_inputs = [tokenized_input.input_ids,
|
131 |
+
tokenized_input.token_type_ids,
|
132 |
+
tokenized_input.attention_mask]
|
133 |
+
|
134 |
+
# Apply Model Prediction to testData
|
135 |
+
y_pred = inference_model.predict(bert_inputs)
|
136 |
+
prediction = f_one_or_zero(y_pred)
|
137 |
+
return prediction
|
138 |
+
|
139 |
+
|
140 |
+
|
141 |
+
model_checkpoint = "bert-base-uncased"
|
142 |
+
# Step 1: Load BERT Tokenizer
|
143 |
+
tokenizer = BertTokenizer.from_pretrained(model_checkpoint)
|
144 |
+
# Step 2: Load Pretrained BERT model
|
145 |
+
bert_model = TFBertModel.from_pretrained(model_checkpoint)
|
146 |
+
# Stage 3: Create custom BERT model on top of the pretrained model
|
147 |
+
inference_model = create_bert_classification_model(bert_model=bert_model)
|
148 |
+
# Stage 4: Load Inference model with saved weights
|
149 |
+
save_path = 'bert_cnn_ensemble_resample_uncased_mdl.h5'
|
150 |
+
inference_model.load_weights(save_path)
|
151 |
+
|
152 |
+
|
153 |
+
|
154 |
+
|