Upload Text_classification_with_BERT_embeddings.py
Browse files
Text_classification_with_BERT_embeddings.py
ADDED
@@ -0,0 +1,102 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python
|
2 |
+
# coding: utf-8
|
3 |
+
|
4 |
+
# In[13]:
|
5 |
+
|
6 |
+
|
7 |
+
# Loading the libaries.
|
8 |
+
import pandas as pd
|
9 |
+
import numpy as np
|
10 |
+
import tensorflow as tf
|
11 |
+
from sklearn.model_selection import train_test_split
|
12 |
+
from tensorflow.keras.preprocessing.sequence import pad_sequences
|
13 |
+
|
14 |
+
# Loading the new CSV file that includes BERT embeddings into a pandas dataframe.
|
15 |
+
file_path = 'bert_embeddings_tweets.csv'
|
16 |
+
data = pd.read_csv(file_path)
|
17 |
+
|
18 |
+
# Converting the "embeddings" column to a numeric float format to numpy array
|
19 |
+
embeddings = np.array([np.fromstring(x.strip('[]'), dtype=float, sep=' ') for x in data['embeddings']])
|
20 |
+
# Normalizing the embeddings column by subtracting the mean of the embeddings column from each value and then dividing by the standard deviation of the embeddings column.
|
21 |
+
embeddings = (embeddings - np.mean(embeddings, axis=0)) / np.std(embeddings, axis=0)
|
22 |
+
# Converting the "choose_one" column to a numeric with "Relevant" as 0 and "Not Relevant" as 1.
|
23 |
+
labels = np.array([0 if x == "Relevant" else 1 for x in data['choose_one']])
|
24 |
+
|
25 |
+
# Spliting the data into training, validation, and test sets
|
26 |
+
x_train, x_val_test, y_train, y_val_test = train_test_split(embeddings, labels, test_size=0.2)
|
27 |
+
x_val, x_test, y_val, y_test = train_test_split(x_val_test, y_val_test, test_size=0.5)
|
28 |
+
|
29 |
+
# Defining a sequential Keras model with one input layer, one hidden layer with 8 units and relu activation function
|
30 |
+
model = tf.keras.Sequential([
|
31 |
+
tf.keras.layers.Dense(8, activation='relu', input_dim=embeddings.shape[1]),
|
32 |
+
tf.keras.layers.Dropout(0.5), # adding Dropout layer to the model to prevent overfitting by randomly dropping out a fraction during training
|
33 |
+
tf.keras.layers.Dense(1, activation='sigmoid')#adding single dense layer with 1 neuron and sigmoid activation function which is useful for binary classification output.
|
34 |
+
])
|
35 |
+
# Compiling the model
|
36 |
+
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
|
37 |
+
|
38 |
+
# Training the model for 10 epochs with applying early stopping.
|
39 |
+
early_stop = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5) #using an early stopping callback to stop the model training if the validation loss doesn't improve for 5 epochs.
|
40 |
+
history = model.fit(x_train, y_train, validation_data=(x_val, y_val), epochs=10, batch_size=8, callbacks=[early_stop])#training the model using an epoch 10 and batch size of 8, and adding the early stopping callback.
|
41 |
+
|
42 |
+
# Evaluating the model
|
43 |
+
test_loss, test_acc = model.evaluate(x_test, y_test)
|
44 |
+
train_loss, train_acc = model.evaluate(x_train, y_train)
|
45 |
+
val_loss, val_acc = model.evaluate(x_val_test, y_val_test)
|
46 |
+
print("Test Loss:", test_loss)
|
47 |
+
print("Test Accuracy:", test_acc)
|
48 |
+
print("Training Accuracy:", train_acc)
|
49 |
+
print("Validation Accuracy:", val_acc)
|
50 |
+
|
51 |
+
|
52 |
+
# In[40]:
|
53 |
+
|
54 |
+
|
55 |
+
# TRY THE MODEL
|
56 |
+
|
57 |
+
from tensorflow.keras.preprocessing.sequence import pad_sequences
|
58 |
+
|
59 |
+
# Preprocess the input sentence
|
60 |
+
input_sentence = input("Write a sentence: ")
|
61 |
+
input_sentence = tokenizer.encode_plus(
|
62 |
+
input_sentence,
|
63 |
+
add_special_tokens=True,
|
64 |
+
max_length=768,
|
65 |
+
padding="longest",
|
66 |
+
truncation=True,
|
67 |
+
return_attention_mask=True,
|
68 |
+
return_tensors="tf",
|
69 |
+
)
|
70 |
+
|
71 |
+
# Pad the input sequence
|
72 |
+
input_ids = pad_sequences(
|
73 |
+
input_sentence["input_ids"],
|
74 |
+
maxlen=768,
|
75 |
+
dtype="float32",
|
76 |
+
value=0,
|
77 |
+
truncating="post",
|
78 |
+
padding="post",
|
79 |
+
)
|
80 |
+
|
81 |
+
# Make the prediction
|
82 |
+
prediction = model.predict(input_ids)[0][0]
|
83 |
+
|
84 |
+
# Convert the prediction to a label
|
85 |
+
label = "Relevant" if prediction == 0 else "Not Relevant"
|
86 |
+
|
87 |
+
print("Input Sentence:", input_sentence)
|
88 |
+
print("Prediction:", label)
|
89 |
+
|
90 |
+
|
91 |
+
|
92 |
+
# In[ ]:
|
93 |
+
|
94 |
+
|
95 |
+
|
96 |
+
|
97 |
+
|
98 |
+
# In[ ]:
|
99 |
+
|
100 |
+
|
101 |
+
|
102 |
+
|