dnzblgn commited on
Commit
041750b
1 Parent(s): e18ff00

Upload Text_classification_with_BERT_embeddings.py

Browse files
Text_classification_with_BERT_embeddings.py ADDED
@@ -0,0 +1,102 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ # coding: utf-8
3
+
4
+ # In[13]:
5
+
6
+
7
+ # Loading the libaries.
8
+ import pandas as pd
9
+ import numpy as np
10
+ import tensorflow as tf
11
+ from sklearn.model_selection import train_test_split
12
+ from tensorflow.keras.preprocessing.sequence import pad_sequences
13
+
14
+ # Loading the new CSV file that includes BERT embeddings into a pandas dataframe.
15
+ file_path = 'bert_embeddings_tweets.csv'
16
+ data = pd.read_csv(file_path)
17
+
18
+ # Converting the "embeddings" column to a numeric float format to numpy array
19
+ embeddings = np.array([np.fromstring(x.strip('[]'), dtype=float, sep=' ') for x in data['embeddings']])
20
+ # Normalizing the embeddings column by subtracting the mean of the embeddings column from each value and then dividing by the standard deviation of the embeddings column.
21
+ embeddings = (embeddings - np.mean(embeddings, axis=0)) / np.std(embeddings, axis=0)
22
+ # Converting the "choose_one" column to a numeric with "Relevant" as 0 and "Not Relevant" as 1.
23
+ labels = np.array([0 if x == "Relevant" else 1 for x in data['choose_one']])
24
+
25
+ # Spliting the data into training, validation, and test sets
26
+ x_train, x_val_test, y_train, y_val_test = train_test_split(embeddings, labels, test_size=0.2)
27
+ x_val, x_test, y_val, y_test = train_test_split(x_val_test, y_val_test, test_size=0.5)
28
+
29
+ # Defining a sequential Keras model with one input layer, one hidden layer with 8 units and relu activation function
30
+ model = tf.keras.Sequential([
31
+ tf.keras.layers.Dense(8, activation='relu', input_dim=embeddings.shape[1]),
32
+ tf.keras.layers.Dropout(0.5), # adding Dropout layer to the model to prevent overfitting by randomly dropping out a fraction during training
33
+ tf.keras.layers.Dense(1, activation='sigmoid')#adding single dense layer with 1 neuron and sigmoid activation function which is useful for binary classification output.
34
+ ])
35
+ # Compiling the model
36
+ model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
37
+
38
+ # Training the model for 10 epochs with applying early stopping.
39
+ early_stop = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5) #using an early stopping callback to stop the model training if the validation loss doesn't improve for 5 epochs.
40
+ history = model.fit(x_train, y_train, validation_data=(x_val, y_val), epochs=10, batch_size=8, callbacks=[early_stop])#training the model using an epoch 10 and batch size of 8, and adding the early stopping callback.
41
+
42
+ # Evaluating the model
43
+ test_loss, test_acc = model.evaluate(x_test, y_test)
44
+ train_loss, train_acc = model.evaluate(x_train, y_train)
45
+ val_loss, val_acc = model.evaluate(x_val_test, y_val_test)
46
+ print("Test Loss:", test_loss)
47
+ print("Test Accuracy:", test_acc)
48
+ print("Training Accuracy:", train_acc)
49
+ print("Validation Accuracy:", val_acc)
50
+
51
+
52
+ # In[40]:
53
+
54
+
55
+ # TRY THE MODEL
56
+
57
+ from tensorflow.keras.preprocessing.sequence import pad_sequences
58
+
59
+ # Preprocess the input sentence
60
+ input_sentence = input("Write a sentence: ")
61
+ input_sentence = tokenizer.encode_plus(
62
+ input_sentence,
63
+ add_special_tokens=True,
64
+ max_length=768,
65
+ padding="longest",
66
+ truncation=True,
67
+ return_attention_mask=True,
68
+ return_tensors="tf",
69
+ )
70
+
71
+ # Pad the input sequence
72
+ input_ids = pad_sequences(
73
+ input_sentence["input_ids"],
74
+ maxlen=768,
75
+ dtype="float32",
76
+ value=0,
77
+ truncating="post",
78
+ padding="post",
79
+ )
80
+
81
+ # Make the prediction
82
+ prediction = model.predict(input_ids)[0][0]
83
+
84
+ # Convert the prediction to a label
85
+ label = "Relevant" if prediction == 0 else "Not Relevant"
86
+
87
+ print("Input Sentence:", input_sentence)
88
+ print("Prediction:", label)
89
+
90
+
91
+
92
+ # In[ ]:
93
+
94
+
95
+
96
+
97
+
98
+ # In[ ]:
99
+
100
+
101
+
102
+