dnzblgn
/

BERT_Text_Classification

TF-Keras

Model card Files Files and versions Community

dnzblgn commited on Jun 23, 2023

Commit

041750b

•

1 Parent(s): e18ff00

Upload Text_classification_with_BERT_embeddings.py

Browse files

Files changed (1) hide show

Text_classification_with_BERT_embeddings.py +102 -0

Text_classification_with_BERT_embeddings.py ADDED Viewed

	@@ -0,0 +1,102 @@

+#!/usr/bin/env python
+# coding: utf-8
+# In[13]:
+# Loading the libaries.
+import pandas as pd
+import numpy as np
+import tensorflow as tf
+from sklearn.model_selection import train_test_split
+from tensorflow.keras.preprocessing.sequence import pad_sequences
+# Loading the new CSV file that includes BERT embeddings into a pandas dataframe.
+file_path = 'bert_embeddings_tweets.csv'
+data = pd.read_csv(file_path)
+# Converting the "embeddings" column to a numeric float format to numpy array
+embeddings = np.array([np.fromstring(x.strip('[]'), dtype=float, sep=' ') for x in data['embeddings']])
+# Normalizing the embeddings column by subtracting the mean of the embeddings column from each value and then dividing by the standard deviation of the embeddings column.
+embeddings = (embeddings - np.mean(embeddings, axis=0)) / np.std(embeddings, axis=0)
+# Converting the "choose_one" column to a numeric with "Relevant" as 0 and "Not Relevant" as 1.
+labels = np.array([0 if x == "Relevant" else 1 for x in data['choose_one']])
+# Spliting the data into training, validation, and test sets
+x_train, x_val_test, y_train, y_val_test = train_test_split(embeddings, labels, test_size=0.2)
+x_val, x_test, y_val, y_test = train_test_split(x_val_test, y_val_test, test_size=0.5)
+# Defining a sequential Keras model with one input layer, one hidden layer with 8 units and relu activation function
+model = tf.keras.Sequential([
+    tf.keras.layers.Dense(8, activation='relu', input_dim=embeddings.shape[1]),
+    tf.keras.layers.Dropout(0.5), # adding Dropout layer to the model to prevent overfitting by randomly dropping out a fraction during training
+    tf.keras.layers.Dense(1, activation='sigmoid')#adding single dense layer with 1 neuron and sigmoid activation function which is useful for binary classification output.
+])
+# Compiling the model
+model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
+# Training the model for 10 epochs with applying early stopping.
+early_stop = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5) #using an early stopping callback to stop the model training if the validation loss doesn't improve for 5 epochs.
+history = model.fit(x_train, y_train, validation_data=(x_val, y_val), epochs=10, batch_size=8, callbacks=[early_stop])#training the model using an epoch 10 and batch size of 8, and adding the early stopping callback.
+# Evaluating the model
+test_loss, test_acc = model.evaluate(x_test, y_test)
+train_loss, train_acc = model.evaluate(x_train, y_train)
+val_loss, val_acc = model.evaluate(x_val_test, y_val_test)
+print("Test Loss:", test_loss)
+print("Test Accuracy:", test_acc)
+print("Training Accuracy:", train_acc)
+print("Validation Accuracy:", val_acc)
+# In[40]:
+# TRY THE MODEL
+from tensorflow.keras.preprocessing.sequence import pad_sequences
+# Preprocess the input sentence
+input_sentence = input("Write a sentence: ")
+input_sentence = tokenizer.encode_plus(
+    input_sentence,
+    add_special_tokens=True,
+    max_length=768,
+    padding="longest",
+    truncation=True,
+    return_attention_mask=True,
+    return_tensors="tf",
+)
+# Pad the input sequence
+input_ids = pad_sequences(
+    input_sentence["input_ids"],
+    maxlen=768,
+    dtype="float32",
+    value=0,
+    truncating="post",
+    padding="post",
+)
+# Make the prediction
+prediction = model.predict(input_ids)[0][0]
+# Convert the prediction to a label
+label = "Relevant" if prediction == 0 else "Not Relevant"
+print("Input Sentence:", input_sentence)
+print("Prediction:", label)
+# In[ ]:
+# In[ ]: