#!/usr/bin/env python # coding: utf-8 # In[13]: # Loading the libaries. import pandas as pd import numpy as np import tensorflow as tf from sklearn.model_selection import train_test_split from tensorflow.keras.preprocessing.sequence import pad_sequences # Loading the new CSV file that includes BERT embeddings into a pandas dataframe. file_path = 'bert_embeddings_tweets.csv' data = pd.read_csv(file_path) # Converting the "embeddings" column to a numeric float format to numpy array embeddings = np.array([np.fromstring(x.strip('[]'), dtype=float, sep=' ') for x in data['embeddings']]) # Normalizing the embeddings column by subtracting the mean of the embeddings column from each value and then dividing by the standard deviation of the embeddings column. embeddings = (embeddings - np.mean(embeddings, axis=0)) / np.std(embeddings, axis=0) # Converting the "choose_one" column to a numeric with "Relevant" as 0 and "Not Relevant" as 1. labels = np.array([0 if x == "Relevant" else 1 for x in data['choose_one']]) # Spliting the data into training, validation, and test sets x_train, x_val_test, y_train, y_val_test = train_test_split(embeddings, labels, test_size=0.2) x_val, x_test, y_val, y_test = train_test_split(x_val_test, y_val_test, test_size=0.5) # Defining a sequential Keras model with one input layer, one hidden layer with 8 units and relu activation function model = tf.keras.Sequential([ tf.keras.layers.Dense(8, activation='relu', input_dim=embeddings.shape[1]), tf.keras.layers.Dropout(0.5), # adding Dropout layer to the model to prevent overfitting by randomly dropping out a fraction during training tf.keras.layers.Dense(1, activation='sigmoid')#adding single dense layer with 1 neuron and sigmoid activation function which is useful for binary classification output. ]) # Compiling the model model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) # Training the model for 10 epochs with applying early stopping. early_stop = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5) #using an early stopping callback to stop the model training if the validation loss doesn't improve for 5 epochs. history = model.fit(x_train, y_train, validation_data=(x_val, y_val), epochs=10, batch_size=8, callbacks=[early_stop])#training the model using an epoch 10 and batch size of 8, and adding the early stopping callback. # Evaluating the model test_loss, test_acc = model.evaluate(x_test, y_test) train_loss, train_acc = model.evaluate(x_train, y_train) val_loss, val_acc = model.evaluate(x_val_test, y_val_test) print("Test Loss:", test_loss) print("Test Accuracy:", test_acc) print("Training Accuracy:", train_acc) print("Validation Accuracy:", val_acc) # In[40]: # TRY THE MODEL from tensorflow.keras.preprocessing.sequence import pad_sequences # Preprocess the input sentence input_sentence = input("Write a sentence: ") input_sentence = tokenizer.encode_plus( input_sentence, add_special_tokens=True, max_length=768, padding="longest", truncation=True, return_attention_mask=True, return_tensors="tf", ) # Pad the input sequence input_ids = pad_sequences( input_sentence["input_ids"], maxlen=768, dtype="float32", value=0, truncating="post", padding="post", ) # Make the prediction prediction = model.predict(input_ids)[0][0] # Convert the prediction to a label label = "Relevant" if prediction == 0 else "Not Relevant" print("Input Sentence:", input_sentence) print("Prediction:", label) # In[ ]: # In[ ]: