|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
import pandas as pd |
|
import numpy as np |
|
import tensorflow as tf |
|
from sklearn.model_selection import train_test_split |
|
from tensorflow.keras.preprocessing.sequence import pad_sequences |
|
|
|
|
|
file_path = 'bert_embeddings_tweets.csv' |
|
data = pd.read_csv(file_path) |
|
|
|
|
|
embeddings = np.array([np.fromstring(x.strip('[]'), dtype=float, sep=' ') for x in data['embeddings']]) |
|
|
|
embeddings = (embeddings - np.mean(embeddings, axis=0)) / np.std(embeddings, axis=0) |
|
|
|
labels = np.array([0 if x == "Relevant" else 1 for x in data['choose_one']]) |
|
|
|
|
|
x_train, x_val_test, y_train, y_val_test = train_test_split(embeddings, labels, test_size=0.2) |
|
x_val, x_test, y_val, y_test = train_test_split(x_val_test, y_val_test, test_size=0.5) |
|
|
|
|
|
model = tf.keras.Sequential([ |
|
tf.keras.layers.Dense(8, activation='relu', input_dim=embeddings.shape[1]), |
|
tf.keras.layers.Dropout(0.5), |
|
tf.keras.layers.Dense(1, activation='sigmoid') |
|
]) |
|
|
|
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) |
|
|
|
|
|
early_stop = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5) |
|
history = model.fit(x_train, y_train, validation_data=(x_val, y_val), epochs=10, batch_size=8, callbacks=[early_stop]) |
|
|
|
|
|
test_loss, test_acc = model.evaluate(x_test, y_test) |
|
train_loss, train_acc = model.evaluate(x_train, y_train) |
|
val_loss, val_acc = model.evaluate(x_val_test, y_val_test) |
|
print("Test Loss:", test_loss) |
|
print("Test Accuracy:", test_acc) |
|
print("Training Accuracy:", train_acc) |
|
print("Validation Accuracy:", val_acc) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
from tensorflow.keras.preprocessing.sequence import pad_sequences |
|
|
|
|
|
input_sentence = input("Write a sentence: ") |
|
input_sentence = tokenizer.encode_plus( |
|
input_sentence, |
|
add_special_tokens=True, |
|
max_length=768, |
|
padding="longest", |
|
truncation=True, |
|
return_attention_mask=True, |
|
return_tensors="tf", |
|
) |
|
|
|
|
|
input_ids = pad_sequences( |
|
input_sentence["input_ids"], |
|
maxlen=768, |
|
dtype="float32", |
|
value=0, |
|
truncating="post", |
|
padding="post", |
|
) |
|
|
|
|
|
prediction = model.predict(input_ids)[0][0] |
|
|
|
|
|
label = "Relevant" if prediction == 0 else "Not Relevant" |
|
|
|
print("Input Sentence:", input_sentence) |
|
print("Prediction:", label) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|