Spaces:
No application file
No application file
import time | |
import pickle | |
import tensorflow as tf | |
import pandas as pd | |
import tqdm | |
import numpy as np | |
import matplotlib | |
matplotlib.use('Agg') | |
import matplotlib.pyplot as plt | |
from tensorflow.keras.preprocessing.text import Tokenizer | |
from tensorflow.keras.preprocessing.sequence import pad_sequences | |
from tensorflow.keras.utils import to_categorical | |
from tensorflow.keras.callbacks import ModelCheckpoint, TensorBoard | |
from sklearn.model_selection import train_test_split | |
#from tensorflow.keras.layers import Embedding, Dropout, Dense | |
from tensorflow.keras.models import Sequential | |
#from tensorflow.keras.metrics import Recall, Precision | |
from sklearn.metrics import f1_score, precision_score, accuracy_score, recall_score | |
from tensorflow.keras.layers import Conv1D, GlobalMaxPooling1D, Dropout, Dense, Input, Embedding, MaxPooling1D, Flatten | |
SEQUENCE_LENGTH = 500 # the length of all sequences (number of words per sample) | |
EMBEDDING_SIZE = 100 # Using 100-Dimensional GloVe embedding vectors | |
TEST_SIZE = 0.25 # ratio of testing set | |
BATCH_SIZE = 64 | |
EPOCHS = 10 # number of epochs | |
maxlen = 80 | |
batch_size = 32 | |
label2int = {"frustrated": 0, "negative": 1,"neutral":2,"positive":3,"satisfied":4} | |
int2label = {0: "frustrated", 1: "negative",2:"neutral",3:"positive",4:"satisfied"} | |
def load_data(): | |
""" | |
Loads SMS Spam Collection dataset | |
""" | |
data = pd.read_csv("train.csv",encoding='latin-1') | |
texts = data['feedback'].values | |
labels=data['sentiment'].values | |
return texts, labels | |
def dl_evaluation_process(): | |
print("loading data") | |
X, y = load_data() | |
# Text tokenization | |
# vectorizing text, turning each text into sequence of integers | |
tokenizer = Tokenizer() | |
tokenizer.fit_on_texts(X) | |
# lets dump it to a file, so we can use it in testing | |
pickle.dump(tokenizer, open("tokenizer.pickle", "wb")) | |
# convert to sequence of integers | |
X = tokenizer.texts_to_sequences(X) | |
# convert to numpy arrays | |
X = np.array(X) | |
y = np.array(y) | |
# pad sequences at the beginning of each sequence with 0's | |
# for example if SEQUENCE_LENGTH=4: | |
# [[5, 3, 2], [5, 1, 2, 3], [3, 4]] | |
# will be transformed to: | |
# [[0, 5, 3, 2], [5, 1, 2, 3], [0, 0, 3, 4]] | |
X = pad_sequences(X, maxlen=SEQUENCE_LENGTH) | |
# One Hot encoding labels | |
# [spam, ham, spam, ham, ham] will be converted to: | |
# [1, 0, 1, 0, 1] and then to: | |
# [[0, 1], [1, 0], [0, 1], [1, 0], [0, 1]] | |
y = [label2int[label] for label in y] | |
y = to_categorical(y) | |
# split and shuffle | |
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=TEST_SIZE, random_state=7) | |
# print our data shapes | |
'''print("X_train.shape:", X_train.shape) | |
print("X_test.shape:", X_test.shape) | |
print("y_train.shape:", y_train.shape) | |
print("y_test.shape:", y_test.shape)''' | |
#print("EMD Matrix") | |
print("Starting...") | |
# Define the model | |
print('Build model...') | |
model = Sequential() | |
model.add(Flatten(input_shape=(500,))) | |
model.add(Dense(128, activation='relu')) | |
model.add(Dense(5, activation='softmax')) | |
# Compile the model | |
model.compile(loss='categorical_crossentropy', | |
optimizer='adam', | |
metrics=['accuracy']) | |
# Train the model | |
print('Train...') | |
model.fit(X, y, | |
batch_size=batch_size, | |
epochs=2, | |
validation_data=(X_test, y_test)) | |
y_test = np.argmax(y_test, axis=1) | |
y_pred = np.argmax(model.predict(X_test), axis=1) | |
acc = accuracy_score(y_test, y_pred) * 100 | |
precsn = precision_score(y_test, y_pred, average="macro") * 100 | |
recall = recall_score(y_test, y_pred, average="macro") * 100 | |
f1score = f1_score(y_test, y_pred, average="macro") * 100 | |
print("acc=", acc) | |
print("precsn=", precsn) | |
print("recall=", recall) | |
print("f1score=", f1score) | |
return acc, precsn, recall, f1score | |
if __name__ == '__main__': | |
dl_evaluation_process() |