Spaces:

AkshayKatukojwala
/

Fine_Grained_Sentiment_Analysis_using_customer_feedback

No application file

App Files Files Community

AkshayKatukojwala commited on May 1

Commit

2f5e00f

•

1 Parent(s): d549a0f

Upload 10 files

Browse files

Files changed (10) hide show

LSTM.py +82 -0
Train_ANN.py +139 -0
Train_CNN.py +184 -0
Train_LSTM2.py +185 -0
cnn_model.h5 +3 -0
g1.jpg +0 -0
lstm_model.h5 +3 -0
manage.py +22 -0
tokenizer.pickle +3 -0
train.csv +0 -0

LSTM.py ADDED Viewed

	@@ -0,0 +1,82 @@

+import time
+import pickle
+import tensorflow as tf
+import pandas as pd
+import tqdm
+import numpy as np
+import os
+import matplotlib
+matplotlib.use('Agg')
+import matplotlib.pyplot as plt
+from tensorflow.keras.preprocessing.text import Tokenizer
+from tensorflow.keras.preprocessing.sequence import pad_sequences
+from tensorflow.keras.utils import to_categorical
+from tensorflow.keras.callbacks import ModelCheckpoint, TensorBoard
+from sklearn.model_selection import train_test_split
+#from tensorflow.keras.layers import Embedding, Dropout, Dense
+from tensorflow.keras.models import Sequential
+from keras.models import load_model
+from sklearn.metrics import f1_score, precision_score, accuracy_score, recall_score
+from tensorflow.keras.layers import LSTM, GlobalMaxPooling1D, Dropout, Dense, Input, Embedding, MaxPooling1D, Flatten,BatchNormalization
+SEQUENCE_LENGTH = 100 # the length of all sequences (number of words per sample)
+EMBEDDING_SIZE = 100  # Using 100-Dimensional GloVe embedding vectors
+TEST_SIZE = 0.25 # ratio of testing set
+BATCH_SIZE = 64
+EPOCHS = 20 # number of epochs
+label2int = {"frustrated": 0, "negative": 1,"neutral":2,"positive":3,"satisfied":4}
+int2label = {0: "frustrated", 1: "negative",2:"neutral",3:"positive",4:"satisfied"}
+def get_embedding_vectors(tokenizer, dim=100):
+    embedding_index = {}
+    with open(f"data/glove.6B.{dim}d.txt", encoding='utf8') as f:
+        for line in tqdm.tqdm(f, "Reading GloVe"):
+            values = line.split()
+            word = values[0]
+            vectors = np.asarray(values[1:], dtype='float32')
+            embedding_index[word] = vectors
+    word_index = tokenizer.word_index
+    embedding_matrix = np.zeros((len(word_index) + 1, dim))
+    for word, i in word_index.items():
+        embedding_vector = embedding_index.get(word)
+        if embedding_vector is not None:
+            # words not found will be 0s
+            embedding_matrix[i] = embedding_vector
+    return embedding_matrix
+def get_predictions(text):
+    tokenizer = Tokenizer()
+    model_path = 'lstm_model.h5'
+    model = load_model(model_path)
+    sequence = tokenizer.texts_to_sequences(text)
+    # pad the sequence
+    sequence = pad_sequences(sequence, maxlen=SEQUENCE_LENGTH)
+    # get the prediction
+    prediction = model.predict(sequence)
+    res=[]
+    for p1 in prediction:
+        res.append(int2label[np.argmax(p1)])
+    return res
+if __name__ == '__main__':
+    t=[' Sooo SAD I will miss you here in San Diego!!!', 'Stolen iPhone 15 pro', 'iPhone 15 Pro and iPhone 15 Pro Max Feature Increased 8GB of RAM', 'Apple announces iPhone 15 Pro and Pro Max', 'Temperature of my iPhone 15 Pro Max while on the phone for 5 mins.', 'I traded in my iPhone 14 Pro for the iPhone 15 Pro Max, then FedEx lost the old phone', 'iPhone 15 Pro Max crushes Google Pixel 8 Pro in speed test', 'Apple Design Team Making The New iPhone 15 Pro Max', 'iPhone 15 Pro Could Be Most Lightweight Pro Model Since iPhone XS', ' iPhone 15 Pro/Pro Max is so sad']
+    print(get_predictions(t))

Train_ANN.py ADDED Viewed

	@@ -0,0 +1,139 @@

+import time
+import pickle
+import tensorflow as tf
+import pandas as pd
+import tqdm
+import numpy as np
+import matplotlib
+matplotlib.use('Agg')
+import matplotlib.pyplot as plt
+from tensorflow.keras.preprocessing.text import Tokenizer
+from tensorflow.keras.preprocessing.sequence import pad_sequences
+from tensorflow.keras.utils import to_categorical
+from tensorflow.keras.callbacks import ModelCheckpoint, TensorBoard
+from sklearn.model_selection import train_test_split
+#from tensorflow.keras.layers import Embedding, Dropout, Dense
+from tensorflow.keras.models import Sequential
+#from tensorflow.keras.metrics import Recall, Precision
+from sklearn.metrics import f1_score, precision_score, accuracy_score, recall_score
+from tensorflow.keras.layers import Conv1D, GlobalMaxPooling1D, Dropout, Dense, Input, Embedding, MaxPooling1D, Flatten
+SEQUENCE_LENGTH = 500 # the length of all sequences (number of words per sample)
+EMBEDDING_SIZE = 100  # Using 100-Dimensional GloVe embedding vectors
+TEST_SIZE = 0.25 # ratio of testing set
+BATCH_SIZE = 64
+EPOCHS = 10 # number of epochs
+maxlen = 80
+batch_size = 32
+label2int = {"frustrated": 0, "negative": 1,"neutral":2,"positive":3,"satisfied":4}
+int2label = {0: "frustrated", 1: "negative",2:"neutral",3:"positive",4:"satisfied"}
+def load_data():
+    """
+    Loads SMS Spam Collection dataset
+    """
+    data = pd.read_csv("train.csv",encoding='latin-1')
+    texts = data['feedback'].values
+    labels=data['sentiment'].values
+    return texts, labels
+def dl_evaluation_process():
+    print("loading data")
+    X, y = load_data()
+    # Text tokenization
+    # vectorizing text, turning each text into sequence of integers
+    tokenizer = Tokenizer()
+    tokenizer.fit_on_texts(X)
+    # lets dump it to a file, so we can use it in testing
+    pickle.dump(tokenizer, open("tokenizer.pickle", "wb"))
+    # convert to sequence of integers
+    X = tokenizer.texts_to_sequences(X)
+    # convert to numpy arrays
+    X = np.array(X)
+    y = np.array(y)
+    # pad sequences at the beginning of each sequence with 0's
+    # for example if SEQUENCE_LENGTH=4:
+    # [[5, 3, 2], [5, 1, 2, 3], [3, 4]]
+    # will be transformed to:
+    # [[0, 5, 3, 2], [5, 1, 2, 3], [0, 0, 3, 4]]
+    X = pad_sequences(X, maxlen=SEQUENCE_LENGTH)
+    # One Hot encoding labels
+    # [spam, ham, spam, ham, ham] will be converted to:
+    # [1, 0, 1, 0, 1] and then to:
+    # [[0, 1], [1, 0], [0, 1], [1, 0], [0, 1]]
+    y = [label2int[label] for label in y]
+    y = to_categorical(y)
+    # split and shuffle
+    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=TEST_SIZE, random_state=7)
+    # print our data shapes
+    '''print("X_train.shape:", X_train.shape)
+    print("X_test.shape:", X_test.shape)
+    print("y_train.shape:", y_train.shape)
+    print("y_test.shape:", y_test.shape)'''
+    #print("EMD Matrix")
+    print("Starting...")
+    # Define the model
+    print('Build model...')
+    model = Sequential()
+    model.add(Flatten(input_shape=(500,)))
+    model.add(Dense(128, activation='relu'))
+    model.add(Dense(5, activation='softmax'))
+    # Compile the model
+    model.compile(loss='categorical_crossentropy',
+                  optimizer='adam',
+                  metrics=['accuracy'])
+    # Train the model
+    print('Train...')
+    model.fit(X, y,
+              batch_size=batch_size,
+              epochs=2,
+              validation_data=(X_test, y_test))
+    y_test = np.argmax(y_test, axis=1)
+    y_pred = np.argmax(model.predict(X_test), axis=1)
+    acc = accuracy_score(y_test, y_pred) * 100
+    precsn = precision_score(y_test, y_pred, average="macro") * 100
+    recall = recall_score(y_test, y_pred, average="macro") * 100
+    f1score = f1_score(y_test, y_pred, average="macro") * 100
+    print("acc=", acc)
+    print("precsn=", precsn)
+    print("recall=", recall)
+    print("f1score=", f1score)
+    return acc, precsn, recall, f1score
+if __name__ == '__main__':
+  dl_evaluation_process()

Train_CNN.py ADDED Viewed

	@@ -0,0 +1,184 @@

+import time
+import pickle
+import tensorflow as tf
+import pandas as pd
+import tqdm
+import numpy as np
+import matplotlib
+matplotlib.use('Agg')
+import matplotlib.pyplot as plt
+from tensorflow.keras.preprocessing.text import Tokenizer
+from tensorflow.keras.preprocessing.sequence import pad_sequences
+from tensorflow.keras.utils import to_categorical
+from tensorflow.keras.callbacks import ModelCheckpoint, TensorBoard
+from sklearn.model_selection import train_test_split
+#from tensorflow.keras.layers import Embedding, Dropout, Dense
+from tensorflow.keras.models import Sequential
+#from tensorflow.keras.metrics import Recall, Precision
+from sklearn.metrics import f1_score, precision_score, accuracy_score, recall_score
+from tensorflow.keras.layers import Conv1D, GlobalMaxPooling1D, Dropout, Dense, Input, Embedding, MaxPooling1D, Flatten
+SEQUENCE_LENGTH = 100 # the length of all sequences (number of words per sample)
+EMBEDDING_SIZE = 100  # Using 100-Dimensional GloVe embedding vectors
+TEST_SIZE = 0.25 # ratio of testing set
+BATCH_SIZE = 64
+EPOCHS = 10 # number of epochs
+label2int = {"frustrated": 0, "negative": 1,"neutral":2,"positive":3,"satisfied":4}
+int2label = {0: "frustrated", 1: "negative",2:"neutral",3:"positive",4:"satisfied"}
+def load_data():
+    """
+    Loads SMS Spam Collection dataset
+    """
+    data = pd.read_csv("train.csv",encoding='latin-1')
+    texts = data['feedback'].values
+    labels=data['sentiment'].values
+    return texts, labels
+def dl_evaluation_process():
+    print("loading data")
+    X, y = load_data()
+    # Text tokenization
+    # vectorizing text, turning each text into sequence of integers
+    tokenizer = Tokenizer()
+    tokenizer.fit_on_texts(X)
+    # lets dump it to a file, so we can use it in testing
+    pickle.dump(tokenizer, open("tokenizer.pickle", "wb"))
+    # convert to sequence of integers
+    X = tokenizer.texts_to_sequences(X)
+    # convert to numpy arrays
+    X = np.array(X)
+    y = np.array(y)
+    # pad sequences at the beginning of each sequence with 0's
+    # for example if SEQUENCE_LENGTH=4:
+    # [[5, 3, 2], [5, 1, 2, 3], [3, 4]]
+    # will be transformed to:
+    # [[0, 5, 3, 2], [5, 1, 2, 3], [0, 0, 3, 4]]
+    X = pad_sequences(X, maxlen=SEQUENCE_LENGTH)
+    # One Hot encoding labels
+    # [spam, ham, spam, ham, ham] will be converted to:
+    # [1, 0, 1, 0, 1] and then to:
+    # [[0, 1], [1, 0], [0, 1], [1, 0], [0, 1]]
+    y = [label2int[label] for label in y]
+    y = to_categorical(y)
+    # split and shuffle
+    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=TEST_SIZE, random_state=7)
+    # print our data shapes
+    '''print("X_train.shape:", X_train.shape)
+    print("X_test.shape:", X_test.shape)
+    print("y_train.shape:", y_train.shape)
+    print("y_test.shape:", y_test.shape)'''
+    #print("EMD Matrix")
+    embedding_matrix = get_embedding_vectors(tokenizer)
+    print("Starting...",len(tokenizer.word_index))
+    model = Sequential()
+    model.add(Embedding(len(tokenizer.word_index) + 1,
+                        EMBEDDING_SIZE,
+                        weights=[embedding_matrix],
+                        trainable=False,
+                        input_length=SEQUENCE_LENGTH))
+    model.add(Conv1D(128, 3, activation='relu'))
+    model.add(GlobalMaxPooling1D())
+    model.add(Dense(64, activation='relu'))
+    model.add(Dense(5, activation="softmax"))
+    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['acc'])
+    model.fit(X, y, epochs=20, verbose=1, validation_data=(X_test, y_test), batch_size=128)
+    #print("saving")
+    model.save('cnn_model.h5')
+    #model.summary()
+    y_test = np.argmax(y_test, axis=1)
+    y_pred = np.argmax(model.predict(X_test), axis=1)
+    acc = accuracy_score(y_test, y_pred) * 100
+    precsn = precision_score(y_test, y_pred, average="macro") * 100
+    recall = recall_score(y_test, y_pred, average="macro") * 100
+    f1score = f1_score(y_test, y_pred, average="macro") * 100
+    print("acc=", acc)
+    print("precsn=", precsn)
+    print("recall=", recall)
+    print("f1score=", f1score)
+    accuracy_list = [acc,precsn,recall,f1score]
+    '''bars = ('Accuracy', 'Precision', 'Recall', 'F1_Score')
+    y_pos = np.arange(len(bars))
+    plt.bar(y_pos, accuracy_list, color=['red', 'green', 'blue', 'orange'])
+    plt.xticks(y_pos, bars)
+    plt.xlabel('Performance Metrics')
+    plt.ylabel('Scores')
+    plt.title('DL Model Evaluation')
+    plt.savefig('static/accuracy.png')
+    plt.clf()'''
+    return acc, precsn, recall, f1score
+def get_embedding_vectors(tokenizer, dim=100):
+    embedding_index = {}
+    with open(f"data/glove.6B.{dim}d.txt", encoding='utf8') as f:
+        for line in tqdm.tqdm(f, "Reading GloVe"):
+            values = line.split()
+            word = values[0]
+            vectors = np.asarray(values[1:], dtype='float32')
+            embedding_index[word] = vectors
+    word_index = tokenizer.word_index
+    embedding_matrix = np.zeros((len(word_index) + 1, dim))
+    for word, i in word_index.items():
+        embedding_vector = embedding_index.get(word)
+        if embedding_vector is not None:
+            # words not found will be 0s
+            embedding_matrix[i] = embedding_vector
+    return embedding_matrix
+# get the loss and metrics
+#result = model.evaluate(X_test, y_test)
+# extract those
+#loss = result[0]
+#accuracy = result[1]
+#precision = result[2]
+#recall = result[3]
+#print(f"[+] Accuracy: {accuracy*100:.2f}%")
+#print("Model created")
+'''def get_predictions(text):
+    sequence = tokenizer.texts_to_sequences([text])
+    # pad the sequence
+    sequence = pad_sequences(sequence, maxlen=SEQUENCE_LENGTH)
+    # get the prediction
+    prediction = model.predict(sequence)[0]
+    # one-hot encoded vector, revert using np.argmax
+    return int2label[np.argmax(prediction)]
+text = "Need a loan? We offer quick and easy approval. Apply now for cash in minutes!."
+print(get_predictions(text))'''
+dl_evaluation_process()

Train_LSTM2.py ADDED Viewed

	@@ -0,0 +1,185 @@

+import time
+import pickle
+import tensorflow as tf
+import pandas as pd
+import tqdm
+import numpy as np
+import os
+import matplotlib
+matplotlib.use('Agg')
+import matplotlib.pyplot as plt
+from tensorflow.keras.preprocessing.text import Tokenizer
+from tensorflow.keras.preprocessing.sequence import pad_sequences
+from tensorflow.keras.utils import to_categorical
+from tensorflow.keras.callbacks import ModelCheckpoint, TensorBoard
+from sklearn.model_selection import train_test_split
+#from tensorflow.keras.layers import Embedding, Dropout, Dense
+from tensorflow.keras.models import Sequential
+from keras.models import load_model
+from sklearn.metrics import f1_score, precision_score, accuracy_score, recall_score
+from tensorflow.keras.layers import LSTM, GlobalMaxPooling1D, Dropout, Dense, Input, Embedding, MaxPooling1D, Flatten,BatchNormalization
+SEQUENCE_LENGTH = 100 # the length of all sequences (number of words per sample)
+EMBEDDING_SIZE = 100  # Using 100-Dimensional GloVe embedding vectors
+TEST_SIZE = 0.25 # ratio of testing set
+BATCH_SIZE = 64
+EPOCHS = 20 # number of epochs
+label2int = {"frustrated": 0, "negative": 1,"neutral":2,"positive":3,"satisfied":4}
+int2label = {0: "frustrated", 1: "negative",2:"neutral",3:"positive",4:"satisfied"}
+def load_data():
+    data = pd.read_csv("train.csv",encoding='latin-1')
+    texts = data['feedback'].values
+    labels=data['sentiment'].values
+    return texts, labels
+def dl_evaluation_process():
+    print("loading data")
+    X, y = load_data()
+    # Text tokenization
+    # vectorizing text, turning each text into sequence of integers
+    tokenizer = Tokenizer()
+    tokenizer.fit_on_texts(X)
+    # lets dump it to a file, so we can use it in testing
+    pickle.dump(tokenizer, open("tokenizer.pickle", "wb"))
+    # convert to sequence of integers
+    X = tokenizer.texts_to_sequences(X)
+    # convert to numpy arrays
+    X = np.array(X)
+    y = np.array(y)
+    # pad sequences at the beginning of each sequence with 0's
+    # for example if SEQUENCE_LENGTH=4:
+    # [[5, 3, 2], [5, 1, 2, 3], [3, 4]]
+    # will be transformed to:
+    # [[0, 5, 3, 2], [5, 1, 2, 3], [0, 0, 3, 4]]
+    X = pad_sequences(X, maxlen=SEQUENCE_LENGTH)
+    y = [label2int[label] for label in y]
+    y = to_categorical(y)
+    # split and shuffle
+    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=TEST_SIZE, random_state=7)
+    #print("EMD Matrix")
+    print("Starting...")
+    embedding_matrix = get_embedding_vectors(tokenizer)
+    if os.path.exists("lstm_model.h5"):
+        model_path = 'lstm_model.h5'
+        model = load_model(model_path)
+        y_test = np.argmax(y_test, axis=1)
+        y_pred = np.argmax(model.predict(X_test), axis=1)
+        acc = accuracy_score(y_test, y_pred) * 100
+        precsn = precision_score(y_test, y_pred, average="macro") * 100
+        recall = recall_score(y_test, y_pred, average="macro") * 100
+        f1score = f1_score(y_test, y_pred, average="macro") * 100
+        print("acc=", acc)
+        print("precsn=", precsn)
+        print("recall=", recall)
+        print("f1score=", f1score)
+    else:
+        model = Sequential()
+        model.add(Embedding(len(tokenizer.word_index) + 1,
+                            EMBEDDING_SIZE,
+                            weights=[embedding_matrix],
+                            trainable=False,
+                            input_length=SEQUENCE_LENGTH))
+        model.add(LSTM(32, return_sequences=True))
+        model.add(BatchNormalization())
+        model.add(LSTM(64))
+        model.add(BatchNormalization())
+        model.add(Dense(64, activation='relu'))
+        model.add(Dense(5, activation="softmax"))
+        model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['acc'])
+        model.fit(X, y, epochs=50, verbose=1, validation_data=(X_test, y_test), batch_size=64)
+        #print("saving")
+        #model.save('lstm_model.h5')
+        #model.summary()
+        y_test = np.argmax(y_test, axis=1)
+        y_pred = np.argmax(model.predict(X_test), axis=1)
+        acc = accuracy_score(y_test, y_pred) * 100
+        precsn = precision_score(y_test, y_pred, average="macro") * 100
+        recall = recall_score(y_test, y_pred, average="macro") * 100
+        f1score = f1_score(y_test, y_pred, average="macro") * 100
+        print("acc=", acc)
+        print("precsn=", precsn)
+        print("recall=", recall)
+        print("f1score=", f1score)
+    return acc, precsn, recall, f1score
+def get_embedding_vectors(tokenizer, dim=100):
+    embedding_index = {}
+    with open(f"data/glove.6B.{dim}d.txt", encoding='utf8') as f:
+        for line in tqdm.tqdm(f, "Reading GloVe"):
+            values = line.split()
+            word = values[0]
+            vectors = np.asarray(values[1:], dtype='float32')
+            embedding_index[word] = vectors
+    word_index = tokenizer.word_index
+    embedding_matrix = np.zeros((len(word_index) + 1, dim))
+    for word, i in word_index.items():
+        embedding_vector = embedding_index.get(word)
+        if embedding_vector is not None:
+            # words not found will be 0s
+            embedding_matrix[i] = embedding_vector
+    return embedding_matrix
+'''def get_predictions(text):
+    sequence = tokenizer.texts_to_sequences([text])
+    # pad the sequence
+    sequence = pad_sequences(sequence, maxlen=SEQUENCE_LENGTH)
+    # get the prediction
+    prediction = model.predict(sequence)[0]
+    # one-hot encoded vector, revert using np.argmax
+    return int2label[np.argmax(prediction)]
+text = "Need a loan? We offer quick and easy approval. Apply now for cash in minutes!."
+print(get_predictions(text))'''
+if __name__ == '__main__':
+    dl_evaluation_process()

cnn_model.h5 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a3a713c55572fb6127d0b32bf73b82cc31f4292c1bdbe3f8196d5c98aafade83
+size 11241160

g1.jpg ADDED Viewed

lstm_model.h5 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:013719aceb3bf3dd458ac32b0fc6e19683543f8247f1da1709fb44fc6811ca24
+size 11269336

manage.py ADDED Viewed

	@@ -0,0 +1,22 @@

+#!/usr/bin/env python
+"""Django's command-line utility for administrative tasks."""
+import os
+import sys
+def main():
+    """Run administrative tasks."""
+    os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'Sentiment.settings')
+    try:
+        from django.core.management import execute_from_command_line
+    except ImportError as exc:
+        raise ImportError(
+            "Couldn't import Django. Are you sure it's installed and "
+            "available on your PYTHONPATH environment variable? Did you "
+            "forget to activate a virtual environment?"
+        ) from exc
+    execute_from_command_line(sys.argv)
+if __name__ == '__main__':
+    main()

tokenizer.pickle ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d6c42fa972b1570608c2571aa0d9409bc9c5775ec97e3312cd409a09c8d2c4f6
+size 1234700

train.csv ADDED Viewed

The diff for this file is too large to render. See raw diff