|
|
|
import numpy as np |
|
import pandas as pd |
|
import seaborn as sns; sns.set() |
|
from nltk.corpus import stopwords |
|
from nltk.stem import PorterStemmer |
|
from nltk.tokenize import word_tokenize |
|
from sklearn.feature_extraction.text import TfidfVectorizer |
|
from sklearn.metrics import confusion_matrix, f1_score, accuracy_score, classification_report |
|
from sklearn.naive_bayes import MultinomialNB |
|
from skmultilearn.problem_transform import ClassifierChain |
|
import matplotlib.pyplot as plt |
|
import pickle |
|
from sklearn.pipeline import Pipeline |
|
|
|
|
|
|
|
train_data = pd.read_csv('Train.csv') |
|
test_data = pd.read_csv('Test.csv') |
|
|
|
abstract_list_train = [] |
|
abstract_list_test = [] |
|
stemmer = PorterStemmer() |
|
stop_words = stopwords.words('english') |
|
|
|
|
|
def remove_stopwords(data = []): |
|
data_list = [] |
|
for name in data: |
|
words = word_tokenize(name) |
|
stem_word = "" |
|
for a in words: |
|
if a.lower() not in stop_words: |
|
stem_word += stemmer.stem(a) + ' ' |
|
data_list.append(stem_word.lower()) |
|
return data_list |
|
|
|
|
|
def remove_special_character(data = []): |
|
abstract_list_wo_sc = [] |
|
special_characters = '''!()-β[]{};:'"\, <>./?@#$%^&*_~0123456789+=ββ''' |
|
for file in data: |
|
word_wo_sc = "" |
|
if len(file.split()) == 1: |
|
abstract_list_wo_sc.append(file) |
|
else: |
|
for a in file: |
|
if a in special_characters: |
|
word_wo_sc += ' ' |
|
else: |
|
word_wo_sc += a |
|
abstract_list_wo_sc.append(word_wo_sc) |
|
return abstract_list_wo_sc |
|
|
|
|
|
data_train = np.array(train_data['ABSTRACT']) |
|
abstract_list_train = remove_stopwords(data_train) |
|
|
|
|
|
data_test = np.array(test_data['ABSTRACT']) |
|
abstract_list_test = remove_stopwords(data_test) |
|
|
|
|
|
abstract_list_wo_sc_train = remove_special_character(abstract_list_train) |
|
abstract_list_wo_sc_test = remove_special_character(abstract_list_test) |
|
|
|
categories=['Engineering', 'Business', 'Art'] |
|
|
|
x_train = abstract_list_wo_sc_train |
|
y_train = train_data[categories] |
|
x_test = abstract_list_wo_sc_test |
|
y_test = test_data[categories] |
|
|
|
print("There are ", len(x_train), " input training samples") |
|
print("There are ", len(x_test), " input testing samples") |
|
print("There are ", y_train.shape, " output training samples") |
|
print("There are ", y_test.shape, " output testing samples") |
|
|
|
|
|
|
|
|
|
|
|
|
|
parameters = Pipeline([('tfidf', TfidfVectorizer(stop_words=stop_words)),('clf', ClassifierChain(MultinomialNB())),]) |
|
|
|
|
|
parameters.fit(x_train, y_train) |
|
|
|
|
|
|
|
predictions = parameters.predict(x_test) |
|
|
|
|
|
|
|
|
|
accuracy = accuracy_score(y_test, predictions) |
|
print('Accuracy:', accuracy) |
|
|
|
|
|
f1 = f1_score(y_test, predictions, average='micro') |
|
print('F1 score:', f1) |
|
|
|
|
|
report = classification_report(y_test, predictions) |
|
print('Classification Report:') |
|
print(report) |
|
|
|
|
|
|
|
|
|
|
|
mat = confusion_matrix(y_test.values.argmax(axis=1), predictions.argmax(axis=1)) |
|
|
|
plt.figure(figsize=(8, 6)) |
|
sns.heatmap(mat.T, square=True, annot=True, fmt='d', cbar=False, cmap='Blues') |
|
plt.xlabel('True Label') |
|
plt.ylabel('Predicted Label') |
|
plt.title('Confusion Matrix') |
|
|
|
plt.show() |
|
|
|
|
|
|
|
with open('model_MultiNB.pkl', 'wb') as picklefile: |
|
pickle.dump(parameters.named_steps['clf'], picklefile) |
|
|