MalwareDetection / model_trainer.py
brdhaker3's picture
Upload 11 files
4438927 verified
#Importing the dataset
import pandas
dataset = pandas.read_csv('datasets/dataset_1.csv', sep=',', low_memory=False)
#About the dataset
print(dataset.head()) #Top 5 row of the dataset
dataset.tail() #Last 5 row of the dataset
print(dataset.columns) # name of the columns
print(dataset.describe())
# count of malware (0) and benign (1) files in dataset
print(dataset.groupby(dataset['legitimate']).size())
#Dataset Visualization
import matplotlib.pyplot as plt
dataset["legitimate"].value_counts().plot(kind="pie",autopct="%1.1f%%")
plt.savefig('pie_chart.png')
plt.close()
import pickle
import joblib
import numpy
import sklearn.ensemble as ek
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import confusion_matrix
from sklearn.tree import DecisionTreeClassifier
# # Feature
X = dataset.drop(['ID', 'md5', 'legitimate'],
axis=1).values #Droping this because classification model will not accept object type elements (float and int only)
# Target variable
y = dataset['legitimate'].values
# Data Fitting and Features we need for DecisionTrees
extratrees = ek.ExtraTreesClassifier().fit(X, y)
model = SelectFromModel(extratrees, prefit=True)
X_new = model.transform(X)
nbfeatures = X_new.shape[1]
print("The number of feature is ", nbfeatures)
#splitting the data (80% - training and 20% - testing)
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_new, y, test_size=0.2)
features = []
index = numpy.argsort(extratrees.feature_importances_)[::-1][:nbfeatures]
#All required features
for f in range(nbfeatures):
#print("%d. feature %s (%f)" % (f + 1, dataset.columns[2+index[f]], extratrees.feature_importances_[index[f]]))
features.append(dataset.columns[2 + f])
#Testing which Classifier will give better result
model = {"DecisionTree": DecisionTreeClassifier(max_depth=10),"RandomForest": ek.RandomForestClassifier(n_estimators=50)}
results = {}
for algo in model:
clf = model[algo]
clf.fit(X_train, y_train)
score = clf.score(X_test, y_test)
print("%s : %s " % (algo, score * 100))
results[algo] = score
winner = max(results, key=results.get) # Selecting the classifier with good result
print("Using", winner, "for classification, with", len(features), 'features.')
# # Find the optimal number of estimators (trees)
# mi = 0
# mp = 0
# for i in range(1, 100):
# model = ek.RandomForestClassifier(n_estimators=i)
# model.fit(X_train, y_train)
# score = model.score(X_test,y_test)
# if mp < score:
# mi = i
# mp = score
# print(mi, ':', mp)
model = ek.RandomForestClassifier(n_estimators=50)
model.fit(X_train, y_train)
score = model.score(X_test, y_test)
print("Accuracy:", (score * 100), '%')
#Saving the machine as classifier.pkl and features to be extr acted as features.pkl
joblib.dump(model, "model/model.pkl")
open('model/features.pkl', 'wb').write(pickle.dumps(features))
# False Positives and Negatives
results = model.predict(X_new)
mt = confusion_matrix(y, results)
print("False positive rate : %f %%" % ((mt[0][1] / float(sum(mt[0]))) * 100))
print('False negative rate : %f %%' % (mt[1][0] / float(sum(mt[1])) * 100))
# Plot the confusion matrix
import matplotlib.pyplot as plt
import seaborn as sns
plt.figure(figsize=(8, 6))
sns.heatmap(mt, annot=True, fmt="d", cmap="Blues", cbar=False,
xticklabels=['Predicted Negative', 'Predicted Positive'],
yticklabels=['Actual Negative', 'Actual Positive'])
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Confusion Matrix')
plt.savefig('confusion_matrix.png')
plt.close()
from sklearn.metrics import precision_score, recall_score, f1_score
# Predictions :
results = model.predict(X_new)
# Calculate precision, recall, and F1 score
precision = precision_score(y, results)
recall = recall_score(y, results)
f1 = f1_score(y, results)
print("Precision: {:.2f}%".format(precision * 100))
print("Recall: {:.2f}%".format(recall * 100))
print("F1 Score: {:.2f}%".format(f1 * 100))