Spaces:
Sleeping
Sleeping
#Importing the dataset | |
import pandas | |
dataset = pandas.read_csv('datasets/dataset_1.csv', sep=',', low_memory=False) | |
#About the dataset | |
print(dataset.head()) #Top 5 row of the dataset | |
dataset.tail() #Last 5 row of the dataset | |
print(dataset.columns) # name of the columns | |
print(dataset.describe()) | |
# count of malware (0) and benign (1) files in dataset | |
print(dataset.groupby(dataset['legitimate']).size()) | |
#Dataset Visualization | |
import matplotlib.pyplot as plt | |
dataset["legitimate"].value_counts().plot(kind="pie",autopct="%1.1f%%") | |
plt.savefig('pie_chart.png') | |
plt.close() | |
import pickle | |
import joblib | |
import numpy | |
import sklearn.ensemble as ek | |
from sklearn.feature_selection import SelectFromModel | |
from sklearn.metrics import confusion_matrix | |
from sklearn.tree import DecisionTreeClassifier | |
# # Feature | |
X = dataset.drop(['ID', 'md5', 'legitimate'], | |
axis=1).values #Droping this because classification model will not accept object type elements (float and int only) | |
# Target variable | |
y = dataset['legitimate'].values | |
# Data Fitting and Features we need for DecisionTrees | |
extratrees = ek.ExtraTreesClassifier().fit(X, y) | |
model = SelectFromModel(extratrees, prefit=True) | |
X_new = model.transform(X) | |
nbfeatures = X_new.shape[1] | |
print("The number of feature is ", nbfeatures) | |
#splitting the data (80% - training and 20% - testing) | |
from sklearn.model_selection import train_test_split | |
X_train, X_test, y_train, y_test = train_test_split(X_new, y, test_size=0.2) | |
features = [] | |
index = numpy.argsort(extratrees.feature_importances_)[::-1][:nbfeatures] | |
#All required features | |
for f in range(nbfeatures): | |
#print("%d. feature %s (%f)" % (f + 1, dataset.columns[2+index[f]], extratrees.feature_importances_[index[f]])) | |
features.append(dataset.columns[2 + f]) | |
#Testing which Classifier will give better result | |
model = {"DecisionTree": DecisionTreeClassifier(max_depth=10),"RandomForest": ek.RandomForestClassifier(n_estimators=50)} | |
results = {} | |
for algo in model: | |
clf = model[algo] | |
clf.fit(X_train, y_train) | |
score = clf.score(X_test, y_test) | |
print("%s : %s " % (algo, score * 100)) | |
results[algo] = score | |
winner = max(results, key=results.get) # Selecting the classifier with good result | |
print("Using", winner, "for classification, with", len(features), 'features.') | |
# # Find the optimal number of estimators (trees) | |
# mi = 0 | |
# mp = 0 | |
# for i in range(1, 100): | |
# model = ek.RandomForestClassifier(n_estimators=i) | |
# model.fit(X_train, y_train) | |
# score = model.score(X_test,y_test) | |
# if mp < score: | |
# mi = i | |
# mp = score | |
# print(mi, ':', mp) | |
model = ek.RandomForestClassifier(n_estimators=50) | |
model.fit(X_train, y_train) | |
score = model.score(X_test, y_test) | |
print("Accuracy:", (score * 100), '%') | |
#Saving the machine as classifier.pkl and features to be extr acted as features.pkl | |
joblib.dump(model, "model/model.pkl") | |
open('model/features.pkl', 'wb').write(pickle.dumps(features)) | |
# False Positives and Negatives | |
results = model.predict(X_new) | |
mt = confusion_matrix(y, results) | |
print("False positive rate : %f %%" % ((mt[0][1] / float(sum(mt[0]))) * 100)) | |
print('False negative rate : %f %%' % (mt[1][0] / float(sum(mt[1])) * 100)) | |
# Plot the confusion matrix | |
import matplotlib.pyplot as plt | |
import seaborn as sns | |
plt.figure(figsize=(8, 6)) | |
sns.heatmap(mt, annot=True, fmt="d", cmap="Blues", cbar=False, | |
xticklabels=['Predicted Negative', 'Predicted Positive'], | |
yticklabels=['Actual Negative', 'Actual Positive']) | |
plt.xlabel('Predicted Label') | |
plt.ylabel('True Label') | |
plt.title('Confusion Matrix') | |
plt.savefig('confusion_matrix.png') | |
plt.close() | |
from sklearn.metrics import precision_score, recall_score, f1_score | |
# Predictions : | |
results = model.predict(X_new) | |
# Calculate precision, recall, and F1 score | |
precision = precision_score(y, results) | |
recall = recall_score(y, results) | |
f1 = f1_score(y, results) | |
print("Precision: {:.2f}%".format(precision * 100)) | |
print("Recall: {:.2f}%".format(recall * 100)) | |
print("F1 Score: {:.2f}%".format(f1 * 100)) | |