Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
import os | |
import pandas as pd | |
import numpy | |
import pickle | |
import pefile | |
import sklearn.ensemble as ek | |
from sklearn.feature_selection import SelectFromModel | |
import joblib | |
from sklearn.tree import DecisionTreeClassifier | |
from sklearn.metrics import confusion_matrix | |
from sklearn import svm | |
import sklearn.metrics as metrics | |
from sklearn.model_selection import train_test_split | |
import pdb | |
from tqdm import tqdm | |
dataset = pd.read_csv("data.csv", sep="|") | |
# Feature | |
X = dataset.drop( | |
["Name", "md5", "legitimate"], axis=1 | |
).values # Droping this because classification model will not accept object type elements (float and int only) | |
# Target variable | |
ugly = [ | |
"Machine", | |
"SizeOfOptionalHeader", | |
"Characteristics", | |
"MajorLinkerVersion", | |
"MinorLinkerVersion", | |
"SizeOfCode", | |
"SizeOfInitializedData", | |
"SizeOfUninitializedData", | |
"AddressOfEntryPoint", | |
"BaseOfCode", | |
"BaseOfData", | |
"ImageBase", | |
"SectionAlignment", | |
"FileAlignment", | |
"MajorOperatingSystemVersion", | |
"MinorOperatingSystemVersion", | |
"MajorImageVersion", | |
"MinorImageVersion", | |
"MajorSubsystemVersion", | |
"MinorSubsystemVersion", | |
"SizeOfImage", | |
"SizeOfHeaders", | |
"CheckSum", | |
"Subsystem", | |
"DllCharacteristics", | |
"SizeOfStackReserve", | |
"SizeOfStackCommit", | |
"SizeOfHeapReserve", | |
"SizeOfHeapCommit", | |
"LoaderFlags", | |
"NumberOfRvaAndSizes", | |
"SectionsNb", | |
"SectionsMeanEntropy", | |
"SectionsMinEntropy", | |
"SectionsMaxEntropy", | |
"SectionsMeanRawsize", | |
"SectionsMinRawsize", | |
#"SectionsMaxRawsize", | |
"SectionsMeanVirtualsize", | |
"SectionsMinVirtualsize", | |
"SectionMaxVirtualsize", | |
"ImportsNbDLL", | |
"ImportsNb", | |
"ImportsNbOrdinal", | |
"ExportNb", | |
"ResourcesNb", | |
"ResourcesMeanEntropy", | |
"ResourcesMinEntropy", | |
"ResourcesMaxEntropy", | |
"ResourcesMeanSize", | |
"ResourcesMinSize", | |
"ResourcesMaxSize", | |
"LoadConfigurationSize", | |
"VersionInformationSize", | |
] | |
X = dataset[ugly].values | |
y = dataset["legitimate"].values | |
extratrees = ek.ExtraTreesClassifier().fit(X[:1000], y[:1000]) | |
model = SelectFromModel(extratrees, prefit=True) | |
X_new = model.transform(X) | |
nbfeatures = X_new.shape[1] | |
# splitting the data (70% - training and 30% - testing) | |
X_train, X_test, y_train, y_test = train_test_split( | |
X_new, y, test_size=0.29, stratify=y | |
) | |
features = [] | |
index = numpy.argsort(extratrees.feature_importances_)[::-1][:nbfeatures] | |
for f in range(nbfeatures): | |
print( | |
"%d. feature %s (%f)" | |
% ( | |
f + 1, | |
dataset.columns[2 + index[f]], | |
extratrees.feature_importances_[index[f]], | |
) | |
) | |
features.append(dataset.columns[2 + f]) | |
model = { | |
"DecisionTree": DecisionTreeClassifier(max_depth=10), | |
"RandomForest": ek.RandomForestClassifier(n_estimators=50), | |
} | |
results = {} | |
for algo in model: | |
clf = model[algo] | |
clf.fit(X_train, y_train) | |
score = clf.score(X_test, y_test) | |
print("%s : %s " % (algo, score)) | |
results[algo] = score | |
winner = max(results, key=results.get) # Selecting the classifier with good result | |
print("Using", winner, "for classification, with", len(features), "features.") | |
joblib.dump(model[winner], "classifier.pkl") | |
open("features.pkl", "wb").write(pickle.dumps(features)) | |
from fhe_utils import ( | |
client_server_interaction, train_zama, | |
setup_network, | |
copy_directory, | |
setup_client, | |
) | |
model_dev_fhe = train_zama(X_train, y_train) | |
#pdb.set_trace() | |
network, _ = setup_network(model_dev_fhe) | |
copied, error_message = copy_directory(network.dev_dir.name, destination="fhe_model") | |
if not copied: | |
print(f"Error copying directory: {error_message}") | |
network.dev_send_model_to_server() | |
network.dev_send_clientspecs_and_modelspecs_to_client() | |
fhemodel_client, serialized_evaluation_keys = setup_client( | |
network, network.client_dir.name | |
) | |
print(f"Evaluation keys size: {len(serialized_evaluation_keys)} B") | |
network.client_send_evaluation_key_to_server(serialized_evaluation_keys) | |
decrypted_predictions, execution_time = client_server_interaction(network, fhemodel_client, X_test[:100]) | |