VaultChem commited on
Commit
9063b08
1 Parent(s): c0565b3

Upload PE_main.py

Browse files
Files changed (1) hide show
  1. PE_main.py +201 -0
PE_main.py ADDED
@@ -0,0 +1,201 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ '''
2
+ This is the main function of the PE classification of this program
3
+ The library used to extract the features from the PE was pefile and you can find it here,
4
+ https://pypi.org/project/pefile/
5
+
6
+ In this program we are first extracting the features from the PE and then providing it to the saved machine and using thoses features we are prediciting whether the PE is malicious or not.
7
+ '''
8
+
9
+ import pefile
10
+ import os
11
+ import array
12
+ import math
13
+ import pickle
14
+ import joblib
15
+ import sys
16
+ import argparse
17
+
18
+
19
+ #For calculating the entropy
20
+ def get_entropy(data):
21
+ if len(data) == 0:
22
+ return 0.0
23
+ occurences = array.array('L', [0]*256)
24
+ for x in data:
25
+ occurences[x if isinstance(x, int) else ord(x)] += 1
26
+
27
+ entropy = 0
28
+ for x in occurences:
29
+ if x:
30
+ p_x = float(x) / len(data)
31
+ entropy -= p_x*math.log(p_x, 2)
32
+
33
+ return entropy
34
+
35
+ #For extracting the resources part
36
+ def get_resources(pe):
37
+ """Extract resources :
38
+ [entropy, size]"""
39
+ resources = []
40
+ if hasattr(pe, 'DIRECTORY_ENTRY_RESOURCE'):
41
+ try:
42
+ for resource_type in pe.DIRECTORY_ENTRY_RESOURCE.entries:
43
+ if hasattr(resource_type, 'directory'):
44
+ for resource_id in resource_type.directory.entries:
45
+ if hasattr(resource_id, 'directory'):
46
+ for resource_lang in resource_id.directory.entries:
47
+ data = pe.get_data(resource_lang.data.struct.OffsetToData, resource_lang.data.struct.Size)
48
+ size = resource_lang.data.struct.Size
49
+ entropy = get_entropy(data)
50
+
51
+ resources.append([entropy, size])
52
+ except Exception as e:
53
+ return resources
54
+ return resources
55
+
56
+ #For getting the version information
57
+ def get_version_info(pe):
58
+ """Return version infos"""
59
+ res = {}
60
+ for fileinfo in pe.FileInfo:
61
+ if fileinfo.Key == 'StringFileInfo':
62
+ for st in fileinfo.StringTable:
63
+ for entry in st.entries.items():
64
+ res[entry[0]] = entry[1]
65
+ if fileinfo.Key == 'VarFileInfo':
66
+ for var in fileinfo.Var:
67
+ res[var.entry.items()[0][0]] = var.entry.items()[0][1]
68
+ if hasattr(pe, 'VS_FIXEDFILEINFO'):
69
+ res['flags'] = pe.VS_FIXEDFILEINFO.FileFlags
70
+ res['os'] = pe.VS_FIXEDFILEINFO.FileOS
71
+ res['type'] = pe.VS_FIXEDFILEINFO.FileType
72
+ res['file_version'] = pe.VS_FIXEDFILEINFO.FileVersionLS
73
+ res['product_version'] = pe.VS_FIXEDFILEINFO.ProductVersionLS
74
+ res['signature'] = pe.VS_FIXEDFILEINFO.Signature
75
+ res['struct_version'] = pe.VS_FIXEDFILEINFO.StrucVersion
76
+ return res
77
+
78
+ #extract the info for a given file using pefile
79
+ def extract_infos(fpath):
80
+ res = {}
81
+ pe = pefile.PE(fpath)
82
+ res['Machine'] = pe.FILE_HEADER.Machine
83
+ res['SizeOfOptionalHeader'] = pe.FILE_HEADER.SizeOfOptionalHeader
84
+ res['Characteristics'] = pe.FILE_HEADER.Characteristics
85
+ res['MajorLinkerVersion'] = pe.OPTIONAL_HEADER.MajorLinkerVersion
86
+ res['MinorLinkerVersion'] = pe.OPTIONAL_HEADER.MinorLinkerVersion
87
+ res['SizeOfCode'] = pe.OPTIONAL_HEADER.SizeOfCode
88
+ res['SizeOfInitializedData'] = pe.OPTIONAL_HEADER.SizeOfInitializedData
89
+ res['SizeOfUninitializedData'] = pe.OPTIONAL_HEADER.SizeOfUninitializedData
90
+ res['AddressOfEntryPoint'] = pe.OPTIONAL_HEADER.AddressOfEntryPoint
91
+ res['BaseOfCode'] = pe.OPTIONAL_HEADER.BaseOfCode
92
+ try:
93
+ res['BaseOfData'] = pe.OPTIONAL_HEADER.BaseOfData
94
+ except AttributeError:
95
+ res['BaseOfData'] = 0
96
+ res['ImageBase'] = pe.OPTIONAL_HEADER.ImageBase
97
+ res['SectionAlignment'] = pe.OPTIONAL_HEADER.SectionAlignment
98
+ res['FileAlignment'] = pe.OPTIONAL_HEADER.FileAlignment
99
+ res['MajorOperatingSystemVersion'] = pe.OPTIONAL_HEADER.MajorOperatingSystemVersion
100
+ res['MinorOperatingSystemVersion'] = pe.OPTIONAL_HEADER.MinorOperatingSystemVersion
101
+ res['MajorImageVersion'] = pe.OPTIONAL_HEADER.MajorImageVersion
102
+ res['MinorImageVersion'] = pe.OPTIONAL_HEADER.MinorImageVersion
103
+ res['MajorSubsystemVersion'] = pe.OPTIONAL_HEADER.MajorSubsystemVersion
104
+ res['MinorSubsystemVersion'] = pe.OPTIONAL_HEADER.MinorSubsystemVersion
105
+ res['SizeOfImage'] = pe.OPTIONAL_HEADER.SizeOfImage
106
+ res['SizeOfHeaders'] = pe.OPTIONAL_HEADER.SizeOfHeaders
107
+ res['CheckSum'] = pe.OPTIONAL_HEADER.CheckSum
108
+ res['Subsystem'] = pe.OPTIONAL_HEADER.Subsystem
109
+ res['DllCharacteristics'] = pe.OPTIONAL_HEADER.DllCharacteristics
110
+ res['SizeOfStackReserve'] = pe.OPTIONAL_HEADER.SizeOfStackReserve
111
+ res['SizeOfStackCommit'] = pe.OPTIONAL_HEADER.SizeOfStackCommit
112
+ res['SizeOfHeapReserve'] = pe.OPTIONAL_HEADER.SizeOfHeapReserve
113
+ res['SizeOfHeapCommit'] = pe.OPTIONAL_HEADER.SizeOfHeapCommit
114
+ res['LoaderFlags'] = pe.OPTIONAL_HEADER.LoaderFlags
115
+ res['NumberOfRvaAndSizes'] = pe.OPTIONAL_HEADER.NumberOfRvaAndSizes
116
+
117
+ # Sections
118
+ res['SectionsNb'] = len(pe.sections)
119
+ entropy = list(map(lambda x:x.get_entropy(), pe.sections))
120
+ res['SectionsMeanEntropy'] = sum(entropy)/float(len((entropy)))
121
+ res['SectionsMinEntropy'] = min(entropy)
122
+ res['SectionsMaxEntropy'] = max(entropy)
123
+ raw_sizes = list(map(lambda x:x.SizeOfRawData, pe.sections))
124
+ res['SectionsMeanRawsize'] = sum(raw_sizes)/float(len((raw_sizes)))
125
+ res['SectionsMinRawsize'] = min(raw_sizes)
126
+ #res['SectionsMaxRawsize'] = max(raw_sizes)
127
+ virtual_sizes = list(map(lambda x:x.Misc_VirtualSize, pe.sections))
128
+ res['SectionsMeanVirtualsize'] = sum(virtual_sizes)/float(len(virtual_sizes))
129
+ res['SectionsMinVirtualsize'] = min(virtual_sizes)
130
+ res['SectionMaxVirtualsize'] = max(virtual_sizes)
131
+
132
+ #Imports
133
+ try:
134
+ res['ImportsNbDLL'] = len(pe.DIRECTORY_ENTRY_IMPORT)
135
+ imports = sum([x.imports for x in pe.DIRECTORY_ENTRY_IMPORT], [])
136
+ res['ImportsNb'] = len(imports)
137
+ res['ImportsNbOrdinal'] = 0
138
+ except AttributeError:
139
+ res['ImportsNbDLL'] = 0
140
+ res['ImportsNb'] = 0
141
+ res['ImportsNbOrdinal'] = 0
142
+
143
+ #Exports
144
+ try:
145
+ res['ExportNb'] = len(pe.DIRECTORY_ENTRY_EXPORT.symbols)
146
+ except AttributeError:
147
+ # No export
148
+ res['ExportNb'] = 0
149
+ #Resources
150
+ resources= get_resources(pe)
151
+ res['ResourcesNb'] = len(resources)
152
+ if len(resources)> 0:
153
+ entropy = list(map(lambda x:x[0], resources))
154
+ res['ResourcesMeanEntropy'] = sum(entropy)/float(len(entropy))
155
+ res['ResourcesMinEntropy'] = min(entropy)
156
+ res['ResourcesMaxEntropy'] = max(entropy)
157
+ sizes = list(map(lambda x:x[1], resources))
158
+ res['ResourcesMeanSize'] = sum(sizes)/float(len(sizes))
159
+ res['ResourcesMinSize'] = min(sizes)
160
+ res['ResourcesMaxSize'] = max(sizes)
161
+ else:
162
+ res['ResourcesNb'] = 0
163
+ res['ResourcesMeanEntropy'] = 0
164
+ res['ResourcesMinEntropy'] = 0
165
+ res['ResourcesMaxEntropy'] = 0
166
+ res['ResourcesMeanSize'] = 0
167
+ res['ResourcesMinSize'] = 0
168
+ res['ResourcesMaxSize'] = 0
169
+
170
+ # Load configuration size
171
+ try:
172
+ res['LoadConfigurationSize'] = pe.DIRECTORY_ENTRY_LOAD_CONFIG.struct.Size
173
+ except AttributeError:
174
+ res['LoadConfigurationSize'] = 0
175
+
176
+
177
+ # Version configuration size
178
+ try:
179
+ version_infos = get_version_info(pe)
180
+ res['VersionInformationSize'] = len(version_infos.keys())
181
+ except AttributeError:
182
+ res['VersionInformationSize'] = 0
183
+ return res
184
+
185
+
186
+ if __name__ == '__main__':
187
+
188
+ #Loading the classifier.pkl and features.pkl
189
+ clf = joblib.load('Classifier/classifier.pkl')
190
+ features = pickle.loads(open(os.path.join('Classifier/features.pkl'),'rb').read())
191
+
192
+ #extracting features from the PE file mentioned in the argument
193
+ data = extract_infos(sys.argv[1])
194
+
195
+ #matching it with the features saved in features.pkl
196
+ pe_features = list(map(lambda x:data[x], features))
197
+ print("Features used for classification: ", pe_features)
198
+
199
+ #prediciting if the PE is malicious or not based on the extracted features
200
+ res= clf.predict([pe_features])[0]
201
+ print ('The file %s is %s' % (os.path.basename(sys.argv[1]),['malicious', 'legitimate'][res]))