RYU-KASH commited on
Commit
902cf4a
1 Parent(s): ccae435

Upload 3 files

Browse files
Files changed (3) hide show
  1. app.py +728 -0
  2. phishing.csv +0 -0
  3. phishing.txt +0 -0
app.py ADDED
@@ -0,0 +1,728 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import numpy as np # linear algebra
3
+ import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
4
+
5
+ import matplotlib.pyplot as plt
6
+ #%matplotlib inline
7
+ import seaborn as sns
8
+ from sklearn import metrics
9
+ import warnings
10
+ warnings.filterwarnings('ignore')
11
+
12
+ data = pd.read_csv('phishing.csv')
13
+ data.head(20)
14
+
15
+ data.columns
16
+ len(data.columns)
17
+ data.isnull().sum()
18
+ X = data.drop(["class","Index"],axis =1)
19
+ y = data["class"]
20
+
21
+ fig, ax = plt.subplots(1, 1, figsize=(15, 9))
22
+ sns.heatmap(data.corr(), annot=True,cmap='viridis')
23
+ plt.title('Correlation between different features', fontsize = 15, c='black')
24
+ plt.show()
25
+
26
+ corr=data.corr()
27
+ corr.head()
28
+
29
+ corr['class']=abs(corr['class'])
30
+ corr.head()
31
+
32
+ incCorr=corr.sort_values(by='class',ascending=False)
33
+ incCorr.head()
34
+
35
+ incCorr['class']
36
+
37
+ tenfeatures=incCorr[1:11].index
38
+ twenfeatures=incCorr[1:21].index
39
+
40
+ #Structutre to Store metrics
41
+ ML_Model = []
42
+ accuracy = []
43
+ f1_score = []
44
+ precision = []
45
+
46
+ def storeResults(model, a,b,c):
47
+ ML_Model.append(model)
48
+ accuracy.append(round(a, 3))
49
+ f1_score.append(round(b, 3))
50
+ precision.append(round(c, 3))
51
+
52
+ def KNN(X):
53
+ x=[a for a in range(1,10,2)]
54
+ knntrain=[]
55
+ knntest=[]
56
+ from sklearn.model_selection import train_test_split
57
+ X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)
58
+ X_train.shape, y_train.shape, X_test.shape, y_test.shape
59
+ for i in range(1,10,2):
60
+ from sklearn.neighbors import KNeighborsClassifier
61
+ knn = KNeighborsClassifier(n_neighbors=i)
62
+ knn.fit(X_train,y_train)
63
+ y_train_knn = knn.predict(X_train)
64
+ y_test_knn = knn.predict(X_test)
65
+ acc_train_knn = metrics.accuracy_score(y_train,y_train_knn)
66
+ acc_test_knn = metrics.accuracy_score(y_test,y_test_knn)
67
+ print("K-Nearest Neighbors with k={}: Accuracy on training Data: {:.3f}".format(i,acc_train_knn))
68
+ print("K-Nearest Neighbors with k={}: Accuracy on test Data: {:.3f}".format(i,acc_test_knn))
69
+ knntrain.append(acc_train_knn)
70
+ knntest.append(acc_test_knn)
71
+ print()
72
+ import matplotlib.pyplot as plt
73
+ plt.plot(x,knntrain,label="Train accuracy")
74
+ plt.plot(x,knntest,label="Test accuracy")
75
+ plt.legend()
76
+ plt.show()
77
+
78
+ Xmain=X
79
+ Xten=X[tenfeatures]
80
+ Xtwen=X[twenfeatures]
81
+
82
+ KNN(Xmain)
83
+
84
+ KNN(Xten)
85
+
86
+ KNN(Xtwen)
87
+
88
+ from sklearn.model_selection import train_test_split
89
+ from sklearn.neighbors import KNeighborsClassifier
90
+ X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)
91
+ X_train.shape, y_train.shape, X_test.shape, y_test.shape
92
+
93
+ knn = KNeighborsClassifier(n_neighbors=5)
94
+ knn.fit(X_train,y_train)
95
+
96
+ y_train_knn = knn.predict(X_train)
97
+ y_test_knn = knn.predict(X_test)
98
+
99
+ acc_train_knn = metrics.accuracy_score(y_train,y_train_knn)
100
+ acc_test_knn = metrics.accuracy_score(y_test,y_test_knn)
101
+
102
+ f1_score_train_knn = metrics.f1_score(y_train,y_train_knn)
103
+ f1_score_test_knn = metrics.f1_score(y_test,y_test_knn)
104
+
105
+ precision_score_train_knn = metrics.precision_score(y_train,y_train_knn)
106
+ precision_score_test_knn = metrics.precision_score(y_test,y_test_knn)
107
+
108
+ storeResults('K-Nearest Neighbors',acc_test_knn,f1_score_test_knn,precision_score_train_knn)
109
+
110
+ def SVM(X, y):
111
+ x=[a for a in range(1,10,2)]
112
+ svmtrain=[]
113
+ svmtest=[]
114
+ from sklearn.model_selection import train_test_split
115
+ X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)
116
+ X_train.shape, y_train.shape, X_test.shape, y_test.shape
117
+ from sklearn.svm import SVC
118
+ for i in range(1,10,2):
119
+ svm = SVC(kernel='linear', C=i)
120
+ svm.fit(X_train, y_train)
121
+ y_train_svm = svm.predict(X_train)
122
+ y_test_svm = svm.predict(X_test)
123
+ acc_train_svm = metrics.accuracy_score(y_train, y_train_svm)
124
+ acc_test_svm = metrics.accuracy_score(y_test, y_test_svm)
125
+ print("SVM with C={}: Accuracy on training Data: {:.3f}".format(i,acc_train_svm))
126
+ print("SVM with C={}: Accuracy on test Data: {:.3f}".format(i,acc_test_svm))
127
+ svmtrain.append(acc_train_svm)
128
+ svmtest.append(acc_test_svm)
129
+ print()
130
+ import matplotlib.pyplot as plt
131
+ plt.plot(x,svmtrain,label="Train accuracy")
132
+ plt.plot(x,svmtest,label="Test accuracy")
133
+ plt.legend()
134
+ plt.show()
135
+
136
+
137
+ Xmain=X
138
+ Xten=X[tenfeatures]
139
+ Xtwen=X[twenfeatures]
140
+
141
+ SVM(Xmain,y)
142
+ SVM(Xten,y)
143
+ SVM(Xtwen,y)
144
+
145
+ from sklearn.model_selection import train_test_split
146
+ from sklearn.svm import SVC
147
+ from sklearn import metrics
148
+
149
+
150
+ X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
151
+
152
+ svm = SVC(kernel='linear', C=1, random_state=42)
153
+ svm.fit(X_train, y_train)
154
+
155
+
156
+ y_train_svm = svm.predict(X_train)
157
+ y_test_svm = svm.predict(X_test)
158
+
159
+
160
+ acc_train_svm = metrics.accuracy_score(y_train, y_train_svm)
161
+ acc_test_svm = metrics.accuracy_score(y_test, y_test_svm)
162
+
163
+ f1_score_train_svm = metrics.f1_score(y_train, y_train_svm)
164
+ f1_score_test_svm = metrics.f1_score(y_test, y_test_svm)
165
+
166
+ precision_score_train_svm = metrics.precision_score(y_train, y_train_svm)
167
+ precision_score_test_svm = metrics.precision_score(y_test, y_test_svm)
168
+
169
+ print("SVM with C={}: Accuracy on training data: {:.3f}".format(1, acc_train_svm))
170
+ print("SVM with C={}: Accuracy on test data: {:.3f}".format(1, acc_test_svm))
171
+ print("SVM with C={}: F1 score on training data: {:.3f}".format(1, f1_score_train_svm))
172
+ print("SVM with C={}: F1 score on test data: {:.3f}".format(1, f1_score_test_svm))
173
+ print("SVM with C={}: Precision on training data: {:.3f}".format(1, precision_score_train_svm))
174
+ print("SVM with C={}: Precision on test data: {:.3f}".format(1, precision_score_test_svm))
175
+
176
+ storeResults('Support Vector Machines',acc_test_svm,f1_score_test_svm,precision_score_train_svm)
177
+
178
+ from sklearn.model_selection import train_test_split
179
+ X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)
180
+ X_train.shape, y_train.shape, X_test.shape, y_test.shape
181
+
182
+ from sklearn.ensemble import GradientBoostingClassifier
183
+ gbc = GradientBoostingClassifier(max_depth=4,learning_rate=0.7)
184
+ gbc.fit(X_train,y_train)
185
+
186
+ y_train_gbc = gbc.predict(X_train)
187
+ y_test_gbc = gbc.predict(X_test)
188
+
189
+ acc_train_gbc = metrics.accuracy_score(y_train,y_train_gbc)
190
+ acc_test_gbc = metrics.accuracy_score(y_test,y_test_gbc)
191
+ print("Gradient Boosting Classifier : Accuracy on training Data: {:.3f}".format(acc_train_gbc))
192
+ print("Gradient Boosting Classifier : Accuracy on test Data: {:.3f}".format(acc_test_gbc))
193
+ print()
194
+
195
+ f1_score_train_gbc = metrics.f1_score(y_train,y_train_gbc)
196
+ f1_score_test_gbc = metrics.f1_score(y_test,y_test_gbc)
197
+
198
+ precision_score_train_gbc = metrics.precision_score(y_train,y_train_gbc)
199
+ precision_score_test_gbc = metrics.precision_score(y_test,y_test_gbc)
200
+
201
+ storeResults('Gradient Boosting Classifier',acc_test_gbc,f1_score_test_gbc,precision_score_train_gbc)
202
+
203
+ df = pd.DataFrame({
204
+ 'Modelname': ML_Model,
205
+ 'Accuracy Score': accuracy,
206
+ 'F1 Score': f1_score,
207
+ 'Precision Score': precision
208
+ })
209
+ df.set_index('Modelname', inplace=True)
210
+
211
+ # plot the scores for each model
212
+
213
+ fig, ax = plt.subplots(figsize=(10,10))
214
+ df.plot(kind='bar', ax=ax)
215
+ ax.set_xticklabels(df.index, rotation=0)
216
+ ax.set_ylim([0.9, 1])
217
+ ax.set_yticks([0.9,0.91,0.92,0.93,0.94,0.95,0.96,0.97,0.98,0.99,1])
218
+ ax.set_xlabel('Model')
219
+ ax.set_ylabel('Score')
220
+ ax.set_title('Model Scores')
221
+ plt.show()
222
+
223
+ import whois
224
+
225
+ import googlesearch
226
+
227
+ import ipaddress
228
+ import re
229
+ import urllib.request
230
+ from bs4 import BeautifulSoup
231
+ import socket
232
+ import requests
233
+ import google
234
+ import whois
235
+ from datetime import date, datetime
236
+ import time
237
+ from dateutil.parser import parse as date_parse
238
+ from urllib.parse import urlparse
239
+
240
+ class FeatureExtraction:
241
+ features = []
242
+ def __init__(self,url):
243
+ self.features = []
244
+ self.url = url
245
+ self.domain = ""
246
+ self.whois_response = ""
247
+ self.urlparse = ""
248
+ self.response = ""
249
+ self.soup = ""
250
+
251
+ try:
252
+ self.response = requests.get(url)
253
+ self.soup = BeautifulSoup(response.text, 'html.parser')
254
+ except:
255
+ pass
256
+
257
+ try:
258
+ self.urlparse = urlparse(url)
259
+ self.domain = self.urlparse.netloc
260
+ except:
261
+ pass
262
+
263
+ try:
264
+ self.whois_response = whois.whois(self.domain)
265
+ except:
266
+ pass
267
+
268
+
269
+
270
+
271
+ self.features.append(self.UsingIp())
272
+ self.features.append(self.longUrl())
273
+ self.features.append(self.shortUrl())
274
+ self.features.append(self.symbol())
275
+ self.features.append(self.redirecting())
276
+ self.features.append(self.prefixSuffix())
277
+ self.features.append(self.SubDomains())
278
+ self.features.append(self.Hppts())
279
+ self.features.append(self.DomainRegLen())
280
+ self.features.append(self.Favicon())
281
+
282
+
283
+ self.features.append(self.NonStdPort())
284
+ self.features.append(self.HTTPSDomainURL())
285
+ self.features.append(self.RequestURL())
286
+ self.features.append(self.AnchorURL())
287
+ self.features.append(self.LinksInScriptTags())
288
+ self.features.append(self.ServerFormHandler())
289
+ self.features.append(self.InfoEmail())
290
+ self.features.append(self.AbnormalURL())
291
+ self.features.append(self.WebsiteForwarding())
292
+ self.features.append(self.StatusBarCust())
293
+
294
+ self.features.append(self.DisableRightClick())
295
+ self.features.append(self.UsingPopupWindow())
296
+ self.features.append(self.IframeRedirection())
297
+ self.features.append(self.AgeofDomain())
298
+ self.features.append(self.DNSRecording())
299
+ self.features.append(self.WebsiteTraffic())
300
+ self.features.append(self.PageRank())
301
+ self.features.append(self.GoogleIndex())
302
+ self.features.append(self.LinksPointingToPage())
303
+ self.features.append(self.StatsReport())
304
+
305
+
306
+ # 1.UsingIp
307
+ def UsingIp(self):
308
+ try:
309
+ ipaddress.ip_address(self.url)
310
+ return -1
311
+ except:
312
+ return 1
313
+
314
+ # 2.longUrl
315
+ def longUrl(self):
316
+ if len(self.url) < 54:
317
+ return 1
318
+ if len(self.url) >= 54 and len(self.url) <= 75:
319
+ return 0
320
+ return -1
321
+
322
+ # 3.shortUrl
323
+ def shortUrl(self):
324
+ match = re.search('bit\.ly|goo\.gl|shorte\.st|go2l\.ink|x\.co|ow\.ly|t\.co|tinyurl|tr\.im|is\.gd|cli\.gs|'
325
+ 'yfrog\.com|migre\.me|ff\.im|tiny\.cc|url4\.eu|twit\.ac|su\.pr|twurl\.nl|snipurl\.com|'
326
+ 'short\.to|BudURL\.com|ping\.fm|post\.ly|Just\.as|bkite\.com|snipr\.com|fic\.kr|loopt\.us|'
327
+ 'doiop\.com|short\.ie|kl\.am|wp\.me|rubyurl\.com|om\.ly|to\.ly|bit\.do|t\.co|lnkd\.in|'
328
+ 'db\.tt|qr\.ae|adf\.ly|goo\.gl|bitly\.com|cur\.lv|tinyurl\.com|ow\.ly|bit\.ly|ity\.im|'
329
+ 'q\.gs|is\.gd|po\.st|bc\.vc|twitthis\.com|u\.to|j\.mp|buzurl\.com|cutt\.us|u\.bb|yourls\.org|'
330
+ 'x\.co|prettylinkpro\.com|scrnch\.me|filoops\.info|vzturl\.com|qr\.net|1url\.com|tweez\.me|v\.gd|tr\.im|link\.zip\.net', self.url)
331
+ if match:
332
+ return -1
333
+ return 1
334
+
335
+ # 4.Symbol@
336
+ def symbol(self):
337
+ if re.findall("@",self.url):
338
+ return -1
339
+ return 1
340
+
341
+ # 5.Redirecting//
342
+ def redirecting(self):
343
+ if self.url.rfind('//')>6:
344
+ return -1
345
+ return 1
346
+
347
+ # 6.prefixSuffix
348
+ def prefixSuffix(self):
349
+ try:
350
+ match = re.findall('\-', self.domain)
351
+ if match:
352
+ return -1
353
+ return 1
354
+ except:
355
+ return -1
356
+
357
+ # 7.SubDomains
358
+ def SubDomains(self):
359
+ dot_count = len(re.findall("\.", self.url))
360
+ if dot_count == 1:
361
+ return 1
362
+ elif dot_count == 2:
363
+ return 0
364
+ return -1
365
+
366
+ # 8.HTTPS
367
+ def Hppts(self):
368
+ try:
369
+ https = self.urlparse.scheme
370
+ if 'https' in https:
371
+ return 1
372
+ return -1
373
+ except:
374
+ return 1
375
+
376
+ # 9.DomainRegLen
377
+ def DomainRegLen(self):
378
+ try:
379
+ expiration_date = self.whois_response.expiration_date
380
+ creation_date = self.whois_response.creation_date
381
+ try:
382
+ if(len(expiration_date)):
383
+ expiration_date = expiration_date[0]
384
+ except:
385
+ pass
386
+ try:
387
+ if(len(creation_date)):
388
+ creation_date = creation_date[0]
389
+ except:
390
+ pass
391
+
392
+ age = (expiration_date.year-creation_date.year)*12+ (expiration_date.month-creation_date.month)
393
+ if age >=12:
394
+ return 1
395
+ return -1
396
+ except:
397
+ return -1
398
+
399
+ # 10. Favicon
400
+ def Favicon(self):
401
+ try:
402
+ for head in self.soup.find_all('head'):
403
+ for head.link in self.soup.find_all('link', href=True):
404
+ dots = [x.start(0) for x in re.finditer('\.', head.link['href'])]
405
+ if self.url in head.link['href'] or len(dots) == 1 or domain in head.link['href']:
406
+ return 1
407
+ return -1
408
+ except:
409
+ return -1
410
+
411
+ # 11. NonStdPort
412
+ def NonStdPort(self):
413
+ try:
414
+ port = self.domain.split(":")
415
+ if len(port)>1:
416
+ return -1
417
+ return 1
418
+ except:
419
+ return -1
420
+
421
+ # 12. HTTPSDomainURL
422
+ def HTTPSDomainURL(self):
423
+ try:
424
+ if 'https' in self.domain:
425
+ return -1
426
+ return 1
427
+ except:
428
+ return -1
429
+
430
+ # 13. RequestURL
431
+ def RequestURL(self):
432
+ try:
433
+ for img in self.soup.find_all('img', src=True):
434
+ dots = [x.start(0) for x in re.finditer('\.', img['src'])]
435
+ if self.url in img['src'] or self.domain in img['src'] or len(dots) == 1:
436
+ success = success + 1
437
+ i = i+1
438
+
439
+ for audio in self.soup.find_all('audio', src=True):
440
+ dots = [x.start(0) for x in re.finditer('\.', audio['src'])]
441
+ if self.url in audio['src'] or self.domain in audio['src'] or len(dots) == 1:
442
+ success = success + 1
443
+ i = i+1
444
+
445
+ for embed in self.soup.find_all('embed', src=True):
446
+ dots = [x.start(0) for x in re.finditer('\.', embed['src'])]
447
+ if self.url in embed['src'] or self.domain in embed['src'] or len(dots) == 1:
448
+ success = success + 1
449
+ i = i+1
450
+
451
+ for iframe in self.soup.find_all('iframe', src=True):
452
+ dots = [x.start(0) for x in re.finditer('\.', iframe['src'])]
453
+ if self.url in iframe['src'] or self.domain in iframe['src'] or len(dots) == 1:
454
+ success = success + 1
455
+ i = i+1
456
+
457
+ try:
458
+ percentage = success/float(i) * 100
459
+ if percentage < 22.0:
460
+ return 1
461
+ elif((percentage >= 22.0) and (percentage < 61.0)):
462
+ return 0
463
+ else:
464
+ return -1
465
+ except:
466
+ return 0
467
+ except:
468
+ return -1
469
+
470
+ # 14. AnchorURL
471
+ def AnchorURL(self):
472
+ try:
473
+ i,unsafe = 0,0
474
+ for a in self.soup.find_all('a', href=True):
475
+ if "#" in a['href'] or "javascript" in a['href'].lower() or "mailto" in a['href'].lower() or not (url in a['href'] or self.domain in a['href']):
476
+ unsafe = unsafe + 1
477
+ i = i + 1
478
+
479
+ try:
480
+ percentage = unsafe / float(i) * 100
481
+ if percentage < 31.0:
482
+ return 1
483
+ elif ((percentage >= 31.0) and (percentage < 67.0)):
484
+ return 0
485
+ else:
486
+ return -1
487
+ except:
488
+ return -1
489
+
490
+ except:
491
+ return -1
492
+
493
+ # 15. LinksInScriptTags
494
+ def LinksInScriptTags(self):
495
+ try:
496
+ i,success = 0,0
497
+
498
+ for link in self.soup.find_all('link', href=True):
499
+ dots = [x.start(0) for x in re.finditer('\.', link['href'])]
500
+ if self.url in link['href'] or self.domain in link['href'] or len(dots) == 1:
501
+ success = success + 1
502
+ i = i+1
503
+
504
+ for script in self.soup.find_all('script', src=True):
505
+ dots = [x.start(0) for x in re.finditer('\.', script['src'])]
506
+ if self.url in script['src'] or self.domain in script['src'] or len(dots) == 1:
507
+ success = success + 1
508
+ i = i+1
509
+
510
+ try:
511
+ percentage = success / float(i) * 100
512
+ if percentage < 17.0:
513
+ return 1
514
+ elif((percentage >= 17.0) and (percentage < 81.0)):
515
+ return 0
516
+ else:
517
+ return -1
518
+ except:
519
+ return 0
520
+ except:
521
+ return -1
522
+
523
+ # 16. ServerFormHandler
524
+ def ServerFormHandler(self):
525
+ try:
526
+ if len(self.soup.find_all('form', action=True))==0:
527
+ return 1
528
+ else :
529
+ for form in self.soup.find_all('form', action=True):
530
+ if form['action'] == "" or form['action'] == "about:blank":
531
+ return -1
532
+ elif self.url not in form['action'] and self.domain not in form['action']:
533
+ return 0
534
+ else:
535
+ return 1
536
+ except:
537
+ return -1
538
+
539
+ # 17. InfoEmail
540
+ def InfoEmail(self):
541
+ try:
542
+ if re.findall(r"[mail\(\)|mailto:?]", self.soap):
543
+ return -1
544
+ else:
545
+ return 1
546
+ except:
547
+ return -1
548
+
549
+ # 18. AbnormalURL
550
+ def AbnormalURL(self):
551
+ try:
552
+ if self.response.text == self.whois_response:
553
+ return 1
554
+ else:
555
+ return -1
556
+ except:
557
+ return -1
558
+
559
+ # 19. WebsiteForwarding
560
+ def WebsiteForwarding(self):
561
+ try:
562
+ if len(self.response.history) <= 1:
563
+ return 1
564
+ elif len(self.response.history) <= 4:
565
+ return 0
566
+ else:
567
+ return -1
568
+ except:
569
+ return -1
570
+
571
+ # 20. StatusBarCust
572
+ def StatusBarCust(self):
573
+ try:
574
+ if re.findall("<script>.+onmouseover.+</script>", self.response.text):
575
+ return 1
576
+ else:
577
+ return -1
578
+ except:
579
+ return -1
580
+
581
+ # 21. DisableRightClick
582
+ def DisableRightClick(self):
583
+ try:
584
+ if re.findall(r"event.button ?== ?2", self.response.text):
585
+ return 1
586
+ else:
587
+ return -1
588
+ except:
589
+ return -1
590
+
591
+ # 22. UsingPopupWindow
592
+ def UsingPopupWindow(self):
593
+ try:
594
+ if re.findall(r"alert\(", self.response.text):
595
+ return 1
596
+ else:
597
+ return -1
598
+ except:
599
+ return -1
600
+
601
+ # 23. IframeRedirection
602
+ def IframeRedirection(self):
603
+ try:
604
+ if re.findall(r"[<iframe>|<frameBorder>]", self.response.text):
605
+ return 1
606
+ else:
607
+ return -1
608
+ except:
609
+ return -1
610
+
611
+ # 24. AgeofDomain
612
+ def AgeofDomain(self):
613
+ try:
614
+ creation_date = self.whois_response.creation_date
615
+ try:
616
+ if(len(creation_date)):
617
+ creation_date = creation_date[0]
618
+ except:
619
+ pass
620
+
621
+ today = date.today()
622
+ age = (today.year-creation_date.year)*12+(today.month-creation_date.month)
623
+ if age >=6:
624
+ return 1
625
+ return -1
626
+ except:
627
+ return -1
628
+
629
+ # 25. DNSRecording
630
+ def DNSRecording(self):
631
+ try:
632
+ creation_date = self.whois_response.creation_date
633
+ try:
634
+ if(len(creation_date)):
635
+ creation_date = creation_date[0]
636
+ except:
637
+ pass
638
+
639
+ today = date.today()
640
+ age = (today.year-creation_date.year)*12+(today.month-creation_date.month)
641
+ if age >=6:
642
+ return 1
643
+ return -1
644
+ except:
645
+ return -1
646
+
647
+ # 26. WebsiteTraffic
648
+ def WebsiteTraffic(self):
649
+ try:
650
+ rank = BeautifulSoup(urllib.request.urlopen("http://data.alexa.com/data?cli=10&dat=s&url=" + url).read(), "xml").find("REACH")['RANK']
651
+ if (int(rank) < 100000):
652
+ return 1
653
+ return 0
654
+ except :
655
+ return -1
656
+
657
+ # 27. PageRank
658
+ def PageRank(self):
659
+ try:
660
+ prank_checker_response = requests.post("https://www.checkpagerank.net/index.php", {"name": self.domain})
661
+
662
+ global_rank = int(re.findall(r"Global Rank: ([0-9]+)", rank_checker_response.text)[0])
663
+ if global_rank > 0 and global_rank < 100000:
664
+ return 1
665
+ return -1
666
+ except:
667
+ return -1
668
+
669
+
670
+ # 28. GoogleIndex
671
+ def GoogleIndex(self):
672
+ try:
673
+ site = search(self.url, 5)
674
+ if site:
675
+ return 1
676
+ else:
677
+ return -1
678
+ except:
679
+ return 1
680
+
681
+ # 29. LinksPointingToPage
682
+ def LinksPointingToPage(self):
683
+ try:
684
+ number_of_links = len(re.findall(r"<a href=", self.response.text))
685
+ if number_of_links == 0:
686
+ return 1
687
+ elif number_of_links <= 2:
688
+ return 0
689
+ else:
690
+ return -1
691
+ except:
692
+ return -1
693
+
694
+ # 30. StatsReport
695
+ def StatsReport(self):
696
+ try:
697
+ url_match = re.search(
698
+ 'at\.ua|usa\.cc|baltazarpresentes\.com\.br|pe\.hu|esy\.es|hol\.es|sweddy\.com|myjino\.ru|96\.lt|ow\.ly', url)
699
+ ip_address = socket.gethostbyname(self.domain)
700
+ ip_match = re.search('146\.112\.61\.108|213\.174\.157\.151|121\.50\.168\.88|192\.185\.217\.116|78\.46\.211\.158|181\.174\.165\.13|46\.242\.145\.103|121\.50\.168\.40|83\.125\.22\.219|46\.242\.145\.98|'
701
+ '107\.151\.148\.44|107\.151\.148\.107|64\.70\.19\.203|199\.184\.144\.27|107\.151\.148\.108|107\.151\.148\.109|119\.28\.52\.61|54\.83\.43\.69|52\.69\.166\.231|216\.58\.192\.225|'
702
+ '118\.184\.25\.86|67\.208\.74\.71|23\.253\.126\.58|104\.239\.157\.210|175\.126\.123\.219|141\.8\.224\.221|10\.10\.10\.10|43\.229\.108\.32|103\.232\.215\.140|69\.172\.201\.153|'
703
+ '216\.218\.185\.162|54\.225\.104\.146|103\.243\.24\.98|199\.59\.243\.120|31\.170\.160\.61|213\.19\.128\.77|62\.113\.226\.131|208\.100\.26\.234|195\.16\.127\.102|195\.16\.127\.157|'
704
+ '34\.196\.13\.28|103\.224\.212\.222|172\.217\.4\.225|54\.72\.9\.51|192\.64\.147\.141|198\.200\.56\.183|23\.253\.164\.103|52\.48\.191\.26|52\.214\.197\.72|87\.98\.255\.18|209\.99\.17\.27|'
705
+ '216\.38\.62\.18|104\.130\.124\.96|47\.89\.58\.141|78\.46\.211\.158|54\.86\.225\.156|54\.82\.156\.19|37\.157\.192\.102|204\.11\.56\.48|110\.34\.231\.42', ip_address)
706
+ if url_match:
707
+ return -1
708
+ elif ip_match:
709
+ return -1
710
+ return 1
711
+ except:
712
+ return 1
713
+
714
+ def getFeaturesList(self):
715
+ return self.features
716
+
717
+ gbc = GradientBoostingClassifier(max_depth=4,learning_rate=0.7)
718
+ gbc.fit(X_train,y_train)
719
+
720
+ url=input("Enter the Url:")
721
+ #can provide any URL. this URL was taken from PhishTank
722
+ obj = FeatureExtraction(url)
723
+ x = np.array(obj.getFeaturesList()).reshape(1,30)
724
+ y_pred =gbc.predict(x)[0]
725
+ if y_pred==1:
726
+ print("We guess it is a safe website")
727
+ else:
728
+ print("Caution! Suspicious website detected")
phishing.csv ADDED
The diff for this file is too large to render. See raw diff
 
phishing.txt ADDED
The diff for this file is too large to render. See raw diff