RYU-KASH commited on
Commit
0685ea8
1 Parent(s): e509ecf

Upload 2 files

Browse files
Files changed (2) hide show
  1. app1.py +26 -0
  2. phishingdetection.py +722 -0
app1.py ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ sys.path.append('G:\Project\phishing-detection')
3
+ from phishingdetection import FeatureExtraction
4
+ import numpy as np
5
+ from phishingdetection import gbc
6
+ import streamlit as st
7
+
8
+ st.title("Phishing Website Detection")
9
+ #
10
+ # User input for URL
11
+ url = st.text_input("Enter the Url:", key="url_input")
12
+ #can provide any URL. this URL was taken from PhishTank
13
+
14
+ # Predict and display the result
15
+ if st.button("Check"):
16
+ if url:
17
+ obj = FeatureExtraction(url)
18
+ x = np.array(obj.getFeaturesList()).reshape(1, 30)
19
+ y_pred = gbc.predict(x)[0]
20
+ if y_pred == 1:
21
+ st.write("We guess it is a safe website")
22
+ else:
23
+ st.write("Caution! Suspicious website detected")
24
+ st.write(y_pred)
25
+ else:
26
+ st.write("Please enter a URL.")
phishingdetection.py ADDED
@@ -0,0 +1,722 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ os.system("pip install seaborn")
3
+ os.system("pip install scikit-learn")
4
+ os.system("pip install whois")
5
+ os.system("pip install googlesearch-python")
6
+ import numpy as np # linear algebra
7
+ import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
8
+
9
+ import matplotlib.pyplot as plt
10
+ #%matplotlib inline
11
+ import seaborn as sns
12
+ from sklearn import metrics
13
+ import warnings
14
+ warnings.filterwarnings('ignore')
15
+
16
+ data = pd.read_csv('phishing.csv')
17
+ data.head(20)
18
+
19
+ data.columns
20
+ len(data.columns)
21
+ data.isnull().sum()
22
+ X = data.drop(["class","Index"],axis =1)
23
+ y = data["class"]
24
+
25
+ fig, ax = plt.subplots(1, 1, figsize=(15, 9))
26
+ sns.heatmap(data.corr(), annot=True,cmap='viridis')
27
+ plt.title('Correlation between different features', fontsize = 15, c='black')
28
+ plt.show()
29
+
30
+ corr=data.corr()
31
+ corr.head()
32
+
33
+ corr['class']=abs(corr['class'])
34
+ corr.head()
35
+
36
+ incCorr=corr.sort_values(by='class',ascending=False)
37
+ incCorr.head()
38
+
39
+ incCorr['class']
40
+
41
+ tenfeatures=incCorr[1:11].index
42
+ twenfeatures=incCorr[1:21].index
43
+
44
+ #Structutre to Store metrics
45
+ ML_Model = []
46
+ accuracy = []
47
+ f1_score = []
48
+ precision = []
49
+
50
+ def storeResults(model, a,b,c):
51
+ ML_Model.append(model)
52
+ accuracy.append(round(a, 3))
53
+ f1_score.append(round(b, 3))
54
+ precision.append(round(c, 3))
55
+
56
+ def KNN(X):
57
+ x=[a for a in range(1,10,2)]
58
+ knntrain=[]
59
+ knntest=[]
60
+ from sklearn.model_selection import train_test_split
61
+ X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)
62
+ X_train.shape, y_train.shape, X_test.shape, y_test.shape
63
+ for i in range(1,10,2):
64
+ from sklearn.neighbors import KNeighborsClassifier
65
+ knn = KNeighborsClassifier(n_neighbors=i)
66
+ knn.fit(X_train,y_train)
67
+ y_train_knn = knn.predict(X_train)
68
+ y_test_knn = knn.predict(X_test)
69
+ acc_train_knn = metrics.accuracy_score(y_train,y_train_knn)
70
+ acc_test_knn = metrics.accuracy_score(y_test,y_test_knn)
71
+ print("K-Nearest Neighbors with k={}: Accuracy on training Data: {:.3f}".format(i,acc_train_knn))
72
+ print("K-Nearest Neighbors with k={}: Accuracy on test Data: {:.3f}".format(i,acc_test_knn))
73
+ knntrain.append(acc_train_knn)
74
+ knntest.append(acc_test_knn)
75
+ print()
76
+ import matplotlib.pyplot as plt
77
+ plt.plot(x,knntrain,label="Train accuracy")
78
+ plt.plot(x,knntest,label="Test accuracy")
79
+ plt.legend()
80
+ plt.show()
81
+
82
+ Xmain=X
83
+ Xten=X[tenfeatures]
84
+ Xtwen=X[twenfeatures]
85
+
86
+ KNN(Xmain)
87
+
88
+ KNN(Xten)
89
+
90
+ KNN(Xtwen)
91
+
92
+ from sklearn.model_selection import train_test_split
93
+ from sklearn.neighbors import KNeighborsClassifier
94
+ X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)
95
+ X_train.shape, y_train.shape, X_test.shape, y_test.shape
96
+
97
+ knn = KNeighborsClassifier(n_neighbors=5)
98
+ knn.fit(X_train,y_train)
99
+
100
+ y_train_knn = knn.predict(X_train)
101
+ y_test_knn = knn.predict(X_test)
102
+
103
+ acc_train_knn = metrics.accuracy_score(y_train,y_train_knn)
104
+ acc_test_knn = metrics.accuracy_score(y_test,y_test_knn)
105
+
106
+ f1_score_train_knn = metrics.f1_score(y_train,y_train_knn)
107
+ f1_score_test_knn = metrics.f1_score(y_test,y_test_knn)
108
+
109
+ precision_score_train_knn = metrics.precision_score(y_train,y_train_knn)
110
+ precision_score_test_knn = metrics.precision_score(y_test,y_test_knn)
111
+
112
+ storeResults('K-Nearest Neighbors',acc_test_knn,f1_score_test_knn,precision_score_train_knn)
113
+
114
+ def SVM(X, y):
115
+ x=[a for a in range(1,10,2)]
116
+ svmtrain=[]
117
+ svmtest=[]
118
+ from sklearn.model_selection import train_test_split
119
+ X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)
120
+ X_train.shape, y_train.shape, X_test.shape, y_test.shape
121
+ from sklearn.svm import SVC
122
+ for i in range(1,10,2):
123
+ svm = SVC(kernel='linear', C=i)
124
+ svm.fit(X_train, y_train)
125
+ y_train_svm = svm.predict(X_train)
126
+ y_test_svm = svm.predict(X_test)
127
+ acc_train_svm = metrics.accuracy_score(y_train, y_train_svm)
128
+ acc_test_svm = metrics.accuracy_score(y_test, y_test_svm)
129
+ print("SVM with C={}: Accuracy on training Data: {:.3f}".format(i,acc_train_svm))
130
+ print("SVM with C={}: Accuracy on test Data: {:.3f}".format(i,acc_test_svm))
131
+ svmtrain.append(acc_train_svm)
132
+ svmtest.append(acc_test_svm)
133
+ print()
134
+ import matplotlib.pyplot as plt
135
+ plt.plot(x,svmtrain,label="Train accuracy")
136
+ plt.plot(x,svmtest,label="Test accuracy")
137
+ plt.legend()
138
+ plt.show()
139
+
140
+
141
+ Xmain=X
142
+ Xten=X[tenfeatures]
143
+ Xtwen=X[twenfeatures]
144
+
145
+ SVM(Xmain,y)
146
+ SVM(Xten,y)
147
+ SVM(Xtwen,y)
148
+
149
+ from sklearn.model_selection import train_test_split
150
+ from sklearn.svm import SVC
151
+ from sklearn import metrics
152
+
153
+
154
+ X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
155
+
156
+ svm = SVC(kernel='linear', C=1, random_state=42)
157
+ svm.fit(X_train, y_train)
158
+
159
+
160
+ y_train_svm = svm.predict(X_train)
161
+ y_test_svm = svm.predict(X_test)
162
+
163
+
164
+ acc_train_svm = metrics.accuracy_score(y_train, y_train_svm)
165
+ acc_test_svm = metrics.accuracy_score(y_test, y_test_svm)
166
+
167
+ f1_score_train_svm = metrics.f1_score(y_train, y_train_svm)
168
+ f1_score_test_svm = metrics.f1_score(y_test, y_test_svm)
169
+
170
+ precision_score_train_svm = metrics.precision_score(y_train, y_train_svm)
171
+ precision_score_test_svm = metrics.precision_score(y_test, y_test_svm)
172
+
173
+ print("SVM with C={}: Accuracy on training data: {:.3f}".format(1, acc_train_svm))
174
+ print("SVM with C={}: Accuracy on test data: {:.3f}".format(1, acc_test_svm))
175
+ print("SVM with C={}: F1 score on training data: {:.3f}".format(1, f1_score_train_svm))
176
+ print("SVM with C={}: F1 score on test data: {:.3f}".format(1, f1_score_test_svm))
177
+ print("SVM with C={}: Precision on training data: {:.3f}".format(1, precision_score_train_svm))
178
+ print("SVM with C={}: Precision on test data: {:.3f}".format(1, precision_score_test_svm))
179
+
180
+ storeResults('Support Vector Machines',acc_test_svm,f1_score_test_svm,precision_score_train_svm)
181
+
182
+ from sklearn.model_selection import train_test_split
183
+ X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)
184
+ X_train.shape, y_train.shape, X_test.shape, y_test.shape
185
+
186
+ from sklearn.ensemble import GradientBoostingClassifier
187
+ gbc = GradientBoostingClassifier(max_depth=4,learning_rate=0.7)
188
+ gbc.fit(X_train,y_train)
189
+
190
+ y_train_gbc = gbc.predict(X_train)
191
+ y_test_gbc = gbc.predict(X_test)
192
+
193
+ acc_train_gbc = metrics.accuracy_score(y_train,y_train_gbc)
194
+ acc_test_gbc = metrics.accuracy_score(y_test,y_test_gbc)
195
+ print("Gradient Boosting Classifier : Accuracy on training Data: {:.3f}".format(acc_train_gbc))
196
+ print("Gradient Boosting Classifier : Accuracy on test Data: {:.3f}".format(acc_test_gbc))
197
+ print()
198
+
199
+ f1_score_train_gbc = metrics.f1_score(y_train,y_train_gbc)
200
+ f1_score_test_gbc = metrics.f1_score(y_test,y_test_gbc)
201
+
202
+ precision_score_train_gbc = metrics.precision_score(y_train,y_train_gbc)
203
+ precision_score_test_gbc = metrics.precision_score(y_test,y_test_gbc)
204
+
205
+ storeResults('Gradient Boosting Classifier',acc_test_gbc,f1_score_test_gbc,precision_score_train_gbc)
206
+
207
+ df = pd.DataFrame({
208
+ 'Modelname': ML_Model,
209
+ 'Accuracy Score': accuracy,
210
+ 'F1 Score': f1_score,
211
+ 'Precision Score': precision
212
+ })
213
+ df.set_index('Modelname', inplace=True)
214
+
215
+ # plot the scores for each model
216
+
217
+ fig, ax = plt.subplots(figsize=(10,10))
218
+ df.plot(kind='bar', ax=ax)
219
+ ax.set_xticklabels(df.index, rotation=0)
220
+ ax.set_ylim([0.9, 1])
221
+ ax.set_yticks([0.9,0.91,0.92,0.93,0.94,0.95,0.96,0.97,0.98,0.99,1])
222
+ ax.set_xlabel('Model')
223
+ ax.set_ylabel('Score')
224
+ ax.set_title('Model Scores')
225
+ plt.show()
226
+
227
+ import whois
228
+
229
+ import googlesearch
230
+
231
+ import ipaddress
232
+ import re
233
+ import urllib.request
234
+ from bs4 import BeautifulSoup
235
+ import socket
236
+ import requests
237
+ import google
238
+ import whois
239
+ from datetime import date, datetime
240
+ import time
241
+ from dateutil.parser import parse as date_parse
242
+ from urllib.parse import urlparse
243
+
244
+ class FeatureExtraction:
245
+ features = []
246
+ def __init__(self,url):
247
+ self.features = []
248
+ self.url = url
249
+ self.domain = ""
250
+ self.whois_response = ""
251
+ self.urlparse = ""
252
+ self.response = ""
253
+ self.soup = ""
254
+
255
+ try:
256
+ self.response = requests.get(url)
257
+ self.soup = BeautifulSoup(response.text, 'html.parser')
258
+ except:
259
+ pass
260
+
261
+ try:
262
+ self.urlparse = urlparse(url)
263
+ self.domain = self.urlparse.netloc
264
+ except:
265
+ pass
266
+
267
+ try:
268
+ self.whois_response = whois.whois(self.domain)
269
+ except:
270
+ pass
271
+
272
+
273
+
274
+
275
+ self.features.append(self.UsingIp())
276
+ self.features.append(self.longUrl())
277
+ self.features.append(self.shortUrl())
278
+ self.features.append(self.symbol())
279
+ self.features.append(self.redirecting())
280
+ self.features.append(self.prefixSuffix())
281
+ self.features.append(self.SubDomains())
282
+ self.features.append(self.Hppts())
283
+ self.features.append(self.DomainRegLen())
284
+ self.features.append(self.Favicon())
285
+
286
+
287
+ self.features.append(self.NonStdPort())
288
+ self.features.append(self.HTTPSDomainURL())
289
+ self.features.append(self.RequestURL())
290
+ self.features.append(self.AnchorURL())
291
+ self.features.append(self.LinksInScriptTags())
292
+ self.features.append(self.ServerFormHandler())
293
+ self.features.append(self.InfoEmail())
294
+ self.features.append(self.AbnormalURL())
295
+ self.features.append(self.WebsiteForwarding())
296
+ self.features.append(self.StatusBarCust())
297
+
298
+ self.features.append(self.DisableRightClick())
299
+ self.features.append(self.UsingPopupWindow())
300
+ self.features.append(self.IframeRedirection())
301
+ self.features.append(self.AgeofDomain())
302
+ self.features.append(self.DNSRecording())
303
+ self.features.append(self.WebsiteTraffic())
304
+ self.features.append(self.PageRank())
305
+ self.features.append(self.GoogleIndex())
306
+ self.features.append(self.LinksPointingToPage())
307
+ self.features.append(self.StatsReport())
308
+
309
+
310
+ # 1.UsingIp
311
+ def UsingIp(self):
312
+ try:
313
+ ipaddress.ip_address(self.url)
314
+ return -1
315
+ except:
316
+ return 1
317
+
318
+ # 2.longUrl
319
+ def longUrl(self):
320
+ if len(self.url) < 54:
321
+ return 1
322
+ if len(self.url) >= 54 and len(self.url) <= 75:
323
+ return 0
324
+ return -1
325
+
326
+ # 3.shortUrl
327
+ def shortUrl(self):
328
+ match = re.search('bit\.ly|goo\.gl|shorte\.st|go2l\.ink|x\.co|ow\.ly|t\.co|tinyurl|tr\.im|is\.gd|cli\.gs|'
329
+ 'yfrog\.com|migre\.me|ff\.im|tiny\.cc|url4\.eu|twit\.ac|su\.pr|twurl\.nl|snipurl\.com|'
330
+ 'short\.to|BudURL\.com|ping\.fm|post\.ly|Just\.as|bkite\.com|snipr\.com|fic\.kr|loopt\.us|'
331
+ 'doiop\.com|short\.ie|kl\.am|wp\.me|rubyurl\.com|om\.ly|to\.ly|bit\.do|t\.co|lnkd\.in|'
332
+ 'db\.tt|qr\.ae|adf\.ly|goo\.gl|bitly\.com|cur\.lv|tinyurl\.com|ow\.ly|bit\.ly|ity\.im|'
333
+ 'q\.gs|is\.gd|po\.st|bc\.vc|twitthis\.com|u\.to|j\.mp|buzurl\.com|cutt\.us|u\.bb|yourls\.org|'
334
+ 'x\.co|prettylinkpro\.com|scrnch\.me|filoops\.info|vzturl\.com|qr\.net|1url\.com|tweez\.me|v\.gd|tr\.im|link\.zip\.net', self.url)
335
+ if match:
336
+ return -1
337
+ return 1
338
+
339
+ # 4.Symbol@
340
+ def symbol(self):
341
+ if re.findall("@",self.url):
342
+ return -1
343
+ return 1
344
+
345
+ # 5.Redirecting//
346
+ def redirecting(self):
347
+ if self.url.rfind('//')>6:
348
+ return -1
349
+ return 1
350
+
351
+ # 6.prefixSuffix
352
+ def prefixSuffix(self):
353
+ try:
354
+ match = re.findall('\-', self.domain)
355
+ if match:
356
+ return -1
357
+ return 1
358
+ except:
359
+ return -1
360
+
361
+ # 7.SubDomains
362
+ def SubDomains(self):
363
+ dot_count = len(re.findall("\.", self.url))
364
+ if dot_count == 1:
365
+ return 1
366
+ elif dot_count == 2:
367
+ return 0
368
+ return -1
369
+
370
+ # 8.HTTPS
371
+ def Hppts(self):
372
+ try:
373
+ https = self.urlparse.scheme
374
+ if 'https' in https:
375
+ return 1
376
+ return -1
377
+ except:
378
+ return 1
379
+
380
+ # 9.DomainRegLen
381
+ def DomainRegLen(self):
382
+ try:
383
+ expiration_date = self.whois_response.expiration_date
384
+ creation_date = self.whois_response.creation_date
385
+ try:
386
+ if(len(expiration_date)):
387
+ expiration_date = expiration_date[0]
388
+ except:
389
+ pass
390
+ try:
391
+ if(len(creation_date)):
392
+ creation_date = creation_date[0]
393
+ except:
394
+ pass
395
+
396
+ age = (expiration_date.year-creation_date.year)*12+ (expiration_date.month-creation_date.month)
397
+ if age >=12:
398
+ return 1
399
+ return -1
400
+ except:
401
+ return -1
402
+
403
+ # 10. Favicon
404
+ def Favicon(self):
405
+ try:
406
+ for head in self.soup.find_all('head'):
407
+ for head.link in self.soup.find_all('link', href=True):
408
+ dots = [x.start(0) for x in re.finditer('\.', head.link['href'])]
409
+ if self.url in head.link['href'] or len(dots) == 1 or domain in head.link['href']:
410
+ return 1
411
+ return -1
412
+ except:
413
+ return -1
414
+
415
+ # 11. NonStdPort
416
+ def NonStdPort(self):
417
+ try:
418
+ port = self.domain.split(":")
419
+ if len(port)>1:
420
+ return -1
421
+ return 1
422
+ except:
423
+ return -1
424
+
425
+ # 12. HTTPSDomainURL
426
+ def HTTPSDomainURL(self):
427
+ try:
428
+ if 'https' in self.domain:
429
+ return -1
430
+ return 1
431
+ except:
432
+ return -1
433
+
434
+ # 13. RequestURL
435
+ def RequestURL(self):
436
+ try:
437
+ for img in self.soup.find_all('img', src=True):
438
+ dots = [x.start(0) for x in re.finditer('\.', img['src'])]
439
+ if self.url in img['src'] or self.domain in img['src'] or len(dots) == 1:
440
+ success = success + 1
441
+ i = i+1
442
+
443
+ for audio in self.soup.find_all('audio', src=True):
444
+ dots = [x.start(0) for x in re.finditer('\.', audio['src'])]
445
+ if self.url in audio['src'] or self.domain in audio['src'] or len(dots) == 1:
446
+ success = success + 1
447
+ i = i+1
448
+
449
+ for embed in self.soup.find_all('embed', src=True):
450
+ dots = [x.start(0) for x in re.finditer('\.', embed['src'])]
451
+ if self.url in embed['src'] or self.domain in embed['src'] or len(dots) == 1:
452
+ success = success + 1
453
+ i = i+1
454
+
455
+ for iframe in self.soup.find_all('iframe', src=True):
456
+ dots = [x.start(0) for x in re.finditer('\.', iframe['src'])]
457
+ if self.url in iframe['src'] or self.domain in iframe['src'] or len(dots) == 1:
458
+ success = success + 1
459
+ i = i+1
460
+
461
+ try:
462
+ percentage = success/float(i) * 100
463
+ if percentage < 22.0:
464
+ return 1
465
+ elif((percentage >= 22.0) and (percentage < 61.0)):
466
+ return 0
467
+ else:
468
+ return -1
469
+ except:
470
+ return 0
471
+ except:
472
+ return -1
473
+
474
+ # 14. AnchorURL
475
+ def AnchorURL(self):
476
+ try:
477
+ i,unsafe = 0,0
478
+ for a in self.soup.find_all('a', href=True):
479
+ if "#" in a['href'] or "javascript" in a['href'].lower() or "mailto" in a['href'].lower() or not (url in a['href'] or self.domain in a['href']):
480
+ unsafe = unsafe + 1
481
+ i = i + 1
482
+
483
+ try:
484
+ percentage = unsafe / float(i) * 100
485
+ if percentage < 31.0:
486
+ return 1
487
+ elif ((percentage >= 31.0) and (percentage < 67.0)):
488
+ return 0
489
+ else:
490
+ return -1
491
+ except:
492
+ return -1
493
+
494
+ except:
495
+ return -1
496
+
497
+ # 15. LinksInScriptTags
498
+ def LinksInScriptTags(self):
499
+ try:
500
+ i,success = 0,0
501
+
502
+ for link in self.soup.find_all('link', href=True):
503
+ dots = [x.start(0) for x in re.finditer('\.', link['href'])]
504
+ if self.url in link['href'] or self.domain in link['href'] or len(dots) == 1:
505
+ success = success + 1
506
+ i = i+1
507
+
508
+ for script in self.soup.find_all('script', src=True):
509
+ dots = [x.start(0) for x in re.finditer('\.', script['src'])]
510
+ if self.url in script['src'] or self.domain in script['src'] or len(dots) == 1:
511
+ success = success + 1
512
+ i = i+1
513
+
514
+ try:
515
+ percentage = success / float(i) * 100
516
+ if percentage < 17.0:
517
+ return 1
518
+ elif((percentage >= 17.0) and (percentage < 81.0)):
519
+ return 0
520
+ else:
521
+ return -1
522
+ except:
523
+ return 0
524
+ except:
525
+ return -1
526
+
527
+ # 16. ServerFormHandler
528
+ def ServerFormHandler(self):
529
+ try:
530
+ if len(self.soup.find_all('form', action=True))==0:
531
+ return 1
532
+ else :
533
+ for form in self.soup.find_all('form', action=True):
534
+ if form['action'] == "" or form['action'] == "about:blank":
535
+ return -1
536
+ elif self.url not in form['action'] and self.domain not in form['action']:
537
+ return 0
538
+ else:
539
+ return 1
540
+ except:
541
+ return -1
542
+
543
+ # 17. InfoEmail
544
+ def InfoEmail(self):
545
+ try:
546
+ if re.findall(r"[mail\(\)|mailto:?]", self.soap):
547
+ return -1
548
+ else:
549
+ return 1
550
+ except:
551
+ return -1
552
+
553
+ # 18. AbnormalURL
554
+ def AbnormalURL(self):
555
+ try:
556
+ if self.response.text == self.whois_response:
557
+ return 1
558
+ else:
559
+ return -1
560
+ except:
561
+ return -1
562
+
563
+ # 19. WebsiteForwarding
564
+ def WebsiteForwarding(self):
565
+ try:
566
+ if len(self.response.history) <= 1:
567
+ return 1
568
+ elif len(self.response.history) <= 4:
569
+ return 0
570
+ else:
571
+ return -1
572
+ except:
573
+ return -1
574
+
575
+ # 20. StatusBarCust
576
+ def StatusBarCust(self):
577
+ try:
578
+ if re.findall("<script>.+onmouseover.+</script>", self.response.text):
579
+ return 1
580
+ else:
581
+ return -1
582
+ except:
583
+ return -1
584
+
585
+ # 21. DisableRightClick
586
+ def DisableRightClick(self):
587
+ try:
588
+ if re.findall(r"event.button ?== ?2", self.response.text):
589
+ return 1
590
+ else:
591
+ return -1
592
+ except:
593
+ return -1
594
+
595
+ # 22. UsingPopupWindow
596
+ def UsingPopupWindow(self):
597
+ try:
598
+ if re.findall(r"alert\(", self.response.text):
599
+ return 1
600
+ else:
601
+ return -1
602
+ except:
603
+ return -1
604
+
605
+ # 23. IframeRedirection
606
+ def IframeRedirection(self):
607
+ try:
608
+ if re.findall(r"[<iframe>|<frameBorder>]", self.response.text):
609
+ return 1
610
+ else:
611
+ return -1
612
+ except:
613
+ return -1
614
+
615
+ # 24. AgeofDomain
616
+ def AgeofDomain(self):
617
+ try:
618
+ creation_date = self.whois_response.creation_date
619
+ try:
620
+ if(len(creation_date)):
621
+ creation_date = creation_date[0]
622
+ except:
623
+ pass
624
+
625
+ today = date.today()
626
+ age = (today.year-creation_date.year)*12+(today.month-creation_date.month)
627
+ if age >=6:
628
+ return 1
629
+ return -1
630
+ except:
631
+ return -1
632
+
633
+ # 25. DNSRecording
634
+ def DNSRecording(self):
635
+ try:
636
+ creation_date = self.whois_response.creation_date
637
+ try:
638
+ if(len(creation_date)):
639
+ creation_date = creation_date[0]
640
+ except:
641
+ pass
642
+
643
+ today = date.today()
644
+ age = (today.year-creation_date.year)*12+(today.month-creation_date.month)
645
+ if age >=6:
646
+ return 1
647
+ return -1
648
+ except:
649
+ return -1
650
+
651
+ # 26. WebsiteTraffic
652
+ def WebsiteTraffic(self):
653
+ try:
654
+ rank = BeautifulSoup(urllib.request.urlopen("http://data.alexa.com/data?cli=10&dat=s&url=" + url).read(), "xml").find("REACH")['RANK']
655
+ if (int(rank) < 100000):
656
+ return 1
657
+ return 0
658
+ except :
659
+ return -1
660
+
661
+ # 27. PageRank
662
+ def PageRank(self):
663
+ try:
664
+ prank_checker_response = requests.post("https://www.checkpagerank.net/index.php", {"name": self.domain})
665
+
666
+ global_rank = int(re.findall(r"Global Rank: ([0-9]+)", rank_checker_response.text)[0])
667
+ if global_rank > 0 and global_rank < 100000:
668
+ return 1
669
+ return -1
670
+ except:
671
+ return -1
672
+
673
+
674
+ # 28. GoogleIndex
675
+ def GoogleIndex(self):
676
+ try:
677
+ site = search(self.url, 5)
678
+ if site:
679
+ return 1
680
+ else:
681
+ return -1
682
+ except:
683
+ return 1
684
+
685
+ # 29. LinksPointingToPage
686
+ def LinksPointingToPage(self):
687
+ try:
688
+ number_of_links = len(re.findall(r"<a href=", self.response.text))
689
+ if number_of_links == 0:
690
+ return 1
691
+ elif number_of_links <= 2:
692
+ return 0
693
+ else:
694
+ return -1
695
+ except:
696
+ return -1
697
+
698
+ # 30. StatsReport
699
+ def StatsReport(self):
700
+ try:
701
+ url_match = re.search(
702
+ 'at\.ua|usa\.cc|baltazarpresentes\.com\.br|pe\.hu|esy\.es|hol\.es|sweddy\.com|myjino\.ru|96\.lt|ow\.ly', url)
703
+ ip_address = socket.gethostbyname(self.domain)
704
+ ip_match = re.search('146\.112\.61\.108|213\.174\.157\.151|121\.50\.168\.88|192\.185\.217\.116|78\.46\.211\.158|181\.174\.165\.13|46\.242\.145\.103|121\.50\.168\.40|83\.125\.22\.219|46\.242\.145\.98|'
705
+ '107\.151\.148\.44|107\.151\.148\.107|64\.70\.19\.203|199\.184\.144\.27|107\.151\.148\.108|107\.151\.148\.109|119\.28\.52\.61|54\.83\.43\.69|52\.69\.166\.231|216\.58\.192\.225|'
706
+ '118\.184\.25\.86|67\.208\.74\.71|23\.253\.126\.58|104\.239\.157\.210|175\.126\.123\.219|141\.8\.224\.221|10\.10\.10\.10|43\.229\.108\.32|103\.232\.215\.140|69\.172\.201\.153|'
707
+ '216\.218\.185\.162|54\.225\.104\.146|103\.243\.24\.98|199\.59\.243\.120|31\.170\.160\.61|213\.19\.128\.77|62\.113\.226\.131|208\.100\.26\.234|195\.16\.127\.102|195\.16\.127\.157|'
708
+ '34\.196\.13\.28|103\.224\.212\.222|172\.217\.4\.225|54\.72\.9\.51|192\.64\.147\.141|198\.200\.56\.183|23\.253\.164\.103|52\.48\.191\.26|52\.214\.197\.72|87\.98\.255\.18|209\.99\.17\.27|'
709
+ '216\.38\.62\.18|104\.130\.124\.96|47\.89\.58\.141|78\.46\.211\.158|54\.86\.225\.156|54\.82\.156\.19|37\.157\.192\.102|204\.11\.56\.48|110\.34\.231\.42', ip_address)
710
+ if url_match:
711
+ return -1
712
+ elif ip_match:
713
+ return -1
714
+ return 1
715
+ except:
716
+ return 1
717
+
718
+ def getFeaturesList(self):
719
+ return self.features
720
+
721
+ gbc = GradientBoostingClassifier(max_depth=4,learning_rate=0.7)
722
+ gbc.fit(X_train,y_train)