RYU-KASH commited on
Commit
79bff6b
1 Parent(s): e16c2d6

Delete app.py

Browse files
Files changed (1) hide show
  1. app.py +0 -743
app.py DELETED
@@ -1,743 +0,0 @@
1
- import os
2
- os.system("pip install seaborn")
3
- os.system("pip install scikit-learn")
4
- os.system("pip install whois")
5
- os.system("pip install googlesearch-python")
6
- import numpy as np # linear algebra
7
- import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
8
-
9
- import matplotlib.pyplot as plt
10
- #%matplotlib inline
11
- import seaborn as sns
12
- from sklearn import metrics
13
- import warnings
14
- warnings.filterwarnings('ignore')
15
-
16
- data = pd.read_csv('phishing.csv')
17
- data.head(20)
18
-
19
- data.columns
20
- len(data.columns)
21
- data.isnull().sum()
22
- X = data.drop(["class","Index"],axis =1)
23
- y = data["class"]
24
-
25
- fig, ax = plt.subplots(1, 1, figsize=(15, 9))
26
- sns.heatmap(data.corr(), annot=True,cmap='viridis')
27
- plt.title('Correlation between different features', fontsize = 15, c='black')
28
- plt.show()
29
-
30
- corr=data.corr()
31
- corr.head()
32
-
33
- corr['class']=abs(corr['class'])
34
- corr.head()
35
-
36
- incCorr=corr.sort_values(by='class',ascending=False)
37
- incCorr.head()
38
-
39
- incCorr['class']
40
-
41
- tenfeatures=incCorr[1:11].index
42
- twenfeatures=incCorr[1:21].index
43
-
44
- #Structutre to Store metrics
45
- ML_Model = []
46
- accuracy = []
47
- f1_score = []
48
- precision = []
49
-
50
- def storeResults(model, a,b,c):
51
- ML_Model.append(model)
52
- accuracy.append(round(a, 3))
53
- f1_score.append(round(b, 3))
54
- precision.append(round(c, 3))
55
-
56
- def KNN(X):
57
- x=[a for a in range(1,10,2)]
58
- knntrain=[]
59
- knntest=[]
60
- from sklearn.model_selection import train_test_split
61
- X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)
62
- X_train.shape, y_train.shape, X_test.shape, y_test.shape
63
- for i in range(1,10,2):
64
- from sklearn.neighbors import KNeighborsClassifier
65
- knn = KNeighborsClassifier(n_neighbors=i)
66
- knn.fit(X_train,y_train)
67
- y_train_knn = knn.predict(X_train)
68
- y_test_knn = knn.predict(X_test)
69
- acc_train_knn = metrics.accuracy_score(y_train,y_train_knn)
70
- acc_test_knn = metrics.accuracy_score(y_test,y_test_knn)
71
- print("K-Nearest Neighbors with k={}: Accuracy on training Data: {:.3f}".format(i,acc_train_knn))
72
- print("K-Nearest Neighbors with k={}: Accuracy on test Data: {:.3f}".format(i,acc_test_knn))
73
- knntrain.append(acc_train_knn)
74
- knntest.append(acc_test_knn)
75
- print()
76
- import matplotlib.pyplot as plt
77
- plt.plot(x,knntrain,label="Train accuracy")
78
- plt.plot(x,knntest,label="Test accuracy")
79
- plt.legend()
80
- plt.show()
81
-
82
- Xmain=X
83
- Xten=X[tenfeatures]
84
- Xtwen=X[twenfeatures]
85
-
86
- KNN(Xmain)
87
-
88
- KNN(Xten)
89
-
90
- KNN(Xtwen)
91
-
92
- from sklearn.model_selection import train_test_split
93
- from sklearn.neighbors import KNeighborsClassifier
94
- X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)
95
- X_train.shape, y_train.shape, X_test.shape, y_test.shape
96
-
97
- knn = KNeighborsClassifier(n_neighbors=5)
98
- knn.fit(X_train,y_train)
99
-
100
- y_train_knn = knn.predict(X_train)
101
- y_test_knn = knn.predict(X_test)
102
-
103
- acc_train_knn = metrics.accuracy_score(y_train,y_train_knn)
104
- acc_test_knn = metrics.accuracy_score(y_test,y_test_knn)
105
-
106
- f1_score_train_knn = metrics.f1_score(y_train,y_train_knn)
107
- f1_score_test_knn = metrics.f1_score(y_test,y_test_knn)
108
-
109
- precision_score_train_knn = metrics.precision_score(y_train,y_train_knn)
110
- precision_score_test_knn = metrics.precision_score(y_test,y_test_knn)
111
-
112
- storeResults('K-Nearest Neighbors',acc_test_knn,f1_score_test_knn,precision_score_train_knn)
113
-
114
- def SVM(X, y):
115
- x=[a for a in range(1,10,2)]
116
- svmtrain=[]
117
- svmtest=[]
118
- from sklearn.model_selection import train_test_split
119
- X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)
120
- X_train.shape, y_train.shape, X_test.shape, y_test.shape
121
- from sklearn.svm import SVC
122
- for i in range(1,10,2):
123
- svm = SVC(kernel='linear', C=i)
124
- svm.fit(X_train, y_train)
125
- y_train_svm = svm.predict(X_train)
126
- y_test_svm = svm.predict(X_test)
127
- acc_train_svm = metrics.accuracy_score(y_train, y_train_svm)
128
- acc_test_svm = metrics.accuracy_score(y_test, y_test_svm)
129
- print("SVM with C={}: Accuracy on training Data: {:.3f}".format(i,acc_train_svm))
130
- print("SVM with C={}: Accuracy on test Data: {:.3f}".format(i,acc_test_svm))
131
- svmtrain.append(acc_train_svm)
132
- svmtest.append(acc_test_svm)
133
- print()
134
- import matplotlib.pyplot as plt
135
- plt.plot(x,svmtrain,label="Train accuracy")
136
- plt.plot(x,svmtest,label="Test accuracy")
137
- plt.legend()
138
- plt.show()
139
-
140
-
141
- Xmain=X
142
- Xten=X[tenfeatures]
143
- Xtwen=X[twenfeatures]
144
-
145
- SVM(Xmain,y)
146
- SVM(Xten,y)
147
- SVM(Xtwen,y)
148
-
149
- from sklearn.model_selection import train_test_split
150
- from sklearn.svm import SVC
151
- from sklearn import metrics
152
-
153
-
154
- X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
155
-
156
- svm = SVC(kernel='linear', C=1, random_state=42)
157
- svm.fit(X_train, y_train)
158
-
159
-
160
- y_train_svm = svm.predict(X_train)
161
- y_test_svm = svm.predict(X_test)
162
-
163
-
164
- acc_train_svm = metrics.accuracy_score(y_train, y_train_svm)
165
- acc_test_svm = metrics.accuracy_score(y_test, y_test_svm)
166
-
167
- f1_score_train_svm = metrics.f1_score(y_train, y_train_svm)
168
- f1_score_test_svm = metrics.f1_score(y_test, y_test_svm)
169
-
170
- precision_score_train_svm = metrics.precision_score(y_train, y_train_svm)
171
- precision_score_test_svm = metrics.precision_score(y_test, y_test_svm)
172
-
173
- print("SVM with C={}: Accuracy on training data: {:.3f}".format(1, acc_train_svm))
174
- print("SVM with C={}: Accuracy on test data: {:.3f}".format(1, acc_test_svm))
175
- print("SVM with C={}: F1 score on training data: {:.3f}".format(1, f1_score_train_svm))
176
- print("SVM with C={}: F1 score on test data: {:.3f}".format(1, f1_score_test_svm))
177
- print("SVM with C={}: Precision on training data: {:.3f}".format(1, precision_score_train_svm))
178
- print("SVM with C={}: Precision on test data: {:.3f}".format(1, precision_score_test_svm))
179
-
180
- storeResults('Support Vector Machines',acc_test_svm,f1_score_test_svm,precision_score_train_svm)
181
-
182
- from sklearn.model_selection import train_test_split
183
- X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)
184
- X_train.shape, y_train.shape, X_test.shape, y_test.shape
185
-
186
- from sklearn.ensemble import GradientBoostingClassifier
187
- gbc = GradientBoostingClassifier(max_depth=4,learning_rate=0.7)
188
- gbc.fit(X_train,y_train)
189
-
190
- y_train_gbc = gbc.predict(X_train)
191
- y_test_gbc = gbc.predict(X_test)
192
-
193
- acc_train_gbc = metrics.accuracy_score(y_train,y_train_gbc)
194
- acc_test_gbc = metrics.accuracy_score(y_test,y_test_gbc)
195
- print("Gradient Boosting Classifier : Accuracy on training Data: {:.3f}".format(acc_train_gbc))
196
- print("Gradient Boosting Classifier : Accuracy on test Data: {:.3f}".format(acc_test_gbc))
197
- print()
198
-
199
- f1_score_train_gbc = metrics.f1_score(y_train,y_train_gbc)
200
- f1_score_test_gbc = metrics.f1_score(y_test,y_test_gbc)
201
-
202
- precision_score_train_gbc = metrics.precision_score(y_train,y_train_gbc)
203
- precision_score_test_gbc = metrics.precision_score(y_test,y_test_gbc)
204
-
205
- storeResults('Gradient Boosting Classifier',acc_test_gbc,f1_score_test_gbc,precision_score_train_gbc)
206
-
207
- df = pd.DataFrame({
208
- 'Modelname': ML_Model,
209
- 'Accuracy Score': accuracy,
210
- 'F1 Score': f1_score,
211
- 'Precision Score': precision
212
- })
213
- df.set_index('Modelname', inplace=True)
214
-
215
- # plot the scores for each model
216
-
217
- fig, ax = plt.subplots(figsize=(10,10))
218
- df.plot(kind='bar', ax=ax)
219
- ax.set_xticklabels(df.index, rotation=0)
220
- ax.set_ylim([0.9, 1])
221
- ax.set_yticks([0.9,0.91,0.92,0.93,0.94,0.95,0.96,0.97,0.98,0.99,1])
222
- ax.set_xlabel('Model')
223
- ax.set_ylabel('Score')
224
- ax.set_title('Model Scores')
225
- plt.show()
226
-
227
- import whois
228
-
229
- import googlesearch
230
-
231
- import ipaddress
232
- import re
233
- import urllib.request
234
- from bs4 import BeautifulSoup
235
- import socket
236
- import requests
237
- import google
238
- import whois
239
- from datetime import date, datetime
240
- import time
241
- from dateutil.parser import parse as date_parse
242
- from urllib.parse import urlparse
243
-
244
- class FeatureExtraction:
245
- features = []
246
- def __init__(self,url):
247
- self.features = []
248
- self.url = url
249
- self.domain = ""
250
- self.whois_response = ""
251
- self.urlparse = ""
252
- self.response = ""
253
- self.soup = ""
254
-
255
- try:
256
- self.response = requests.get(url)
257
- self.soup = BeautifulSoup(response.text, 'html.parser')
258
- except:
259
- pass
260
-
261
- try:
262
- self.urlparse = urlparse(url)
263
- self.domain = self.urlparse.netloc
264
- except:
265
- pass
266
-
267
- try:
268
- self.whois_response = whois.whois(self.domain)
269
- except:
270
- pass
271
-
272
-
273
-
274
-
275
- self.features.append(self.UsingIp())
276
- self.features.append(self.longUrl())
277
- self.features.append(self.shortUrl())
278
- self.features.append(self.symbol())
279
- self.features.append(self.redirecting())
280
- self.features.append(self.prefixSuffix())
281
- self.features.append(self.SubDomains())
282
- self.features.append(self.Hppts())
283
- self.features.append(self.DomainRegLen())
284
- self.features.append(self.Favicon())
285
-
286
-
287
- self.features.append(self.NonStdPort())
288
- self.features.append(self.HTTPSDomainURL())
289
- self.features.append(self.RequestURL())
290
- self.features.append(self.AnchorURL())
291
- self.features.append(self.LinksInScriptTags())
292
- self.features.append(self.ServerFormHandler())
293
- self.features.append(self.InfoEmail())
294
- self.features.append(self.AbnormalURL())
295
- self.features.append(self.WebsiteForwarding())
296
- self.features.append(self.StatusBarCust())
297
-
298
- self.features.append(self.DisableRightClick())
299
- self.features.append(self.UsingPopupWindow())
300
- self.features.append(self.IframeRedirection())
301
- self.features.append(self.AgeofDomain())
302
- self.features.append(self.DNSRecording())
303
- self.features.append(self.WebsiteTraffic())
304
- self.features.append(self.PageRank())
305
- self.features.append(self.GoogleIndex())
306
- self.features.append(self.LinksPointingToPage())
307
- self.features.append(self.StatsReport())
308
-
309
-
310
- # 1.UsingIp
311
- def UsingIp(self):
312
- try:
313
- ipaddress.ip_address(self.url)
314
- return -1
315
- except:
316
- return 1
317
-
318
- # 2.longUrl
319
- def longUrl(self):
320
- if len(self.url) < 54:
321
- return 1
322
- if len(self.url) >= 54 and len(self.url) <= 75:
323
- return 0
324
- return -1
325
-
326
- # 3.shortUrl
327
- def shortUrl(self):
328
- match = re.search('bit\.ly|goo\.gl|shorte\.st|go2l\.ink|x\.co|ow\.ly|t\.co|tinyurl|tr\.im|is\.gd|cli\.gs|'
329
- 'yfrog\.com|migre\.me|ff\.im|tiny\.cc|url4\.eu|twit\.ac|su\.pr|twurl\.nl|snipurl\.com|'
330
- 'short\.to|BudURL\.com|ping\.fm|post\.ly|Just\.as|bkite\.com|snipr\.com|fic\.kr|loopt\.us|'
331
- 'doiop\.com|short\.ie|kl\.am|wp\.me|rubyurl\.com|om\.ly|to\.ly|bit\.do|t\.co|lnkd\.in|'
332
- 'db\.tt|qr\.ae|adf\.ly|goo\.gl|bitly\.com|cur\.lv|tinyurl\.com|ow\.ly|bit\.ly|ity\.im|'
333
- 'q\.gs|is\.gd|po\.st|bc\.vc|twitthis\.com|u\.to|j\.mp|buzurl\.com|cutt\.us|u\.bb|yourls\.org|'
334
- 'x\.co|prettylinkpro\.com|scrnch\.me|filoops\.info|vzturl\.com|qr\.net|1url\.com|tweez\.me|v\.gd|tr\.im|link\.zip\.net', self.url)
335
- if match:
336
- return -1
337
- return 1
338
-
339
- # 4.Symbol@
340
- def symbol(self):
341
- if re.findall("@",self.url):
342
- return -1
343
- return 1
344
-
345
- # 5.Redirecting//
346
- def redirecting(self):
347
- if self.url.rfind('//')>6:
348
- return -1
349
- return 1
350
-
351
- # 6.prefixSuffix
352
- def prefixSuffix(self):
353
- try:
354
- match = re.findall('\-', self.domain)
355
- if match:
356
- return -1
357
- return 1
358
- except:
359
- return -1
360
-
361
- # 7.SubDomains
362
- def SubDomains(self):
363
- dot_count = len(re.findall("\.", self.url))
364
- if dot_count == 1:
365
- return 1
366
- elif dot_count == 2:
367
- return 0
368
- return -1
369
-
370
- # 8.HTTPS
371
- def Hppts(self):
372
- try:
373
- https = self.urlparse.scheme
374
- if 'https' in https:
375
- return 1
376
- return -1
377
- except:
378
- return 1
379
-
380
- # 9.DomainRegLen
381
- def DomainRegLen(self):
382
- try:
383
- expiration_date = self.whois_response.expiration_date
384
- creation_date = self.whois_response.creation_date
385
- try:
386
- if(len(expiration_date)):
387
- expiration_date = expiration_date[0]
388
- except:
389
- pass
390
- try:
391
- if(len(creation_date)):
392
- creation_date = creation_date[0]
393
- except:
394
- pass
395
-
396
- age = (expiration_date.year-creation_date.year)*12+ (expiration_date.month-creation_date.month)
397
- if age >=12:
398
- return 1
399
- return -1
400
- except:
401
- return -1
402
-
403
- # 10. Favicon
404
- def Favicon(self):
405
- try:
406
- for head in self.soup.find_all('head'):
407
- for head.link in self.soup.find_all('link', href=True):
408
- dots = [x.start(0) for x in re.finditer('\.', head.link['href'])]
409
- if self.url in head.link['href'] or len(dots) == 1 or domain in head.link['href']:
410
- return 1
411
- return -1
412
- except:
413
- return -1
414
-
415
- # 11. NonStdPort
416
- def NonStdPort(self):
417
- try:
418
- port = self.domain.split(":")
419
- if len(port)>1:
420
- return -1
421
- return 1
422
- except:
423
- return -1
424
-
425
- # 12. HTTPSDomainURL
426
- def HTTPSDomainURL(self):
427
- try:
428
- if 'https' in self.domain:
429
- return -1
430
- return 1
431
- except:
432
- return -1
433
-
434
- # 13. RequestURL
435
- def RequestURL(self):
436
- try:
437
- for img in self.soup.find_all('img', src=True):
438
- dots = [x.start(0) for x in re.finditer('\.', img['src'])]
439
- if self.url in img['src'] or self.domain in img['src'] or len(dots) == 1:
440
- success = success + 1
441
- i = i+1
442
-
443
- for audio in self.soup.find_all('audio', src=True):
444
- dots = [x.start(0) for x in re.finditer('\.', audio['src'])]
445
- if self.url in audio['src'] or self.domain in audio['src'] or len(dots) == 1:
446
- success = success + 1
447
- i = i+1
448
-
449
- for embed in self.soup.find_all('embed', src=True):
450
- dots = [x.start(0) for x in re.finditer('\.', embed['src'])]
451
- if self.url in embed['src'] or self.domain in embed['src'] or len(dots) == 1:
452
- success = success + 1
453
- i = i+1
454
-
455
- for iframe in self.soup.find_all('iframe', src=True):
456
- dots = [x.start(0) for x in re.finditer('\.', iframe['src'])]
457
- if self.url in iframe['src'] or self.domain in iframe['src'] or len(dots) == 1:
458
- success = success + 1
459
- i = i+1
460
-
461
- try:
462
- percentage = success/float(i) * 100
463
- if percentage < 22.0:
464
- return 1
465
- elif((percentage >= 22.0) and (percentage < 61.0)):
466
- return 0
467
- else:
468
- return -1
469
- except:
470
- return 0
471
- except:
472
- return -1
473
-
474
- # 14. AnchorURL
475
- def AnchorURL(self):
476
- try:
477
- i,unsafe = 0,0
478
- for a in self.soup.find_all('a', href=True):
479
- if "#" in a['href'] or "javascript" in a['href'].lower() or "mailto" in a['href'].lower() or not (url in a['href'] or self.domain in a['href']):
480
- unsafe = unsafe + 1
481
- i = i + 1
482
-
483
- try:
484
- percentage = unsafe / float(i) * 100
485
- if percentage < 31.0:
486
- return 1
487
- elif ((percentage >= 31.0) and (percentage < 67.0)):
488
- return 0
489
- else:
490
- return -1
491
- except:
492
- return -1
493
-
494
- except:
495
- return -1
496
-
497
- # 15. LinksInScriptTags
498
- def LinksInScriptTags(self):
499
- try:
500
- i,success = 0,0
501
-
502
- for link in self.soup.find_all('link', href=True):
503
- dots = [x.start(0) for x in re.finditer('\.', link['href'])]
504
- if self.url in link['href'] or self.domain in link['href'] or len(dots) == 1:
505
- success = success + 1
506
- i = i+1
507
-
508
- for script in self.soup.find_all('script', src=True):
509
- dots = [x.start(0) for x in re.finditer('\.', script['src'])]
510
- if self.url in script['src'] or self.domain in script['src'] or len(dots) == 1:
511
- success = success + 1
512
- i = i+1
513
-
514
- try:
515
- percentage = success / float(i) * 100
516
- if percentage < 17.0:
517
- return 1
518
- elif((percentage >= 17.0) and (percentage < 81.0)):
519
- return 0
520
- else:
521
- return -1
522
- except:
523
- return 0
524
- except:
525
- return -1
526
-
527
- # 16. ServerFormHandler
528
- def ServerFormHandler(self):
529
- try:
530
- if len(self.soup.find_all('form', action=True))==0:
531
- return 1
532
- else :
533
- for form in self.soup.find_all('form', action=True):
534
- if form['action'] == "" or form['action'] == "about:blank":
535
- return -1
536
- elif self.url not in form['action'] and self.domain not in form['action']:
537
- return 0
538
- else:
539
- return 1
540
- except:
541
- return -1
542
-
543
- # 17. InfoEmail
544
- def InfoEmail(self):
545
- try:
546
- if re.findall(r"[mail\(\)|mailto:?]", self.soap):
547
- return -1
548
- else:
549
- return 1
550
- except:
551
- return -1
552
-
553
- # 18. AbnormalURL
554
- def AbnormalURL(self):
555
- try:
556
- if self.response.text == self.whois_response:
557
- return 1
558
- else:
559
- return -1
560
- except:
561
- return -1
562
-
563
- # 19. WebsiteForwarding
564
- def WebsiteForwarding(self):
565
- try:
566
- if len(self.response.history) <= 1:
567
- return 1
568
- elif len(self.response.history) <= 4:
569
- return 0
570
- else:
571
- return -1
572
- except:
573
- return -1
574
-
575
- # 20. StatusBarCust
576
- def StatusBarCust(self):
577
- try:
578
- if re.findall("<script>.+onmouseover.+</script>", self.response.text):
579
- return 1
580
- else:
581
- return -1
582
- except:
583
- return -1
584
-
585
- # 21. DisableRightClick
586
- def DisableRightClick(self):
587
- try:
588
- if re.findall(r"event.button ?== ?2", self.response.text):
589
- return 1
590
- else:
591
- return -1
592
- except:
593
- return -1
594
-
595
- # 22. UsingPopupWindow
596
- def UsingPopupWindow(self):
597
- try:
598
- if re.findall(r"alert\(", self.response.text):
599
- return 1
600
- else:
601
- return -1
602
- except:
603
- return -1
604
-
605
- # 23. IframeRedirection
606
- def IframeRedirection(self):
607
- try:
608
- if re.findall(r"[<iframe>|<frameBorder>]", self.response.text):
609
- return 1
610
- else:
611
- return -1
612
- except:
613
- return -1
614
-
615
- # 24. AgeofDomain
616
- def AgeofDomain(self):
617
- try:
618
- creation_date = self.whois_response.creation_date
619
- try:
620
- if(len(creation_date)):
621
- creation_date = creation_date[0]
622
- except:
623
- pass
624
-
625
- today = date.today()
626
- age = (today.year-creation_date.year)*12+(today.month-creation_date.month)
627
- if age >=6:
628
- return 1
629
- return -1
630
- except:
631
- return -1
632
-
633
- # 25. DNSRecording
634
- def DNSRecording(self):
635
- try:
636
- creation_date = self.whois_response.creation_date
637
- try:
638
- if(len(creation_date)):
639
- creation_date = creation_date[0]
640
- except:
641
- pass
642
-
643
- today = date.today()
644
- age = (today.year-creation_date.year)*12+(today.month-creation_date.month)
645
- if age >=6:
646
- return 1
647
- return -1
648
- except:
649
- return -1
650
-
651
- # 26. WebsiteTraffic
652
- def WebsiteTraffic(self):
653
- try:
654
- rank = BeautifulSoup(urllib.request.urlopen("http://data.alexa.com/data?cli=10&dat=s&url=" + url).read(), "xml").find("REACH")['RANK']
655
- if (int(rank) < 100000):
656
- return 1
657
- return 0
658
- except :
659
- return -1
660
-
661
- # 27. PageRank
662
- def PageRank(self):
663
- try:
664
- prank_checker_response = requests.post("https://www.checkpagerank.net/index.php", {"name": self.domain})
665
-
666
- global_rank = int(re.findall(r"Global Rank: ([0-9]+)", rank_checker_response.text)[0])
667
- if global_rank > 0 and global_rank < 100000:
668
- return 1
669
- return -1
670
- except:
671
- return -1
672
-
673
-
674
- # 28. GoogleIndex
675
- def GoogleIndex(self):
676
- try:
677
- site = search(self.url, 5)
678
- if site:
679
- return 1
680
- else:
681
- return -1
682
- except:
683
- return 1
684
-
685
- # 29. LinksPointingToPage
686
- def LinksPointingToPage(self):
687
- try:
688
- number_of_links = len(re.findall(r"<a href=", self.response.text))
689
- if number_of_links == 0:
690
- return 1
691
- elif number_of_links <= 2:
692
- return 0
693
- else:
694
- return -1
695
- except:
696
- return -1
697
-
698
- # 30. StatsReport
699
- def StatsReport(self):
700
- try:
701
- url_match = re.search(
702
- 'at\.ua|usa\.cc|baltazarpresentes\.com\.br|pe\.hu|esy\.es|hol\.es|sweddy\.com|myjino\.ru|96\.lt|ow\.ly', url)
703
- ip_address = socket.gethostbyname(self.domain)
704
- ip_match = re.search('146\.112\.61\.108|213\.174\.157\.151|121\.50\.168\.88|192\.185\.217\.116|78\.46\.211\.158|181\.174\.165\.13|46\.242\.145\.103|121\.50\.168\.40|83\.125\.22\.219|46\.242\.145\.98|'
705
- '107\.151\.148\.44|107\.151\.148\.107|64\.70\.19\.203|199\.184\.144\.27|107\.151\.148\.108|107\.151\.148\.109|119\.28\.52\.61|54\.83\.43\.69|52\.69\.166\.231|216\.58\.192\.225|'
706
- '118\.184\.25\.86|67\.208\.74\.71|23\.253\.126\.58|104\.239\.157\.210|175\.126\.123\.219|141\.8\.224\.221|10\.10\.10\.10|43\.229\.108\.32|103\.232\.215\.140|69\.172\.201\.153|'
707
- '216\.218\.185\.162|54\.225\.104\.146|103\.243\.24\.98|199\.59\.243\.120|31\.170\.160\.61|213\.19\.128\.77|62\.113\.226\.131|208\.100\.26\.234|195\.16\.127\.102|195\.16\.127\.157|'
708
- '34\.196\.13\.28|103\.224\.212\.222|172\.217\.4\.225|54\.72\.9\.51|192\.64\.147\.141|198\.200\.56\.183|23\.253\.164\.103|52\.48\.191\.26|52\.214\.197\.72|87\.98\.255\.18|209\.99\.17\.27|'
709
- '216\.38\.62\.18|104\.130\.124\.96|47\.89\.58\.141|78\.46\.211\.158|54\.86\.225\.156|54\.82\.156\.19|37\.157\.192\.102|204\.11\.56\.48|110\.34\.231\.42', ip_address)
710
- if url_match:
711
- return -1
712
- elif ip_match:
713
- return -1
714
- return 1
715
- except:
716
- return 1
717
-
718
- def getFeaturesList(self):
719
- return self.features
720
-
721
- gbc = GradientBoostingClassifier(max_depth=4,learning_rate=0.7)
722
- gbc.fit(X_train,y_train)
723
-
724
- import streamlit as st
725
- st.title("Phishing Website Detection")
726
- #
727
- # User input for URL
728
- url = st.text_input("Enter the Url:", key="url_input")
729
- #can provide any URL. this URL was taken from PhishTank
730
-
731
- # Predict and display the result
732
- if st.button("Check"):
733
- if url:
734
- obj = FeatureExtraction(url)
735
- x = np.array(obj.getFeaturesList()).reshape(1, 30)
736
- y_pred = gbc.predict(x)[0]
737
- if y_pred == 1:
738
- st.write("We guess it is a safe website")
739
- else:
740
- st.write("Caution! Suspicious website detected")
741
- st.write(y_pred)
742
- else:
743
- st.write("Please enter a URL.")