import os os.system("pip install seaborn") os.system("pip install scikit-learn") os.system("pip install whois") os.system("pip install googlesearch-python") import numpy as np # linear algebra import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv) import matplotlib.pyplot as plt #%matplotlib inline import seaborn as sns from sklearn import metrics import warnings warnings.filterwarnings('ignore') data = pd.read_csv('phishing.csv') data.head(20) data.columns len(data.columns) data.isnull().sum() X = data.drop(["class","Index"],axis =1) y = data["class"] fig, ax = plt.subplots(1, 1, figsize=(15, 9)) sns.heatmap(data.corr(), annot=True,cmap='viridis') plt.title('Correlation between different features', fontsize = 15, c='black') plt.show() corr=data.corr() corr.head() corr['class']=abs(corr['class']) corr.head() incCorr=corr.sort_values(by='class',ascending=False) incCorr.head() incCorr['class'] tenfeatures=incCorr[1:11].index twenfeatures=incCorr[1:21].index #Structutre to Store metrics ML_Model = [] accuracy = [] f1_score = [] precision = [] def storeResults(model, a,b,c): ML_Model.append(model) accuracy.append(round(a, 3)) f1_score.append(round(b, 3)) precision.append(round(c, 3)) def KNN(X): x=[a for a in range(1,10,2)] knntrain=[] knntest=[] from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42) X_train.shape, y_train.shape, X_test.shape, y_test.shape for i in range(1,10,2): from sklearn.neighbors import KNeighborsClassifier knn = KNeighborsClassifier(n_neighbors=i) knn.fit(X_train,y_train) y_train_knn = knn.predict(X_train) y_test_knn = knn.predict(X_test) acc_train_knn = metrics.accuracy_score(y_train,y_train_knn) acc_test_knn = metrics.accuracy_score(y_test,y_test_knn) print("K-Nearest Neighbors with k={}: Accuracy on training Data: {:.3f}".format(i,acc_train_knn)) print("K-Nearest Neighbors with k={}: Accuracy on test Data: {:.3f}".format(i,acc_test_knn)) knntrain.append(acc_train_knn) knntest.append(acc_test_knn) print() import matplotlib.pyplot as plt plt.plot(x,knntrain,label="Train accuracy") plt.plot(x,knntest,label="Test accuracy") plt.legend() plt.show() Xmain=X Xten=X[tenfeatures] Xtwen=X[twenfeatures] KNN(Xmain) KNN(Xten) KNN(Xtwen) from sklearn.model_selection import train_test_split from sklearn.neighbors import KNeighborsClassifier X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42) X_train.shape, y_train.shape, X_test.shape, y_test.shape knn = KNeighborsClassifier(n_neighbors=5) knn.fit(X_train,y_train) y_train_knn = knn.predict(X_train) y_test_knn = knn.predict(X_test) acc_train_knn = metrics.accuracy_score(y_train,y_train_knn) acc_test_knn = metrics.accuracy_score(y_test,y_test_knn) f1_score_train_knn = metrics.f1_score(y_train,y_train_knn) f1_score_test_knn = metrics.f1_score(y_test,y_test_knn) precision_score_train_knn = metrics.precision_score(y_train,y_train_knn) precision_score_test_knn = metrics.precision_score(y_test,y_test_knn) storeResults('K-Nearest Neighbors',acc_test_knn,f1_score_test_knn,precision_score_train_knn) def SVM(X, y): x=[a for a in range(1,10,2)] svmtrain=[] svmtest=[] from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42) X_train.shape, y_train.shape, X_test.shape, y_test.shape from sklearn.svm import SVC for i in range(1,10,2): svm = SVC(kernel='linear', C=i) svm.fit(X_train, y_train) y_train_svm = svm.predict(X_train) y_test_svm = svm.predict(X_test) acc_train_svm = metrics.accuracy_score(y_train, y_train_svm) acc_test_svm = metrics.accuracy_score(y_test, y_test_svm) print("SVM with C={}: Accuracy on training Data: {:.3f}".format(i,acc_train_svm)) print("SVM with C={}: Accuracy on test Data: {:.3f}".format(i,acc_test_svm)) svmtrain.append(acc_train_svm) svmtest.append(acc_test_svm) print() import matplotlib.pyplot as plt plt.plot(x,svmtrain,label="Train accuracy") plt.plot(x,svmtest,label="Test accuracy") plt.legend() plt.show() Xmain=X Xten=X[tenfeatures] Xtwen=X[twenfeatures] SVM(Xmain,y) SVM(Xten,y) SVM(Xtwen,y) from sklearn.model_selection import train_test_split from sklearn.svm import SVC from sklearn import metrics X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) svm = SVC(kernel='linear', C=1, random_state=42) svm.fit(X_train, y_train) y_train_svm = svm.predict(X_train) y_test_svm = svm.predict(X_test) acc_train_svm = metrics.accuracy_score(y_train, y_train_svm) acc_test_svm = metrics.accuracy_score(y_test, y_test_svm) f1_score_train_svm = metrics.f1_score(y_train, y_train_svm) f1_score_test_svm = metrics.f1_score(y_test, y_test_svm) precision_score_train_svm = metrics.precision_score(y_train, y_train_svm) precision_score_test_svm = metrics.precision_score(y_test, y_test_svm) print("SVM with C={}: Accuracy on training data: {:.3f}".format(1, acc_train_svm)) print("SVM with C={}: Accuracy on test data: {:.3f}".format(1, acc_test_svm)) print("SVM with C={}: F1 score on training data: {:.3f}".format(1, f1_score_train_svm)) print("SVM with C={}: F1 score on test data: {:.3f}".format(1, f1_score_test_svm)) print("SVM with C={}: Precision on training data: {:.3f}".format(1, precision_score_train_svm)) print("SVM with C={}: Precision on test data: {:.3f}".format(1, precision_score_test_svm)) storeResults('Support Vector Machines',acc_test_svm,f1_score_test_svm,precision_score_train_svm) from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42) X_train.shape, y_train.shape, X_test.shape, y_test.shape from sklearn.ensemble import GradientBoostingClassifier gbc = GradientBoostingClassifier(max_depth=4,learning_rate=0.7) gbc.fit(X_train,y_train) y_train_gbc = gbc.predict(X_train) y_test_gbc = gbc.predict(X_test) acc_train_gbc = metrics.accuracy_score(y_train,y_train_gbc) acc_test_gbc = metrics.accuracy_score(y_test,y_test_gbc) print("Gradient Boosting Classifier : Accuracy on training Data: {:.3f}".format(acc_train_gbc)) print("Gradient Boosting Classifier : Accuracy on test Data: {:.3f}".format(acc_test_gbc)) print() f1_score_train_gbc = metrics.f1_score(y_train,y_train_gbc) f1_score_test_gbc = metrics.f1_score(y_test,y_test_gbc) precision_score_train_gbc = metrics.precision_score(y_train,y_train_gbc) precision_score_test_gbc = metrics.precision_score(y_test,y_test_gbc) storeResults('Gradient Boosting Classifier',acc_test_gbc,f1_score_test_gbc,precision_score_train_gbc) df = pd.DataFrame({ 'Modelname': ML_Model, 'Accuracy Score': accuracy, 'F1 Score': f1_score, 'Precision Score': precision }) df.set_index('Modelname', inplace=True) # plot the scores for each model fig, ax = plt.subplots(figsize=(10,10)) df.plot(kind='bar', ax=ax) ax.set_xticklabels(df.index, rotation=0) ax.set_ylim([0.9, 1]) ax.set_yticks([0.9,0.91,0.92,0.93,0.94,0.95,0.96,0.97,0.98,0.99,1]) ax.set_xlabel('Model') ax.set_ylabel('Score') ax.set_title('Model Scores') plt.show() import whois import googlesearch import ipaddress import re import urllib.request from bs4 import BeautifulSoup import socket import requests import google import whois from datetime import date, datetime import time from dateutil.parser import parse as date_parse from urllib.parse import urlparse class FeatureExtraction: features = [] def __init__(self,url): self.features = [] self.url = url self.domain = "" self.whois_response = "" self.urlparse = "" self.response = "" self.soup = "" try: self.response = requests.get(url) self.soup = BeautifulSoup(response.text, 'html.parser') except: pass try: self.urlparse = urlparse(url) self.domain = self.urlparse.netloc except: pass try: self.whois_response = whois.whois(self.domain) except: pass self.features.append(self.UsingIp()) self.features.append(self.longUrl()) self.features.append(self.shortUrl()) self.features.append(self.symbol()) self.features.append(self.redirecting()) self.features.append(self.prefixSuffix()) self.features.append(self.SubDomains()) self.features.append(self.Hppts()) self.features.append(self.DomainRegLen()) self.features.append(self.Favicon()) self.features.append(self.NonStdPort()) self.features.append(self.HTTPSDomainURL()) self.features.append(self.RequestURL()) self.features.append(self.AnchorURL()) self.features.append(self.LinksInScriptTags()) self.features.append(self.ServerFormHandler()) self.features.append(self.InfoEmail()) self.features.append(self.AbnormalURL()) self.features.append(self.WebsiteForwarding()) self.features.append(self.StatusBarCust()) self.features.append(self.DisableRightClick()) self.features.append(self.UsingPopupWindow()) self.features.append(self.IframeRedirection()) self.features.append(self.AgeofDomain()) self.features.append(self.DNSRecording()) self.features.append(self.WebsiteTraffic()) self.features.append(self.PageRank()) self.features.append(self.GoogleIndex()) self.features.append(self.LinksPointingToPage()) self.features.append(self.StatsReport()) # 1.UsingIp def UsingIp(self): try: ipaddress.ip_address(self.url) return -1 except: return 1 # 2.longUrl def longUrl(self): if len(self.url) < 54: return 1 if len(self.url) >= 54 and len(self.url) <= 75: return 0 return -1 # 3.shortUrl def shortUrl(self): match = re.search('bit\.ly|goo\.gl|shorte\.st|go2l\.ink|x\.co|ow\.ly|t\.co|tinyurl|tr\.im|is\.gd|cli\.gs|' 'yfrog\.com|migre\.me|ff\.im|tiny\.cc|url4\.eu|twit\.ac|su\.pr|twurl\.nl|snipurl\.com|' 'short\.to|BudURL\.com|ping\.fm|post\.ly|Just\.as|bkite\.com|snipr\.com|fic\.kr|loopt\.us|' 'doiop\.com|short\.ie|kl\.am|wp\.me|rubyurl\.com|om\.ly|to\.ly|bit\.do|t\.co|lnkd\.in|' 'db\.tt|qr\.ae|adf\.ly|goo\.gl|bitly\.com|cur\.lv|tinyurl\.com|ow\.ly|bit\.ly|ity\.im|' 'q\.gs|is\.gd|po\.st|bc\.vc|twitthis\.com|u\.to|j\.mp|buzurl\.com|cutt\.us|u\.bb|yourls\.org|' 'x\.co|prettylinkpro\.com|scrnch\.me|filoops\.info|vzturl\.com|qr\.net|1url\.com|tweez\.me|v\.gd|tr\.im|link\.zip\.net', self.url) if match: return -1 return 1 # 4.Symbol@ def symbol(self): if re.findall("@",self.url): return -1 return 1 # 5.Redirecting// def redirecting(self): if self.url.rfind('//')>6: return -1 return 1 # 6.prefixSuffix def prefixSuffix(self): try: match = re.findall('\-', self.domain) if match: return -1 return 1 except: return -1 # 7.SubDomains def SubDomains(self): dot_count = len(re.findall("\.", self.url)) if dot_count == 1: return 1 elif dot_count == 2: return 0 return -1 # 8.HTTPS def Hppts(self): try: https = self.urlparse.scheme if 'https' in https: return 1 return -1 except: return 1 # 9.DomainRegLen def DomainRegLen(self): try: expiration_date = self.whois_response.expiration_date creation_date = self.whois_response.creation_date try: if(len(expiration_date)): expiration_date = expiration_date[0] except: pass try: if(len(creation_date)): creation_date = creation_date[0] except: pass age = (expiration_date.year-creation_date.year)*12+ (expiration_date.month-creation_date.month) if age >=12: return 1 return -1 except: return -1 # 10. Favicon def Favicon(self): try: for head in self.soup.find_all('head'): for head.link in self.soup.find_all('link', href=True): dots = [x.start(0) for x in re.finditer('\.', head.link['href'])] if self.url in head.link['href'] or len(dots) == 1 or domain in head.link['href']: return 1 return -1 except: return -1 # 11. NonStdPort def NonStdPort(self): try: port = self.domain.split(":") if len(port)>1: return -1 return 1 except: return -1 # 12. HTTPSDomainURL def HTTPSDomainURL(self): try: if 'https' in self.domain: return -1 return 1 except: return -1 # 13. RequestURL def RequestURL(self): try: for img in self.soup.find_all('img', src=True): dots = [x.start(0) for x in re.finditer('\.', img['src'])] if self.url in img['src'] or self.domain in img['src'] or len(dots) == 1: success = success + 1 i = i+1 for audio in self.soup.find_all('audio', src=True): dots = [x.start(0) for x in re.finditer('\.', audio['src'])] if self.url in audio['src'] or self.domain in audio['src'] or len(dots) == 1: success = success + 1 i = i+1 for embed in self.soup.find_all('embed', src=True): dots = [x.start(0) for x in re.finditer('\.', embed['src'])] if self.url in embed['src'] or self.domain in embed['src'] or len(dots) == 1: success = success + 1 i = i+1 for iframe in self.soup.find_all('iframe', src=True): dots = [x.start(0) for x in re.finditer('\.', iframe['src'])] if self.url in iframe['src'] or self.domain in iframe['src'] or len(dots) == 1: success = success + 1 i = i+1 try: percentage = success/float(i) * 100 if percentage < 22.0: return 1 elif((percentage >= 22.0) and (percentage < 61.0)): return 0 else: return -1 except: return 0 except: return -1 # 14. AnchorURL def AnchorURL(self): try: i,unsafe = 0,0 for a in self.soup.find_all('a', href=True): if "#" in a['href'] or "javascript" in a['href'].lower() or "mailto" in a['href'].lower() or not (url in a['href'] or self.domain in a['href']): unsafe = unsafe + 1 i = i + 1 try: percentage = unsafe / float(i) * 100 if percentage < 31.0: return 1 elif ((percentage >= 31.0) and (percentage < 67.0)): return 0 else: return -1 except: return -1 except: return -1 # 15. LinksInScriptTags def LinksInScriptTags(self): try: i,success = 0,0 for link in self.soup.find_all('link', href=True): dots = [x.start(0) for x in re.finditer('\.', link['href'])] if self.url in link['href'] or self.domain in link['href'] or len(dots) == 1: success = success + 1 i = i+1 for script in self.soup.find_all('script', src=True): dots = [x.start(0) for x in re.finditer('\.', script['src'])] if self.url in script['src'] or self.domain in script['src'] or len(dots) == 1: success = success + 1 i = i+1 try: percentage = success / float(i) * 100 if percentage < 17.0: return 1 elif((percentage >= 17.0) and (percentage < 81.0)): return 0 else: return -1 except: return 0 except: return -1 # 16. ServerFormHandler def ServerFormHandler(self): try: if len(self.soup.find_all('form', action=True))==0: return 1 else : for form in self.soup.find_all('form', action=True): if form['action'] == "" or form['action'] == "about:blank": return -1 elif self.url not in form['action'] and self.domain not in form['action']: return 0 else: return 1 except: return -1 # 17. InfoEmail def InfoEmail(self): try: if re.findall(r"[mail\(\)|mailto:?]", self.soap): return -1 else: return 1 except: return -1 # 18. AbnormalURL def AbnormalURL(self): try: if self.response.text == self.whois_response: return 1 else: return -1 except: return -1 # 19. WebsiteForwarding def WebsiteForwarding(self): try: if len(self.response.history) <= 1: return 1 elif len(self.response.history) <= 4: return 0 else: return -1 except: return -1 # 20. StatusBarCust def StatusBarCust(self): try: if re.findall("", self.response.text): return 1 else: return -1 except: return -1 # 21. DisableRightClick def DisableRightClick(self): try: if re.findall(r"event.button ?== ?2", self.response.text): return 1 else: return -1 except: return -1 # 22. UsingPopupWindow def UsingPopupWindow(self): try: if re.findall(r"alert\(", self.response.text): return 1 else: return -1 except: return -1 # 23. IframeRedirection def IframeRedirection(self): try: if re.findall(r"[