Phishing-URL-Detection / featureExtractor.py
Hussain033's picture
Update featureExtractor.py
cf61aa1
raw
history blame
No virus
1.54 kB
import whois
from urllib.parse import urlparse
import httpx
import pickle as pk
import pandas as pd
import extractorFunctions as ef
#Function to extract features
def featureExtraction(url):
features = []
#Address bar based features (12)
features.append(ef.getLength(url))
features.append(ef.getDepth(url))
features.append(ef.tinyURL(url))
features.append(ef.prefixSuffix(url))
features.append(ef.no_of_dots(url))
features.append(ef.sensitive_word(url))
domain_name = ''
#Domain based features (4)
dns = 0
try:
domain_name = whois.whois(urlparse(url).netloc)
except:
dns = 1
features.append(1 if dns == 1 else ef.domainAge(domain_name))
features.append(1 if dns == 1 else ef.domainEnd(domain_name))
# HTML & Javascript based features (4)
dom = []
try:
response = httpx.get(url)
except:
response = ""
dom.append(ef.iframe(response))
dom.append(ef.mouseOver(response))
dom.append(ef.forwarding(response))
features.append(ef.has_unicode(url)+ef.haveAtSign(url)+ef.havingIP(url))
with open('pca_model.pkl', 'rb') as file:
pca = pk.load(file)
#converting the list to dataframe
feature_names = ['URL_Length', 'URL_Depth', 'TinyURL', 'Prefix/Suffix', 'No_Of_Dots', 'Sensitive_Words',
'Domain_Age', 'Domain_End', 'Have_Symbol','domain_att']
dom_pd = pd.DataFrame([dom], columns = ['iFrame','Web_Forwards','Mouse_Over'])
features.append(pca.transform(dom_pd)[0][0])
row = pd.DataFrame([features], columns= feature_names)
return row