import whois from urllib.parse import urlparse import httpx import pickle as pk import pandas as pd import phishing_url_detection.extractorFunctions as ef from pathlib import Path #Function to extract features def featureExtraction(url): features = [] #Address bar based features (12) features.append(ef.getLength(url)) features.append(ef.getDepth(url)) features.append(ef.tinyURL(url)) features.append(ef.prefixSuffix(url)) features.append(ef.no_of_dots(url)) features.append(ef.sensitive_word(url)) domain_name = '' #Domain based features (4) dns = 0 try: domain_name = whois.whois(urlparse(url).netloc) except: dns = 1 features.append(1 if dns == 1 else ef.domainAge(domain_name)) features.append(1 if dns == 1 else ef.domainEnd(domain_name)) # HTML & Javascript based features (4) dom = [] try: response = httpx.get(url) except: response = "" dom.append(ef.iframe(response)) dom.append(ef.mouseOver(response)) dom.append(ef.forwarding(response)) features.append(ef.has_unicode(url)+ef.haveAtSign(url)+ef.havingIP(url)) root_folder = Path(__file__).parents[1] my_path = root_folder / "models/pca_model.pkl" with open(my_path, 'rb') as file: pca = pk.load(file) #converting the list to dataframe feature_names = ['URL_Length', 'URL_Depth', 'TinyURL', 'Prefix/Suffix', 'No_Of_Dots', 'Sensitive_Words', 'Domain_Age', 'Domain_End', 'Have_Symbol','domain_att'] dom_pd = pd.DataFrame([dom], columns = ['iFrame','Web_Forwards','Mouse_Over']) features.append(pca.transform(dom_pd)[0][0]) row = pd.DataFrame([features], columns= feature_names) return row