Spaces:
Sleeping
Sleeping
File size: 1,536 Bytes
f923379 cf61aa1 f923379 cf61aa1 f923379 cf61aa1 f923379 cf61aa1 f923379 cf61aa1 f923379 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 |
import whois
from urllib.parse import urlparse
import httpx
import pickle as pk
import pandas as pd
import extractorFunctions as ef
#Function to extract features
def featureExtraction(url):
features = []
#Address bar based features (12)
features.append(ef.getLength(url))
features.append(ef.getDepth(url))
features.append(ef.tinyURL(url))
features.append(ef.prefixSuffix(url))
features.append(ef.no_of_dots(url))
features.append(ef.sensitive_word(url))
domain_name = ''
#Domain based features (4)
dns = 0
try:
domain_name = whois.whois(urlparse(url).netloc)
except:
dns = 1
features.append(1 if dns == 1 else ef.domainAge(domain_name))
features.append(1 if dns == 1 else ef.domainEnd(domain_name))
# HTML & Javascript based features (4)
dom = []
try:
response = httpx.get(url)
except:
response = ""
dom.append(ef.iframe(response))
dom.append(ef.mouseOver(response))
dom.append(ef.forwarding(response))
features.append(ef.has_unicode(url)+ef.haveAtSign(url)+ef.havingIP(url))
with open('pca_model.pkl', 'rb') as file:
pca = pk.load(file)
#converting the list to dataframe
feature_names = ['URL_Length', 'URL_Depth', 'TinyURL', 'Prefix/Suffix', 'No_Of_Dots', 'Sensitive_Words',
'Domain_Age', 'Domain_End', 'Have_Symbol','domain_att']
dom_pd = pd.DataFrame([dom], columns = ['iFrame','Web_Forwards','Mouse_Over'])
features.append(pca.transform(dom_pd)[0][0])
row = pd.DataFrame([features], columns= feature_names)
return row |