File size: 1,566 Bytes
f923379
 
 
 
 
cf61aa1
f923379
 
 
 
 
 
cf61aa1
 
 
 
 
 
f923379
 
 
 
 
 
 
 
 
 
cf61aa1
 
f923379
 
 
 
 
 
 
 
cf61aa1
 
 
f923379
cf61aa1
f923379
911c3ec
f923379
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
import whois
from urllib.parse import urlparse
import httpx
import pickle as pk
import pandas as pd
import extractorFunctions as ef

#Function to extract features
def featureExtraction(url):

  features = []
  #Address bar based features (12)
  features.append(ef.getLength(url))
  features.append(ef.getDepth(url))
  features.append(ef.tinyURL(url))
  features.append(ef.prefixSuffix(url))
  features.append(ef.no_of_dots(url))
  features.append(ef.sensitive_word(url))


  domain_name = ''
  #Domain based features (4)
  dns = 0
  try:
    domain_name = whois.whois(urlparse(url).netloc)
  except:
    dns = 1

  features.append(1 if dns == 1 else ef.domainAge(domain_name))
  features.append(1 if dns == 1 else ef.domainEnd(domain_name))

  # HTML & Javascript based features (4)
  dom = []
  try:
    response = httpx.get(url)
  except:
    response = ""

  dom.append(ef.iframe(response))
  dom.append(ef.mouseOver(response))
  dom.append(ef.forwarding(response))

  features.append(ef.has_unicode(url)+ef.haveAtSign(url)+ef.havingIP(url))

  with open('Phishing-URL-Detection/models/pca_model.pkl', 'rb') as file:
    pca = pk.load(file)

  #converting the list to dataframe
  feature_names = ['URL_Length', 'URL_Depth', 'TinyURL', 'Prefix/Suffix', 'No_Of_Dots', 'Sensitive_Words',
                       'Domain_Age', 'Domain_End', 'Have_Symbol','domain_att']
  dom_pd = pd.DataFrame([dom], columns = ['iFrame','Web_Forwards','Mouse_Over'])
  features.append(pca.transform(dom_pd)[0][0])

  row = pd.DataFrame([features], columns= feature_names)

  return row