Spaces:
Sleeping
Sleeping
Hussain033
commited on
Commit
•
ad60c0d
1
Parent(s):
f923379
Create extractorFunctions.py
Browse files- extractorFunctions.py +175 -0
extractorFunctions.py
ADDED
@@ -0,0 +1,175 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# importing required packages for Address Bar Based feature Extraction
|
2 |
+
from urllib.parse import urlparse,urlencode, unquote
|
3 |
+
import re
|
4 |
+
# importing required packages for Domain Based Feature Extraction
|
5 |
+
import whois
|
6 |
+
from datetime import datetime
|
7 |
+
|
8 |
+
|
9 |
+
# 2.Checks for IP address in URL (Have_IP)
|
10 |
+
def havingIP(url):
|
11 |
+
ip_pattern = r"\b(?:\d{1,3}\.){3}\d{1,3}\b"
|
12 |
+
match = re.search(ip_pattern, url)
|
13 |
+
if match:
|
14 |
+
return 1
|
15 |
+
return 0
|
16 |
+
|
17 |
+
# 3.Checks the presence of @ in URL (Have_At)
|
18 |
+
def haveAtSign(url):
|
19 |
+
if "@" in url:
|
20 |
+
at = 1
|
21 |
+
else:
|
22 |
+
at = 0
|
23 |
+
return at
|
24 |
+
|
25 |
+
# 4.Finding the length of URL and categorizing (URL_Length)
|
26 |
+
def getLength(url):
|
27 |
+
return len(url)
|
28 |
+
|
29 |
+
# 5.Gives number of '/' in URL (URL_Depth)
|
30 |
+
def getDepth(url):
|
31 |
+
s = urlparse(url).path.split('/')
|
32 |
+
depth = 0
|
33 |
+
for j in range(len(s)):
|
34 |
+
if len(s[j]) != 0:
|
35 |
+
depth = depth+1
|
36 |
+
return depth
|
37 |
+
|
38 |
+
#listing shortening services
|
39 |
+
shortening_services = r"bit\.ly|goo\.gl|shorte\.st|go2l\.ink|x\.co|ow\.ly|t\.co|tinyurl|tr\.im|is\.gd|cli\.gs|" \
|
40 |
+
r"yfrog\.com|migre\.me|ff\.im|tiny\.cc|url4\.eu|twit\.ac|su\.pr|twurl\.nl|snipurl\.com|" \
|
41 |
+
r"short\.to|BudURL\.com|ping\.fm|post\.ly|Just\.as|bkite\.com|snipr\.com|fic\.kr|loopt\.us|" \
|
42 |
+
r"doiop\.com|short\.ie|kl\.am|wp\.me|rubyurl\.com|om\.ly|to\.ly|bit\.do|t\.co|lnkd\.in|db\.tt|" \
|
43 |
+
r"qr\.ae|adf\.ly|goo\.gl|bitly\.com|cur\.lv|tinyurl\.com|ow\.ly|bit\.ly|ity\.im|q\.gs|is\.gd|" \
|
44 |
+
r"po\.st|bc\.vc|twitthis\.com|u\.to|j\.mp|buzurl\.com|cutt\.us|u\.bb|yourls\.org|x\.co|" \
|
45 |
+
r"prettylinkpro\.com|scrnch\.me|filoops\.info|vzturl\.com|qr\.net|1url\.com|tweez\.me|v\.gd|" \
|
46 |
+
r"tr\.im|link\.zip\.net"
|
47 |
+
|
48 |
+
# 8. Checking for Shortening Services in URL (Tiny_URL)
|
49 |
+
def tinyURL(url):
|
50 |
+
match=re.search(shortening_services,url)
|
51 |
+
if match:
|
52 |
+
return 1
|
53 |
+
else:
|
54 |
+
return 0
|
55 |
+
|
56 |
+
# 9.Checking for Prefix or Suffix Separated by (-) in the Domain (Prefix/Suffix)
|
57 |
+
def prefixSuffix(url):
|
58 |
+
if '-' in urlparse(url).netloc:
|
59 |
+
return 1 # phishing
|
60 |
+
else:
|
61 |
+
return 0 # legitimate
|
62 |
+
|
63 |
+
def no_of_dots(url):
|
64 |
+
return url.count('.')
|
65 |
+
|
66 |
+
sensitiveWords = ["account", "confirm", "banking", "secure", "ebyisapi", "webscr", "signin", "mail",
|
67 |
+
"install", "toolbar", "backup", "paypal", "password", "username", "verify", "update",
|
68 |
+
"login", "support", "billing", "transaction", "security", "payment", "verify", "online",
|
69 |
+
"customer", "service", "accountupdate", "verification", "important", "confidential",
|
70 |
+
"limited", "access", "securitycheck", "verifyaccount", "information", "change", "notice"
|
71 |
+
"myaccount", "updateinfo", "loginsecure", "protect", "transaction", "identity", "member"
|
72 |
+
"personal", "actionrequired", "loginverify", "validate", "paymentupdate", "urgent"]
|
73 |
+
|
74 |
+
def sensitive_word(url):
|
75 |
+
domain = urlparse(url).netloc
|
76 |
+
for i in sensitiveWords:
|
77 |
+
if i in domain:
|
78 |
+
return 1
|
79 |
+
return 0
|
80 |
+
|
81 |
+
|
82 |
+
def has_unicode(url):
|
83 |
+
# Parse the URL
|
84 |
+
parsed_url = urlparse(url)
|
85 |
+
|
86 |
+
# Get the netloc part of the URL
|
87 |
+
netloc = parsed_url.netloc
|
88 |
+
|
89 |
+
# Decode the netloc using IDNA encoding
|
90 |
+
decoded_netloc = netloc.encode('latin1').decode('idna')
|
91 |
+
|
92 |
+
# Unquote the decoded netloc
|
93 |
+
unquoted_netloc = unquote(decoded_netloc)
|
94 |
+
|
95 |
+
# Compare the unquoted netloc with the original netloc
|
96 |
+
if unquoted_netloc != netloc:
|
97 |
+
return 1
|
98 |
+
|
99 |
+
return 0
|
100 |
+
|
101 |
+
# 13.Survival time of domain: The difference between termination time and creation time (Domain_Age)
|
102 |
+
def domainAge(domain_name):
|
103 |
+
creation_date = domain_name.creation_date
|
104 |
+
expiration_date = domain_name.expiration_date
|
105 |
+
if (isinstance(creation_date,str) or isinstance(expiration_date,str)):
|
106 |
+
try:
|
107 |
+
creation_date = datetime.strptime(creation_date,'%Y-%m-%d')
|
108 |
+
expiration_date = datetime.strptime(expiration_date,"%Y-%m-%d")
|
109 |
+
except:
|
110 |
+
return 1
|
111 |
+
if ((expiration_date is None) or (creation_date is None)):
|
112 |
+
return 1
|
113 |
+
elif ((type(expiration_date) is list) or (type(creation_date) is list)):
|
114 |
+
return 1
|
115 |
+
else:
|
116 |
+
ageofdomain = abs((expiration_date - creation_date).days)
|
117 |
+
if ((ageofdomain/30) < 6):
|
118 |
+
age = 1
|
119 |
+
else:
|
120 |
+
age = 0
|
121 |
+
return age
|
122 |
+
|
123 |
+
# 14.End time of domain: The difference between termination time and current time (Domain_End)
|
124 |
+
def domainEnd(domain_name):
|
125 |
+
expiration_date = domain_name.expiration_date
|
126 |
+
if isinstance(expiration_date,str):
|
127 |
+
try:
|
128 |
+
expiration_date = datetime.strptime(expiration_date,"%Y-%m-%d")
|
129 |
+
except:
|
130 |
+
return 1
|
131 |
+
if (expiration_date is None):
|
132 |
+
return 1
|
133 |
+
elif (type(expiration_date) is list):
|
134 |
+
return 1
|
135 |
+
else:
|
136 |
+
today = datetime.now()
|
137 |
+
end = abs((expiration_date - today).days)
|
138 |
+
if ((end/30) < 6):
|
139 |
+
end = 0
|
140 |
+
else:
|
141 |
+
end = 1
|
142 |
+
return end
|
143 |
+
|
144 |
+
# 15. IFrame Redirection (iFrame)
|
145 |
+
def iframe(response):
|
146 |
+
if response == "":
|
147 |
+
return 1
|
148 |
+
else:
|
149 |
+
if re.findall(r"[<iframe>|<frameBorder>]", response.text):
|
150 |
+
return 0
|
151 |
+
else:
|
152 |
+
return 1
|
153 |
+
|
154 |
+
# 16.Checks the effect of mouse over on status bar (Mouse_Over)
|
155 |
+
def mouseOver(response):
|
156 |
+
if response == "" :
|
157 |
+
return 1
|
158 |
+
else:
|
159 |
+
try:
|
160 |
+
if re.findall("<script>.+onmouseover.+</script>", response.text):
|
161 |
+
return 1
|
162 |
+
else:
|
163 |
+
return 0
|
164 |
+
except:
|
165 |
+
return 1
|
166 |
+
|
167 |
+
# 18.Checks the number of forwardings (Web_Forwards)
|
168 |
+
def forwarding(response):
|
169 |
+
if response == "":
|
170 |
+
return 1
|
171 |
+
else:
|
172 |
+
if len(response.history) <= 2:
|
173 |
+
return 0
|
174 |
+
else:
|
175 |
+
return 1
|