Spaces:
Sleeping
Sleeping
File size: 5,582 Bytes
ad60c0d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 |
# importing required packages for Address Bar Based feature Extraction
from urllib.parse import urlparse,urlencode, unquote
import re
# importing required packages for Domain Based Feature Extraction
import whois
from datetime import datetime
# 2.Checks for IP address in URL (Have_IP)
def havingIP(url):
ip_pattern = r"\b(?:\d{1,3}\.){3}\d{1,3}\b"
match = re.search(ip_pattern, url)
if match:
return 1
return 0
# 3.Checks the presence of @ in URL (Have_At)
def haveAtSign(url):
if "@" in url:
at = 1
else:
at = 0
return at
# 4.Finding the length of URL and categorizing (URL_Length)
def getLength(url):
return len(url)
# 5.Gives number of '/' in URL (URL_Depth)
def getDepth(url):
s = urlparse(url).path.split('/')
depth = 0
for j in range(len(s)):
if len(s[j]) != 0:
depth = depth+1
return depth
#listing shortening services
shortening_services = r"bit\.ly|goo\.gl|shorte\.st|go2l\.ink|x\.co|ow\.ly|t\.co|tinyurl|tr\.im|is\.gd|cli\.gs|" \
r"yfrog\.com|migre\.me|ff\.im|tiny\.cc|url4\.eu|twit\.ac|su\.pr|twurl\.nl|snipurl\.com|" \
r"short\.to|BudURL\.com|ping\.fm|post\.ly|Just\.as|bkite\.com|snipr\.com|fic\.kr|loopt\.us|" \
r"doiop\.com|short\.ie|kl\.am|wp\.me|rubyurl\.com|om\.ly|to\.ly|bit\.do|t\.co|lnkd\.in|db\.tt|" \
r"qr\.ae|adf\.ly|goo\.gl|bitly\.com|cur\.lv|tinyurl\.com|ow\.ly|bit\.ly|ity\.im|q\.gs|is\.gd|" \
r"po\.st|bc\.vc|twitthis\.com|u\.to|j\.mp|buzurl\.com|cutt\.us|u\.bb|yourls\.org|x\.co|" \
r"prettylinkpro\.com|scrnch\.me|filoops\.info|vzturl\.com|qr\.net|1url\.com|tweez\.me|v\.gd|" \
r"tr\.im|link\.zip\.net"
# 8. Checking for Shortening Services in URL (Tiny_URL)
def tinyURL(url):
match=re.search(shortening_services,url)
if match:
return 1
else:
return 0
# 9.Checking for Prefix or Suffix Separated by (-) in the Domain (Prefix/Suffix)
def prefixSuffix(url):
if '-' in urlparse(url).netloc:
return 1 # phishing
else:
return 0 # legitimate
def no_of_dots(url):
return url.count('.')
sensitiveWords = ["account", "confirm", "banking", "secure", "ebyisapi", "webscr", "signin", "mail",
"install", "toolbar", "backup", "paypal", "password", "username", "verify", "update",
"login", "support", "billing", "transaction", "security", "payment", "verify", "online",
"customer", "service", "accountupdate", "verification", "important", "confidential",
"limited", "access", "securitycheck", "verifyaccount", "information", "change", "notice"
"myaccount", "updateinfo", "loginsecure", "protect", "transaction", "identity", "member"
"personal", "actionrequired", "loginverify", "validate", "paymentupdate", "urgent"]
def sensitive_word(url):
domain = urlparse(url).netloc
for i in sensitiveWords:
if i in domain:
return 1
return 0
def has_unicode(url):
# Parse the URL
parsed_url = urlparse(url)
# Get the netloc part of the URL
netloc = parsed_url.netloc
# Decode the netloc using IDNA encoding
decoded_netloc = netloc.encode('latin1').decode('idna')
# Unquote the decoded netloc
unquoted_netloc = unquote(decoded_netloc)
# Compare the unquoted netloc with the original netloc
if unquoted_netloc != netloc:
return 1
return 0
# 13.Survival time of domain: The difference between termination time and creation time (Domain_Age)
def domainAge(domain_name):
creation_date = domain_name.creation_date
expiration_date = domain_name.expiration_date
if (isinstance(creation_date,str) or isinstance(expiration_date,str)):
try:
creation_date = datetime.strptime(creation_date,'%Y-%m-%d')
expiration_date = datetime.strptime(expiration_date,"%Y-%m-%d")
except:
return 1
if ((expiration_date is None) or (creation_date is None)):
return 1
elif ((type(expiration_date) is list) or (type(creation_date) is list)):
return 1
else:
ageofdomain = abs((expiration_date - creation_date).days)
if ((ageofdomain/30) < 6):
age = 1
else:
age = 0
return age
# 14.End time of domain: The difference between termination time and current time (Domain_End)
def domainEnd(domain_name):
expiration_date = domain_name.expiration_date
if isinstance(expiration_date,str):
try:
expiration_date = datetime.strptime(expiration_date,"%Y-%m-%d")
except:
return 1
if (expiration_date is None):
return 1
elif (type(expiration_date) is list):
return 1
else:
today = datetime.now()
end = abs((expiration_date - today).days)
if ((end/30) < 6):
end = 0
else:
end = 1
return end
# 15. IFrame Redirection (iFrame)
def iframe(response):
if response == "":
return 1
else:
if re.findall(r"[<iframe>|<frameBorder>]", response.text):
return 0
else:
return 1
# 16.Checks the effect of mouse over on status bar (Mouse_Over)
def mouseOver(response):
if response == "" :
return 1
else:
try:
if re.findall("<script>.+onmouseover.+</script>", response.text):
return 1
else:
return 0
except:
return 1
# 18.Checks the number of forwardings (Web_Forwards)
def forwarding(response):
if response == "":
return 1
else:
if len(response.history) <= 2:
return 0
else:
return 1
|