spam-email-detection / URLFeatureExtraction.py
akshatsanghvi's picture
Update file
837d4e1
raw
history blame
6.45 kB
import urllib
import ipaddress
import re
import socket
from bs4 import BeautifulSoup
import whois
import requests
import urllib.request
from urllib.parse import urlparse
from datetime import datetime
def havingIP(url):
try:
ipaddress.ip_address(url)
ip = 1
except:
ip = 0
return ip
def haveAtSign(url):
if "@" in url:
at = 1
else:
at = 0
return at
def getLength(url):
if len(url) < 54:
return 1
else:
return 0
def getDepth(url):
s = urlparse(url).path.split('/')
depth = 0
for j in range(len(s)):
if len(s[j]) != 0:
depth +=1
return depth
def redirection(url):
pos = url.rfind('//')
if pos > 6:
if pos > 7:
return 1
else:
return 0
else:
return 0
def httpDomain(url):
domain = urlparse(url).netloc
if 'https' in domain:
return 1
else:
return 0
shortening_services = r"bit\.ly|goo\.gl|shorte\.st|go2l\.ink|x\.co|ow\.ly|t\.co|tinyurl|tr\.im|is\.gd|cli\.gs|" \
r"yfrog\.com|migre\.me|ff\.im|tiny\.cc|url4\.eu|twit\.ac|su\.pr|twurl\.nl|snipurl\.com|" \
r"short\.to|BudURL\.com|ping\.fm|post\.ly|Just\.as|bkite\.com|snipr\.com|fic\.kr|loopt\.us|" \
r"doiop\.com|short\.ie|kl\.am|wp\.me|rubyurl\.com|om\.ly|to\.ly|bit\.do|t\.co|lnkd\.in|db\.tt|" \
r"qr\.ae|adf\.ly|goo\.gl|bitly\.com|cur\.lv|tinyurl\.com|ow\.ly|bit\.ly|ity\.im|q\.gs|is\.gd|" \
r"po\.st|bc\.vc|twitthis\.com|u\.to|j\.mp|buzurl\.com|cutt\.us|u\.bb|yourls\.org|x\.co|" \
r"prettylinkpro\.com|scrnch\.me|filoops\.info|vzturl\.com|qr\.net|1url\.com|tweez\.me|v\.gd|" \
r"tr\.im|link\.zip\.net"
def tinyURL(url):
match=re.search(shortening_services,url)
if match:
return 1
else:
return 0
def prefixSuffix(url):
if '-' in urlparse(url).netloc:
return 1
else:
return 0
def web_traffic(url):
try:
query = urllib.parse.quote(url)
search_url = f"https://www.google.com/search?q=site:{query}"
headers = {'User-Agent': 'Mozilla/5.0'}
req = urllib.request.Request(search_url, headers=headers)
response = urllib.request.urlopen(req).read()
soup = BeautifulSoup(response, "lxml")
results = soup.find_all('div', class_='BNeawe')
for result in results:
if 'did not match' in result.get_text():
return 0
return 1
except Exception as e:
print(f"Error: {e}")
return 0
def domainAge(domain_name):
creation_date = domain_name.creation_date
expiration_date = domain_name.expiration_date
if (isinstance(creation_date,str) or isinstance(expiration_date,str)):
try:
creation_date = datetime.strptime(creation_date,'%Y-%m-%d')
expiration_date = datetime.strptime(expiration_date,"%Y-%m-%d")
except:
return 1
if ((expiration_date is None) or (creation_date is None)):
return 1
elif ((type(expiration_date) is list) or (type(creation_date) is list)):
return 1
else:
ageofdomain = abs((expiration_date - creation_date).days)
if ((ageofdomain/30) < 6):
age = 1
else:
age = 0
return age
def domainEnd(domain_name):
expiration_date = domain_name.expiration_date
if isinstance(expiration_date,str):
try:
expiration_date = datetime.strptime(expiration_date,"%Y-%m-%d")
except:
return 1
if (expiration_date is None):
return 1
elif (type(expiration_date) is list):
return 1
else:
today = datetime.now()
end = abs((expiration_date - today).days)
if ((end/30) < 6):
end = 0
else:
end = 1
return end
def iframe(response):
if response == "":
return 1
else:
if re.findall(r"[<iframe>|<frameBorder>]", response.text):
return 0
else:
return 1
def mouseOver(response):
if response == "" :
return 1
else:
if re.findall("<script>.+onmouseover.+</script>", response.text):
return 1
else:
return 0
def rightClick(response):
if response == "":
return 1
else:
if re.findall(r"event.button ?== ?2", response.text):
return 0
else:
return 1
def forwarding(response):
if response == "":
return 1
else:
if len(response.history) <= 2:
return 0
else:
return 1
state = 0
def featureExtraction(url):
new_url = url
try:
response = requests.get(new_url)
except:
try:
new_url = 'https://' + url
response = requests.get(new_url)
except:
try:
new_url = 'http://' + url
response = requests.get(new_url)
except:
response = ""
url = new_url
print("URL", url)
features = []
features.append(havingIP(url))
features.append(haveAtSign(url))
features.append(getLength(url))
features.append(getDepth(url))
features.append(redirection(url))
features.append(httpDomain(url))
features.append(tinyURL(url))
features.append(prefixSuffix(url))
try:
global state
domain_name = whois.whois(urlparse(url).netloc)
if domain_name.get('domain_name'):
state = 0
else:
state = 1
dns = 0 if socket.gethostbyname(domain_name.domain_name[0]) else 1
except:
dns = 1
features.append(dns)
features.append(web_traffic(url))
features.append(1 if dns == 1 else domainAge(domain_name))
features.append(1 if dns == 1 else domainEnd(domain_name))
features.append(iframe(response))
features.append(mouseOver(response))
features.append(rightClick(response))
features.append(forwarding(response))
return features
feature_names = ['Domain', 'Have_IP', 'Have_At', 'URL_Length', 'URL_Depth','Redirection',
'https_Domain', 'TinyURL', 'Prefix/Suffix', 'DNS_Record', 'Web_Traffic',
'Domain_Age', 'Domain_End', 'iFrame', 'Mouse_Over','Right_Click', 'Web_Forwards', 'Label']
# I @ L D R D t P D T A E i M R F L
# . . . . . .
# 0,0,1,3,0,0,0,0,0,1,0,1,0,0,1,0 0
# 0,0,1,1,0,0,0,0,0,1,0,1,0,0,1,0 Y
# 0,0,0,1,0,0,0,0,1,0,1,1,0,0,1,0 -
# . .
# 0,0,1,1,0,0,0,0,1,1,1,1,1,1,1,1 0
# 0,0,1,0,0,0,0,0,0,1,1,1,0,0,1,0
# 0,0,0,0,0,0,0,0,1,0,1,1,0,0,1,0 -
# 0,0,1,3,0,0,0,0,0,0,1,1,0,0,1,0 1
# 0,0,1,0,0,0,0,0,0,0,1,1,0,0,1,0
# 0,0,0,0,0,0,0,0,1,1,1,1,0,0,1,0 -
# Prints : site. history. array. pred.