Spaces:

akshatsanghvi
/

spam-email-detection

Running

File size: 6,451 Bytes

837d4e1

import urllib
import ipaddress
import re
import socket
from bs4 import BeautifulSoup
import whois
import requests
import urllib.request
from urllib.parse import urlparse
from datetime import datetime

def havingIP(url):
  try:
    ipaddress.ip_address(url)
    ip = 1
  except:
    ip = 0
  return ip

def haveAtSign(url):
  if "@" in url:
    at = 1    
  else:
    at = 0    
  return at


def getLength(url):
  if len(url) < 54:
    return 1         
  else:
    return 0

def getDepth(url):
  s = urlparse(url).path.split('/')
  depth = 0
  for j in range(len(s)):
    if len(s[j]) != 0:
      depth +=1
  return depth

def redirection(url):
  pos = url.rfind('//')
  if pos > 6:
    if pos > 7:
      return 1
    else:
      return 0
  else:
    return 0
  
def httpDomain(url):
  domain = urlparse(url).netloc
  if 'https' in domain:
    return 1
  else:
    return 0
  
shortening_services = r"bit\.ly|goo\.gl|shorte\.st|go2l\.ink|x\.co|ow\.ly|t\.co|tinyurl|tr\.im|is\.gd|cli\.gs|" \
                      r"yfrog\.com|migre\.me|ff\.im|tiny\.cc|url4\.eu|twit\.ac|su\.pr|twurl\.nl|snipurl\.com|" \
                      r"short\.to|BudURL\.com|ping\.fm|post\.ly|Just\.as|bkite\.com|snipr\.com|fic\.kr|loopt\.us|" \
                      r"doiop\.com|short\.ie|kl\.am|wp\.me|rubyurl\.com|om\.ly|to\.ly|bit\.do|t\.co|lnkd\.in|db\.tt|" \
                      r"qr\.ae|adf\.ly|goo\.gl|bitly\.com|cur\.lv|tinyurl\.com|ow\.ly|bit\.ly|ity\.im|q\.gs|is\.gd|" \
                      r"po\.st|bc\.vc|twitthis\.com|u\.to|j\.mp|buzurl\.com|cutt\.us|u\.bb|yourls\.org|x\.co|" \
                      r"prettylinkpro\.com|scrnch\.me|filoops\.info|vzturl\.com|qr\.net|1url\.com|tweez\.me|v\.gd|" \
                      r"tr\.im|link\.zip\.net"

def tinyURL(url):
    match=re.search(shortening_services,url)
    if match:
        return 1
    else:
        return 0


def prefixSuffix(url):
    if '-' in urlparse(url).netloc:
        return 1        
    else:
        return 0        

def web_traffic(url):
  try:
    query = urllib.parse.quote(url)
    search_url = f"https://www.google.com/search?q=site:{query}"

    headers = {'User-Agent': 'Mozilla/5.0'}
    req = urllib.request.Request(search_url, headers=headers)
    response = urllib.request.urlopen(req).read()
    soup = BeautifulSoup(response, "lxml")
    
    results = soup.find_all('div', class_='BNeawe')
    
    for result in results:
        if 'did not match' in result.get_text():
            return 0
        
    return 1
  
  except Exception as e:
      print(f"Error: {e}")
      return 0

def domainAge(domain_name):
  creation_date = domain_name.creation_date
  expiration_date = domain_name.expiration_date
  if (isinstance(creation_date,str) or isinstance(expiration_date,str)):
    try:
      creation_date = datetime.strptime(creation_date,'%Y-%m-%d')
      expiration_date = datetime.strptime(expiration_date,"%Y-%m-%d")
    except:
      return 1
  if ((expiration_date is None) or (creation_date is None)):
      return 1
  elif ((type(expiration_date) is list) or (type(creation_date) is list)):
      return 1
  else:
    ageofdomain = abs((expiration_date - creation_date).days)
    if ((ageofdomain/30) < 6):
      age = 1
    else:
      age = 0
  return age

def domainEnd(domain_name):
  expiration_date = domain_name.expiration_date
  if isinstance(expiration_date,str):
    try:
      expiration_date = datetime.strptime(expiration_date,"%Y-%m-%d")
    except:
      return 1
  if (expiration_date is None):
      return 1
  elif (type(expiration_date) is list):
      return 1
  else:
    today = datetime.now()
    end = abs((expiration_date - today).days)
    if ((end/30) < 6):
      end = 0
    else:
      end = 1
  return end

def iframe(response):
  if response == "":
      return 1
  else:
      if re.findall(r"[<iframe>|<frameBorder>]", response.text):
          return 0
      else:
          return 1

def mouseOver(response): 
  if response == "" :
    return 1
  else:
    if re.findall("<script>.+onmouseover.+</script>", response.text):
      return 1
    else:
      return 0

def rightClick(response):
  if response == "":
    return 1
  else:
    if re.findall(r"event.button ?== ?2", response.text):
      return 0
    else:
      return 1
 
def forwarding(response):
  if response == "":
    return 1
  else:
    if len(response.history) <= 2:
      return 0
    else:
      return 1

state = 0
def featureExtraction(url):

  new_url = url
  try:
    response = requests.get(new_url)

  except:
    try:
      new_url = 'https://' + url
      response = requests.get(new_url)

    except:
      try:
        new_url = 'http://' + url
        response = requests.get(new_url)

      except:
        response = ""
    
  url = new_url
  print("URL", url)

  features = []
  features.append(havingIP(url))
  features.append(haveAtSign(url))
  features.append(getLength(url))
  features.append(getDepth(url))
  features.append(redirection(url))
  features.append(httpDomain(url))
  features.append(tinyURL(url))
  features.append(prefixSuffix(url))
  
  try:
    global state

    domain_name = whois.whois(urlparse(url).netloc)

    if domain_name.get('domain_name'):
      state = 0

    else:
      state = 1
    dns = 0 if socket.gethostbyname(domain_name.domain_name[0]) else 1
  except:
    dns = 1

  features.append(dns)
  features.append(web_traffic(url))
  features.append(1 if dns == 1 else domainAge(domain_name))
  features.append(1 if dns == 1 else domainEnd(domain_name))

  features.append(iframe(response))
  features.append(mouseOver(response))
  features.append(rightClick(response))
  features.append(forwarding(response))
  
  return features

feature_names = ['Domain', 'Have_IP', 'Have_At', 'URL_Length', 'URL_Depth','Redirection', 
                      'https_Domain', 'TinyURL', 'Prefix/Suffix', 'DNS_Record', 'Web_Traffic', 
                      'Domain_Age', 'Domain_End', 'iFrame', 'Mouse_Over','Right_Click', 'Web_Forwards', 'Label']

# I @ L D R D t P D T A E i M R F  L
#     . .         . . .         .

# 0,0,1,3,0,0,0,0,0,1,0,1,0,0,1,0  0
# 0,0,1,1,0,0,0,0,0,1,0,1,0,0,1,0  Y
# 0,0,0,1,0,0,0,0,1,0,1,1,0,0,1,0  -

#                         . .   
# 0,0,1,1,0,0,0,0,1,1,1,1,1,1,1,1  0
# 0,0,1,0,0,0,0,0,0,1,1,1,0,0,1,0
# 0,0,0,0,0,0,0,0,1,0,1,1,0,0,1,0  -

# 0,0,1,3,0,0,0,0,0,0,1,1,0,0,1,0  1
# 0,0,1,0,0,0,0,0,0,0,1,1,0,0,1,0
# 0,0,0,0,0,0,0,0,1,1,1,1,0,0,1,0  -

# Prints : site. history. array. pred.