from urllib.parse import urlparse, urlencode import ipaddress import re from bs4 import BeautifulSoup import whois import urllib import urllib.request from datetime import datetime import requests import pickle import gradio as gr loaded_model = pickle.load(open("XGBoostClassifier1.pickle.dat", "rb")) shortening_services = r"bit\.ly|goo\.gl|shorte\.st|go2l\.ink|x\.co|ow\.ly|t\.co|tinyurl|tr\.im|is\.gd|cli\.gs|" \ r"yfrog\.com|migre\.me|ff\.im|tiny\.cc|url4\.eu|twit\.ac|su\.pr|twurl\.nl|snipurl\.com|" \ r"short\.to|BudURL\.com|ping\.fm|post\.ly|Just\.as|bkite\.com|snipr\.com|fic\.kr|loopt\.us|" \ r"doiop\.com|short\.ie|kl\.am|wp\.me|rubyurl\.com|om\.ly|to\.ly|bit\.do|t\.co|lnkd\.in|db\.tt|" \ r"qr\.ae|adf\.ly|goo\.gl|bitly\.com|cur\.lv|tinyurl\.com|ow\.ly|bit\.ly|ity\.im|q\.gs|is\.gd|" \ r"po\.st|bc\.vc|twitthis\.com|u\.to|j\.mp|buzurl\.com|cutt\.us|u\.bb|yourls\.org|x\.co|" \ r"prettylinkpro\.com|scrnch\.me|filoops\.info|vzturl\.com|qr\.net|1url\.com|tweez\.me|v\.gd|" \ r"tr\.im|link\.zip\.net" def getDomain(url): domain = urlparse(url).netloc if re.match(r"^www.",domain): domain = domain.replace("www.","") return domain def havingIP(url): try: ipaddress.ip_address(url) ip = 1 except: ip = 0 return ip def haveAtSign(url): if "@" in url: at = 1 else: at = 0 return at def getLength(url): if len(url) < 54: length = 0 else: length = 1 return length def getDepth(url): s = urlparse(url).path.split('/') depth = 0 for j in range(len(s)): if len(s[j]) != 0: depth = depth+1 return depth def redirection(url): pos = url.rfind('//') if pos > 6: if pos > 7: return 1 else: return 0 else: return 0 def httpDomain(url): domain = urlparse(url).netloc if 'https' in domain: return 1 else: return 0 def tinyURL(url): match=re.search(shortening_services,url) if match: return 1 else: return 0 def prefixSuffix(url): if '-' in urlparse(url).netloc: return 1 # phishing else: return 0 # legitimate def web_traffic(url): # try: # url = urllib.parse.quote(url) # rank = BeautifulSoup(urllib.request.urlopen("http://data.alexa.com/data?cli=10&dat=s&url=" + url).read(), "xml").find( # "REACH")['RANK'] # rank = int(rank) # except TypeError: # return 1 # if rank <100000: # return 1 # else: return 0 def domainAge(domain_name): creation_date = domain_name.creation_date expiration_date = domain_name.expiration_date if (isinstance(creation_date,str) or isinstance(expiration_date,str)): try: creation_date = datetime.strptime(creation_date,'%Y-%m-%d') expiration_date = datetime.strptime(expiration_date,"%Y-%m-%d") except: return 1 if ((expiration_date is None) or (creation_date is None)): return 1 elif ((type(expiration_date) is list) or (type(creation_date) is list)): return 1 else: ageofdomain = abs((expiration_date - creation_date).days) if ((ageofdomain/30) < 6): age = 1 else: age = 0 return age def domainEnd(domain_name): expiration_date = domain_name.expiration_date if isinstance(expiration_date,str): try: expiration_date = datetime.strptime(expiration_date,"%Y-%m-%d") except: return 1 if (expiration_date is None): return 1 elif (type(expiration_date) is list): return 1 else: today = datetime.now() end = abs((expiration_date - today).days) if ((end/30) < 6): end = 0 else: end = 1 return end def iframe(response): if response == "": return 1 else: if re.findall(r"[