Spaces:

danushkhanna
/

Phishing_Domain_Detector

Runtime error

App Files Files Community

danushkhanna commited on May 28, 2023

Commit

0f5a965

1 Parent(s): d9bdf5f

Create extract_features.py

Browse files

Files changed (1) hide show

extract_features.py +266 -0

extract_features.py ADDED Viewed

	@@ -0,0 +1,266 @@

+import re
+import whois
+import tldextract
+import time
+from urllib.parse import urlparse, parse_qs
+import requests
+import ipwhois
+import socket
+class ExtractFeatures:
+    def parse_url(self, url):
+        """
+        Parses the given URL and extracts various components.
+        This method takes in URL input and parses it.
+        It extracts the domain, directories, files and parameters (if applicable) of the URL.
+        It also counts the number of top-level domains in the URL.
+        Args:
+            url (str): The URL to be parsed.
+        Returns:
+            tuple: A tuple containing the extracted components of the URL.
+                - domain (str): The domain name of the URL.
+                - directories (str): The directories in the URL's path.
+                - file (str): The file name in the URL's path.
+                - parameters (dict): A dictionary of query parameters.
+                - num_tlds (int): The number of top-level domains in the URL.
+        """
+        # Parse the URL into its components
+        if '//' not in url:
+            url = '//' + url
+        parsed_url = urlparse(url)
+        # Extract the domain name
+        domain = parsed_url.netloc
+        # Extract the path and split it into directories and file name
+        path = parsed_url.path
+        try:
+            directories, file = path.rsplit('/', 1)
+        except:
+            if '.' in path:
+                file = path
+                directories = ""
+            else:
+                directories = path
+                file = ""
+        # Extract the query parameters
+        parameters = parse_qs(parsed_url.query)
+        tld_info = tldextract.extract(url)
+        tld = tld_info.suffix
+        # Count the number of top-level domains
+        num_tlds = tld.count('.') + 1
+        return domain, directories, file, parameters, num_tlds
+    def get_domain_info(self, domain):
+        """
+        Retrieves information about a domain.
+        This method takes in the domain of a URL as input, and fetches its information.
+        It calculates the time elapsed since its creation and time remaining for its expiration.
+        Args:
+            domain (str): The domain to retrieve information for.
+        Returns:
+            tuple: A tuple containing the creation and expiration time of the domain in seconds.
+                - creation_time_seconds (float): Time elapsed since domain creation in seconds.
+                - expiration_time_seconds (float):  Time remaining for domain expiration in seconds.
+        """
+        try:
+            # Get the domain information using python-whois
+            domain_info = whois.whois(domain)
+            # Extract the creation and expiration time
+            creation_time = domain_info.creation_date
+            expiration_time = domain_info.expiration_date
+            # Convert the time to seconds
+            if creation_time != None and expiration_time != None:
+                creation_time_seconds = time.mktime(creation_time.timetuple())
+                expiration_time_seconds = time.mktime(expiration_time.timetuple())
+            else:
+                raise ValueError
+        except:
+            creation_time_seconds = -1
+            expiration_time_seconds = -1
+        return creation_time_seconds, expiration_time_seconds
+    def get_redirects(self, url):
+        """
+        Retrieves the number of redirects for a given URL.
+        This method takes in a URL as input and assesses the number of times it redirects traffic.
+        Args:
+            url (str): The URL to retrieve redirects for.
+        Returns:
+            int: The number of redirects encountered.
+        Note:
+            The maximum number of redirects is limited to 20 to prevent infinite loops.
+        """
+        max_redirects = 20
+        # Initialize the redirect count
+        redirect_count = 0
+        # Follow the redirects
+        while True:
+            response = requests.get(url, allow_redirects=False)
+            if response.status_code == 301 or response.status_code == 302:
+                url = response.headers['Location']
+                redirect_count += 1
+                if redirect_count >= max_redirects:
+                    break
+            else:
+                break
+        return redirect_count
+    def get_features(self):
+        """
+        Retrieves a list of features used for URL analysis.
+        This method returns the list of features that must be extracted from the URL to perform analysis.
+        Returns:
+            list: A list of features used for URL analysis.
+        Note:
+            The features include:
+            - length_url: Length of the URL.
+            - domain_length: Length of the domain name in the URL.
+            - domain_in_ip: Whether the domain is represented as an IP address.
+            - directory_length: Length of the directory path in the URL.
+            - file_length: Length of the file name in the URL.
+            - params_length: Length of the query parameters in the URL.
+            - email_in_url: Whether an email address is present in the URL.
+            - asn_ip: Autonomous System Number (ASN) associated with the IP address.
+            - time_domain_activation: Time of domain activation.
+            - time_domain_expiration: Time of domain expiration.
+            - tls_ssl_certificate: Availability of TLS/SSL certificate.
+            - qty_redirects: Number of redirects encountered.
+            - qty_char_domain: Number of characters in the domain name.
+        """
+        features_list = ['length_url',
+                            'domain_length',
+                            'domain_in_ip',
+                            'directory_length',
+                            'file_length',
+                            'params_length',
+                            'email_in_url',
+                            'asn_ip',
+                            'time_domain_activation',
+                            'time_domain_expiration',
+                            'tls_ssl_certificate',
+                            'qty_redirects',
+                            'qty_char_domain']
+        return features_list
+    def url_to_features(self, url):
+        """
+        Extracts features from a given URL.
+        This method takes in a URL as input and extracts all the relavant features for classification.
+        Also, it rearranges the features according to the training dataset of the classfier.
+        Args:
+            url (str): The URL to extract features from.
+        Returns:
+            dict: A dictionary containing the extracted features.
+        Note:
+            The extracted features are the same the the ones specified in the documentation of get_features.
+        See also:
+            get_features(): Retrieves a list of features used for URL analysis.
+            parse_url(): Parses the given URL and extracts its components.
+            get_domain_info(): Retrieves information about a domain.
+            get_redirects(): Retrieves the number of redirects for a given URL.
+        """
+        features_list = self.get_features()
+        new_dataset = {}
+        signs_dict = {"dot":".",
+                "hyphen":"-",
+                "underline": "_",
+                "slash":"/",
+                "questionmark": "?",
+                "equal":"=",
+                "at": "@",
+                "and": "&",
+                "exclamation": "!",
+                "space": " ",
+                "tilde": "~",
+                "comma": ",",
+                "plus": "+",
+                "asterisk": "∗",
+                "hashtag": "#",
+                "dollar": "$",
+                "percent": "%"}
+        return_val = self.parse_url(url)
+        if  return_val != None:
+            domain, directory, file, parameters, new_dataset['qty_tld_url'] = return_val
+        else:
+            return -1
+        new_dataset['length_url'] = len(url)
+        new_dataset['domain_length'] = len(domain)
+        new_dataset['directory_length'] = len(directory) if directory != [""] else -1
+        new_dataset['file_length'] = len(file) if file != [""] else -1
+        new_dataset['params_length'] = len(str(parameters.values())) if parameters != {} else -1
+        new_dataset['qty_params'] = len(parameters) if parameters != {} else -1
+        new_dataset['time_domain_activation'], new_dataset['time_domain_expiration'] = self.get_domain_info(str(domain))
+        # Check if IP is in domain
+        if re.match('\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}', url) is not None:
+            new_dataset['domain_in_ip'] = int(True)
+        else:
+            new_dataset['domain_in_ip'] = int(False)
+        # Check for tls certificate
+        if url[:5] == 'https':
+                new_dataset["tls_ssl_certificate"] = int(True)
+        else:
+                new_dataset["tls_ssl_certificate"] = int(False)
+        # check for email in url
+        if re.search(r'[\w\-.]+@[\w\-.]+\.\w+', url):
+            new_dataset['email_in_url'] = int(True)
+        else:
+            new_dataset['email_in_url'] = int(False)
+        ip_addresses = socket.getaddrinfo(domain, None)
+        # Get the ASN of the IP address
+        try:
+            results = ipwhois.IPWhois.lookup_rdap(ip_addresses)
+            new_dataset['asn_ip'] = results['asn']
+        except:
+            new_dataset['asn_ip'] = -1
+        try:
+            new_dataset['qty_redirects'] = self.get_redirects(url)
+        except:
+            new_dataset['qty_redirects'] = -1
+        new_dataset['qty_char_domain'] = 0
+        for sign in signs_dict.values():
+            new_dataset['qty_char_domain'] += domain.count(sign)
+        reordered_dict = {k: new_dataset[k] for k in features_list}
+        return reordered_dict