danushkhanna commited on
Commit
0f5a965
1 Parent(s): d9bdf5f

Create extract_features.py

Browse files
Files changed (1) hide show
  1. extract_features.py +266 -0
extract_features.py ADDED
@@ -0,0 +1,266 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import whois
3
+ import tldextract
4
+ import time
5
+ from urllib.parse import urlparse, parse_qs
6
+ import requests
7
+ import ipwhois
8
+ import socket
9
+
10
+ class ExtractFeatures:
11
+ def parse_url(self, url):
12
+ """
13
+ Parses the given URL and extracts various components.
14
+
15
+ This method takes in URL input and parses it.
16
+ It extracts the domain, directories, files and parameters (if applicable) of the URL.
17
+ It also counts the number of top-level domains in the URL.
18
+
19
+ Args:
20
+ url (str): The URL to be parsed.
21
+
22
+ Returns:
23
+ tuple: A tuple containing the extracted components of the URL.
24
+ - domain (str): The domain name of the URL.
25
+ - directories (str): The directories in the URL's path.
26
+ - file (str): The file name in the URL's path.
27
+ - parameters (dict): A dictionary of query parameters.
28
+ - num_tlds (int): The number of top-level domains in the URL.
29
+ """
30
+ # Parse the URL into its components
31
+ if '//' not in url:
32
+ url = '//' + url
33
+
34
+ parsed_url = urlparse(url)
35
+
36
+ # Extract the domain name
37
+ domain = parsed_url.netloc
38
+
39
+ # Extract the path and split it into directories and file name
40
+ path = parsed_url.path
41
+ try:
42
+ directories, file = path.rsplit('/', 1)
43
+ except:
44
+ if '.' in path:
45
+ file = path
46
+ directories = ""
47
+ else:
48
+ directories = path
49
+ file = ""
50
+
51
+ # Extract the query parameters
52
+ parameters = parse_qs(parsed_url.query)
53
+
54
+ tld_info = tldextract.extract(url)
55
+ tld = tld_info.suffix
56
+
57
+ # Count the number of top-level domains
58
+ num_tlds = tld.count('.') + 1
59
+
60
+ return domain, directories, file, parameters, num_tlds
61
+
62
+ def get_domain_info(self, domain):
63
+ """
64
+ Retrieves information about a domain.
65
+
66
+ This method takes in the domain of a URL as input, and fetches its information.
67
+ It calculates the time elapsed since its creation and time remaining for its expiration.
68
+
69
+ Args:
70
+ domain (str): The domain to retrieve information for.
71
+
72
+ Returns:
73
+ tuple: A tuple containing the creation and expiration time of the domain in seconds.
74
+ - creation_time_seconds (float): Time elapsed since domain creation in seconds.
75
+ - expiration_time_seconds (float): Time remaining for domain expiration in seconds.
76
+ """
77
+ try:
78
+ # Get the domain information using python-whois
79
+ domain_info = whois.whois(domain)
80
+
81
+ # Extract the creation and expiration time
82
+ creation_time = domain_info.creation_date
83
+ expiration_time = domain_info.expiration_date
84
+
85
+ # Convert the time to seconds
86
+ if creation_time != None and expiration_time != None:
87
+ creation_time_seconds = time.mktime(creation_time.timetuple())
88
+ expiration_time_seconds = time.mktime(expiration_time.timetuple())
89
+ else:
90
+ raise ValueError
91
+ except:
92
+ creation_time_seconds = -1
93
+ expiration_time_seconds = -1
94
+
95
+ return creation_time_seconds, expiration_time_seconds
96
+
97
+ def get_redirects(self, url):
98
+ """
99
+ Retrieves the number of redirects for a given URL.
100
+
101
+ This method takes in a URL as input and assesses the number of times it redirects traffic.
102
+
103
+ Args:
104
+ url (str): The URL to retrieve redirects for.
105
+
106
+ Returns:
107
+ int: The number of redirects encountered.
108
+
109
+ Note:
110
+ The maximum number of redirects is limited to 20 to prevent infinite loops.
111
+ """
112
+ max_redirects = 20
113
+
114
+ # Initialize the redirect count
115
+ redirect_count = 0
116
+
117
+ # Follow the redirects
118
+ while True:
119
+ response = requests.get(url, allow_redirects=False)
120
+ if response.status_code == 301 or response.status_code == 302:
121
+ url = response.headers['Location']
122
+ redirect_count += 1
123
+ if redirect_count >= max_redirects:
124
+ break
125
+ else:
126
+ break
127
+ return redirect_count
128
+
129
+ def get_features(self):
130
+ """
131
+ Retrieves a list of features used for URL analysis.
132
+
133
+ This method returns the list of features that must be extracted from the URL to perform analysis.
134
+
135
+ Returns:
136
+ list: A list of features used for URL analysis.
137
+
138
+ Note:
139
+ The features include:
140
+ - length_url: Length of the URL.
141
+ - domain_length: Length of the domain name in the URL.
142
+ - domain_in_ip: Whether the domain is represented as an IP address.
143
+ - directory_length: Length of the directory path in the URL.
144
+ - file_length: Length of the file name in the URL.
145
+ - params_length: Length of the query parameters in the URL.
146
+ - email_in_url: Whether an email address is present in the URL.
147
+ - asn_ip: Autonomous System Number (ASN) associated with the IP address.
148
+ - time_domain_activation: Time of domain activation.
149
+ - time_domain_expiration: Time of domain expiration.
150
+ - tls_ssl_certificate: Availability of TLS/SSL certificate.
151
+ - qty_redirects: Number of redirects encountered.
152
+ - qty_char_domain: Number of characters in the domain name.
153
+ """
154
+ features_list = ['length_url',
155
+ 'domain_length',
156
+ 'domain_in_ip',
157
+ 'directory_length',
158
+ 'file_length',
159
+ 'params_length',
160
+ 'email_in_url',
161
+ 'asn_ip',
162
+ 'time_domain_activation',
163
+ 'time_domain_expiration',
164
+ 'tls_ssl_certificate',
165
+ 'qty_redirects',
166
+ 'qty_char_domain']
167
+
168
+ return features_list
169
+
170
+ def url_to_features(self, url):
171
+ """
172
+ Extracts features from a given URL.
173
+
174
+ This method takes in a URL as input and extracts all the relavant features for classification.
175
+ Also, it rearranges the features according to the training dataset of the classfier.
176
+
177
+ Args:
178
+ url (str): The URL to extract features from.
179
+
180
+ Returns:
181
+ dict: A dictionary containing the extracted features.
182
+
183
+ Note:
184
+ The extracted features are the same the the ones specified in the documentation of get_features.
185
+
186
+ See also:
187
+ get_features(): Retrieves a list of features used for URL analysis.
188
+ parse_url(): Parses the given URL and extracts its components.
189
+ get_domain_info(): Retrieves information about a domain.
190
+ get_redirects(): Retrieves the number of redirects for a given URL.
191
+ """
192
+ features_list = self.get_features()
193
+ new_dataset = {}
194
+
195
+ signs_dict = {"dot":".",
196
+ "hyphen":"-",
197
+ "underline": "_",
198
+ "slash":"/",
199
+ "questionmark": "?",
200
+ "equal":"=",
201
+ "at": "@",
202
+ "and": "&",
203
+ "exclamation": "!",
204
+ "space": " ",
205
+ "tilde": "~",
206
+ "comma": ",",
207
+ "plus": "+",
208
+ "asterisk": "∗",
209
+ "hashtag": "#",
210
+ "dollar": "$",
211
+ "percent": "%"}
212
+
213
+ return_val = self.parse_url(url)
214
+
215
+ if return_val != None:
216
+ domain, directory, file, parameters, new_dataset['qty_tld_url'] = return_val
217
+ else:
218
+ return -1
219
+
220
+ new_dataset['length_url'] = len(url)
221
+ new_dataset['domain_length'] = len(domain)
222
+ new_dataset['directory_length'] = len(directory) if directory != [""] else -1
223
+ new_dataset['file_length'] = len(file) if file != [""] else -1
224
+ new_dataset['params_length'] = len(str(parameters.values())) if parameters != {} else -1
225
+ new_dataset['qty_params'] = len(parameters) if parameters != {} else -1
226
+ new_dataset['time_domain_activation'], new_dataset['time_domain_expiration'] = self.get_domain_info(str(domain))
227
+
228
+ # Check if IP is in domain
229
+ if re.match('\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}', url) is not None:
230
+ new_dataset['domain_in_ip'] = int(True)
231
+ else:
232
+ new_dataset['domain_in_ip'] = int(False)
233
+
234
+ # Check for tls certificate
235
+ if url[:5] == 'https':
236
+ new_dataset["tls_ssl_certificate"] = int(True)
237
+ else:
238
+ new_dataset["tls_ssl_certificate"] = int(False)
239
+
240
+ # check for email in url
241
+ if re.search(r'[\w\-.]+@[\w\-.]+\.\w+', url):
242
+ new_dataset['email_in_url'] = int(True)
243
+ else:
244
+ new_dataset['email_in_url'] = int(False)
245
+
246
+ ip_addresses = socket.getaddrinfo(domain, None)
247
+
248
+ # Get the ASN of the IP address
249
+ try:
250
+ results = ipwhois.IPWhois.lookup_rdap(ip_addresses)
251
+ new_dataset['asn_ip'] = results['asn']
252
+ except:
253
+ new_dataset['asn_ip'] = -1
254
+
255
+ try:
256
+ new_dataset['qty_redirects'] = self.get_redirects(url)
257
+ except:
258
+ new_dataset['qty_redirects'] = -1
259
+
260
+ new_dataset['qty_char_domain'] = 0
261
+
262
+ for sign in signs_dict.values():
263
+ new_dataset['qty_char_domain'] += domain.count(sign)
264
+
265
+ reordered_dict = {k: new_dataset[k] for k in features_list}
266
+ return reordered_dict