akshatsanghvi commited on
Commit
837d4e1
1 Parent(s): feced93

Update file

Browse files
Files changed (1) hide show
  1. URLFeatureExtraction.py +260 -0
URLFeatureExtraction.py ADDED
@@ -0,0 +1,260 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import urllib
2
+ import ipaddress
3
+ import re
4
+ import socket
5
+ from bs4 import BeautifulSoup
6
+ import whois
7
+ import requests
8
+ import urllib.request
9
+ from urllib.parse import urlparse
10
+ from datetime import datetime
11
+
12
+ def havingIP(url):
13
+ try:
14
+ ipaddress.ip_address(url)
15
+ ip = 1
16
+ except:
17
+ ip = 0
18
+ return ip
19
+
20
+ def haveAtSign(url):
21
+ if "@" in url:
22
+ at = 1
23
+ else:
24
+ at = 0
25
+ return at
26
+
27
+
28
+ def getLength(url):
29
+ if len(url) < 54:
30
+ return 1
31
+ else:
32
+ return 0
33
+
34
+ def getDepth(url):
35
+ s = urlparse(url).path.split('/')
36
+ depth = 0
37
+ for j in range(len(s)):
38
+ if len(s[j]) != 0:
39
+ depth +=1
40
+ return depth
41
+
42
+ def redirection(url):
43
+ pos = url.rfind('//')
44
+ if pos > 6:
45
+ if pos > 7:
46
+ return 1
47
+ else:
48
+ return 0
49
+ else:
50
+ return 0
51
+
52
+ def httpDomain(url):
53
+ domain = urlparse(url).netloc
54
+ if 'https' in domain:
55
+ return 1
56
+ else:
57
+ return 0
58
+
59
+ shortening_services = r"bit\.ly|goo\.gl|shorte\.st|go2l\.ink|x\.co|ow\.ly|t\.co|tinyurl|tr\.im|is\.gd|cli\.gs|" \
60
+ r"yfrog\.com|migre\.me|ff\.im|tiny\.cc|url4\.eu|twit\.ac|su\.pr|twurl\.nl|snipurl\.com|" \
61
+ r"short\.to|BudURL\.com|ping\.fm|post\.ly|Just\.as|bkite\.com|snipr\.com|fic\.kr|loopt\.us|" \
62
+ r"doiop\.com|short\.ie|kl\.am|wp\.me|rubyurl\.com|om\.ly|to\.ly|bit\.do|t\.co|lnkd\.in|db\.tt|" \
63
+ r"qr\.ae|adf\.ly|goo\.gl|bitly\.com|cur\.lv|tinyurl\.com|ow\.ly|bit\.ly|ity\.im|q\.gs|is\.gd|" \
64
+ r"po\.st|bc\.vc|twitthis\.com|u\.to|j\.mp|buzurl\.com|cutt\.us|u\.bb|yourls\.org|x\.co|" \
65
+ r"prettylinkpro\.com|scrnch\.me|filoops\.info|vzturl\.com|qr\.net|1url\.com|tweez\.me|v\.gd|" \
66
+ r"tr\.im|link\.zip\.net"
67
+
68
+ def tinyURL(url):
69
+ match=re.search(shortening_services,url)
70
+ if match:
71
+ return 1
72
+ else:
73
+ return 0
74
+
75
+
76
+ def prefixSuffix(url):
77
+ if '-' in urlparse(url).netloc:
78
+ return 1
79
+ else:
80
+ return 0
81
+
82
+ def web_traffic(url):
83
+ try:
84
+ query = urllib.parse.quote(url)
85
+ search_url = f"https://www.google.com/search?q=site:{query}"
86
+
87
+ headers = {'User-Agent': 'Mozilla/5.0'}
88
+ req = urllib.request.Request(search_url, headers=headers)
89
+ response = urllib.request.urlopen(req).read()
90
+ soup = BeautifulSoup(response, "lxml")
91
+
92
+ results = soup.find_all('div', class_='BNeawe')
93
+
94
+ for result in results:
95
+ if 'did not match' in result.get_text():
96
+ return 0
97
+
98
+ return 1
99
+
100
+ except Exception as e:
101
+ print(f"Error: {e}")
102
+ return 0
103
+
104
+ def domainAge(domain_name):
105
+ creation_date = domain_name.creation_date
106
+ expiration_date = domain_name.expiration_date
107
+ if (isinstance(creation_date,str) or isinstance(expiration_date,str)):
108
+ try:
109
+ creation_date = datetime.strptime(creation_date,'%Y-%m-%d')
110
+ expiration_date = datetime.strptime(expiration_date,"%Y-%m-%d")
111
+ except:
112
+ return 1
113
+ if ((expiration_date is None) or (creation_date is None)):
114
+ return 1
115
+ elif ((type(expiration_date) is list) or (type(creation_date) is list)):
116
+ return 1
117
+ else:
118
+ ageofdomain = abs((expiration_date - creation_date).days)
119
+ if ((ageofdomain/30) < 6):
120
+ age = 1
121
+ else:
122
+ age = 0
123
+ return age
124
+
125
+ def domainEnd(domain_name):
126
+ expiration_date = domain_name.expiration_date
127
+ if isinstance(expiration_date,str):
128
+ try:
129
+ expiration_date = datetime.strptime(expiration_date,"%Y-%m-%d")
130
+ except:
131
+ return 1
132
+ if (expiration_date is None):
133
+ return 1
134
+ elif (type(expiration_date) is list):
135
+ return 1
136
+ else:
137
+ today = datetime.now()
138
+ end = abs((expiration_date - today).days)
139
+ if ((end/30) < 6):
140
+ end = 0
141
+ else:
142
+ end = 1
143
+ return end
144
+
145
+ def iframe(response):
146
+ if response == "":
147
+ return 1
148
+ else:
149
+ if re.findall(r"[<iframe>|<frameBorder>]", response.text):
150
+ return 0
151
+ else:
152
+ return 1
153
+
154
+ def mouseOver(response):
155
+ if response == "" :
156
+ return 1
157
+ else:
158
+ if re.findall("<script>.+onmouseover.+</script>", response.text):
159
+ return 1
160
+ else:
161
+ return 0
162
+
163
+ def rightClick(response):
164
+ if response == "":
165
+ return 1
166
+ else:
167
+ if re.findall(r"event.button ?== ?2", response.text):
168
+ return 0
169
+ else:
170
+ return 1
171
+
172
+ def forwarding(response):
173
+ if response == "":
174
+ return 1
175
+ else:
176
+ if len(response.history) <= 2:
177
+ return 0
178
+ else:
179
+ return 1
180
+
181
+ state = 0
182
+ def featureExtraction(url):
183
+
184
+ new_url = url
185
+ try:
186
+ response = requests.get(new_url)
187
+
188
+ except:
189
+ try:
190
+ new_url = 'https://' + url
191
+ response = requests.get(new_url)
192
+
193
+ except:
194
+ try:
195
+ new_url = 'http://' + url
196
+ response = requests.get(new_url)
197
+
198
+ except:
199
+ response = ""
200
+
201
+ url = new_url
202
+ print("URL", url)
203
+
204
+ features = []
205
+ features.append(havingIP(url))
206
+ features.append(haveAtSign(url))
207
+ features.append(getLength(url))
208
+ features.append(getDepth(url))
209
+ features.append(redirection(url))
210
+ features.append(httpDomain(url))
211
+ features.append(tinyURL(url))
212
+ features.append(prefixSuffix(url))
213
+
214
+ try:
215
+ global state
216
+
217
+ domain_name = whois.whois(urlparse(url).netloc)
218
+
219
+ if domain_name.get('domain_name'):
220
+ state = 0
221
+
222
+ else:
223
+ state = 1
224
+ dns = 0 if socket.gethostbyname(domain_name.domain_name[0]) else 1
225
+ except:
226
+ dns = 1
227
+
228
+ features.append(dns)
229
+ features.append(web_traffic(url))
230
+ features.append(1 if dns == 1 else domainAge(domain_name))
231
+ features.append(1 if dns == 1 else domainEnd(domain_name))
232
+
233
+ features.append(iframe(response))
234
+ features.append(mouseOver(response))
235
+ features.append(rightClick(response))
236
+ features.append(forwarding(response))
237
+
238
+ return features
239
+
240
+ feature_names = ['Domain', 'Have_IP', 'Have_At', 'URL_Length', 'URL_Depth','Redirection',
241
+ 'https_Domain', 'TinyURL', 'Prefix/Suffix', 'DNS_Record', 'Web_Traffic',
242
+ 'Domain_Age', 'Domain_End', 'iFrame', 'Mouse_Over','Right_Click', 'Web_Forwards', 'Label']
243
+
244
+ # I @ L D R D t P D T A E i M R F L
245
+ # . . . . . .
246
+
247
+ # 0,0,1,3,0,0,0,0,0,1,0,1,0,0,1,0 0
248
+ # 0,0,1,1,0,0,0,0,0,1,0,1,0,0,1,0 Y
249
+ # 0,0,0,1,0,0,0,0,1,0,1,1,0,0,1,0 -
250
+
251
+ # . .
252
+ # 0,0,1,1,0,0,0,0,1,1,1,1,1,1,1,1 0
253
+ # 0,0,1,0,0,0,0,0,0,1,1,1,0,0,1,0
254
+ # 0,0,0,0,0,0,0,0,1,0,1,1,0,0,1,0 -
255
+
256
+ # 0,0,1,3,0,0,0,0,0,0,1,1,0,0,1,0 1
257
+ # 0,0,1,0,0,0,0,0,0,0,1,1,0,0,1,0
258
+ # 0,0,0,0,0,0,0,0,1,1,1,1,0,0,1,0 -
259
+
260
+ # Prints : site. history. array. pred.