Hussain033 commited on
Commit
ad60c0d
1 Parent(s): f923379

Create extractorFunctions.py

Browse files
Files changed (1) hide show
  1. extractorFunctions.py +175 -0
extractorFunctions.py ADDED
@@ -0,0 +1,175 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # importing required packages for Address Bar Based feature Extraction
2
+ from urllib.parse import urlparse,urlencode, unquote
3
+ import re
4
+ # importing required packages for Domain Based Feature Extraction
5
+ import whois
6
+ from datetime import datetime
7
+
8
+
9
+ # 2.Checks for IP address in URL (Have_IP)
10
+ def havingIP(url):
11
+ ip_pattern = r"\b(?:\d{1,3}\.){3}\d{1,3}\b"
12
+ match = re.search(ip_pattern, url)
13
+ if match:
14
+ return 1
15
+ return 0
16
+
17
+ # 3.Checks the presence of @ in URL (Have_At)
18
+ def haveAtSign(url):
19
+ if "@" in url:
20
+ at = 1
21
+ else:
22
+ at = 0
23
+ return at
24
+
25
+ # 4.Finding the length of URL and categorizing (URL_Length)
26
+ def getLength(url):
27
+ return len(url)
28
+
29
+ # 5.Gives number of '/' in URL (URL_Depth)
30
+ def getDepth(url):
31
+ s = urlparse(url).path.split('/')
32
+ depth = 0
33
+ for j in range(len(s)):
34
+ if len(s[j]) != 0:
35
+ depth = depth+1
36
+ return depth
37
+
38
+ #listing shortening services
39
+ shortening_services = r"bit\.ly|goo\.gl|shorte\.st|go2l\.ink|x\.co|ow\.ly|t\.co|tinyurl|tr\.im|is\.gd|cli\.gs|" \
40
+ r"yfrog\.com|migre\.me|ff\.im|tiny\.cc|url4\.eu|twit\.ac|su\.pr|twurl\.nl|snipurl\.com|" \
41
+ r"short\.to|BudURL\.com|ping\.fm|post\.ly|Just\.as|bkite\.com|snipr\.com|fic\.kr|loopt\.us|" \
42
+ r"doiop\.com|short\.ie|kl\.am|wp\.me|rubyurl\.com|om\.ly|to\.ly|bit\.do|t\.co|lnkd\.in|db\.tt|" \
43
+ r"qr\.ae|adf\.ly|goo\.gl|bitly\.com|cur\.lv|tinyurl\.com|ow\.ly|bit\.ly|ity\.im|q\.gs|is\.gd|" \
44
+ r"po\.st|bc\.vc|twitthis\.com|u\.to|j\.mp|buzurl\.com|cutt\.us|u\.bb|yourls\.org|x\.co|" \
45
+ r"prettylinkpro\.com|scrnch\.me|filoops\.info|vzturl\.com|qr\.net|1url\.com|tweez\.me|v\.gd|" \
46
+ r"tr\.im|link\.zip\.net"
47
+
48
+ # 8. Checking for Shortening Services in URL (Tiny_URL)
49
+ def tinyURL(url):
50
+ match=re.search(shortening_services,url)
51
+ if match:
52
+ return 1
53
+ else:
54
+ return 0
55
+
56
+ # 9.Checking for Prefix or Suffix Separated by (-) in the Domain (Prefix/Suffix)
57
+ def prefixSuffix(url):
58
+ if '-' in urlparse(url).netloc:
59
+ return 1 # phishing
60
+ else:
61
+ return 0 # legitimate
62
+
63
+ def no_of_dots(url):
64
+ return url.count('.')
65
+
66
+ sensitiveWords = ["account", "confirm", "banking", "secure", "ebyisapi", "webscr", "signin", "mail",
67
+ "install", "toolbar", "backup", "paypal", "password", "username", "verify", "update",
68
+ "login", "support", "billing", "transaction", "security", "payment", "verify", "online",
69
+ "customer", "service", "accountupdate", "verification", "important", "confidential",
70
+ "limited", "access", "securitycheck", "verifyaccount", "information", "change", "notice"
71
+ "myaccount", "updateinfo", "loginsecure", "protect", "transaction", "identity", "member"
72
+ "personal", "actionrequired", "loginverify", "validate", "paymentupdate", "urgent"]
73
+
74
+ def sensitive_word(url):
75
+ domain = urlparse(url).netloc
76
+ for i in sensitiveWords:
77
+ if i in domain:
78
+ return 1
79
+ return 0
80
+
81
+
82
+ def has_unicode(url):
83
+ # Parse the URL
84
+ parsed_url = urlparse(url)
85
+
86
+ # Get the netloc part of the URL
87
+ netloc = parsed_url.netloc
88
+
89
+ # Decode the netloc using IDNA encoding
90
+ decoded_netloc = netloc.encode('latin1').decode('idna')
91
+
92
+ # Unquote the decoded netloc
93
+ unquoted_netloc = unquote(decoded_netloc)
94
+
95
+ # Compare the unquoted netloc with the original netloc
96
+ if unquoted_netloc != netloc:
97
+ return 1
98
+
99
+ return 0
100
+
101
+ # 13.Survival time of domain: The difference between termination time and creation time (Domain_Age)
102
+ def domainAge(domain_name):
103
+ creation_date = domain_name.creation_date
104
+ expiration_date = domain_name.expiration_date
105
+ if (isinstance(creation_date,str) or isinstance(expiration_date,str)):
106
+ try:
107
+ creation_date = datetime.strptime(creation_date,'%Y-%m-%d')
108
+ expiration_date = datetime.strptime(expiration_date,"%Y-%m-%d")
109
+ except:
110
+ return 1
111
+ if ((expiration_date is None) or (creation_date is None)):
112
+ return 1
113
+ elif ((type(expiration_date) is list) or (type(creation_date) is list)):
114
+ return 1
115
+ else:
116
+ ageofdomain = abs((expiration_date - creation_date).days)
117
+ if ((ageofdomain/30) < 6):
118
+ age = 1
119
+ else:
120
+ age = 0
121
+ return age
122
+
123
+ # 14.End time of domain: The difference between termination time and current time (Domain_End)
124
+ def domainEnd(domain_name):
125
+ expiration_date = domain_name.expiration_date
126
+ if isinstance(expiration_date,str):
127
+ try:
128
+ expiration_date = datetime.strptime(expiration_date,"%Y-%m-%d")
129
+ except:
130
+ return 1
131
+ if (expiration_date is None):
132
+ return 1
133
+ elif (type(expiration_date) is list):
134
+ return 1
135
+ else:
136
+ today = datetime.now()
137
+ end = abs((expiration_date - today).days)
138
+ if ((end/30) < 6):
139
+ end = 0
140
+ else:
141
+ end = 1
142
+ return end
143
+
144
+ # 15. IFrame Redirection (iFrame)
145
+ def iframe(response):
146
+ if response == "":
147
+ return 1
148
+ else:
149
+ if re.findall(r"[<iframe>|<frameBorder>]", response.text):
150
+ return 0
151
+ else:
152
+ return 1
153
+
154
+ # 16.Checks the effect of mouse over on status bar (Mouse_Over)
155
+ def mouseOver(response):
156
+ if response == "" :
157
+ return 1
158
+ else:
159
+ try:
160
+ if re.findall("<script>.+onmouseover.+</script>", response.text):
161
+ return 1
162
+ else:
163
+ return 0
164
+ except:
165
+ return 1
166
+
167
+ # 18.Checks the number of forwardings (Web_Forwards)
168
+ def forwarding(response):
169
+ if response == "":
170
+ return 1
171
+ else:
172
+ if len(response.history) <= 2:
173
+ return 0
174
+ else:
175
+ return 1