{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": { "colab": {}, "colab_type": "code", "id": "PH13wfswmyDv" }, "outputs": [], "source": [ "#importing required packages for this module\n", "import pandas as pd" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 392 }, "colab_type": "code", "id": "FF5vM84YriWc", "outputId": "34b29509-57f2-48c9-a862-db8390b6af1c" }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "--2024-02-25 16:01:08-- http://data.phishtank.com/data/online-valid.csv\n", "Resolving data.phishtank.com (data.phishtank.com)... 2606:4700:8392:2ee:d039:14a:6810:654b, 104.16.101.75, 104.17.177.85\n", "Connecting to data.phishtank.com (data.phishtank.com)|2606:4700:8392:2ee:d039:14a:6810:654b|:80... connected.\n", "HTTP request sent, awaiting response... 301 Moved Permanently\n", "Location: https://data.phishtank.com/data/online-valid.csv [following]\n", "--2024-02-25 16:01:08-- https://data.phishtank.com/data/online-valid.csv\n", "Connecting to data.phishtank.com (data.phishtank.com)|2606:4700:8392:2ee:d039:14a:6810:654b|:443... connected.\n", "HTTP request sent, awaiting response... 429 Too Many Requests\n", "2024-02-25 16:01:09 ERROR 429: Too Many Requests.\n", "\n" ] } ], "source": [ "#Downloading the phishing URLs file\n", "!wget http://data.phishtank.com/data/online-valid.csv" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 305 }, "colab_type": "code", "id": "GaGVL9gYKXma", "outputId": "fad0a947-4996-44bf-d46f-89abc4306e62" }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
phish_idurlphish_detail_urlsubmission_timeverifiedverification_timeonlinetarget
06557033http://u1047531.cp.regruhosting.ru/acces-inges...http://www.phishtank.com/phish_detail.php?phis...2020-05-09T22:01:43+00:00yes2020-05-09T22:03:07+00:00yesOther
16557032http://hoysalacreations.com/wp-content/plugins...http://www.phishtank.com/phish_detail.php?phis...2020-05-09T22:01:37+00:00yes2020-05-09T22:03:07+00:00yesOther
26557011http://www.accsystemprblemhelp.site/checkpoint...http://www.phishtank.com/phish_detail.php?phis...2020-05-09T21:54:31+00:00yes2020-05-09T21:55:38+00:00yesFacebook
36557010http://www.accsystemprblemhelp.site/login_atte...http://www.phishtank.com/phish_detail.php?phis...2020-05-09T21:53:48+00:00yes2020-05-09T21:54:34+00:00yesFacebook
46557009https://firebasestorage.googleapis.com/v0/b/so...http://www.phishtank.com/phish_detail.php?phis...2020-05-09T21:49:27+00:00yes2020-05-09T21:51:24+00:00yesMicrosoft
\n", "
" ], "text/plain": [ " phish_id ... target\n", "0 6557033 ... Other\n", "1 6557032 ... Other\n", "2 6557011 ... Facebook\n", "3 6557010 ... Facebook\n", "4 6557009 ... Microsoft\n", "\n", "[5 rows x 8 columns]" ] }, "execution_count": 3, "metadata": { "tags": [] }, "output_type": "execute_result" } ], "source": [ "#loading the phishing URLs data to dataframe\n", "data0 = pd.read_csv(\"online-valid.csv\")\n", "data0.head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 35 }, "colab_type": "code", "id": "mAZAvSe2n1oT", "outputId": "da2fbbb6-871f-4070-df86-cc9a135ac37a" }, "outputs": [ { "data": { "text/plain": [ "(14858, 8)" ] }, "execution_count": 4, "metadata": { "tags": [] }, "output_type": "execute_result" } ], "source": [ "data0.shape" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 305 }, "colab_type": "code", "id": "9CTCI_EgERPM", "outputId": "cb74e74c-5591-4523-e077-bbf13ef89245" }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
phish_idurlphish_detail_urlsubmission_timeverifiedverification_timeonlinetarget
06485787https://eevee.tv/Bootstrap/assets/css/acceshttp://www.phishtank.com/phish_detail.php?phis...2020-04-04T03:01:00+00:00yes2020-04-04T03:03:56+00:00yesOther
16422543https://appleid.apple.com-sa.pm/appleid/?http://www.phishtank.com/phish_detail.php?phis...2020-02-27T17:01:01+00:00yes2020-03-17T01:50:51+00:00yesOther
26543602https://grandcup.xyz/http://www.phishtank.com/phish_detail.php?phis...2020-05-02T23:07:29+00:00yes2020-05-02T23:09:03+00:00yesSteam
36528783https://villa-azzurro.com/onedrive/http://www.phishtank.com/phish_detail.php?phis...2020-04-25T20:54:02+00:00yes2020-04-25T21:46:55+00:00yesOther
46498136http://mygpstrip.net/ii/u.phphttp://www.phishtank.com/phish_detail.php?phis...2020-04-10T15:01:56+00:00yes2020-04-10T16:01:37+00:00yesOther
\n", "
" ], "text/plain": [ " phish_id url ... online target\n", "0 6485787 https://eevee.tv/Bootstrap/assets/css/acces ... yes Other\n", "1 6422543 https://appleid.apple.com-sa.pm/appleid/? ... yes Other\n", "2 6543602 https://grandcup.xyz/ ... yes Steam\n", "3 6528783 https://villa-azzurro.com/onedrive/ ... yes Other\n", "4 6498136 http://mygpstrip.net/ii/u.php ... yes Other\n", "\n", "[5 rows x 8 columns]" ] }, "execution_count": 5, "metadata": { "tags": [] }, "output_type": "execute_result" } ], "source": [ "#Collecting 5,000 Phishing URLs randomly\n", "phishurl = data0.sample(n = 5000, random_state = 12).copy()\n", "phishurl = phishurl.reset_index(drop=True)\n", "phishurl.head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 35 }, "colab_type": "code", "id": "-FOfv0bspc8N", "outputId": "48e76e11-37d7-4ba1-e04a-c2fa661e9219" }, "outputs": [ { "data": { "text/plain": [ "(5000, 8)" ] }, "execution_count": 6, "metadata": { "tags": [] }, "output_type": "execute_result" } ], "source": [ "phishurl.shape" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 200 }, "colab_type": "code", "id": "0wkw4wGAsIbT", "outputId": "4395a2bd-dd8b-49ea-fb1e-36cf0b67e75f" }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
URLs
0http://1337x.to/torrent/1110018/Blackhat-2015-...
1http://1337x.to/torrent/1122940/Blackhat-2015-...
2http://1337x.to/torrent/1124395/Fast-and-Furio...
3http://1337x.to/torrent/1145504/Avengers-Age-o...
4http://1337x.to/torrent/1160078/Avengers-age-o...
\n", "
" ], "text/plain": [ " URLs\n", "0 http://1337x.to/torrent/1110018/Blackhat-2015-...\n", "1 http://1337x.to/torrent/1122940/Blackhat-2015-...\n", "2 http://1337x.to/torrent/1124395/Fast-and-Furio...\n", "3 http://1337x.to/torrent/1145504/Avengers-Age-o...\n", "4 http://1337x.to/torrent/1160078/Avengers-age-o..." ] }, "execution_count": 7, "metadata": { "tags": [] }, "output_type": "execute_result" } ], "source": [ "#Loading legitimate files \n", "data1 = pd.read_csv(\"Benign_list_big_final.csv\")\n", "data1.columns = ['URLs']\n", "data1.head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 200 }, "colab_type": "code", "id": "EQRtf9Ybs5sv", "outputId": "227e262b-1483-4549-8bdf-49da2f321b06" }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
URLs
0http://graphicriver.net/search?date=this-month...
1http://ecnavi.jp/redirect/?url=http://www.cros...
2https://hubpages.com/signin?explain=follow+Hub...
3http://extratorrent.cc/torrent/4190536/AOMEI+B...
4http://icicibank.com/Personal-Banking/offers/o...
\n", "
" ], "text/plain": [ " URLs\n", "0 http://graphicriver.net/search?date=this-month...\n", "1 http://ecnavi.jp/redirect/?url=http://www.cros...\n", "2 https://hubpages.com/signin?explain=follow+Hub...\n", "3 http://extratorrent.cc/torrent/4190536/AOMEI+B...\n", "4 http://icicibank.com/Personal-Banking/offers/o..." ] }, "execution_count": 8, "metadata": { "tags": [] }, "output_type": "execute_result" } ], "source": [ "#Collecting 5,000 Legitimate URLs randomly\n", "legiurl = data1.sample(n = 5000, random_state = 12).copy()\n", "legiurl = legiurl.reset_index(drop=True)\n", "legiurl.head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 35 }, "colab_type": "code", "id": "QrpSRXzDuKwW", "outputId": "8b8e5220-be59-4893-9dd5-3ffc381d2b1d" }, "outputs": [ { "data": { "text/plain": [ "(5000, 1)" ] }, "execution_count": 9, "metadata": { "tags": [] }, "output_type": "execute_result" } ], "source": [ "legiurl.shape" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": {}, "colab_type": "code", "id": "Rk4HFWsEKXpS" }, "outputs": [], "source": [ "# importing required packages for this section\n", "from urllib.parse import urlparse,urlencode\n", "import ipaddress\n", "import re" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": {}, "colab_type": "code", "id": "S0QorYenhaOD" }, "outputs": [], "source": [ "# 1.Domain of the URL (Domain) \n", "def getDomain(url): \n", " domain = urlparse(url).netloc\n", " if re.match(r\"^www.\",domain):\n", " domain = domain.replace(\"www.\",\"\")\n", " return domain" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": {}, "colab_type": "code", "id": "SX-4mbq27QBj" }, "outputs": [], "source": [ "# 2.Checks for IP address in URL (Have_IP)\n", "def havingIP(url):\n", " try:\n", " ipaddress.ip_address(url)\n", " ip = 1\n", " except:\n", " ip = 0\n", " return ip\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": {}, "colab_type": "code", "id": "XZQZi3K17TcR" }, "outputs": [], "source": [ "# 3.Checks the presence of @ in URL (Have_At)\n", "def haveAtSign(url):\n", " if \"@\" in url:\n", " at = 1 \n", " else:\n", " at = 0 \n", " return at" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": {}, "colab_type": "code", "id": "fnQazil39Kra" }, "outputs": [], "source": [ "# 4.Finding the length of URL and categorizing (URL_Length)\n", "def getLength(url):\n", " if len(url) < 54:\n", " length = 0 \n", " else:\n", " length = 1 \n", " return length" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": {}, "colab_type": "code", "id": "yILgNFf_9L3X" }, "outputs": [], "source": [ "# 5.Gives number of '/' in URL (URL_Depth)\n", "def getDepth(url):\n", " s = urlparse(url).path.split('/')\n", " depth = 0\n", " for j in range(len(s)):\n", " if len(s[j]) != 0:\n", " depth = depth+1\n", " return depth" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": {}, "colab_type": "code", "id": "RIJEiq51BSy0" }, "outputs": [], "source": [ "# 6.Checking for redirection '//' in the url (Redirection)\n", "def redirection(url):\n", " pos = url.rfind('//')\n", " if pos > 6:\n", " if pos > 7:\n", " return 1\n", " else:\n", " return 0\n", " else:\n", " return 0" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": {}, "colab_type": "code", "id": "h2vW23O1BbWl" }, "outputs": [], "source": [ "# 7.Existence of “HTTPS” Token in the Domain Part of the URL (https_Domain)\n", "def httpDomain(url):\n", " domain = urlparse(url).netloc\n", " if 'https' in domain:\n", " return 1\n", " else:\n", " return 0" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": {}, "colab_type": "code", "id": "UdC9pUdTAVRU" }, "outputs": [], "source": [ "#listing shortening services\n", "shortening_services = r\"bit\\.ly|goo\\.gl|shorte\\.st|go2l\\.ink|x\\.co|ow\\.ly|t\\.co|tinyurl|tr\\.im|is\\.gd|cli\\.gs|\" \\\n", " r\"yfrog\\.com|migre\\.me|ff\\.im|tiny\\.cc|url4\\.eu|twit\\.ac|su\\.pr|twurl\\.nl|snipurl\\.com|\" \\\n", " r\"short\\.to|BudURL\\.com|ping\\.fm|post\\.ly|Just\\.as|bkite\\.com|snipr\\.com|fic\\.kr|loopt\\.us|\" \\\n", " r\"doiop\\.com|short\\.ie|kl\\.am|wp\\.me|rubyurl\\.com|om\\.ly|to\\.ly|bit\\.do|t\\.co|lnkd\\.in|db\\.tt|\" \\\n", " r\"qr\\.ae|adf\\.ly|goo\\.gl|bitly\\.com|cur\\.lv|tinyurl\\.com|ow\\.ly|bit\\.ly|ity\\.im|q\\.gs|is\\.gd|\" \\\n", " r\"po\\.st|bc\\.vc|twitthis\\.com|u\\.to|j\\.mp|buzurl\\.com|cutt\\.us|u\\.bb|yourls\\.org|x\\.co|\" \\\n", " r\"prettylinkpro\\.com|scrnch\\.me|filoops\\.info|vzturl\\.com|qr\\.net|1url\\.com|tweez\\.me|v\\.gd|\" \\\n", " r\"tr\\.im|link\\.zip\\.net\"" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": {}, "colab_type": "code", "id": "IUkU9UbbnKpY" }, "outputs": [], "source": [ "# 8. Checking for Shortening Services in URL (Tiny_URL)\n", "def tinyURL(url):\n", " match=re.search(shortening_services,url)\n", " if match:\n", " return 1\n", " else:\n", " return 0" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": {}, "colab_type": "code", "id": "vLyjiIUgPjuw" }, "outputs": [], "source": [ "# 9.Checking for Prefix or Suffix Separated by (-) in the Domain (Prefix/Suffix)\n", "def prefixSuffix(url):\n", " if '-' in urlparse(url).netloc:\n", " return 1 # phishing\n", " else:\n", " return 0 # legitimate" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 232 }, "colab_type": "code", "id": "NbkEYJ_JOVa7", "outputId": "f08b25f8-3852-432c-e141-8eb57ff916d8" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Collecting python-whois\n", "\u001b[?25l Downloading https://files.pythonhosted.org/packages/f0/ab/11c2d01db2554bbaabb2c32b06b6a73f7277372533484c320c78a304dfd7/python-whois-0.7.2.tar.gz (90kB)\n", "\r\u001b[K |███▋ | 10kB 24.0MB/s eta 0:00:01\r\u001b[K |███████▎ | 20kB 6.5MB/s eta 0:00:01\r\u001b[K |███████████ | 30kB 6.8MB/s eta 0:00:01\r\u001b[K |██████████████▋ | 40kB 7.8MB/s eta 0:00:01\r\u001b[K |██████████████████▏ | 51kB 7.6MB/s eta 0:00:01\r\u001b[K |█████████████████████▉ | 61kB 8.6MB/s eta 0:00:01\r\u001b[K |█████████████████████████▌ | 71kB 8.4MB/s eta 0:00:01\r\u001b[K |█████████████████████████████▏ | 81kB 9.3MB/s eta 0:00:01\r\u001b[K |████████████████████████████████| 92kB 5.5MB/s \n", "\u001b[?25hRequirement already satisfied: future in /usr/local/lib/python3.6/dist-packages (from python-whois) (0.16.0)\n", "Building wheels for collected packages: python-whois\n", " Building wheel for python-whois (setup.py) ... \u001b[?25l\u001b[?25hdone\n", " Created wheel for python-whois: filename=python_whois-0.7.2-cp36-none-any.whl size=85245 sha256=900afbc18f144913762a57978778098dda65b687b3b5a1f14f7998e9631564e8\n", " Stored in directory: /root/.cache/pip/wheels/69/e6/62/1e6a746ca8e690f472611511b6948c325b232aaf693245ce46\n", "Successfully built python-whois\n", "Installing collected packages: python-whois\n", "Successfully installed python-whois-0.7.2\n" ] } ], "source": [ "%pip install python-whois" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": {}, "colab_type": "code", "id": "esZ7FcvlOMZu" }, "outputs": [], "source": [ "# importing required packages for this section\n", "import re\n", "from bs4 import BeautifulSoup\n", "import whois\n", "import urllib\n", "import urllib.request\n", "from datetime import datetime" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": {}, "colab_type": "code", "id": "8O5D1jH0IDgf" }, "outputs": [], "source": [ "# 11.DNS Record availability (DNS_Record)\n", "# obtained in the featureExtraction function itself" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": {}, "colab_type": "code", "id": "mtwQiRotZ2GD" }, "outputs": [], "source": [ "# 12.Web traffic (Web_Traffic)\n", "def web_traffic(url):\n", " try:\n", " #Filling the whitespaces in the URL if any\n", " url = urllib.parse.quote(url)\n", " rank = BeautifulSoup(urllib.request.urlopen(\"http://data.alexa.com/data?cli=10&dat=s&url=\" + url).read(), \"xml\").find(\n", " \"REACH\")['RANK']\n", " rank = int(rank)\n", " except TypeError:\n", " return 1\n", " if rank <100000:\n", " return 1\n", " else:\n", " return 0" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": {}, "colab_type": "code", "id": "li03hqJgH__j" }, "outputs": [], "source": [ "# 13.Survival time of domain: The difference between termination time and creation time (Domain_Age) \n", "def domainAge(domain_name):\n", " creation_date = domain_name.creation_date\n", " expiration_date = domain_name.expiration_date\n", " if (isinstance(creation_date,str) or isinstance(expiration_date,str)):\n", " try:\n", " creation_date = datetime.strptime(creation_date,'%Y-%m-%d')\n", " expiration_date = datetime.strptime(expiration_date,\"%Y-%m-%d\")\n", " except:\n", " return 1\n", " if ((expiration_date is None) or (creation_date is None)):\n", " return 1\n", " elif ((type(expiration_date) is list) or (type(creation_date) is list)):\n", " return 1\n", " else:\n", " ageofdomain = abs((expiration_date - creation_date).days)\n", " if ((ageofdomain/30) < 6):\n", " age = 1\n", " else:\n", " age = 0\n", " return age" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": {}, "colab_type": "code", "id": "NueO81-ttKYd" }, "outputs": [], "source": [ "# 14.End time of domain: The difference between termination time and current time (Domain_End) \n", "def domainEnd(domain_name):\n", " expiration_date = domain_name.expiration_date\n", " if isinstance(expiration_date,str):\n", " try:\n", " expiration_date = datetime.strptime(expiration_date,\"%Y-%m-%d\")\n", " except:\n", " return 1\n", " if (expiration_date is None):\n", " return 1\n", " elif (type(expiration_date) is list):\n", " return 1\n", " else:\n", " today = datetime.now()\n", " end = abs((expiration_date - today).days)\n", " if ((end/30) < 6):\n", " end = 0\n", " else:\n", " end = 1\n", " return end" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": {}, "colab_type": "code", "id": "lw0JmOGEQPwb" }, "outputs": [], "source": [ "# importing required packages for this section\n", "import requests" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": {}, "colab_type": "code", "id": "F2gpZEMSQGpu" }, "outputs": [], "source": [ "# 15. IFrame Redirection (iFrame)\n", "def iframe(response):\n", " if response == \"\":\n", " return 1\n", " else:\n", " if re.findall(r\"[