Spaces:

Bloodlyghoul
/

Pythonbot

Running

App Files Files Community

Bloodlyghoul commited on 23 days ago

Commit

0ade82b

verified ·

1 Parent(s): ec981c6

Upload 8 files

Browse files

Files changed (8) hide show

WEB SCRAPING.jpg +0 -0
Web Scraping with BeautifulSoup.ipynb +400 -0
Web Scraping with BeautifulSoup.py +127 -0
readme.md +37 -0
requirement.txt +24 -3
scrap wikipedia.png +0 -0
scraped_data.json +0 -0
web_scraping_command_line_tool.py +152 -0

WEB SCRAPING.jpg ADDED Viewed

Web Scraping with BeautifulSoup.ipynb ADDED Viewed

	@@ -0,0 +1,400 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#Requirements\n",
+    "#pip3 install requests\n",
+    "#pip3 install bs4"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Basic fundamentals of web scraping"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 49,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "this is with html tags : <title>Easy Python – A programming language of revolution</title>\n",
+      "this is without html tags: Easy Python\n",
+      "<a class=\"screen-reader-text skip-link\" href=\"#content\">Skip to content</a>\n"
+     ]
+    }
+   ],
+   "source": [
+    "# import these two modules bs4 for selecting HTML tags easily\n",
+    "from bs4 import BeautifulSoup\n",
+    "# requests module is easy to operate some people use urllib but I prefer this one because it is easy to use.\n",
+    "import requests\n",
+    "\n",
+    "# I put here my own blog url ,you can change it.\n",
+    "url=\"https://getpython.wordpress.com/\"\n",
+    "\n",
+    "#Requests module use to data from given url\n",
+    "source=requests.get(url)\n",
+    "\n",
+    "# BeautifulSoup is used for getting HTML structure from requests response.(craete your soup)\n",
+    "soup=BeautifulSoup(source.text,'html')\n",
+    "\n",
+    "# Find function is used to find a single element if there are more than once it always returns the first element.\n",
+    "title=soup.find('title') # place your html tagg in parentheses that you want to find from html.\n",
+    "print(\"this is with html tags :\",title)\n",
+    "\n",
+    "qwery=soup.find('h1') # here i find first h1 tagg in my website using find operation.\n",
+    "\n",
+    "#use .text for extract only text without any html tags\n",
+    "print(\"this is without html tags:\",qwery.text) \n",
+    "\n",
+    "\n",
+    "links=soup.find('a') #i extarcted link using \"a\" tag\n",
+    "print(links)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## extarct data from innerhtml "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 41,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "#content\n"
+     ]
+    }
+   ],
+   "source": [
+    "# here i extarcted href data from anchor tag.\n",
+    "print(links['href']) "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 42,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "['screen-reader-text', 'skip-link']\n"
+     ]
+    }
+   ],
+   "source": [
+    "# similarly i got class details from a anchor tag\n",
+    "print(links['class'])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## findall operation in Bs4"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 51,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "total links in my website : 37\n",
+      "\n",
+      "<a class=\"screen-reader-text skip-link\" href=\"#content\">Skip to content</a>\n",
+      "<a href=\"https://getpython.wordpress.com/\" rel=\"home\">\n",
+      "<div class=\"cover\"></div>\n",
+      "</a>\n",
+      "<a class=\"screen-reader-text search-toggle\" href=\"#search-container\">Search</a>\n",
+      "<a href=\"https://getpython.wordpress.com/\" rel=\"home\">Easy Python</a>\n",
+      "<a aria-current=\"page\" href=\"/\">Home</a>\n",
+      "<a href=\"https://getpython.wordpress.com/contact/\">Contact</a>\n"
+     ]
+    }
+   ],
+   "source": [
+    "# findall function is used to fetch all tags at a single time.\n",
+    "many_link=soup.find_all('a') # here i extracted all the anchor tags of my website\n",
+    "total_links=len(many_link) # len function is use to calculate length of your array\n",
+    "print(\"total links in my website :\",total_links)\n",
+    "print()\n",
+    "for i in many_link[:6]: # here i use slicing to fetch only first 6 links from rest of them.\n",
+    "    print(i)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 54,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "<a href=\"https://getpython.wordpress.com/\" rel=\"home\">\n",
+      "<div class=\"cover\"></div>\n",
+      "</a>\n",
+      "\n",
+      "href is : https://getpython.wordpress.com/\n"
+     ]
+    }
+   ],
+   "source": [
+    "second_link=many_link[1] #here i fetch second link which place on 1 index number in many_links.\n",
+    "print(second_link)\n",
+    "print()\n",
+    "print(\"href is :\",second_link['href']) #only href link is extracted from ancor tag\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 59,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "<div class=\"cover\"></div>\n",
+      "\n",
+      "['cover']\n",
+      "<class 'list'>\n",
+      "\n",
+      "class name of div is : cover\n"
+     ]
+    }
+   ],
+   "source": [
+    "# select div tag from second link\n",
+    "nested_div=second_link.find('div')\n",
+    "# As you can see div element extarcted , it also have inner elements\n",
+    "print(nested_div)\n",
+    "print()\n",
+    "#here i extracted class element from div but it give us in the form of list\n",
+    "z=(nested_div['class'])\n",
+    "print(z)\n",
+    "print(type(z))\n",
+    "print()\n",
+    "#  \" \" .join () method use to convert list type  into string type\n",
+    "print(\"class name of div is :\",\" \".join(nested_div['class'])) "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## scrap data from wikipedia"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 60,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "<title>World War II - Wikipedia</title>\n"
+     ]
+    }
+   ],
+   "source": [
+    "wiki=requests.get(\"https://en.wikipedia.org/wiki/World_War_II\")\n",
+    "soup=BeautifulSoup(wiki.text,'html')\n",
+    "print(soup.find('title'))\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### find html tags with classes"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 65,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Contents\n",
+      "\n",
+      "1 Chronology\n",
+      "2 Background\n",
+      "\n",
+      "2.1 Europe\n",
+      "2.2 Asia\n",
+      "\n",
+      "\n",
+      "3 Pre-war events\n",
+      "\n",
+      "3.1 Italian invasion of Ethiopia (1935)\n",
+      "3.2 Spanish Civil War (1936–1939)\n",
+      "3.3 Japanese invasion of China (1937)\n",
+      "3.4 Soviet–Japanese border conflicts\n",
+      "3.5 European occupations and agreements\n",
+      "\n",
+      "\n",
+      "4 Course of the war\n",
+      "\n",
+      "4.1 War breaks out in Europe (1939–40)\n",
+      "4.2 Western Europe (1940–41)\n",
+      "4.3 Mediterranean (1940–41)\n",
+      "4.4 Axis attack on the Soviet Union (1941)\n",
+      "4.5 War breaks out in the Pacific (1941)\n",
+      "4.6 Axis advance stalls (1942–43)\n",
+      "\n",
+      "4.6.1 Pacific (1942–43)\n",
+      "4.6.2 Eastern Front (1942–43)\n",
+      "4.6.3 Western Europe/Atlantic and Mediterranean (1942–43)\n",
+      "\n",
+      "\n",
+      "4.7 Allies gain momentum (1943–44)\n",
+      "4.8 Allies close in (1944)\n",
+      "4.9 Axis collapse, Allied victory (1944–45)\n",
+      "\n",
+      "\n",
+      "5 Aftermath\n",
+      "6 Impact\n",
+      "\n",
+      "6.1 Casualties and war crimes\n",
+      "6.2 Genocide, concentration camps, and slave labour\n",
+      "6.3 Occupation\n",
+      "6.4 Home fronts and production\n",
+      "6.5 Advances in technology and warfare\n",
+      "\n",
+      "\n",
+      "7 See also\n",
+      "8 Notes\n",
+      "9 Citations\n",
+      "10 References\n",
+      "11 External links\n",
+      "\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "ww2_contents=soup.find_all(\"div\",class_='toc')\n",
+    "for i in ww2_contents:\n",
+    "    print(i.text)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 68,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "World War II(clockwise from top left)\n",
+      "Chinese forces in the Battle of Wanjialing\n",
+      "Australian 25-pounder guns during the First Battle of El Alamein\n",
+      "German Stuka dive bombers on the Eastern Front in December 1943\n",
+      "American naval force in the Lingayen Gulf\n",
+      "Wilhelm Keitel signing the German Instrument of Surrender\n",
+      "Soviet troops in the Battle of Stalingrad\n",
+      "Date1 September 1939 – 2 September 1945 (1939-09-01 – 1945-09-02)(6 years and 1 day)[a]LocationEurope, Pacific, Atlantic, South-East Asia, China, Middle East, Mediterranean, North Africa, Horn of Africa, Australia, briefly North and South AmericaResult\n",
+      "Allied victory\n",
+      "Collapse of Nazi Germany\n",
+      "Fall of the Japanese and Italian Empires\n",
+      "Beginning of the Nuclear Age\n",
+      "Dissolution of the League of Nations\n",
+      "Creation of the United Nations\n",
+      "Emergence of the United States and the Soviet Union as rival superpowers\n",
+      "Beginning of the Cold War (more...)Participants\n",
+      "Allies\n",
+      "AxisCommanders and leaders\n",
+      "Main Allied leaders\n",
+      " Joseph Stalin\n",
+      " Franklin D. Roosevelt\n",
+      " Winston Churchill\n",
+      " Chiang Kai-shek\n",
+      "\n",
+      "Main Axis leaders\n",
+      " Adolf Hitler\n",
+      " Hirohito\n",
+      " Benito Mussolini\n",
+      "Casualties and losses\n",
+      "\n",
+      "Military dead:\n",
+      "Over 16,000,000\n",
+      "Civilian dead:\n",
+      "Over 45,000,000\n",
+      "Total dead:\n",
+      "Over 61,000,000\n",
+      "(1937–1945)\n",
+      "...further details\n",
+      "\n",
+      "\n",
+      "Military dead:\n",
+      "Over 8,000,000\n",
+      "Civilian dead:\n",
+      "Over 4,000,000\n",
+      "Total dead:\n",
+      "Over 12,000,000\n",
+      "(1937–1945)\n",
+      "...further details\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "overview=soup.find_all('table',class_='infobox vevent')\n",
+    "for z in overview:\n",
+    "    print(z.text)\n",
+    "    "
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.5.2"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

Web Scraping with BeautifulSoup.py ADDED Viewed

	@@ -0,0 +1,127 @@

+#!/usr/bin/env python
+# coding: utf-8
+#Requirements
+#pip3 install requests
+#pip3 install bs4
+#run in the browser also what are you doing with the help of chrome driver
+# ## Basic fundamentals of web scraping
+# import these two modules bs4 for selecting HTML tags easily
+from bs4 import BeautifulSoup
+# requests module is easy to operate some people use urllib but I prefer this one because it is easy to use.
+import requests
+from selenium import webdriver
+# I put here my own blog url ,you can change it.
+url="https://getpython.wordpress.com/"
+BASE_URL = "https://getpython.wordpress.com/"
+#Requests module use to data from given url
+source=requests.get(url)
+def get_chrome_web_driver(options):
+    return webdriver.Chrome("./chromedriver", chrome_options=options)
+def get_web_driver_options():
+    return webdriver.ChromeOptions()
+def set_ignore_certificate_error(options):
+    options.add_argument('--ignore-certificate-errors')
+def set_browser_as_incognito(options):
+    options.add_argument('--incognito')
+# BeautifulSoup is used for getting HTML structure from requests response.(craete your soup)
+soup=BeautifulSoup(source.text,'html')
+# Find function is used to find a single element if there are more than once it always returns the first element.
+title=soup.find('title') # place your html tagg in parentheses that you want to find from html.
+print("this is with html tags :",title)
+qwery=soup.find('h1') # here i find first h1 tagg in my website using find operation.
+#use .text for extract only text without any html tags
+print("this is without html tags:",qwery.text)
+links=soup.find('a') #i extarcted link using "a" tag
+print(links)
+# ## extarct data from innerhtml
+# here i extarcted href data from anchor tag.
+print(links['href'])
+##  or another way
+##extracting href(links) attribute and anchor(<a>) tag from page
+for a in soup.find_all('a', href=True):
+    print ( a['href'])
+for i in links:
+    print(i.text)
+# similarly i got class details from a anchor tag
+print(links['class'])
+# ## findall operation in Bs4
+# findall function is used to fetch all tags at a single time.
+many_link=soup.find_all('a') # here i extracted all the anchor tags of my website
+total_links=len(many_link) # len function is use to calculate length of your array
+print("total links in my website :",total_links)
+print()
+for i in many_link[:6]: # here i use slicing to fetch only first 6 links from rest of them.
+    print(i)
+second_link=many_link[1] #here i fetch second link which place on 1 index number in many_links.
+print(second_link)
+print()
+print("href is :",second_link['href']) #only href link is extracted from ancor tag
+# select div tag from second link
+nested_div=second_link.find('div')
+# As you can see div element extarcted , it also have inner elements
+print(nested_div)
+print()
+#here i extracted class element from div but it give us in the form of list
+z=(nested_div['class'])
+print(z)
+print(type(z))
+print()
+#  " " .join () method use to convert list type  into string type
+print("class name of div is :"," ".join(nested_div['class']))
+# ## scrap data from wikipedia
+wiki=requests.get("https://en.wikipedia.org/wiki/World_War_II")
+soup=BeautifulSoup(wiki.text,'html')
+print(soup.find('title'))
+# ### find html tags with classes
+ww2_contents=soup.find_all("div",class_='toc')
+for i in ww2_contents:
+    print(i.text)
+overview=soup.find_all('table',class_='infobox vevent')
+for z in overview:
+    print(z.text)
+images=soup.find_all('img')
+images
+##or
+print(images)

readme.md ADDED Viewed

	@@ -0,0 +1,37 @@

+![web scraping with python](https://github.com/rajat4665/web-scraping-with-python/blob/master/WEB%20SCRAPING.jpg)
+<br>
+<span style="text-decoration: underline;"><strong>Introduction:</strong></span>
+<b>Web scraping</b>, <b>web harvesting</b>, or <b>web data extraction</b> is data scraping used for extracting data from websites using its HTML structure, In this post, I will explain basic fundaments of web scraping using python and also explore it by a live demonstration with two python libraries Beautifulsoup and requests respectively.
+<span style="text-decoration: underline;"><strong>What you will learn from this post:</strong></span>
+<ul>
+	<li>basic understanding of web scraping</li>
+	<li>how to extract data from a website using classes and HTML tags</li>
+	<li>how to use requests module to get data</li>
+	<li>how to use Beautifulsoup</li>
+</ul>
+<span style="text-decoration: underline;"><strong>Requirements:</strong></span>
+<ul>
+	<li>python3</li>
+	<li>requests</li>
+	<li>bs4</li>
+</ul>
+<h3>Install required dependencies :</h3>
+<ul>
+	<li>clone or download it from <a href="https://github.com/rajat4665/web-scraping-with-python" target="_blank" rel="noopener">here</a></li>
+	<li>install requirements.txt file</li>
+	<li><code>pip install -r requirements.txt</code></li>
+ </ul>
+<h2> How to run this code</h2>
+<ul>
+	<li>there are two source code files, one is .py extention and another is .ipynb extention</li>
+	<li>one can run Scraping with BeautifulSoup.py file in python by run this cammand in terminal "python3 Web Scraping with BeautifulSoup.py"</li>
+	<li>one can run Scraping with BeautifulSoup.ipynb file in jupyter notebook /li>
+	<li>one can install juypyter notebook by this command "pip3 install jupyter"</li>
+	<li> CLI scraping tool is underdevelopment only beta version  is available now </li>
+</ul>
+----------------------------------------------------------------------------------------
+<h1>HAPPY CODING</h1>

requirement.txt CHANGED Viewed

@@ -1,3 +1,24 @@
-python3
-requests
-bs4

+async-generator==1.10
+attrs==21.4.0
+beautifulsoup4==4.10.0
+beautifultable==1.0.1
+certifi==2021.10.8
+cffi==1.15.0
+charset-normalizer==2.0.12
+cryptography==36.0.1
+h11==0.13.0
+idna==3.3
+outcome==1.1.0
+pycparser==2.21
+pyOpenSSL==22.0.0
+PySocks==1.7.1
+requests==2.27.1
+selenium==4.1.2
+sniffio==1.2.0
+sortedcontainers==2.4.0
+soupsieve==2.3.1
+trio==0.20.0
+trio-websocket==0.9.2
+urllib3==1.26.8
+wcwidth==0.2.5
+wsproto==1.1.0

scrap wikipedia.png ADDED Viewed

scraped_data.json ADDED Viewed

The diff for this file is too large to render. See raw diff

web_scraping_command_line_tool.py ADDED Viewed

	@@ -0,0 +1,152 @@

+# import required modules
+import json
+import requests
+from datetime import datetime
+from urllib.parse import urlparse
+from bs4 import BeautifulSoup
+from beautifultable import BeautifulTable
+def load_json(database_json_file="scraped_data.json"):
+    """
+    This function will load json data from scraped_data.json file if it exist else crean an empty array
+    """
+    try:
+        with open(database_json_file, "r") as read_it:
+            all_data_base = json.loads(read_it.read())
+            return all_data_base
+    except:
+        all_data_base = dict()
+        return all_data_base
+def save_scraped_data_in_json(data, database_json_file="scraped_data.json"):
+    """
+    This function Save the scraped data in json format. scraped_data.json file if it exist else create it.
+    if file already exist you can view previous scraped data
+    """
+    file_obj =  open(database_json_file, "w")
+    file_obj.write(json.dumps(data))
+    file_obj.close()
+def existing_scraped_data_init(json_db):
+    """
+    This function init data from json file if it exist have data else create an empty one
+    """
+    scraped_data = json_db.get("scraped_data")
+    if scraped_data is None:
+        json_db['scraped_data'] = dict()
+    return None
+def scraped_time_is():
+    """
+    This function create time stamp for keep our book issue record trackable
+    """
+    now = datetime.now()
+    dt_string = now.strftime("%d/%m/%Y %H:%M:%S")
+    return dt_string
+def process_url_request(website_url):
+    """
+    This function process provided URL get its data using requets module
+    and contrunct soup data using BeautifulSoup for scarping
+    """
+    requets_data = requests.get(website_url)
+    if requets_data.status_code == 200:
+        soup = BeautifulSoup(requets_data.text,'html')
+        return soup
+    return None
+def proccess_beautiful_soup_data(soup):
+    return {
+        'title': soup.find('title').text,
+        'all_anchor_href': [i['href'] for i in soup.find_all('a', href=True)],
+        'all_anchors': [str(i) for i in soup.find_all('a')],
+        'all_images_data': [ str(i) for i in soup.find_all('img')],
+        'all_images_source_data': [ i['src'] for i in soup.find_all('img')],
+        'all_h1_data': [i.text for i in soup.find_all('h1')],
+        'all_h2_data': [i.text for i in soup.find_all('h2')],
+        'all_h3_data': [i.text for i in soup.find_all('h3')],
+        'all_p_data': [i.text for i in soup.find_all('p')]
+    }
+# Here I used infinite loop because i don't want to run it again and again.
+while True:
+    print("""  ================ Welcome to this scraping program =============
+    ==>> press 1 for checking existing scraped websites
+    ==>> press 2 for scrap a single website
+    ==>> press 3 for exit
+    """)
+    choice = int(input("==>> Please enter your choice :"))
+    # Load json function called for fetching/creating data from json file.
+    local_json_db = load_json()
+    existing_scraped_data_init(local_json_db)
+    if choice == 1:
+        # I used Beautiful table for presenting scraped data in a good way !!
+        # you guys can read more about from this link https://beautifultable.readthedocs.io/en/latest/index.html
+        scraped_websites_table = BeautifulTable()
+        scraped_websites_table.columns.header = ["Sr no.", "Allias name ", "Website domain", "title",   "Scraped at", "Status"]
+        scraped_websites_table.set_style(BeautifulTable.STYLE_BOX_DOUBLED)
+        local_json_db = load_json()
+        for count,  data in enumerate(local_json_db['scraped_data']):
+           scraped_websites_table.rows.append([count + 1,
+                            local_json_db['scraped_data'][data]['alias'],
+                            local_json_db['scraped_data'][data]['domain'],
+                            local_json_db['scraped_data'][data]['title'],
+                            local_json_db['scraped_data'][data]['scraped_at'],
+                            local_json_db['scraped_data'][data]['status']])
+        # all_scraped_websites = [websites['name'] for websites in local_json_db['scraped_data']]
+        if not local_json_db['scraped_data']:
+            print('===> No existing data found !!!')
+        print(scraped_websites_table)
+    elif choice == 2:
+        print()
+        url_for_scrap = input("===> Please enter url you want to scrap:")
+        is_accessable = process_url_request(url_for_scrap)
+        if is_accessable:
+            scraped_data_packet = proccess_beautiful_soup_data(is_accessable)
+            print()
+            print(' =====> Data scraped successfully !!!')
+            key_for_storing_data = input("enter alias name for saving scraped data :")
+            scraped_data_packet['url'] = url_for_scrap
+            scraped_data_packet['name'] = key_for_storing_data
+            scraped_data_packet['scraped_at'] = scraped_time_is()
+            if key_for_storing_data in  local_json_db['scraped_data']:
+                key_for_storing_data = key_for_storing_data + str(scraped_time_is())
+                print("Provided key is already exist so data stored as : {}".format(key_for_storing_data))
+            scraped_data_packet['alias'] = key_for_storing_data
+            scraped_data_packet['status'] = True
+            scraped_data_packet['domain'] = urlparse(url_for_scrap).netloc
+            local_json_db['scraped_data'][key_for_storing_data] = scraped_data_packet
+            print(
+                'scraped data is:', local_json_db['scraped_data'][key_for_storing_data]
+            )
+            save_scraped_data_in_json(local_json_db)
+            # load data
+            local_json_db = load_json()
+            print(' =====> Data saved successfully !!!')
+            print()
+    elif choice == 3:
+        print('Thank you for using !!!')
+        break
+    elif choice == 4:
+        print('Thank you for using !!!')
+        break
+    else:
+        print("enter a valid choice ")