Bloodlyghoul commited on
Commit
0ade82b
·
verified ·
1 Parent(s): ec981c6

Upload 8 files

Browse files
WEB SCRAPING.jpg ADDED
Web Scraping with BeautifulSoup.ipynb ADDED
@@ -0,0 +1,400 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": null,
6
+ "metadata": {},
7
+ "outputs": [],
8
+ "source": [
9
+ "#Requirements\n",
10
+ "#pip3 install requests\n",
11
+ "#pip3 install bs4"
12
+ ]
13
+ },
14
+ {
15
+ "cell_type": "markdown",
16
+ "metadata": {},
17
+ "source": [
18
+ "## Basic fundamentals of web scraping"
19
+ ]
20
+ },
21
+ {
22
+ "cell_type": "code",
23
+ "execution_count": 49,
24
+ "metadata": {},
25
+ "outputs": [
26
+ {
27
+ "name": "stdout",
28
+ "output_type": "stream",
29
+ "text": [
30
+ "this is with html tags : <title>Easy Python – A programming language of revolution</title>\n",
31
+ "this is without html tags: Easy Python\n",
32
+ "<a class=\"screen-reader-text skip-link\" href=\"#content\">Skip to content</a>\n"
33
+ ]
34
+ }
35
+ ],
36
+ "source": [
37
+ "# import these two modules bs4 for selecting HTML tags easily\n",
38
+ "from bs4 import BeautifulSoup\n",
39
+ "# requests module is easy to operate some people use urllib but I prefer this one because it is easy to use.\n",
40
+ "import requests\n",
41
+ "\n",
42
+ "# I put here my own blog url ,you can change it.\n",
43
+ "url=\"https://getpython.wordpress.com/\"\n",
44
+ "\n",
45
+ "#Requests module use to data from given url\n",
46
+ "source=requests.get(url)\n",
47
+ "\n",
48
+ "# BeautifulSoup is used for getting HTML structure from requests response.(craete your soup)\n",
49
+ "soup=BeautifulSoup(source.text,'html')\n",
50
+ "\n",
51
+ "# Find function is used to find a single element if there are more than once it always returns the first element.\n",
52
+ "title=soup.find('title') # place your html tagg in parentheses that you want to find from html.\n",
53
+ "print(\"this is with html tags :\",title)\n",
54
+ "\n",
55
+ "qwery=soup.find('h1') # here i find first h1 tagg in my website using find operation.\n",
56
+ "\n",
57
+ "#use .text for extract only text without any html tags\n",
58
+ "print(\"this is without html tags:\",qwery.text) \n",
59
+ "\n",
60
+ "\n",
61
+ "links=soup.find('a') #i extarcted link using \"a\" tag\n",
62
+ "print(links)"
63
+ ]
64
+ },
65
+ {
66
+ "cell_type": "markdown",
67
+ "metadata": {},
68
+ "source": [
69
+ "## extarct data from innerhtml "
70
+ ]
71
+ },
72
+ {
73
+ "cell_type": "code",
74
+ "execution_count": 41,
75
+ "metadata": {},
76
+ "outputs": [
77
+ {
78
+ "name": "stdout",
79
+ "output_type": "stream",
80
+ "text": [
81
+ "#content\n"
82
+ ]
83
+ }
84
+ ],
85
+ "source": [
86
+ "# here i extarcted href data from anchor tag.\n",
87
+ "print(links['href']) "
88
+ ]
89
+ },
90
+ {
91
+ "cell_type": "code",
92
+ "execution_count": 42,
93
+ "metadata": {},
94
+ "outputs": [
95
+ {
96
+ "name": "stdout",
97
+ "output_type": "stream",
98
+ "text": [
99
+ "['screen-reader-text', 'skip-link']\n"
100
+ ]
101
+ }
102
+ ],
103
+ "source": [
104
+ "# similarly i got class details from a anchor tag\n",
105
+ "print(links['class'])"
106
+ ]
107
+ },
108
+ {
109
+ "cell_type": "markdown",
110
+ "metadata": {},
111
+ "source": [
112
+ "## findall operation in Bs4"
113
+ ]
114
+ },
115
+ {
116
+ "cell_type": "code",
117
+ "execution_count": 51,
118
+ "metadata": {},
119
+ "outputs": [
120
+ {
121
+ "name": "stdout",
122
+ "output_type": "stream",
123
+ "text": [
124
+ "total links in my website : 37\n",
125
+ "\n",
126
+ "<a class=\"screen-reader-text skip-link\" href=\"#content\">Skip to content</a>\n",
127
+ "<a href=\"https://getpython.wordpress.com/\" rel=\"home\">\n",
128
+ "<div class=\"cover\"></div>\n",
129
+ "</a>\n",
130
+ "<a class=\"screen-reader-text search-toggle\" href=\"#search-container\">Search</a>\n",
131
+ "<a href=\"https://getpython.wordpress.com/\" rel=\"home\">Easy Python</a>\n",
132
+ "<a aria-current=\"page\" href=\"/\">Home</a>\n",
133
+ "<a href=\"https://getpython.wordpress.com/contact/\">Contact</a>\n"
134
+ ]
135
+ }
136
+ ],
137
+ "source": [
138
+ "# findall function is used to fetch all tags at a single time.\n",
139
+ "many_link=soup.find_all('a') # here i extracted all the anchor tags of my website\n",
140
+ "total_links=len(many_link) # len function is use to calculate length of your array\n",
141
+ "print(\"total links in my website :\",total_links)\n",
142
+ "print()\n",
143
+ "for i in many_link[:6]: # here i use slicing to fetch only first 6 links from rest of them.\n",
144
+ " print(i)"
145
+ ]
146
+ },
147
+ {
148
+ "cell_type": "code",
149
+ "execution_count": 54,
150
+ "metadata": {},
151
+ "outputs": [
152
+ {
153
+ "name": "stdout",
154
+ "output_type": "stream",
155
+ "text": [
156
+ "<a href=\"https://getpython.wordpress.com/\" rel=\"home\">\n",
157
+ "<div class=\"cover\"></div>\n",
158
+ "</a>\n",
159
+ "\n",
160
+ "href is : https://getpython.wordpress.com/\n"
161
+ ]
162
+ }
163
+ ],
164
+ "source": [
165
+ "second_link=many_link[1] #here i fetch second link which place on 1 index number in many_links.\n",
166
+ "print(second_link)\n",
167
+ "print()\n",
168
+ "print(\"href is :\",second_link['href']) #only href link is extracted from ancor tag\n",
169
+ "\n"
170
+ ]
171
+ },
172
+ {
173
+ "cell_type": "code",
174
+ "execution_count": 59,
175
+ "metadata": {},
176
+ "outputs": [
177
+ {
178
+ "name": "stdout",
179
+ "output_type": "stream",
180
+ "text": [
181
+ "<div class=\"cover\"></div>\n",
182
+ "\n",
183
+ "['cover']\n",
184
+ "<class 'list'>\n",
185
+ "\n",
186
+ "class name of div is : cover\n"
187
+ ]
188
+ }
189
+ ],
190
+ "source": [
191
+ "# select div tag from second link\n",
192
+ "nested_div=second_link.find('div')\n",
193
+ "# As you can see div element extarcted , it also have inner elements\n",
194
+ "print(nested_div)\n",
195
+ "print()\n",
196
+ "#here i extracted class element from div but it give us in the form of list\n",
197
+ "z=(nested_div['class'])\n",
198
+ "print(z)\n",
199
+ "print(type(z))\n",
200
+ "print()\n",
201
+ "# \" \" .join () method use to convert list type into string type\n",
202
+ "print(\"class name of div is :\",\" \".join(nested_div['class'])) "
203
+ ]
204
+ },
205
+ {
206
+ "cell_type": "markdown",
207
+ "metadata": {},
208
+ "source": [
209
+ "## scrap data from wikipedia"
210
+ ]
211
+ },
212
+ {
213
+ "cell_type": "code",
214
+ "execution_count": 60,
215
+ "metadata": {},
216
+ "outputs": [
217
+ {
218
+ "name": "stdout",
219
+ "output_type": "stream",
220
+ "text": [
221
+ "<title>World War II - Wikipedia</title>\n"
222
+ ]
223
+ }
224
+ ],
225
+ "source": [
226
+ "wiki=requests.get(\"https://en.wikipedia.org/wiki/World_War_II\")\n",
227
+ "soup=BeautifulSoup(wiki.text,'html')\n",
228
+ "print(soup.find('title'))\n"
229
+ ]
230
+ },
231
+ {
232
+ "cell_type": "markdown",
233
+ "metadata": {},
234
+ "source": [
235
+ "### find html tags with classes"
236
+ ]
237
+ },
238
+ {
239
+ "cell_type": "code",
240
+ "execution_count": 65,
241
+ "metadata": {},
242
+ "outputs": [
243
+ {
244
+ "name": "stdout",
245
+ "output_type": "stream",
246
+ "text": [
247
+ "Contents\n",
248
+ "\n",
249
+ "1 Chronology\n",
250
+ "2 Background\n",
251
+ "\n",
252
+ "2.1 Europe\n",
253
+ "2.2 Asia\n",
254
+ "\n",
255
+ "\n",
256
+ "3 Pre-war events\n",
257
+ "\n",
258
+ "3.1 Italian invasion of Ethiopia (1935)\n",
259
+ "3.2 Spanish Civil War (1936–1939)\n",
260
+ "3.3 Japanese invasion of China (1937)\n",
261
+ "3.4 Soviet–Japanese border conflicts\n",
262
+ "3.5 European occupations and agreements\n",
263
+ "\n",
264
+ "\n",
265
+ "4 Course of the war\n",
266
+ "\n",
267
+ "4.1 War breaks out in Europe (1939–40)\n",
268
+ "4.2 Western Europe (1940–41)\n",
269
+ "4.3 Mediterranean (1940–41)\n",
270
+ "4.4 Axis attack on the Soviet Union (1941)\n",
271
+ "4.5 War breaks out in the Pacific (1941)\n",
272
+ "4.6 Axis advance stalls (1942–43)\n",
273
+ "\n",
274
+ "4.6.1 Pacific (1942–43)\n",
275
+ "4.6.2 Eastern Front (1942–43)\n",
276
+ "4.6.3 Western Europe/Atlantic and Mediterranean (1942–43)\n",
277
+ "\n",
278
+ "\n",
279
+ "4.7 Allies gain momentum (1943–44)\n",
280
+ "4.8 Allies close in (1944)\n",
281
+ "4.9 Axis collapse, Allied victory (1944–45)\n",
282
+ "\n",
283
+ "\n",
284
+ "5 Aftermath\n",
285
+ "6 Impact\n",
286
+ "\n",
287
+ "6.1 Casualties and war crimes\n",
288
+ "6.2 Genocide, concentration camps, and slave labour\n",
289
+ "6.3 Occupation\n",
290
+ "6.4 Home fronts and production\n",
291
+ "6.5 Advances in technology and warfare\n",
292
+ "\n",
293
+ "\n",
294
+ "7 See also\n",
295
+ "8 Notes\n",
296
+ "9 Citations\n",
297
+ "10 References\n",
298
+ "11 External links\n",
299
+ "\n",
300
+ "\n"
301
+ ]
302
+ }
303
+ ],
304
+ "source": [
305
+ "ww2_contents=soup.find_all(\"div\",class_='toc')\n",
306
+ "for i in ww2_contents:\n",
307
+ " print(i.text)"
308
+ ]
309
+ },
310
+ {
311
+ "cell_type": "code",
312
+ "execution_count": 68,
313
+ "metadata": {},
314
+ "outputs": [
315
+ {
316
+ "name": "stdout",
317
+ "output_type": "stream",
318
+ "text": [
319
+ "World War II(clockwise from top left)\n",
320
+ "Chinese forces in the Battle of Wanjialing\n",
321
+ "Australian 25-pounder guns during the First Battle of El Alamein\n",
322
+ "German Stuka dive bombers on the Eastern Front in December 1943\n",
323
+ "American naval force in the Lingayen Gulf\n",
324
+ "Wilhelm Keitel signing the German Instrument of Surrender\n",
325
+ "Soviet troops in the Battle of Stalingrad\n",
326
+ "Date1 September 1939 – 2 September 1945 (1939-09-01 – 1945-09-02)(6 years and 1 day)[a]LocationEurope, Pacific, Atlantic, South-East Asia, China, Middle East, Mediterranean, North Africa, Horn of Africa, Australia, briefly North and South AmericaResult\n",
327
+ "Allied victory\n",
328
+ "Collapse of Nazi Germany\n",
329
+ "Fall of the Japanese and Italian Empires\n",
330
+ "Beginning of the Nuclear Age\n",
331
+ "Dissolution of the League of Nations\n",
332
+ "Creation of the United Nations\n",
333
+ "Emergence of the United States and the Soviet Union as rival superpowers\n",
334
+ "Beginning of the Cold War (more...)Participants\n",
335
+ "Allies\n",
336
+ "AxisCommanders and leaders\n",
337
+ "Main Allied leaders\n",
338
+ " Joseph Stalin\n",
339
+ " Franklin D. Roosevelt\n",
340
+ " Winston Churchill\n",
341
+ " Chiang Kai-shek\n",
342
+ "\n",
343
+ "Main Axis leaders\n",
344
+ " Adolf Hitler\n",
345
+ " Hirohito\n",
346
+ " Benito Mussolini\n",
347
+ "Casualties and losses\n",
348
+ "\n",
349
+ "Military dead:\n",
350
+ "Over 16,000,000\n",
351
+ "Civilian dead:\n",
352
+ "Over 45,000,000\n",
353
+ "Total dead:\n",
354
+ "Over 61,000,000\n",
355
+ "(1937–1945)\n",
356
+ "...further details\n",
357
+ "\n",
358
+ "\n",
359
+ "Military dead:\n",
360
+ "Over 8,000,000\n",
361
+ "Civilian dead:\n",
362
+ "Over 4,000,000\n",
363
+ "Total dead:\n",
364
+ "Over 12,000,000\n",
365
+ "(1937–1945)\n",
366
+ "...further details\n",
367
+ "\n"
368
+ ]
369
+ }
370
+ ],
371
+ "source": [
372
+ "overview=soup.find_all('table',class_='infobox vevent')\n",
373
+ "for z in overview:\n",
374
+ " print(z.text)\n",
375
+ " "
376
+ ]
377
+ }
378
+ ],
379
+ "metadata": {
380
+ "kernelspec": {
381
+ "display_name": "Python 3",
382
+ "language": "python",
383
+ "name": "python3"
384
+ },
385
+ "language_info": {
386
+ "codemirror_mode": {
387
+ "name": "ipython",
388
+ "version": 3
389
+ },
390
+ "file_extension": ".py",
391
+ "mimetype": "text/x-python",
392
+ "name": "python",
393
+ "nbconvert_exporter": "python",
394
+ "pygments_lexer": "ipython3",
395
+ "version": "3.5.2"
396
+ }
397
+ },
398
+ "nbformat": 4,
399
+ "nbformat_minor": 2
400
+ }
Web Scraping with BeautifulSoup.py ADDED
@@ -0,0 +1,127 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ # coding: utf-8
3
+
4
+ #Requirements
5
+ #pip3 install requests
6
+ #pip3 install bs4
7
+
8
+ #run in the browser also what are you doing with the help of chrome driver
9
+
10
+ # ## Basic fundamentals of web scraping
11
+
12
+ # import these two modules bs4 for selecting HTML tags easily
13
+ from bs4 import BeautifulSoup
14
+ # requests module is easy to operate some people use urllib but I prefer this one because it is easy to use.
15
+ import requests
16
+ from selenium import webdriver
17
+
18
+ # I put here my own blog url ,you can change it.
19
+ url="https://getpython.wordpress.com/"
20
+ BASE_URL = "https://getpython.wordpress.com/"
21
+ #Requests module use to data from given url
22
+ source=requests.get(url)
23
+
24
+
25
+ def get_chrome_web_driver(options):
26
+ return webdriver.Chrome("./chromedriver", chrome_options=options)
27
+
28
+
29
+ def get_web_driver_options():
30
+ return webdriver.ChromeOptions()
31
+
32
+
33
+ def set_ignore_certificate_error(options):
34
+ options.add_argument('--ignore-certificate-errors')
35
+
36
+
37
+ def set_browser_as_incognito(options):
38
+ options.add_argument('--incognito')
39
+
40
+ # BeautifulSoup is used for getting HTML structure from requests response.(craete your soup)
41
+ soup=BeautifulSoup(source.text,'html')
42
+
43
+ # Find function is used to find a single element if there are more than once it always returns the first element.
44
+ title=soup.find('title') # place your html tagg in parentheses that you want to find from html.
45
+ print("this is with html tags :",title)
46
+
47
+ qwery=soup.find('h1') # here i find first h1 tagg in my website using find operation.
48
+
49
+ #use .text for extract only text without any html tags
50
+ print("this is without html tags:",qwery.text)
51
+
52
+
53
+ links=soup.find('a') #i extarcted link using "a" tag
54
+ print(links)
55
+
56
+
57
+ # ## extarct data from innerhtml
58
+
59
+ # here i extarcted href data from anchor tag.
60
+ print(links['href'])
61
+
62
+ ## or another way
63
+ ##extracting href(links) attribute and anchor(<a>) tag from page
64
+ for a in soup.find_all('a', href=True):
65
+ print ( a['href'])
66
+
67
+ for i in links:
68
+ print(i.text)
69
+
70
+ # similarly i got class details from a anchor tag
71
+ print(links['class'])
72
+
73
+
74
+ # ## findall operation in Bs4
75
+
76
+ # findall function is used to fetch all tags at a single time.
77
+ many_link=soup.find_all('a') # here i extracted all the anchor tags of my website
78
+ total_links=len(many_link) # len function is use to calculate length of your array
79
+ print("total links in my website :",total_links)
80
+ print()
81
+ for i in many_link[:6]: # here i use slicing to fetch only first 6 links from rest of them.
82
+ print(i)
83
+
84
+ second_link=many_link[1] #here i fetch second link which place on 1 index number in many_links.
85
+ print(second_link)
86
+ print()
87
+ print("href is :",second_link['href']) #only href link is extracted from ancor tag
88
+
89
+
90
+ # select div tag from second link
91
+ nested_div=second_link.find('div')
92
+ # As you can see div element extarcted , it also have inner elements
93
+ print(nested_div)
94
+ print()
95
+ #here i extracted class element from div but it give us in the form of list
96
+ z=(nested_div['class'])
97
+ print(z)
98
+ print(type(z))
99
+ print()
100
+ # " " .join () method use to convert list type into string type
101
+ print("class name of div is :"," ".join(nested_div['class']))
102
+
103
+
104
+ # ## scrap data from wikipedia
105
+
106
+ wiki=requests.get("https://en.wikipedia.org/wiki/World_War_II")
107
+ soup=BeautifulSoup(wiki.text,'html')
108
+ print(soup.find('title'))
109
+
110
+
111
+ # ### find html tags with classes
112
+
113
+ ww2_contents=soup.find_all("div",class_='toc')
114
+ for i in ww2_contents:
115
+ print(i.text)
116
+
117
+
118
+ overview=soup.find_all('table',class_='infobox vevent')
119
+ for z in overview:
120
+ print(z.text)
121
+
122
+ images=soup.find_all('img')
123
+
124
+ images
125
+ ##or
126
+ print(images)
127
+
readme.md ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ![web scraping with python](https://github.com/rajat4665/web-scraping-with-python/blob/master/WEB%20SCRAPING.jpg)
2
+ <br>
3
+ <span style="text-decoration: underline;"><strong>Introduction:</strong></span>
4
+
5
+ <b>Web scraping</b>, <b>web harvesting</b>, or <b>web data extraction</b> is data scraping used for extracting data from websites using its HTML structure, In this post, I will explain basic fundaments of web scraping using python and also explore it by a live demonstration with two python libraries Beautifulsoup and requests respectively.
6
+
7
+ <span style="text-decoration: underline;"><strong>What you will learn from this post:</strong></span>
8
+ <ul>
9
+ <li>basic understanding of web scraping</li>
10
+ <li>how to extract data from a website using classes and HTML tags</li>
11
+ <li>how to use requests module to get data</li>
12
+ <li>how to use Beautifulsoup</li>
13
+ </ul>
14
+ <span style="text-decoration: underline;"><strong>Requirements:</strong></span>
15
+ <ul>
16
+ <li>python3</li>
17
+ <li>requests</li>
18
+ <li>bs4</li>
19
+ </ul>
20
+ <h3>Install required dependencies :</h3>
21
+ <ul>
22
+ <li>clone or download it from <a href="https://github.com/rajat4665/web-scraping-with-python" target="_blank" rel="noopener">here</a></li>
23
+ <li>install requirements.txt file</li>
24
+ <li><code>pip install -r requirements.txt</code></li>
25
+
26
+ </ul>
27
+
28
+ <h2> How to run this code</h2>
29
+ <ul>
30
+ <li>there are two source code files, one is .py extention and another is .ipynb extention</li>
31
+ <li>one can run Scraping with BeautifulSoup.py file in python by run this cammand in terminal "python3 Web Scraping with BeautifulSoup.py"</li>
32
+ <li>one can run Scraping with BeautifulSoup.ipynb file in jupyter notebook /li>
33
+ <li>one can install juypyter notebook by this command "pip3 install jupyter"</li>
34
+ <li> CLI scraping tool is underdevelopment only beta version is available now </li>
35
+ </ul>
36
+ ----------------------------------------------------------------------------------------
37
+ <h1>HAPPY CODING</h1>
requirement.txt CHANGED
@@ -1,3 +1,24 @@
1
- python3
2
- requests
3
- bs4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ async-generator==1.10
2
+ attrs==21.4.0
3
+ beautifulsoup4==4.10.0
4
+ beautifultable==1.0.1
5
+ certifi==2021.10.8
6
+ cffi==1.15.0
7
+ charset-normalizer==2.0.12
8
+ cryptography==36.0.1
9
+ h11==0.13.0
10
+ idna==3.3
11
+ outcome==1.1.0
12
+ pycparser==2.21
13
+ pyOpenSSL==22.0.0
14
+ PySocks==1.7.1
15
+ requests==2.27.1
16
+ selenium==4.1.2
17
+ sniffio==1.2.0
18
+ sortedcontainers==2.4.0
19
+ soupsieve==2.3.1
20
+ trio==0.20.0
21
+ trio-websocket==0.9.2
22
+ urllib3==1.26.8
23
+ wcwidth==0.2.5
24
+ wsproto==1.1.0
scrap wikipedia.png ADDED
scraped_data.json ADDED
The diff for this file is too large to render. See raw diff
 
web_scraping_command_line_tool.py ADDED
@@ -0,0 +1,152 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # import required modules
2
+ import json
3
+ import requests
4
+ from datetime import datetime
5
+ from urllib.parse import urlparse
6
+ from bs4 import BeautifulSoup
7
+ from beautifultable import BeautifulTable
8
+
9
+
10
+
11
+ def load_json(database_json_file="scraped_data.json"):
12
+ """
13
+ This function will load json data from scraped_data.json file if it exist else crean an empty array
14
+ """
15
+ try:
16
+ with open(database_json_file, "r") as read_it:
17
+ all_data_base = json.loads(read_it.read())
18
+ return all_data_base
19
+ except:
20
+ all_data_base = dict()
21
+ return all_data_base
22
+
23
+
24
+ def save_scraped_data_in_json(data, database_json_file="scraped_data.json"):
25
+ """
26
+ This function Save the scraped data in json format. scraped_data.json file if it exist else create it.
27
+ if file already exist you can view previous scraped data
28
+ """
29
+ file_obj = open(database_json_file, "w")
30
+ file_obj.write(json.dumps(data))
31
+ file_obj.close()
32
+
33
+
34
+ def existing_scraped_data_init(json_db):
35
+ """
36
+ This function init data from json file if it exist have data else create an empty one
37
+ """
38
+ scraped_data = json_db.get("scraped_data")
39
+ if scraped_data is None:
40
+ json_db['scraped_data'] = dict()
41
+
42
+ return None
43
+
44
+
45
+ def scraped_time_is():
46
+ """
47
+ This function create time stamp for keep our book issue record trackable
48
+ """
49
+ now = datetime.now()
50
+ dt_string = now.strftime("%d/%m/%Y %H:%M:%S")
51
+ return dt_string
52
+
53
+ def process_url_request(website_url):
54
+ """
55
+ This function process provided URL get its data using requets module
56
+ and contrunct soup data using BeautifulSoup for scarping
57
+ """
58
+ requets_data = requests.get(website_url)
59
+ if requets_data.status_code == 200:
60
+ soup = BeautifulSoup(requets_data.text,'html')
61
+ return soup
62
+ return None
63
+
64
+ def proccess_beautiful_soup_data(soup):
65
+ return {
66
+ 'title': soup.find('title').text,
67
+ 'all_anchor_href': [i['href'] for i in soup.find_all('a', href=True)],
68
+ 'all_anchors': [str(i) for i in soup.find_all('a')],
69
+ 'all_images_data': [ str(i) for i in soup.find_all('img')],
70
+ 'all_images_source_data': [ i['src'] for i in soup.find_all('img')],
71
+ 'all_h1_data': [i.text for i in soup.find_all('h1')],
72
+ 'all_h2_data': [i.text for i in soup.find_all('h2')],
73
+ 'all_h3_data': [i.text for i in soup.find_all('h3')],
74
+ 'all_p_data': [i.text for i in soup.find_all('p')]
75
+ }
76
+
77
+
78
+
79
+ # Here I used infinite loop because i don't want to run it again and again.
80
+ while True:
81
+
82
+ print(""" ================ Welcome to this scraping program =============
83
+ ==>> press 1 for checking existing scraped websites
84
+ ==>> press 2 for scrap a single website
85
+ ==>> press 3 for exit
86
+ """)
87
+
88
+ choice = int(input("==>> Please enter your choice :"))
89
+
90
+ # Load json function called for fetching/creating data from json file.
91
+ local_json_db = load_json()
92
+ existing_scraped_data_init(local_json_db)
93
+
94
+ if choice == 1:
95
+ # I used Beautiful table for presenting scraped data in a good way !!
96
+ # you guys can read more about from this link https://beautifultable.readthedocs.io/en/latest/index.html
97
+ scraped_websites_table = BeautifulTable()
98
+ scraped_websites_table.columns.header = ["Sr no.", "Allias name ", "Website domain", "title", "Scraped at", "Status"]
99
+ scraped_websites_table.set_style(BeautifulTable.STYLE_BOX_DOUBLED)
100
+
101
+
102
+ local_json_db = load_json()
103
+ for count, data in enumerate(local_json_db['scraped_data']):
104
+ scraped_websites_table.rows.append([count + 1,
105
+ local_json_db['scraped_data'][data]['alias'],
106
+ local_json_db['scraped_data'][data]['domain'],
107
+ local_json_db['scraped_data'][data]['title'],
108
+ local_json_db['scraped_data'][data]['scraped_at'],
109
+ local_json_db['scraped_data'][data]['status']])
110
+ # all_scraped_websites = [websites['name'] for websites in local_json_db['scraped_data']]
111
+ if not local_json_db['scraped_data']:
112
+ print('===> No existing data found !!!')
113
+ print(scraped_websites_table)
114
+
115
+ elif choice == 2:
116
+ print()
117
+ url_for_scrap = input("===> Please enter url you want to scrap:")
118
+ is_accessable = process_url_request(url_for_scrap)
119
+ if is_accessable:
120
+ scraped_data_packet = proccess_beautiful_soup_data(is_accessable)
121
+ print()
122
+ print(' =====> Data scraped successfully !!!')
123
+ key_for_storing_data = input("enter alias name for saving scraped data :")
124
+ scraped_data_packet['url'] = url_for_scrap
125
+ scraped_data_packet['name'] = key_for_storing_data
126
+ scraped_data_packet['scraped_at'] = scraped_time_is()
127
+ if key_for_storing_data in local_json_db['scraped_data']:
128
+ key_for_storing_data = key_for_storing_data + str(scraped_time_is())
129
+ print("Provided key is already exist so data stored as : {}".format(key_for_storing_data))
130
+ scraped_data_packet['alias'] = key_for_storing_data
131
+ scraped_data_packet['status'] = True
132
+ scraped_data_packet['domain'] = urlparse(url_for_scrap).netloc
133
+
134
+ local_json_db['scraped_data'][key_for_storing_data] = scraped_data_packet
135
+ print(
136
+ 'scraped data is:', local_json_db['scraped_data'][key_for_storing_data]
137
+ )
138
+ save_scraped_data_in_json(local_json_db)
139
+ # load data
140
+ local_json_db = load_json()
141
+ print(' =====> Data saved successfully !!!')
142
+ print()
143
+ elif choice == 3:
144
+ print('Thank you for using !!!')
145
+ break
146
+
147
+ elif choice == 4:
148
+ print('Thank you for using !!!')
149
+ break
150
+
151
+ else:
152
+ print("enter a valid choice ")