ravi6389 commited on
Commit
dfbc8dd
1 Parent(s): 0d53f3d

Upload folder using huggingface_hub

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +1 -0
  2. Hands-on-WebScraping-master.zip +3 -0
  3. Hands-on-WebScraping-master/Hands-on-WebScraping-master/.gitignore +241 -0
  4. Hands-on-WebScraping-master/Hands-on-WebScraping-master/LICENSE +21 -0
  5. Hands-on-WebScraping-master/Hands-on-WebScraping-master/README.md +2 -0
  6. Hands-on-WebScraping-master/Hands-on-WebScraping-master/project1_twitter_hashtag_crawler/Readme.md +118 -0
  7. Hands-on-WebScraping-master/Hands-on-WebScraping-master/project1_twitter_hashtag_crawler/TwitterHashTagCrawler/__init__.py +0 -0
  8. Hands-on-WebScraping-master/Hands-on-WebScraping-master/project1_twitter_hashtag_crawler/TwitterHashTagCrawler/items.py +14 -0
  9. Hands-on-WebScraping-master/Hands-on-WebScraping-master/project1_twitter_hashtag_crawler/TwitterHashTagCrawler/middlewares.py +103 -0
  10. Hands-on-WebScraping-master/Hands-on-WebScraping-master/project1_twitter_hashtag_crawler/TwitterHashTagCrawler/pipelines.py +11 -0
  11. Hands-on-WebScraping-master/Hands-on-WebScraping-master/project1_twitter_hashtag_crawler/TwitterHashTagCrawler/settings.py +90 -0
  12. Hands-on-WebScraping-master/Hands-on-WebScraping-master/project1_twitter_hashtag_crawler/TwitterHashTagCrawler/spiders/__init__.py +4 -0
  13. Hands-on-WebScraping-master/Hands-on-WebScraping-master/project1_twitter_hashtag_crawler/TwitterHashTagCrawler/spiders/hashtag.py +112 -0
  14. Hands-on-WebScraping-master/Hands-on-WebScraping-master/project1_twitter_hashtag_crawler/myhashtags.csv +2 -0
  15. Hands-on-WebScraping-master/Hands-on-WebScraping-master/project1_twitter_hashtag_crawler/requirements.txt +2 -0
  16. Hands-on-WebScraping-master/Hands-on-WebScraping-master/project1_twitter_hashtag_crawler/sampledata.csv +12 -0
  17. Hands-on-WebScraping-master/Hands-on-WebScraping-master/project1_twitter_hashtag_crawler/scrapy.cfg +11 -0
  18. Hands-on-WebScraping-master/Hands-on-WebScraping-master/project1_twitter_hashtag_crawler/utils.py +67 -0
  19. Hands-on-WebScraping/.gitignore +241 -0
  20. Hands-on-WebScraping/LICENSE +21 -0
  21. Hands-on-WebScraping/README.md +2 -0
  22. Hands-on-WebScraping/project1_twitter_hashtag_crawler/Readme.md +118 -0
  23. Hands-on-WebScraping/project1_twitter_hashtag_crawler/TwitterHashTagCrawler/__init__.py +0 -0
  24. Hands-on-WebScraping/project1_twitter_hashtag_crawler/TwitterHashTagCrawler/__pycache__/__init__.cpython-310.pyc +0 -0
  25. Hands-on-WebScraping/project1_twitter_hashtag_crawler/TwitterHashTagCrawler/__pycache__/settings.cpython-310.pyc +0 -0
  26. Hands-on-WebScraping/project1_twitter_hashtag_crawler/TwitterHashTagCrawler/items.py +14 -0
  27. Hands-on-WebScraping/project1_twitter_hashtag_crawler/TwitterHashTagCrawler/middlewares.py +103 -0
  28. Hands-on-WebScraping/project1_twitter_hashtag_crawler/TwitterHashTagCrawler/pipelines.py +11 -0
  29. Hands-on-WebScraping/project1_twitter_hashtag_crawler/TwitterHashTagCrawler/settings.py +90 -0
  30. Hands-on-WebScraping/project1_twitter_hashtag_crawler/TwitterHashTagCrawler/spiders/__init__.py +4 -0
  31. Hands-on-WebScraping/project1_twitter_hashtag_crawler/TwitterHashTagCrawler/spiders/__pycache__/__init__.cpython-310.pyc +0 -0
  32. Hands-on-WebScraping/project1_twitter_hashtag_crawler/TwitterHashTagCrawler/spiders/__pycache__/hashtag.cpython-310.pyc +0 -0
  33. Hands-on-WebScraping/project1_twitter_hashtag_crawler/TwitterHashTagCrawler/spiders/__pycache__/hashtag2.cpython-310.pyc +0 -0
  34. Hands-on-WebScraping/project1_twitter_hashtag_crawler/TwitterHashTagCrawler/spiders/__pycache__/hashtag3.cpython-310.pyc +0 -0
  35. Hands-on-WebScraping/project1_twitter_hashtag_crawler/TwitterHashTagCrawler/spiders/hashtag.py +118 -0
  36. Hands-on-WebScraping/project1_twitter_hashtag_crawler/TwitterHashTagCrawler/spiders/hashtag2.py +121 -0
  37. Hands-on-WebScraping/project1_twitter_hashtag_crawler/TwitterHashTagCrawler/spiders/hashtag3.py +142 -0
  38. Hands-on-WebScraping/project1_twitter_hashtag_crawler/__pycache__/utils.cpython-310.pyc +0 -0
  39. Hands-on-WebScraping/project1_twitter_hashtag_crawler/mydata.csv +0 -0
  40. Hands-on-WebScraping/project1_twitter_hashtag_crawler/myhashtag.csv +0 -0
  41. Hands-on-WebScraping/project1_twitter_hashtag_crawler/myhashtags.csv +1 -0
  42. Hands-on-WebScraping/project1_twitter_hashtag_crawler/requirements.txt +2 -0
  43. Hands-on-WebScraping/project1_twitter_hashtag_crawler/sampledata.csv +12 -0
  44. Hands-on-WebScraping/project1_twitter_hashtag_crawler/scrapy.cfg +11 -0
  45. Hands-on-WebScraping/project1_twitter_hashtag_crawler/utils.py +67 -0
  46. README.md +2 -8
  47. __pycache__/test.cpython-39.pyc +0 -0
  48. __pycache__/twitter_crawl.cpython-310.pyc +0 -0
  49. requirements.txt +4 -0
  50. scrapper.ipynb +168 -0
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ twitter_scraper_without_API/firefox-geckodriver/geckodriver.exe filter=lfs diff=lfs merge=lfs -text
Hands-on-WebScraping-master.zip ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8634d80c9a416f3346c02ce5b6f25f96fda953cc24212a45f461a510d145f04c
3
+ size 15838
Hands-on-WebScraping-master/Hands-on-WebScraping-master/.gitignore ADDED
@@ -0,0 +1,241 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+
6
+ # C extensions
7
+ *.so
8
+
9
+ # Distribution / packaging
10
+ .Python
11
+ build/
12
+ develop-eggs/
13
+ dist/
14
+ downloads/
15
+ eggs/
16
+ .eggs/
17
+ lib/
18
+ lib64/
19
+ parts/
20
+ sdist/
21
+ var/
22
+ wheels/
23
+ pip-wheel-metadata/
24
+ share/python-wheels/
25
+ *.egg-info/
26
+ .installed.cfg
27
+ *.egg
28
+ MANIFEST
29
+
30
+ # PyInstaller
31
+ # Usually these files are written by a python script from a template
32
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
33
+ *.manifest
34
+ *.spec
35
+
36
+ # Installer logs
37
+ pip-log.txt
38
+ pip-delete-this-directory.txt
39
+
40
+ # Unit test / coverage reports
41
+ htmlcov/
42
+ .tox/
43
+ .nox/
44
+ .coverage
45
+ .coverage.*
46
+ .cache
47
+ nosetests.xml
48
+ coverage.xml
49
+ *.cover
50
+ *.py,cover
51
+ .hypothesis/
52
+ .pytest_cache/
53
+
54
+ # Translations
55
+ *.mo
56
+ *.pot
57
+
58
+ # Django stuff:
59
+ *.log
60
+ local_settings.py
61
+ db.sqlite3
62
+ db.sqlite3-journal
63
+
64
+ # Flask stuff:
65
+ instance/
66
+ .webassets-cache
67
+
68
+ # Scrapy stuff:
69
+ .scrapy
70
+
71
+ # Sphinx documentation
72
+ docs/_build/
73
+
74
+ # PyBuilder
75
+ target/
76
+
77
+ # Jupyter Notebook
78
+ .ipynb_checkpoints
79
+
80
+ # IPython
81
+ profile_default/
82
+ ipython_config.py
83
+
84
+ # pyenv
85
+ .python-version
86
+
87
+ # pipenv
88
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
89
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
90
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
91
+ # install all needed dependencies.
92
+ #Pipfile.lock
93
+
94
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow
95
+ __pypackages__/
96
+
97
+ # Celery stuff
98
+ celerybeat-schedule
99
+ celerybeat.pid
100
+
101
+ # SageMath parsed files
102
+ *.sage.py
103
+
104
+ # Environments
105
+ .env
106
+ .venv
107
+ env/
108
+ venv/
109
+ ENV/
110
+ env.bak/
111
+ venv.bak/
112
+
113
+ # Spyder project settings
114
+ .spyderproject
115
+ .spyproject
116
+
117
+ # Rope project settings
118
+ .ropeproject
119
+
120
+ # mkdocs documentation
121
+ /site
122
+
123
+ # mypy
124
+ .mypy_cache/
125
+ .dmypy.json
126
+ dmypy.json
127
+
128
+ # Pyre type checker
129
+ .pyre/
130
+
131
+
132
+
133
+
134
+ # Logs
135
+ logs
136
+ *.log
137
+ npm-debug.log*
138
+ yarn-debug.log*
139
+ yarn-error.log*
140
+ lerna-debug.log*
141
+
142
+ # Diagnostic reports (https://nodejs.org/api/report.html)
143
+ report.[0-9]*.[0-9]*.[0-9]*.[0-9]*.json
144
+
145
+ # Runtime data
146
+ pids
147
+ *.pid
148
+ *.seed
149
+ *.pid.lock
150
+
151
+ # Directory for instrumented libs generated by jscoverage/JSCover
152
+ lib-cov
153
+
154
+ # Coverage directory used by tools like istanbul
155
+ coverage
156
+ *.lcov
157
+
158
+ # nyc test coverage
159
+ .nyc_output
160
+
161
+ # Grunt intermediate storage (https://gruntjs.com/creating-plugins#storing-task-files)
162
+ .grunt
163
+
164
+ # Bower dependency directory (https://bower.io/)
165
+ bower_components
166
+
167
+ # node-waf configuration
168
+ .lock-wscript
169
+
170
+ # Compiled binary addons (https://nodejs.org/api/addons.html)
171
+ build/Release
172
+
173
+ # Dependency directories
174
+ node_modules/
175
+ jspm_packages/
176
+
177
+ # TypeScript v1 declaration files
178
+ typings/
179
+
180
+ # TypeScript cache
181
+ *.tsbuildinfo
182
+
183
+ # Optional npm cache directory
184
+ .npm
185
+
186
+ # Optional eslint cache
187
+ .eslintcache
188
+
189
+ # Microbundle cache
190
+ .rpt2_cache/
191
+ .rts2_cache_cjs/
192
+ .rts2_cache_es/
193
+ .rts2_cache_umd/
194
+
195
+ # Optional REPL history
196
+ .node_repl_history
197
+
198
+ # Output of 'npm pack'
199
+ *.tgz
200
+
201
+ # Yarn Integrity file
202
+ .yarn-integrity
203
+
204
+ # dotenv environment variables file
205
+ .env
206
+ .env.test
207
+
208
+ # parcel-bundler cache (https://parceljs.org/)
209
+ .cache
210
+
211
+ # Next.js build output
212
+ .next
213
+
214
+ # Nuxt.js build / generate output
215
+ .nuxt
216
+ dist
217
+
218
+ # Gatsby files
219
+ .cache/
220
+ # Comment in the public line in if your project uses Gatsby and not Next.js
221
+ # https://nextjs.org/blog/next-9-1#public-directory-support
222
+ # public
223
+
224
+ # vuepress build output
225
+ .vuepress/dist
226
+
227
+ # Serverless directories
228
+ .serverless/
229
+
230
+ # FuseBox cache
231
+ .fusebox/
232
+
233
+ # DynamoDB Local files
234
+ .dynamodb/
235
+
236
+ # TernJS port file
237
+ .tern-port
238
+
239
+ # Stores VSCode versions used for testing VSCode extensions
240
+ .vscode-test
241
+
Hands-on-WebScraping-master/Hands-on-WebScraping-master/LICENSE ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Copyright (c) 2020 Amit Upreti
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
Hands-on-WebScraping-master/Hands-on-WebScraping-master/README.md ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ # Hands-on-WebScraping (NO LONGER MAINTAINED)
2
+ This repo is a part of blog series on several web scraping projects where we will explore scraping techniques to crawl data from simple websites to websites using advanced protection.
Hands-on-WebScraping-master/Hands-on-WebScraping-master/project1_twitter_hashtag_crawler/Readme.md ADDED
@@ -0,0 +1,118 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #### Depricated. No longer maintained.
2
+
3
+ # Twitter Hashtag crawler
4
+ > A fast and unofficial twitter crawler to collect tweets using hashtag search.
5
+
6
+ > Notice: The crawler is meant to be used for collecting data purely for academic and research purpose only. I am not responsible for any legal issue that might arise for any unintended use of this crawler
7
+
8
+ [![Python 3](https://img.shields.io/badge/python-3.6-blue.svg)](https://www.python.org/downloads/release/python-360/)
9
+ [![twitter crawler](https://img.shields.io/badge/twittercrawler-1.0-green)](https://github.com/amitupreti/Hands-on-WebScraping/tree/master/project1_twitter_hashtag_crawler)
10
+
11
+ This is written using scrapy and python. The logic is straight forward. We are simply sending get requests to the mobile version of the twitter(mobile.twitter.com) to collect the list of tweets and sending get requests to the web version to parse tweet details.
12
+ ![](header.png)
13
+
14
+ ## Installation
15
+
16
+ OS X & Linux:
17
+
18
+ 1. Download the project
19
+
20
+ ```sh
21
+ git clone https://github.com/amitupreti/Hands-on-WebScraping
22
+
23
+ cd Hands-on-WebScraping/project1_twitter_hashtag_crawler
24
+ ```
25
+ 2. Install the dependencies
26
+
27
+ ```sh
28
+ pip install -r requirements.txt --user
29
+ ```
30
+
31
+ 3. Verify the crawler spider exists
32
+
33
+ ```sh
34
+ scrapy list
35
+ ```
36
+ if you see `twittercrawler` than you are all set.
37
+
38
+
39
+ Windows:
40
+ 1. Install [python3](https://www.python.org/downloads/) if you haven't already
41
+ 2. Download the project. https://github.com/amitupreti/Hands-on-WebScraping/archive/master.zip
42
+ 3. Extract the project
43
+ 4. Open cmd and navigate inside the project directory
44
+ ```sh
45
+ cd Hands-on-WebScraping/project1_twitter_hashtag_crawler
46
+ ```
47
+ 5. Follow step 2 and 3 from Mac/Linux installation
48
+
49
+
50
+
51
+ ## Usage example
52
+
53
+ 1. Put the hashtags in a csv file seperated by new line. For example, I have included `myhashtags.csv` as a sample.
54
+
55
+ ![Hashtags file](https://i.paste.pics/225079df0d3dc27d66430b1553b2ac39.png)
56
+
57
+ 2. Run the crawler with your hashtag file and the desired [output formats] (https://docs.scrapy.org/en/latest/topics/feed-exports.html)(JSON,JSON lines,CSV,XML)
58
+
59
+ * For csv
60
+ ```sh
61
+ scrapy crawl twittercrawler -a filename=myhashtags.csv -o mydata.csv
62
+
63
+ ```
64
+
65
+ * For JSON
66
+ ```sh
67
+ scrapy crawl twittercrawler -a filename=myhashtags.csv -o mydata.json
68
+
69
+ ```
70
+ ![sample images](https://i.paste.pics/4a5826a6a090522e5326bb11838258df.png)
71
+ ![sample images](https://i.paste.pics/68a64bab743150e00af4cd9eea9af8dc.png)
72
+
73
+
74
+ ### Speeding up the crawls
75
+ If you feel like the crawler is a little slow then find the hashtag.py file in the project and edit the custom settings.
76
+ ```py
77
+ custom_settings = {
78
+ 'USER_AGENT': 'Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; compatible; Googlebot/2.1; +http://www.google.com/bot.html) Safari/537.36',
79
+ 'CONCURRENT_REQUESTS': 2, 'DOWNLOAD_DELAY': 1, 'LOG_LEVEL': 'INFO'}
80
+ ```
81
+ > Here CONCURRENT_REQUESTS is the number of URLs that will be processed parallelly and DOWNLOAD_DELAY is a wait between each request. So, Increase CONCURRENT_REQUESTS and decrease DOWNLOAD_DELAY (minimum value for download delay is 0).
82
+
83
+
84
+ ## Data Columns
85
+ * username
86
+ * full_name
87
+ * twitter_url
88
+ * tweet_text
89
+ * tweet_time
90
+ * number_of_likes
91
+ * no_of_retweets
92
+ * no_of_replies
93
+ * mentions
94
+ * no_of_mentions
95
+ * hashtags
96
+ * no_of_hashtags
97
+ * call_to_action
98
+ * image_url
99
+
100
+ ## Release History
101
+
102
+ * 1.0.0
103
+ * first release crawl by hashtags
104
+
105
+ ## Meta
106
+
107
+ Amit Upreti – [@amitupreti](https://www.linkedin.com/in/amitupreti/)
108
+
109
+ Distributed under the MIT license. See ``LICENSE`` for more information.
110
+
111
+
112
+ ## Contributing
113
+
114
+ 1. Fork it (<https://github.com/amitupreti/Hands-on-WebScraping/fork>)
115
+ 2. Create your feature branch (`git checkout -b feature/fooBar`)
116
+ 3. Commit your changes (`git commit -am 'Add some fooBar'`)
117
+ 4. Push to the branch (`git push origin feature/fooBar`)
118
+ 5. Create a new Pull Request
Hands-on-WebScraping-master/Hands-on-WebScraping-master/project1_twitter_hashtag_crawler/TwitterHashTagCrawler/__init__.py ADDED
File without changes
Hands-on-WebScraping-master/Hands-on-WebScraping-master/project1_twitter_hashtag_crawler/TwitterHashTagCrawler/items.py ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+
3
+ # Define here the models for your scraped items
4
+ #
5
+ # See documentation in:
6
+ # https://doc.scrapy.org/en/latest/topics/items.html
7
+
8
+ import scrapy
9
+
10
+
11
+ class TwitterhashtagcrawlerItem(scrapy.Item):
12
+ # define the fields for your item here like:
13
+ # name = scrapy.Field()
14
+ pass
Hands-on-WebScraping-master/Hands-on-WebScraping-master/project1_twitter_hashtag_crawler/TwitterHashTagCrawler/middlewares.py ADDED
@@ -0,0 +1,103 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+
3
+ # Define here the models for your spider middleware
4
+ #
5
+ # See documentation in:
6
+ # https://doc.scrapy.org/en/latest/topics/spider-middleware.html
7
+
8
+ from scrapy import signals
9
+
10
+
11
+ class TwitterhashtagcrawlerSpiderMiddleware(object):
12
+ # Not all methods need to be defined. If a method is not defined,
13
+ # scrapy acts as if the spider middleware does not modify the
14
+ # passed objects.
15
+
16
+ @classmethod
17
+ def from_crawler(cls, crawler):
18
+ # This method is used by Scrapy to create your spiders.
19
+ s = cls()
20
+ crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
21
+ return s
22
+
23
+ def process_spider_input(self, response, spider):
24
+ # Called for each response that goes through the spider
25
+ # middleware and into the spider.
26
+
27
+ # Should return None or raise an exception.
28
+ return None
29
+
30
+ def process_spider_output(self, response, result, spider):
31
+ # Called with the results returned from the Spider, after
32
+ # it has processed the response.
33
+
34
+ # Must return an iterable of Request, dict or Item objects.
35
+ for i in result:
36
+ yield i
37
+
38
+ def process_spider_exception(self, response, exception, spider):
39
+ # Called when a spider or process_spider_input() method
40
+ # (from other spider middleware) raises an exception.
41
+
42
+ # Should return either None or an iterable of Response, dict
43
+ # or Item objects.
44
+ pass
45
+
46
+ def process_start_requests(self, start_requests, spider):
47
+ # Called with the start requests of the spider, and works
48
+ # similarly to the process_spider_output() method, except
49
+ # that it doesn’t have a response associated.
50
+
51
+ # Must return only requests (not items).
52
+ for r in start_requests:
53
+ yield r
54
+
55
+ def spider_opened(self, spider):
56
+ spider.logger.info('Spider opened: %s' % spider.name)
57
+
58
+
59
+ class TwitterhashtagcrawlerDownloaderMiddleware(object):
60
+ # Not all methods need to be defined. If a method is not defined,
61
+ # scrapy acts as if the downloader middleware does not modify the
62
+ # passed objects.
63
+
64
+ @classmethod
65
+ def from_crawler(cls, crawler):
66
+ # This method is used by Scrapy to create your spiders.
67
+ s = cls()
68
+ crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
69
+ return s
70
+
71
+ def process_request(self, request, spider):
72
+ # Called for each request that goes through the downloader
73
+ # middleware.
74
+
75
+ # Must either:
76
+ # - return None: continue processing this request
77
+ # - or return a Response object
78
+ # - or return a Request object
79
+ # - or raise IgnoreRequest: process_exception() methods of
80
+ # installed downloader middleware will be called
81
+ return None
82
+
83
+ def process_response(self, request, response, spider):
84
+ # Called with the response returned from the downloader.
85
+
86
+ # Must either;
87
+ # - return a Response object
88
+ # - return a Request object
89
+ # - or raise IgnoreRequest
90
+ return response
91
+
92
+ def process_exception(self, request, exception, spider):
93
+ # Called when a download handler or a process_request()
94
+ # (from other downloader middleware) raises an exception.
95
+
96
+ # Must either:
97
+ # - return None: continue processing this exception
98
+ # - return a Response object: stops process_exception() chain
99
+ # - return a Request object: stops process_exception() chain
100
+ pass
101
+
102
+ def spider_opened(self, spider):
103
+ spider.logger.info('Spider opened: %s' % spider.name)
Hands-on-WebScraping-master/Hands-on-WebScraping-master/project1_twitter_hashtag_crawler/TwitterHashTagCrawler/pipelines.py ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+
3
+ # Define your item pipelines here
4
+ #
5
+ # Don't forget to add your pipeline to the ITEM_PIPELINES setting
6
+ # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
7
+
8
+
9
+ class TwitterhashtagcrawlerPipeline(object):
10
+ def process_item(self, item, spider):
11
+ return item
Hands-on-WebScraping-master/Hands-on-WebScraping-master/project1_twitter_hashtag_crawler/TwitterHashTagCrawler/settings.py ADDED
@@ -0,0 +1,90 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+
3
+ # Scrapy settings for TwitterHashTagCrawler project
4
+ #
5
+ # For simplicity, this file contains only settings considered important or
6
+ # commonly used. You can find more settings consulting the documentation:
7
+ #
8
+ # https://doc.scrapy.org/en/latest/topics/settings.html
9
+ # https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
10
+ # https://doc.scrapy.org/en/latest/topics/spider-middleware.html
11
+
12
+ BOT_NAME = 'TwitterHashTagCrawler'
13
+
14
+ SPIDER_MODULES = ['TwitterHashTagCrawler.spiders']
15
+ NEWSPIDER_MODULE = 'TwitterHashTagCrawler.spiders'
16
+
17
+
18
+ # Crawl responsibly by identifying yourself (and your website) on the user-agent
19
+ #USER_AGENT = 'TwitterHashTagCrawler (+http://www.yourdomain.com)'
20
+
21
+ # Obey robots.txt rules
22
+ ROBOTSTXT_OBEY = True
23
+
24
+ # Configure maximum concurrent requests performed by Scrapy (default: 16)
25
+ #CONCURRENT_REQUESTS = 32
26
+
27
+ # Configure a delay for requests for the same website (default: 0)
28
+ # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
29
+ # See also autothrottle settings and docs
30
+ #DOWNLOAD_DELAY = 3
31
+ # The download delay setting will honor only one of:
32
+ #CONCURRENT_REQUESTS_PER_DOMAIN = 16
33
+ #CONCURRENT_REQUESTS_PER_IP = 16
34
+
35
+ # Disable cookies (enabled by default)
36
+ #COOKIES_ENABLED = False
37
+
38
+ # Disable Telnet Console (enabled by default)
39
+ #TELNETCONSOLE_ENABLED = False
40
+
41
+ # Override the default request headers:
42
+ #DEFAULT_REQUEST_HEADERS = {
43
+ # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
44
+ # 'Accept-Language': 'en',
45
+ #}
46
+
47
+ # Enable or disable spider middlewares
48
+ # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
49
+ #SPIDER_MIDDLEWARES = {
50
+ # 'TwitterHashTagCrawler.middlewares.TwitterhashtagcrawlerSpiderMiddleware': 543,
51
+ #}
52
+
53
+ # Enable or disable downloader middlewares
54
+ # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
55
+ #DOWNLOADER_MIDDLEWARES = {
56
+ # 'TwitterHashTagCrawler.middlewares.TwitterhashtagcrawlerDownloaderMiddleware': 543,
57
+ #}
58
+
59
+ # Enable or disable extensions
60
+ # See https://doc.scrapy.org/en/latest/topics/extensions.html
61
+ #EXTENSIONS = {
62
+ # 'scrapy.extensions.telnet.TelnetConsole': None,
63
+ #}
64
+
65
+ # Configure item pipelines
66
+ # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
67
+ #ITEM_PIPELINES = {
68
+ # 'TwitterHashTagCrawler.pipelines.TwitterhashtagcrawlerPipeline': 300,
69
+ #}
70
+
71
+ # Enable and configure the AutoThrottle extension (disabled by default)
72
+ # See https://doc.scrapy.org/en/latest/topics/autothrottle.html
73
+ #AUTOTHROTTLE_ENABLED = True
74
+ # The initial download delay
75
+ #AUTOTHROTTLE_START_DELAY = 5
76
+ # The maximum download delay to be set in case of high latencies
77
+ #AUTOTHROTTLE_MAX_DELAY = 60
78
+ # The average number of requests Scrapy should be sending in parallel to
79
+ # each remote server
80
+ #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
81
+ # Enable showing throttling stats for every response received:
82
+ #AUTOTHROTTLE_DEBUG = False
83
+
84
+ # Enable and configure HTTP caching (disabled by default)
85
+ # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
86
+ #HTTPCACHE_ENABLED = True
87
+ #HTTPCACHE_EXPIRATION_SECS = 0
88
+ #HTTPCACHE_DIR = 'httpcache'
89
+ #HTTPCACHE_IGNORE_HTTP_CODES = []
90
+ #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
Hands-on-WebScraping-master/Hands-on-WebScraping-master/project1_twitter_hashtag_crawler/TwitterHashTagCrawler/spiders/__init__.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ # This package will contain the spiders of your Scrapy project
2
+ #
3
+ # Please refer to the documentation for information on how to create and manage
4
+ # your spiders.
Hands-on-WebScraping-master/Hands-on-WebScraping-master/project1_twitter_hashtag_crawler/TwitterHashTagCrawler/spiders/hashtag.py ADDED
@@ -0,0 +1,112 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ import scrapy
3
+ import ipdb
4
+ import re
5
+ from dateutil import parser
6
+ import sys
7
+ from scrapy.crawler import CrawlerProcess
8
+ from utils import get_links, get_hashtags, get_mentions
9
+ import logging
10
+
11
+ class HashtagSpider(scrapy.Spider):
12
+ name = 'twittercrawler'
13
+ allowed_domains = ["twitter.com"]
14
+
15
+ # custom settings for user agent and proxy. Default will get chrome as user agent and use a proxypool of 50 .
16
+ # Override here
17
+ custom_settings = {
18
+ 'USER_AGENT': 'Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; compatible; Googlebot/2.1; +http://www.google.com/bot.html) Safari/537.36',
19
+ 'CONCURRENT_REQUESTS': 5, 'DOWNLOAD_DELAY': 0, 'LOG_LEVEL': 'INFO'}
20
+
21
+ def __init__(self, filename=''):
22
+ if not filename:
23
+ sys.exit('Please provide the input filename also. Example \n\n$ python3 hashtags.py myinput.csv')
24
+ self.filename = filename
25
+
26
+ # the crawler will execute start_requests function at first.
27
+ def start_requests(self):
28
+ with open(self.filename, 'r') as f:
29
+ hashtags = f.read().splitlines()
30
+ if len(hashtags) == 0:
31
+ sys.exit('Emplty File detected.Please provide hashtags separated by newlines')
32
+ else:
33
+ logging.info(f'{len(hashtags)} hashtags found')
34
+ for hashtag in hashtags:
35
+ if hashtag:
36
+ search_url = "https://mobile.twitter.com/hashtag/" + hashtag.lower()
37
+
38
+ yield scrapy.Request(search_url, callback=self.find_tweets, dont_filter=True)
39
+
40
+ def find_tweets(self, response):
41
+ tweets = response.xpath('//table[@class="tweet "]/@href').getall()
42
+ logging.info(f'{len(tweets)} tweets found')
43
+ for tweet_id in tweets:
44
+ tweet_id = re.findall("\d+", tweet_id)[-1]
45
+ tweet_url = 'https://twitter.com/anyuser/status/' + \
46
+ str(tweet_id)
47
+ yield scrapy.Request(tweet_url, callback=self.parse_tweet)
48
+
49
+ # finding and visiting next page
50
+ next_page = response.xpath(
51
+ '//*[@class="w-button-more"]/a/@href').get(default='')
52
+ logging.info('Next page found:')
53
+ if next_page != '':
54
+ next_page = 'https://mobile.twitter.com' + next_page
55
+ yield scrapy.Request(next_page, callback=self.find_tweets)
56
+
57
+ def parse_tweet(self, response):
58
+ logging.info('Processing --> ' + response.url)
59
+ username = response.xpath(
60
+ '//*[@class="permalink-inner permalink-tweet-container"]//*[@class="username u-dir u-textTruncate"]/b/text()').get(
61
+ default='')
62
+ full_name = response.xpath(
63
+ '//*[@class="permalink-inner permalink-tweet-container"]//*[@class="FullNameGroup"]/strong/text()').get(
64
+ default='')
65
+
66
+ try:
67
+ tweet_text = response.xpath('//title/text()').get(default='').split(':')[1].strip()
68
+
69
+ except:
70
+ tweet_text = ' '.join(response.xpath(
71
+ '//*[contains(@class,"permalink-inner permalink-tweet-container")]//*[@class="js-tweet-text-container"]/p//text()').getall()).strip()
72
+ image_list = response.xpath(
73
+ '//*[contains(@class,"permalink-inner permalink-tweet-container")]//*[@class="AdaptiveMediaOuterContainer"]//img/@src').getall()
74
+ date_time = response.xpath(
75
+ '//*[contains(@class,"permalink-inner permalink-tweet-container")]//*[@class="js-tweet-details-fixer tweet-details-fixer"]/div[@class="client-and-actions"]/span[@class="metadata"]/span/text()').get(
76
+ default='')
77
+
78
+ date_time = parser.parse(date_time.replace('-', '')).strftime('%Y-%m-%d %H:%M:%S')
79
+ retweets = response.xpath(
80
+ '//*[contains(@class,"permalink-inner permalink-tweet-container")]//*[@class="js-tweet-details-fixer tweet-details-fixer"]/div[@class="js-tweet-stats-container tweet-stats-container"]//*[@class="js-stat-count js-stat-retweets stat-count"]/a/strong/text()').get(
81
+ default='')
82
+
83
+ likes = response.xpath(
84
+ '//*[contains(@class,"permalink-inner permalink-tweet-container")]//*[@class="js-tweet-details-fixer tweet-details-fixer"]/div[@class="js-tweet-stats-container tweet-stats-container"]//*[@class="js-stat-count js-stat-favorites stat-count"]/a/strong/text()').get(
85
+ default='')
86
+ replies = response.xpath(
87
+ '//*[contains(@class,"permalink-inner permalink-tweet-container")]//*[contains(@id,"profile-tweet-action-reply-count")]/parent::span/@data-tweet-stat-count').get(
88
+ default='')
89
+
90
+ mentions = get_mentions(tweet_text)
91
+ hashtags = get_hashtags(tweet_text)
92
+ cta = get_links(tweet_text)
93
+
94
+ result = {
95
+ 'username': username.lower(),
96
+ 'full_name': full_name,
97
+ 'twitter_url': response.url,
98
+ 'tweet_text': tweet_text,
99
+ 'tweet_time': str(date_time),
100
+ 'number_of_likes': str(likes),
101
+ 'no_of_retweets': str(retweets),
102
+ 'no_of_replies': str(replies),
103
+ 'mentions': ' | '.join(mentions),
104
+ 'no_of_mentions': str(len(mentions)),
105
+ 'hashtags': ' | '.join(hashtags),
106
+ 'no_of_hashtags': str(len(hashtags)),
107
+ 'call_to_action': ' | '.join(cta),
108
+ 'image_url': ' | '.join(image_list),
109
+
110
+ }
111
+ yield result
112
+
Hands-on-WebScraping-master/Hands-on-WebScraping-master/project1_twitter_hashtag_crawler/myhashtags.csv ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ cats
2
+ dogs
Hands-on-WebScraping-master/Hands-on-WebScraping-master/project1_twitter_hashtag_crawler/requirements.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ scrapy
2
+ dateutil
Hands-on-WebScraping-master/Hands-on-WebScraping-master/project1_twitter_hashtag_crawler/sampledata.csv ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ username,full_name,twitter_url,tweet_text,tweet_time,number_of_likes,no_of_retweets,no_of_replies,mentions,no_of_mentions,hashtags,no_of_hashtags,call_to_action,image_url
2
+ cctvasiapacific,CCTV Asia Pacific,https://twitter.com/CCTVAsiaPacific/status/1212269072328491008,"Turning off the stereotype of political faces, Hou Yanqi, the # ChineseAmbssdor to # Nepal , amazes Nepalese and gains popularity on twitter by posting her ad-like photos and wishes: ""True beauty always touches the deep heart"", said Hou.
3
+ २०२० नेपाल भ्रमाण वर्ष सफलताको शुभकामना pic.twitter.com/z0N8ru2vNd",2019-12-31 23:07:00,804,171,35,,0,,0,,https://pbs.twimg.com/media/ENLYSqlU4AAgiFh.jpg | https://pbs.twimg.com/media/ENLYSqoVAAASSS-.jpg | https://pbs.twimg.com/media/ENLYSqmU0AAZEyK.jpg
4
+ ,,https://twitter.com/BishowParajuli/status/1213037950549626882,"Zimbabwe is beautiful! Glad to hear your mountain climbing adventure ; If you wish to climb further higher, another beautiful place is # Nepal ! You will you can also enjoy some terrific historical spots: pic.twitter.com/ofsCppyp0O",2020-01-03 02:02:00,27,3,1,,0,,0,,https://pbs.twimg.com/media/ENWTkzmUEAEKS1k.jpg | https://pbs.twimg.com/media/ENWTkznU4AAtVxK.jpg | https://pbs.twimg.com/media/ENWTkzoUwAEgMpX.jpg | https://pbs.twimg.com/media/ENWTkzlU4AEYxor.jpg
5
+ kopinoora,kpila,https://twitter.com/kopinoora/status/1213481511967690752,# VisitNepal2020 official inauguration at London Nepal Embassy. # pic.twitter.com/e4N9XulBH7,2020-01-04 07:25:00,3,,0,,0,,0,,https://pbs.twimg.com/media/ENcnABiXsAE7_sw.jpg | https://pbs.twimg.com/media/ENcnABsXUAAnuBL.jpg
6
+ mahbub_nazif,Nazif Mahbub,https://twitter.com/mahbub_nazif/status/1213328288271089664,"The joy of being Innocent. Durbar square, kathmandu, nepal pic.twitter.com/sbsfxTzeHN",2020-01-03 21:16:00,4,,0,,0,,0,,https://pbs.twimg.com/media/ENabn-uWwAcbUfb.jpg
7
+ prabhumteverest,Prastuti_प्रश्तुती,https://twitter.com/PrabhuMteverest/status/1213178026457878528,"Visit nepal2020. where heaven meets and you won't feel regret choosing Nepal as your destination
8
+
9
+ We are eager to welcome you with our beautiful destinations and warm hospitality pic.twitter.com/l7GQfk2ha6",2020-01-03 11:19:00,5,,0,,0,,0,,https://pbs.twimg.com/media/ENYS_CLUwAAVypp.jpg
10
+ kashishds,Kashish Das Shrestha,https://twitter.com/kashishds/status/1213120581412876295,"Marpha bazaar, Mustang, Nepal. Today.
11
+
12
+ Requested my friend & Marpha resident Dipesh Hirachan for this clip. This is just outside his Apple orchard there. pic.twitter.com/oOFy88ylIt",2020-01-03 07:30:00,123,20,4,,0,,0,,
Hands-on-WebScraping-master/Hands-on-WebScraping-master/project1_twitter_hashtag_crawler/scrapy.cfg ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Automatically created by: scrapy startproject
2
+ #
3
+ # For more information about the [deploy] section see:
4
+ # https://scrapyd.readthedocs.io/en/latest/deploy.html
5
+
6
+ [settings]
7
+ default = TwitterHashTagCrawler.settings
8
+
9
+ [deploy]
10
+ #url = http://localhost:6800/
11
+ project = TwitterHashTagCrawler
Hands-on-WebScraping-master/Hands-on-WebScraping-master/project1_twitter_hashtag_crawler/utils.py ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+
3
+
4
+ def find_emails(text):
5
+ """
6
+ It will parse the given string and return a list of emails if found
7
+
8
+ Example:
9
+ >>find_emails('hello\n find me here\nemail@gmail.com')
10
+ ['email@gmail.com']
11
+
12
+ :param text: string
13
+ :return: list
14
+ """
15
+ return re.findall(r"([a-zA-Z0-9+._-]+@[a-zA-Z0-9._-]+\.[a-zA-Z0-9_-]+)", text)
16
+
17
+
18
+ def get_mentions(text):
19
+ """
20
+ It will return mentions from the text i.e @someone
21
+
22
+ :param text: string
23
+ :return: list
24
+
25
+ example
26
+ >>> get_mentions('Hi @hero, How are you? I hope @hero2 is fine. BTW say hi to @heroine for me')
27
+ ['hero','hero2','heroine']
28
+ """
29
+ result = re.findall("(^|[^@\w])@(\w{1,15})", text)
30
+ if len(result) != 0:
31
+ result = [i[1] for i in result]
32
+ return result
33
+
34
+
35
+ def get_hashtags(text):
36
+ """
37
+ It will return hashtags from the text i.e #something
38
+
39
+ :param text: string
40
+ :return: list
41
+
42
+ example
43
+ >>> get_hashtags('my first code #programmer #python #awesome #grepsr')
44
+ ['programmer','python','awesome','grepsr']
45
+ """
46
+
47
+ result = re.findall("(^|[^@\w])#(\w{1,15})", text)
48
+ if len(result) != 0:
49
+ result = [i[1] for i in result]
50
+ return result
51
+
52
+
53
+ def get_links(text):
54
+ """
55
+ It will return website links from the text
56
+
57
+ :param text: string
58
+ :return: list
59
+
60
+ example
61
+ >>> message = 'http://twitter.com Project URL: https://app.grepsr.com/app/project/message/70454'
62
+ >>> get_links(message)
63
+ ['http://twitter.com', 'https://app.grepsr.com/app/project/message/70454']
64
+
65
+ """
66
+ result = re.findall(r"(?P<url>https?://[^\s]+)", text)
67
+ return result
Hands-on-WebScraping/.gitignore ADDED
@@ -0,0 +1,241 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+
6
+ # C extensions
7
+ *.so
8
+
9
+ # Distribution / packaging
10
+ .Python
11
+ build/
12
+ develop-eggs/
13
+ dist/
14
+ downloads/
15
+ eggs/
16
+ .eggs/
17
+ lib/
18
+ lib64/
19
+ parts/
20
+ sdist/
21
+ var/
22
+ wheels/
23
+ pip-wheel-metadata/
24
+ share/python-wheels/
25
+ *.egg-info/
26
+ .installed.cfg
27
+ *.egg
28
+ MANIFEST
29
+
30
+ # PyInstaller
31
+ # Usually these files are written by a python script from a template
32
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
33
+ *.manifest
34
+ *.spec
35
+
36
+ # Installer logs
37
+ pip-log.txt
38
+ pip-delete-this-directory.txt
39
+
40
+ # Unit test / coverage reports
41
+ htmlcov/
42
+ .tox/
43
+ .nox/
44
+ .coverage
45
+ .coverage.*
46
+ .cache
47
+ nosetests.xml
48
+ coverage.xml
49
+ *.cover
50
+ *.py,cover
51
+ .hypothesis/
52
+ .pytest_cache/
53
+
54
+ # Translations
55
+ *.mo
56
+ *.pot
57
+
58
+ # Django stuff:
59
+ *.log
60
+ local_settings.py
61
+ db.sqlite3
62
+ db.sqlite3-journal
63
+
64
+ # Flask stuff:
65
+ instance/
66
+ .webassets-cache
67
+
68
+ # Scrapy stuff:
69
+ .scrapy
70
+
71
+ # Sphinx documentation
72
+ docs/_build/
73
+
74
+ # PyBuilder
75
+ target/
76
+
77
+ # Jupyter Notebook
78
+ .ipynb_checkpoints
79
+
80
+ # IPython
81
+ profile_default/
82
+ ipython_config.py
83
+
84
+ # pyenv
85
+ .python-version
86
+
87
+ # pipenv
88
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
89
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
90
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
91
+ # install all needed dependencies.
92
+ #Pipfile.lock
93
+
94
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow
95
+ __pypackages__/
96
+
97
+ # Celery stuff
98
+ celerybeat-schedule
99
+ celerybeat.pid
100
+
101
+ # SageMath parsed files
102
+ *.sage.py
103
+
104
+ # Environments
105
+ .env
106
+ .venv
107
+ env/
108
+ venv/
109
+ ENV/
110
+ env.bak/
111
+ venv.bak/
112
+
113
+ # Spyder project settings
114
+ .spyderproject
115
+ .spyproject
116
+
117
+ # Rope project settings
118
+ .ropeproject
119
+
120
+ # mkdocs documentation
121
+ /site
122
+
123
+ # mypy
124
+ .mypy_cache/
125
+ .dmypy.json
126
+ dmypy.json
127
+
128
+ # Pyre type checker
129
+ .pyre/
130
+
131
+
132
+
133
+
134
+ # Logs
135
+ logs
136
+ *.log
137
+ npm-debug.log*
138
+ yarn-debug.log*
139
+ yarn-error.log*
140
+ lerna-debug.log*
141
+
142
+ # Diagnostic reports (https://nodejs.org/api/report.html)
143
+ report.[0-9]*.[0-9]*.[0-9]*.[0-9]*.json
144
+
145
+ # Runtime data
146
+ pids
147
+ *.pid
148
+ *.seed
149
+ *.pid.lock
150
+
151
+ # Directory for instrumented libs generated by jscoverage/JSCover
152
+ lib-cov
153
+
154
+ # Coverage directory used by tools like istanbul
155
+ coverage
156
+ *.lcov
157
+
158
+ # nyc test coverage
159
+ .nyc_output
160
+
161
+ # Grunt intermediate storage (https://gruntjs.com/creating-plugins#storing-task-files)
162
+ .grunt
163
+
164
+ # Bower dependency directory (https://bower.io/)
165
+ bower_components
166
+
167
+ # node-waf configuration
168
+ .lock-wscript
169
+
170
+ # Compiled binary addons (https://nodejs.org/api/addons.html)
171
+ build/Release
172
+
173
+ # Dependency directories
174
+ node_modules/
175
+ jspm_packages/
176
+
177
+ # TypeScript v1 declaration files
178
+ typings/
179
+
180
+ # TypeScript cache
181
+ *.tsbuildinfo
182
+
183
+ # Optional npm cache directory
184
+ .npm
185
+
186
+ # Optional eslint cache
187
+ .eslintcache
188
+
189
+ # Microbundle cache
190
+ .rpt2_cache/
191
+ .rts2_cache_cjs/
192
+ .rts2_cache_es/
193
+ .rts2_cache_umd/
194
+
195
+ # Optional REPL history
196
+ .node_repl_history
197
+
198
+ # Output of 'npm pack'
199
+ *.tgz
200
+
201
+ # Yarn Integrity file
202
+ .yarn-integrity
203
+
204
+ # dotenv environment variables file
205
+ .env
206
+ .env.test
207
+
208
+ # parcel-bundler cache (https://parceljs.org/)
209
+ .cache
210
+
211
+ # Next.js build output
212
+ .next
213
+
214
+ # Nuxt.js build / generate output
215
+ .nuxt
216
+ dist
217
+
218
+ # Gatsby files
219
+ .cache/
220
+ # Comment in the public line in if your project uses Gatsby and not Next.js
221
+ # https://nextjs.org/blog/next-9-1#public-directory-support
222
+ # public
223
+
224
+ # vuepress build output
225
+ .vuepress/dist
226
+
227
+ # Serverless directories
228
+ .serverless/
229
+
230
+ # FuseBox cache
231
+ .fusebox/
232
+
233
+ # DynamoDB Local files
234
+ .dynamodb/
235
+
236
+ # TernJS port file
237
+ .tern-port
238
+
239
+ # Stores VSCode versions used for testing VSCode extensions
240
+ .vscode-test
241
+
Hands-on-WebScraping/LICENSE ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Copyright (c) 2020 Amit Upreti
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
Hands-on-WebScraping/README.md ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ # Hands-on-WebScraping (NO LONGER MAINTAINED)
2
+ This repo is a part of blog series on several web scraping projects where we will explore scraping techniques to crawl data from simple websites to websites using advanced protection.
Hands-on-WebScraping/project1_twitter_hashtag_crawler/Readme.md ADDED
@@ -0,0 +1,118 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #### Depricated. No longer maintained.
2
+
3
+ # Twitter Hashtag crawler
4
+ > A fast and unofficial twitter crawler to collect tweets using hashtag search.
5
+
6
+ > Notice: The crawler is meant to be used for collecting data purely for academic and research purpose only. I am not responsible for any legal issue that might arise for any unintended use of this crawler
7
+
8
+ [![Python 3](https://img.shields.io/badge/python-3.6-blue.svg)](https://www.python.org/downloads/release/python-360/)
9
+ [![twitter crawler](https://img.shields.io/badge/twittercrawler-1.0-green)](https://github.com/amitupreti/Hands-on-WebScraping/tree/master/project1_twitter_hashtag_crawler)
10
+
11
+ This is written using scrapy and python. The logic is straight forward. We are simply sending get requests to the mobile version of the twitter(mobile.twitter.com) to collect the list of tweets and sending get requests to the web version to parse tweet details.
12
+ ![](header.png)
13
+
14
+ ## Installation
15
+
16
+ OS X & Linux:
17
+
18
+ 1. Download the project
19
+
20
+ ```sh
21
+ git clone https://github.com/amitupreti/Hands-on-WebScraping
22
+
23
+ cd Hands-on-WebScraping/project1_twitter_hashtag_crawler
24
+ ```
25
+ 2. Install the dependencies
26
+
27
+ ```sh
28
+ pip install -r requirements.txt --user
29
+ ```
30
+
31
+ 3. Verify the crawler spider exists
32
+
33
+ ```sh
34
+ scrapy list
35
+ ```
36
+ if you see `twittercrawler` than you are all set.
37
+
38
+
39
+ Windows:
40
+ 1. Install [python3](https://www.python.org/downloads/) if you haven't already
41
+ 2. Download the project. https://github.com/amitupreti/Hands-on-WebScraping/archive/master.zip
42
+ 3. Extract the project
43
+ 4. Open cmd and navigate inside the project directory
44
+ ```sh
45
+ cd Hands-on-WebScraping/project1_twitter_hashtag_crawler
46
+ ```
47
+ 5. Follow step 2 and 3 from Mac/Linux installation
48
+
49
+
50
+
51
+ ## Usage example
52
+
53
+ 1. Put the hashtags in a csv file seperated by new line. For example, I have included `myhashtags.csv` as a sample.
54
+
55
+ ![Hashtags file](https://i.paste.pics/225079df0d3dc27d66430b1553b2ac39.png)
56
+
57
+ 2. Run the crawler with your hashtag file and the desired [output formats] (https://docs.scrapy.org/en/latest/topics/feed-exports.html)(JSON,JSON lines,CSV,XML)
58
+
59
+ * For csv
60
+ ```sh
61
+ scrapy crawl twittercrawler -a filename=myhashtags.csv -o mydata.csv
62
+
63
+ ```
64
+
65
+ * For JSON
66
+ ```sh
67
+ scrapy crawl twittercrawler -a filename=myhashtags.csv -o mydata.json
68
+
69
+ ```
70
+ ![sample images](https://i.paste.pics/4a5826a6a090522e5326bb11838258df.png)
71
+ ![sample images](https://i.paste.pics/68a64bab743150e00af4cd9eea9af8dc.png)
72
+
73
+
74
+ ### Speeding up the crawls
75
+ If you feel like the crawler is a little slow then find the hashtag.py file in the project and edit the custom settings.
76
+ ```py
77
+ custom_settings = {
78
+ 'USER_AGENT': 'Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; compatible; Googlebot/2.1; +http://www.google.com/bot.html) Safari/537.36',
79
+ 'CONCURRENT_REQUESTS': 2, 'DOWNLOAD_DELAY': 1, 'LOG_LEVEL': 'INFO'}
80
+ ```
81
+ > Here CONCURRENT_REQUESTS is the number of URLs that will be processed parallelly and DOWNLOAD_DELAY is a wait between each request. So, Increase CONCURRENT_REQUESTS and decrease DOWNLOAD_DELAY (minimum value for download delay is 0).
82
+
83
+
84
+ ## Data Columns
85
+ * username
86
+ * full_name
87
+ * twitter_url
88
+ * tweet_text
89
+ * tweet_time
90
+ * number_of_likes
91
+ * no_of_retweets
92
+ * no_of_replies
93
+ * mentions
94
+ * no_of_mentions
95
+ * hashtags
96
+ * no_of_hashtags
97
+ * call_to_action
98
+ * image_url
99
+
100
+ ## Release History
101
+
102
+ * 1.0.0
103
+ * first release crawl by hashtags
104
+
105
+ ## Meta
106
+
107
+ Amit Upreti – [@amitupreti](https://www.linkedin.com/in/amitupreti/)
108
+
109
+ Distributed under the MIT license. See ``LICENSE`` for more information.
110
+
111
+
112
+ ## Contributing
113
+
114
+ 1. Fork it (<https://github.com/amitupreti/Hands-on-WebScraping/fork>)
115
+ 2. Create your feature branch (`git checkout -b feature/fooBar`)
116
+ 3. Commit your changes (`git commit -am 'Add some fooBar'`)
117
+ 4. Push to the branch (`git push origin feature/fooBar`)
118
+ 5. Create a new Pull Request
Hands-on-WebScraping/project1_twitter_hashtag_crawler/TwitterHashTagCrawler/__init__.py ADDED
File without changes
Hands-on-WebScraping/project1_twitter_hashtag_crawler/TwitterHashTagCrawler/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (234 Bytes). View file
 
Hands-on-WebScraping/project1_twitter_hashtag_crawler/TwitterHashTagCrawler/__pycache__/settings.cpython-310.pyc ADDED
Binary file (370 Bytes). View file
 
Hands-on-WebScraping/project1_twitter_hashtag_crawler/TwitterHashTagCrawler/items.py ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+
3
+ # Define here the models for your scraped items
4
+ #
5
+ # See documentation in:
6
+ # https://doc.scrapy.org/en/latest/topics/items.html
7
+
8
+ import scrapy
9
+
10
+
11
+ class TwitterhashtagcrawlerItem(scrapy.Item):
12
+ # define the fields for your item here like:
13
+ # name = scrapy.Field()
14
+ pass
Hands-on-WebScraping/project1_twitter_hashtag_crawler/TwitterHashTagCrawler/middlewares.py ADDED
@@ -0,0 +1,103 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+
3
+ # Define here the models for your spider middleware
4
+ #
5
+ # See documentation in:
6
+ # https://doc.scrapy.org/en/latest/topics/spider-middleware.html
7
+
8
+ from scrapy import signals
9
+
10
+
11
+ class TwitterhashtagcrawlerSpiderMiddleware(object):
12
+ # Not all methods need to be defined. If a method is not defined,
13
+ # scrapy acts as if the spider middleware does not modify the
14
+ # passed objects.
15
+
16
+ @classmethod
17
+ def from_crawler(cls, crawler):
18
+ # This method is used by Scrapy to create your spiders.
19
+ s = cls()
20
+ crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
21
+ return s
22
+
23
+ def process_spider_input(self, response, spider):
24
+ # Called for each response that goes through the spider
25
+ # middleware and into the spider.
26
+
27
+ # Should return None or raise an exception.
28
+ return None
29
+
30
+ def process_spider_output(self, response, result, spider):
31
+ # Called with the results returned from the Spider, after
32
+ # it has processed the response.
33
+
34
+ # Must return an iterable of Request, dict or Item objects.
35
+ for i in result:
36
+ yield i
37
+
38
+ def process_spider_exception(self, response, exception, spider):
39
+ # Called when a spider or process_spider_input() method
40
+ # (from other spider middleware) raises an exception.
41
+
42
+ # Should return either None or an iterable of Response, dict
43
+ # or Item objects.
44
+ pass
45
+
46
+ def process_start_requests(self, start_requests, spider):
47
+ # Called with the start requests of the spider, and works
48
+ # similarly to the process_spider_output() method, except
49
+ # that it doesn’t have a response associated.
50
+
51
+ # Must return only requests (not items).
52
+ for r in start_requests:
53
+ yield r
54
+
55
+ def spider_opened(self, spider):
56
+ spider.logger.info('Spider opened: %s' % spider.name)
57
+
58
+
59
+ class TwitterhashtagcrawlerDownloaderMiddleware(object):
60
+ # Not all methods need to be defined. If a method is not defined,
61
+ # scrapy acts as if the downloader middleware does not modify the
62
+ # passed objects.
63
+
64
+ @classmethod
65
+ def from_crawler(cls, crawler):
66
+ # This method is used by Scrapy to create your spiders.
67
+ s = cls()
68
+ crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
69
+ return s
70
+
71
+ def process_request(self, request, spider):
72
+ # Called for each request that goes through the downloader
73
+ # middleware.
74
+
75
+ # Must either:
76
+ # - return None: continue processing this request
77
+ # - or return a Response object
78
+ # - or return a Request object
79
+ # - or raise IgnoreRequest: process_exception() methods of
80
+ # installed downloader middleware will be called
81
+ return None
82
+
83
+ def process_response(self, request, response, spider):
84
+ # Called with the response returned from the downloader.
85
+
86
+ # Must either;
87
+ # - return a Response object
88
+ # - return a Request object
89
+ # - or raise IgnoreRequest
90
+ return response
91
+
92
+ def process_exception(self, request, exception, spider):
93
+ # Called when a download handler or a process_request()
94
+ # (from other downloader middleware) raises an exception.
95
+
96
+ # Must either:
97
+ # - return None: continue processing this exception
98
+ # - return a Response object: stops process_exception() chain
99
+ # - return a Request object: stops process_exception() chain
100
+ pass
101
+
102
+ def spider_opened(self, spider):
103
+ spider.logger.info('Spider opened: %s' % spider.name)
Hands-on-WebScraping/project1_twitter_hashtag_crawler/TwitterHashTagCrawler/pipelines.py ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+
3
+ # Define your item pipelines here
4
+ #
5
+ # Don't forget to add your pipeline to the ITEM_PIPELINES setting
6
+ # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
7
+
8
+
9
+ class TwitterhashtagcrawlerPipeline(object):
10
+ def process_item(self, item, spider):
11
+ return item
Hands-on-WebScraping/project1_twitter_hashtag_crawler/TwitterHashTagCrawler/settings.py ADDED
@@ -0,0 +1,90 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+
3
+ # Scrapy settings for TwitterHashTagCrawler project
4
+ #
5
+ # For simplicity, this file contains only settings considered important or
6
+ # commonly used. You can find more settings consulting the documentation:
7
+ #
8
+ # https://doc.scrapy.org/en/latest/topics/settings.html
9
+ # https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
10
+ # https://doc.scrapy.org/en/latest/topics/spider-middleware.html
11
+
12
+ BOT_NAME = 'TwitterHashTagCrawler'
13
+
14
+ SPIDER_MODULES = ['TwitterHashTagCrawler.spiders']
15
+ NEWSPIDER_MODULE = 'TwitterHashTagCrawler.spiders'
16
+
17
+
18
+ # Crawl responsibly by identifying yourself (and your website) on the user-agent
19
+ #USER_AGENT = 'TwitterHashTagCrawler (+http://www.yourdomain.com)'
20
+
21
+ # Obey robots.txt rules
22
+ ROBOTSTXT_OBEY = False
23
+
24
+ # Configure maximum concurrent requests performed by Scrapy (default: 16)
25
+ #CONCURRENT_REQUESTS = 32
26
+
27
+ # Configure a delay for requests for the same website (default: 0)
28
+ # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
29
+ # See also autothrottle settings and docs
30
+ #DOWNLOAD_DELAY = 3
31
+ # The download delay setting will honor only one of:
32
+ #CONCURRENT_REQUESTS_PER_DOMAIN = 16
33
+ #CONCURRENT_REQUESTS_PER_IP = 16
34
+
35
+ # Disable cookies (enabled by default)
36
+ #COOKIES_ENABLED = False
37
+
38
+ # Disable Telnet Console (enabled by default)
39
+ #TELNETCONSOLE_ENABLED = False
40
+
41
+ # Override the default request headers:
42
+ #DEFAULT_REQUEST_HEADERS = {
43
+ # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
44
+ # 'Accept-Language': 'en',
45
+ #}
46
+
47
+ # Enable or disable spider middlewares
48
+ # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
49
+ #SPIDER_MIDDLEWARES = {
50
+ # 'TwitterHashTagCrawler.middlewares.TwitterhashtagcrawlerSpiderMiddleware': 543,
51
+ #}
52
+
53
+ # Enable or disable downloader middlewares
54
+ # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
55
+ #DOWNLOADER_MIDDLEWARES = {
56
+ # 'TwitterHashTagCrawler.middlewares.TwitterhashtagcrawlerDownloaderMiddleware': 543,
57
+ #}
58
+
59
+ # Enable or disable extensions
60
+ # See https://doc.scrapy.org/en/latest/topics/extensions.html
61
+ #EXTENSIONS = {
62
+ # 'scrapy.extensions.telnet.TelnetConsole': None,
63
+ #}
64
+
65
+ # Configure item pipelines
66
+ # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
67
+ #ITEM_PIPELINES = {
68
+ # 'TwitterHashTagCrawler.pipelines.TwitterhashtagcrawlerPipeline': 300,
69
+ #}
70
+
71
+ # Enable and configure the AutoThrottle extension (disabled by default)
72
+ # See https://doc.scrapy.org/en/latest/topics/autothrottle.html
73
+ #AUTOTHROTTLE_ENABLED = True
74
+ # The initial download delay
75
+ #AUTOTHROTTLE_START_DELAY = 5
76
+ # The maximum download delay to be set in case of high latencies
77
+ #AUTOTHROTTLE_MAX_DELAY = 60
78
+ # The average number of requests Scrapy should be sending in parallel to
79
+ # each remote server
80
+ #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
81
+ # Enable showing throttling stats for every response received:
82
+ #AUTOTHROTTLE_DEBUG = False
83
+
84
+ # Enable and configure HTTP caching (disabled by default)
85
+ # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
86
+ #HTTPCACHE_ENABLED = True
87
+ #HTTPCACHE_EXPIRATION_SECS = 0
88
+ #HTTPCACHE_DIR = 'httpcache'
89
+ #HTTPCACHE_IGNORE_HTTP_CODES = []
90
+ #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
Hands-on-WebScraping/project1_twitter_hashtag_crawler/TwitterHashTagCrawler/spiders/__init__.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ # This package will contain the spiders of your Scrapy project
2
+ #
3
+ # Please refer to the documentation for information on how to create and manage
4
+ # your spiders.
Hands-on-WebScraping/project1_twitter_hashtag_crawler/TwitterHashTagCrawler/spiders/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (242 Bytes). View file
 
Hands-on-WebScraping/project1_twitter_hashtag_crawler/TwitterHashTagCrawler/spiders/__pycache__/hashtag.cpython-310.pyc ADDED
Binary file (4.72 kB). View file
 
Hands-on-WebScraping/project1_twitter_hashtag_crawler/TwitterHashTagCrawler/spiders/__pycache__/hashtag2.cpython-310.pyc ADDED
Binary file (4.8 kB). View file
 
Hands-on-WebScraping/project1_twitter_hashtag_crawler/TwitterHashTagCrawler/spiders/__pycache__/hashtag3.cpython-310.pyc ADDED
Binary file (2.65 kB). View file
 
Hands-on-WebScraping/project1_twitter_hashtag_crawler/TwitterHashTagCrawler/spiders/hashtag.py ADDED
@@ -0,0 +1,118 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ import scrapy
3
+ import ipdb
4
+ import re
5
+ from dateutil import parser
6
+ import sys
7
+ from scrapy.crawler import CrawlerProcess
8
+ from utils import get_links, get_hashtags, get_mentions
9
+ import logging
10
+
11
+ class HashtagSpider(scrapy.Spider):
12
+ name = 'hashtag'
13
+ allowed_domains = ["twitter.com"]
14
+
15
+ # custom settings for user agent and proxy. Default will get chrome as user agent and use a proxypool of 50 .
16
+ # Override here
17
+ custom_settings = {
18
+ 'USER_AGENT': 'my-cool-project (http://example.com)',
19
+ 'CONCURRENT_REQUESTS': 5, 'DOWNLOAD_DELAY': 0, 'LOG_LEVEL': 'INFO'}
20
+
21
+ def __init__(self, filename=''):
22
+ if not filename:
23
+ sys.exit('Please provide the input filename also. Example \n\n$ python3 hashtags.py myinput.csv')
24
+ self.filename = filename
25
+
26
+ # the crawler will execute start_requests function at first.
27
+ def start_requests(self):
28
+ #with open(self.filename, 'r') as f:
29
+ #hashtags = ['danaher']
30
+ hashtags= ['danaher']
31
+ if len(hashtags) == 0:
32
+ sys.exit('Emplty File detected.Please provide hashtags separated by newlines')
33
+ else:
34
+ logging.info(f'{len(hashtags)} hashtags found')
35
+ print('hashtag is..',hashtags)
36
+ for hashtag in hashtags:
37
+ if hashtag:
38
+ search_url = "https://twitter.com/hashtag/" + hashtag.lower()
39
+ print('search_url is...', search_url)
40
+
41
+ yield scrapy.Request(search_url, callback=self.find_tweets, dont_filter=True)
42
+
43
+ def find_tweets(self, response):
44
+ print("I am in find_tweets")
45
+ tweets = response.xpath('//*[@data-testid="tweetText"]/span[1]/text()').getall()
46
+ print("Tweets is...", tweets)
47
+ print(f'{len(tweets)} tweets found')
48
+ for tweet in tweets:
49
+ # tweet_id = re.findall("\d+", tweet_id)[-1]
50
+ # tweet_url = 'https://twitter.com/anyuser/status/' + \
51
+ # str(tweet_id)
52
+ print(tweet)
53
+ yield scrapy.Request(tweet_url, callback=self.parse_tweet)
54
+
55
+ # finding and visiting next page
56
+ next_page = response.xpath(
57
+ '//*[@class="w-button-more"]/a/@href').get(default='')
58
+ logging.info('Next page found:')
59
+ if next_page != '':
60
+ next_page = 'https://mobile.twitter.com' + next_page
61
+ yield scrapy.Request(next_page, callback=self.find_tweets)
62
+
63
+ def parse_tweet(self, response):
64
+ logging.info('Processing --> ' + response.url)
65
+ username = response.xpath(
66
+ '//*[@class="permalink-inner permalink-tweet-container"]//*[@class="username u-dir u-textTruncate"]/b/text()').get(
67
+ default='')
68
+ full_name = response.xpath(
69
+ '//*[@class="permalink-inner permalink-tweet-container"]//*[@class="FullNameGroup"]/strong/text()').get(
70
+ default='')
71
+
72
+ try:
73
+ tweet_text = response.xpath('//title/text()').get(default='').split(':')[1].strip()
74
+
75
+ except:
76
+ tweet_text = ' '.join(response.xpath(
77
+ '//*[contains(@class,"permalink-inner permalink-tweet-container")]//*[@class="js-tweet-text-container"]/p//text()').getall()).strip()
78
+ image_list = response.xpath(
79
+ '//*[contains(@class,"permalink-inner permalink-tweet-container")]//*[@class="AdaptiveMediaOuterContainer"]//img/@src').getall()
80
+ date_time = response.xpath(
81
+ '//*[contains(@class,"permalink-inner permalink-tweet-container")]//*[@class="js-tweet-details-fixer tweet-details-fixer"]/div[@class="client-and-actions"]/span[@class="metadata"]/span/text()').get(
82
+ default='')
83
+
84
+ date_time = parser.parse(date_time.replace('-', '')).strftime('%Y-%m-%d %H:%M:%S')
85
+ retweets = response.xpath(
86
+ '//*[contains(@class,"permalink-inner permalink-tweet-container")]//*[@class="js-tweet-details-fixer tweet-details-fixer"]/div[@class="js-tweet-stats-container tweet-stats-container"]//*[@class="js-stat-count js-stat-retweets stat-count"]/a/strong/text()').get(
87
+ default='')
88
+
89
+ likes = response.xpath(
90
+ '//*[contains(@class,"permalink-inner permalink-tweet-container")]//*[@class="js-tweet-details-fixer tweet-details-fixer"]/div[@class="js-tweet-stats-container tweet-stats-container"]//*[@class="js-stat-count js-stat-favorites stat-count"]/a/strong/text()').get(
91
+ default='')
92
+ replies = response.xpath(
93
+ '//*[contains(@class,"permalink-inner permalink-tweet-container")]//*[contains(@id,"profile-tweet-action-reply-count")]/parent::span/@data-tweet-stat-count').get(
94
+ default='')
95
+
96
+ mentions = get_mentions(tweet_text)
97
+ hashtags = get_hashtags(tweet_text)
98
+ cta = get_links(tweet_text)
99
+
100
+ result = {
101
+ 'username': username.lower(),
102
+ 'full_name': full_name,
103
+ 'twitter_url': response.url,
104
+ 'tweet_text': tweet_text,
105
+ 'tweet_time': str(date_time),
106
+ 'number_of_likes': str(likes),
107
+ 'no_of_retweets': str(retweets),
108
+ 'no_of_replies': str(replies),
109
+ 'mentions': ' | '.join(mentions),
110
+ 'no_of_mentions': str(len(mentions)),
111
+ 'hashtags': ' | '.join(hashtags),
112
+ 'no_of_hashtags': str(len(hashtags)),
113
+ 'call_to_action': ' | '.join(cta),
114
+ 'image_url': ' | '.join(image_list),
115
+
116
+ }
117
+ yield result
118
+
Hands-on-WebScraping/project1_twitter_hashtag_crawler/TwitterHashTagCrawler/spiders/hashtag2.py ADDED
@@ -0,0 +1,121 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ import scrapy
3
+ import ipdb
4
+ import re
5
+ from dateutil import parser
6
+ import sys
7
+ from scrapy.crawler import CrawlerProcess
8
+ from utils import get_links, get_hashtags, get_mentions
9
+ from scrapy.http.request import Request
10
+ import logging
11
+
12
+ class HashtagSpider(scrapy.Spider):
13
+ name = 'hashtag2'
14
+ allowed_domains = ["twitter.com"]
15
+
16
+ # custom settings for user agent and proxy. Default will get chrome as user agent and use a proxypool of 50 .
17
+ # Override here
18
+ custom_settings = {
19
+ 'USER_AGENT': 'my-cool-project (http://example.com)',
20
+ 'CONCURRENT_REQUESTS': 5, 'DOWNLOAD_DELAY': 0, 'LOG_LEVEL': 'INFO'}
21
+
22
+ def __init__(self, filename=''):
23
+ if not filename:
24
+ sys.exit('Please provide the input filename also. Example \n\n$ python3 hashtags.py myinput.csv')
25
+ self.filename = filename
26
+
27
+ # the crawler will execute start_requests function at first.
28
+ def start_requests(self):
29
+ #with open(self.filename, 'r') as f:
30
+ #hashtags = ['danaher']
31
+ hashtags= ['danaher']
32
+ if len(hashtags) == 0:
33
+ sys.exit('Emplty File detected.Please provide hashtags separated by newlines')
34
+ else:
35
+ logging.info(f'{len(hashtags)} hashtags found')
36
+ print('hashtag is..',hashtags)
37
+ for hashtag in hashtags:
38
+ if hashtag:
39
+ search_url = "https://twitter.com/hashtag/" + hashtag.lower()
40
+ print('search_url is...', search_url)
41
+ try:
42
+ yield Request(search_url, callback=self.find_tweets, dont_filter=True)
43
+ except Exception as e:
44
+ print(e)
45
+
46
+ def find_tweets(self, response):
47
+ print("I am in find_tweets")
48
+ tweets = response.xpath('//*[@data-testid="tweetText"]/span[1]/text()').getall()
49
+ print("Tweets is...", tweets)
50
+ print(f'{len(tweets)} tweets found')
51
+ for tweet in tweets:
52
+ # tweet_id = re.findall("\d+", tweet_id)[-1]
53
+ # tweet_url = 'https://twitter.com/anyuser/status/' + \
54
+ # str(tweet_id)
55
+ print(tweet)
56
+ #yield scrapy.Request(tweet_url, callback=self.parse_tweet)
57
+
58
+ # finding and visiting next page
59
+ next_page = response.xpath(
60
+ '//*[@class="w-button-more"]/a/@href').get(default='')
61
+ logging.info('Next page found:')
62
+ if next_page != '':
63
+ next_page = 'https://mobile.twitter.com' + next_page
64
+ yield scrapy.Request(next_page, callback=self.find_tweets)
65
+
66
+ def parse_tweet(self, response):
67
+ logging.info('Processing --> ' + response.url)
68
+ username = response.xpath(
69
+ '//*[@class="permalink-inner permalink-tweet-container"]//*[@class="username u-dir u-textTruncate"]/b/text()').get(
70
+ default='')
71
+ full_name = response.xpath(
72
+ '//*[@class="permalink-inner permalink-tweet-container"]//*[@class="FullNameGroup"]/strong/text()').get(
73
+ default='')
74
+
75
+ try:
76
+ tweet_text = response.xpath('//title/text()').get(default='').split(':')[1].strip()
77
+
78
+ except:
79
+ tweet_text = ' '.join(response.xpath(
80
+ '//*[contains(@class,"permalink-inner permalink-tweet-container")]//*[@class="js-tweet-text-container"]/p//text()').getall()).strip()
81
+ image_list = response.xpath(
82
+ '//*[contains(@class,"permalink-inner permalink-tweet-container")]//*[@class="AdaptiveMediaOuterContainer"]//img/@src').getall()
83
+ date_time = response.xpath(
84
+ '//*[contains(@class,"permalink-inner permalink-tweet-container")]//*[@class="js-tweet-details-fixer tweet-details-fixer"]/div[@class="client-and-actions"]/span[@class="metadata"]/span/text()').get(
85
+ default='')
86
+
87
+ date_time = parser.parse(date_time.replace('-', '')).strftime('%Y-%m-%d %H:%M:%S')
88
+ retweets = response.xpath(
89
+ '//*[contains(@class,"permalink-inner permalink-tweet-container")]//*[@class="js-tweet-details-fixer tweet-details-fixer"]/div[@class="js-tweet-stats-container tweet-stats-container"]//*[@class="js-stat-count js-stat-retweets stat-count"]/a/strong/text()').get(
90
+ default='')
91
+
92
+ likes = response.xpath(
93
+ '//*[contains(@class,"permalink-inner permalink-tweet-container")]//*[@class="js-tweet-details-fixer tweet-details-fixer"]/div[@class="js-tweet-stats-container tweet-stats-container"]//*[@class="js-stat-count js-stat-favorites stat-count"]/a/strong/text()').get(
94
+ default='')
95
+ replies = response.xpath(
96
+ '//*[contains(@class,"permalink-inner permalink-tweet-container")]//*[contains(@id,"profile-tweet-action-reply-count")]/parent::span/@data-tweet-stat-count').get(
97
+ default='')
98
+
99
+ mentions = get_mentions(tweet_text)
100
+ hashtags = get_hashtags(tweet_text)
101
+ cta = get_links(tweet_text)
102
+
103
+ result = {
104
+ 'username': username.lower(),
105
+ 'full_name': full_name,
106
+ 'twitter_url': response.url,
107
+ 'tweet_text': tweet_text,
108
+ 'tweet_time': str(date_time),
109
+ 'number_of_likes': str(likes),
110
+ 'no_of_retweets': str(retweets),
111
+ 'no_of_replies': str(replies),
112
+ 'mentions': ' | '.join(mentions),
113
+ 'no_of_mentions': str(len(mentions)),
114
+ 'hashtags': ' | '.join(hashtags),
115
+ 'no_of_hashtags': str(len(hashtags)),
116
+ 'call_to_action': ' | '.join(cta),
117
+ 'image_url': ' | '.join(image_list),
118
+
119
+ }
120
+ yield result
121
+
Hands-on-WebScraping/project1_twitter_hashtag_crawler/TwitterHashTagCrawler/spiders/hashtag3.py ADDED
@@ -0,0 +1,142 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ from operator import concat
3
+ import scrapy
4
+ import time
5
+ import pandas as pd
6
+ from scrapy.http.request import Request
7
+ from scrapy import signals
8
+ #from sentence_transformers import SentenceTransformer, util
9
+ #import numpy as np
10
+ #import yake
11
+ ##import nltk.data
12
+ #import nltk
13
+ #nltk.download('punkt')
14
+ #from nltk.tokenize import sent_tokenize
15
+ #from keybert import KeyBERT
16
+ #from statistics import mean
17
+ #from urllib.parse import urlparse
18
+ #import random
19
+
20
+
21
+ from transformers import AutoTokenizer, AutoModel
22
+
23
+ import torch
24
+
25
+ from sklearn.metrics.pairwise import cosine_similarity
26
+
27
+
28
+ #text =[]
29
+ text2 =''
30
+ res =[]
31
+ len_res = 0
32
+ len_res2 = 0
33
+
34
+ list1 = []
35
+ df = pd.DataFrame()
36
+ df_sim = pd.DataFrame()
37
+ allowed_domains = []
38
+ list_start_urls = []
39
+ list_companies = []
40
+ index = 0
41
+ len_df = 0
42
+ mean_embedding = []
43
+ list_df_sim = []
44
+
45
+ class BioSpider(scrapy.Spider):
46
+ name = "hashtag3"
47
+
48
+ custom_settings = {'CONCURRENT_REQUESTS': '1', 'CONCURRENT_REQUESTS_PER_DOMAIN':'1', 'ROBOTSTXT_OBEY' : False \
49
+ , "REQUEST_FINGERPRINTER_IMPLEMENTATION": "2.7", 'USER_AGENT':'my-cool-project (http://example.com)'}
50
+
51
+
52
+ global allowed_domains
53
+ allowed_domains = ["twitter.com"]
54
+ global list_start_urls
55
+ global list_companies
56
+ global res
57
+ global index
58
+ list_start_urls2 = []
59
+ global len_df
60
+ global df
61
+
62
+ #data=pd.read_excel("C:\\Users\\RSPRASAD\OneDrive - Danaher\\Bec_LS\\2023\\D_and_B_Project\\Segmentation\\Customer focus list 2023 NGW.xlsx", sheet_name='Sheet1')
63
+
64
+ #df= data[['Company', 'Website']]
65
+ #df.drop_duplicates(inplace = True)
66
+ #df['Content'] = ''
67
+
68
+ i = 0
69
+ len_df = 1
70
+ # for i in range(0, len(df)):
71
+ # #df.loc[i, 'company_name']= df.loc[i, 'Company']
72
+ # #df.loc[i, 'company_website']= df.loc[i, 'Website']
73
+ # list_start_urls.append(df.loc[i, 'Website'])
74
+ # list_companies.append(df.loc[i, 'Company'])
75
+ # domain = urlparse(df.loc[i, 'Website']).netloc
76
+ # allowed_domains.append(domain)
77
+ # print(allowed_domains)
78
+ # upper_len_websites = 5
79
+ start_index = 0
80
+ # if(len_df >upper_len_websites):
81
+ # list_start_urls= list_start_urls[start_index:upper_len_websites]
82
+ # df = df.iloc[start_index:upper_len_websites,:]
83
+
84
+
85
+ list_start_urls = ['https://twitter.com/hashtag/danaher/']
86
+ # df = df.iloc[start_index:upper_len_websites,:]
87
+ # df = df.reset_index()
88
+ # df = df.drop('index', axis = 1)
89
+ # len_df = len(df)
90
+
91
+ # print("Dataframe for crawling website is ..")
92
+ # print(df)
93
+
94
+
95
+
96
+ print(list_start_urls)
97
+
98
+
99
+ @classmethod
100
+ def from_crawler(cls, crawler, *args, **kwargs):
101
+ spider = super(BioSpider, cls).from_crawler(crawler, *args, **kwargs)
102
+ #crawler.signals.connect(spider.spider_opened, signals.spider_opened)
103
+ crawler.signals.connect(spider.spider_closed, signals.spider_closed)
104
+ return spider
105
+
106
+ def start_requests(self):
107
+
108
+ global list_start_urls
109
+ global index
110
+ global res
111
+
112
+
113
+ index =0
114
+ index2 = len(list_start_urls)
115
+ print(" i am in start_requests")
116
+
117
+
118
+
119
+ try:
120
+
121
+ yield Request(list_start_urls[0].strip(), callback = self.parse)#, meta={'priority': index2})
122
+
123
+ except Exception as e:
124
+ print("There is exception and exception is..",e)
125
+
126
+
127
+
128
+
129
+ def parse(self, response):
130
+ print("I am in parse..")
131
+ print("I am in find_tweets")
132
+ tweets = response.xpath('//*[@data-testid="tweetText"]/span[1]/text()').getall()
133
+ print("Tweets is...", tweets)
134
+ print(f'{len(tweets)} tweets found')
135
+ for tweet in tweets:
136
+ print(tweet)
137
+ count += 1
138
+ if (count >5):
139
+ break
140
+
141
+ def spider_closed(self, spider):
142
+ print("I am in spider closed...")
Hands-on-WebScraping/project1_twitter_hashtag_crawler/__pycache__/utils.cpython-310.pyc ADDED
Binary file (2.21 kB). View file
 
Hands-on-WebScraping/project1_twitter_hashtag_crawler/mydata.csv ADDED
File without changes
Hands-on-WebScraping/project1_twitter_hashtag_crawler/myhashtag.csv ADDED
Binary file (9.3 kB). View file
 
Hands-on-WebScraping/project1_twitter_hashtag_crawler/myhashtags.csv ADDED
@@ -0,0 +1 @@
 
 
1
+ danaher
Hands-on-WebScraping/project1_twitter_hashtag_crawler/requirements.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ scrapy
2
+ dateutil
Hands-on-WebScraping/project1_twitter_hashtag_crawler/sampledata.csv ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ username,full_name,twitter_url,tweet_text,tweet_time,number_of_likes,no_of_retweets,no_of_replies,mentions,no_of_mentions,hashtags,no_of_hashtags,call_to_action,image_url
2
+ cctvasiapacific,CCTV Asia Pacific,https://twitter.com/CCTVAsiaPacific/status/1212269072328491008,"Turning off the stereotype of political faces, Hou Yanqi, the # ChineseAmbssdor to # Nepal , amazes Nepalese and gains popularity on twitter by posting her ad-like photos and wishes: ""True beauty always touches the deep heart"", said Hou.
3
+ २०२० नेपाल भ्रमाण वर्ष सफलताको शुभकामना pic.twitter.com/z0N8ru2vNd",2019-12-31 23:07:00,804,171,35,,0,,0,,https://pbs.twimg.com/media/ENLYSqlU4AAgiFh.jpg | https://pbs.twimg.com/media/ENLYSqoVAAASSS-.jpg | https://pbs.twimg.com/media/ENLYSqmU0AAZEyK.jpg
4
+ ,,https://twitter.com/BishowParajuli/status/1213037950549626882,"Zimbabwe is beautiful! Glad to hear your mountain climbing adventure ; If you wish to climb further higher, another beautiful place is # Nepal ! You will you can also enjoy some terrific historical spots: pic.twitter.com/ofsCppyp0O",2020-01-03 02:02:00,27,3,1,,0,,0,,https://pbs.twimg.com/media/ENWTkzmUEAEKS1k.jpg | https://pbs.twimg.com/media/ENWTkznU4AAtVxK.jpg | https://pbs.twimg.com/media/ENWTkzoUwAEgMpX.jpg | https://pbs.twimg.com/media/ENWTkzlU4AEYxor.jpg
5
+ kopinoora,kpila,https://twitter.com/kopinoora/status/1213481511967690752,# VisitNepal2020 official inauguration at London Nepal Embassy. # pic.twitter.com/e4N9XulBH7,2020-01-04 07:25:00,3,,0,,0,,0,,https://pbs.twimg.com/media/ENcnABiXsAE7_sw.jpg | https://pbs.twimg.com/media/ENcnABsXUAAnuBL.jpg
6
+ mahbub_nazif,Nazif Mahbub,https://twitter.com/mahbub_nazif/status/1213328288271089664,"The joy of being Innocent. Durbar square, kathmandu, nepal pic.twitter.com/sbsfxTzeHN",2020-01-03 21:16:00,4,,0,,0,,0,,https://pbs.twimg.com/media/ENabn-uWwAcbUfb.jpg
7
+ prabhumteverest,Prastuti_प्रश्तुती,https://twitter.com/PrabhuMteverest/status/1213178026457878528,"Visit nepal2020. where heaven meets and you won't feel regret choosing Nepal as your destination
8
+
9
+ We are eager to welcome you with our beautiful destinations and warm hospitality pic.twitter.com/l7GQfk2ha6",2020-01-03 11:19:00,5,,0,,0,,0,,https://pbs.twimg.com/media/ENYS_CLUwAAVypp.jpg
10
+ kashishds,Kashish Das Shrestha,https://twitter.com/kashishds/status/1213120581412876295,"Marpha bazaar, Mustang, Nepal. Today.
11
+
12
+ Requested my friend & Marpha resident Dipesh Hirachan for this clip. This is just outside his Apple orchard there. pic.twitter.com/oOFy88ylIt",2020-01-03 07:30:00,123,20,4,,0,,0,,
Hands-on-WebScraping/project1_twitter_hashtag_crawler/scrapy.cfg ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Automatically created by: scrapy startproject
2
+ #
3
+ # For more information about the [deploy] section see:
4
+ # https://scrapyd.readthedocs.io/en/latest/deploy.html
5
+
6
+ [settings]
7
+ default = TwitterHashTagCrawler.settings
8
+
9
+ [deploy]
10
+ #url = http://localhost:6800/
11
+ project = TwitterHashTagCrawler
Hands-on-WebScraping/project1_twitter_hashtag_crawler/utils.py ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+
3
+
4
+ def find_emails(text):
5
+ """
6
+ It will parse the given string and return a list of emails if found
7
+
8
+ Example:
9
+ >>find_emails('hello\n find me here\nemail@gmail.com')
10
+ ['email@gmail.com']
11
+
12
+ :param text: string
13
+ :return: list
14
+ """
15
+ return re.findall(r"([a-zA-Z0-9+._-]+@[a-zA-Z0-9._-]+\.[a-zA-Z0-9_-]+)", text)
16
+
17
+
18
+ def get_mentions(text):
19
+ """
20
+ It will return mentions from the text i.e @someone
21
+
22
+ :param text: string
23
+ :return: list
24
+
25
+ example
26
+ >>> get_mentions('Hi @hero, How are you? I hope @hero2 is fine. BTW say hi to @heroine for me')
27
+ ['hero','hero2','heroine']
28
+ """
29
+ result = re.findall("(^|[^@\w])@(\w{1,15})", text)
30
+ if len(result) != 0:
31
+ result = [i[1] for i in result]
32
+ return result
33
+
34
+
35
+ def get_hashtags(text):
36
+ """
37
+ It will return hashtags from the text i.e #something
38
+
39
+ :param text: string
40
+ :return: list
41
+
42
+ example
43
+ >>> get_hashtags('my first code #programmer #python #awesome #grepsr')
44
+ ['programmer','python','awesome','grepsr']
45
+ """
46
+
47
+ result = re.findall("(^|[^@\w])#(\w{1,15})", text)
48
+ if len(result) != 0:
49
+ result = [i[1] for i in result]
50
+ return result
51
+
52
+
53
+ def get_links(text):
54
+ """
55
+ It will return website links from the text
56
+
57
+ :param text: string
58
+ :return: list
59
+
60
+ example
61
+ >>> message = 'http://twitter.com Project URL: https://app.grepsr.com/app/project/message/70454'
62
+ >>> get_links(message)
63
+ ['http://twitter.com', 'https://app.grepsr.com/app/project/message/70454']
64
+
65
+ """
66
+ result = re.findall(r"(?P<url>https?://[^\s]+)", text)
67
+ return result
README.md CHANGED
@@ -1,12 +1,6 @@
1
  ---
2
- title: Twitter Sentiment
3
- emoji: 📉
4
- colorFrom: pink
5
- colorTo: red
6
  sdk: gradio
7
  sdk_version: 4.31.1
8
- app_file: app.py
9
- pinned: false
10
  ---
11
-
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
  ---
2
+ title: twitter_sentiment
3
+ app_file: test.py
 
 
4
  sdk: gradio
5
  sdk_version: 4.31.1
 
 
6
  ---
 
 
__pycache__/test.cpython-39.pyc ADDED
Binary file (2.57 kB). View file
 
__pycache__/twitter_crawl.cpython-310.pyc ADDED
Binary file (1.12 kB). View file
 
requirements.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ streamlit
2
+ transformers==4.40.2
3
+ tensorflow==2.16.1
4
+ tweetnlp
scrapper.ipynb ADDED
@@ -0,0 +1,168 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": null,
6
+ "metadata": {},
7
+ "outputs": [],
8
+ "source": [
9
+ "import re\n",
10
+ "import csv\n",
11
+ "from getpass import getpass\n",
12
+ "from time import sleep\n",
13
+ "from selenium.webdriver.common.keys import Keys\n",
14
+ "from selenium.common.exceptions import NoSuchElementException\n",
15
+ "from msedge.selenium_tools import Edge, EdgeOptions "
16
+ ]
17
+ },
18
+ {
19
+ "cell_type": "code",
20
+ "execution_count": null,
21
+ "metadata": {},
22
+ "outputs": [],
23
+ "source": [
24
+ "def get_tweet_data(card):\n",
25
+ " \"\"\"Extract data from tweet card\"\"\"\n",
26
+ " username = card.find_element_by_xpath('.//span').text\n",
27
+ " try:\n",
28
+ " handle = card.find_element_by_xpath('.//span[contains(text(), \"@\")]').text\n",
29
+ " except NoSuchElementException:\n",
30
+ " return\n",
31
+ " \n",
32
+ " try:\n",
33
+ " postdate = card.find_element_by_xpath('.//time').get_attribute('datetime')\n",
34
+ " except NoSuchElementException:\n",
35
+ " return\n",
36
+ " \n",
37
+ " comment = card.find_element_by_xpath('.//div[2]/div[2]/div[1]').text\n",
38
+ " responding = card.find_element_by_xpath('.//div[2]/div[2]/div[2]').text\n",
39
+ " text = comment + responding\n",
40
+ " reply_cnt = card.find_element_by_xpath('.//div[@data-testid=\"reply\"]').text\n",
41
+ " retweet_cnt = card.find_element_by_xpath('.//div[@data-testid=\"retweet\"]').text\n",
42
+ " like_cnt = card.find_element_by_xpath('.//div[@data-testid=\"like\"]').text\n",
43
+ "\n",
44
+ " \n",
45
+ " tweet = (username, handle, postdate, text, reply_cnt, retweet_cnt, like_cnt)\n",
46
+ " return tweet "
47
+ ]
48
+ },
49
+ {
50
+ "cell_type": "code",
51
+ "execution_count": null,
52
+ "metadata": {},
53
+ "outputs": [],
54
+ "source": [
55
+ "search_term = input('search term: ')\n",
56
+ "\n",
57
+ "# create instance of web driver\n",
58
+ "options = EdgeOptions()\n",
59
+ "options.use_chromium = True\n",
60
+ "driver = Edge(options=options)\n",
61
+ "\n",
62
+ "# navigate to login screen\n",
63
+ "driver.get('https://twitter.com/search')\n",
64
+ "driver.maximize_window()\n",
65
+ "sleep(5)\n",
66
+ "\n",
67
+ "# find search input and search for term\n",
68
+ "search_input = driver.find_element_by_xpath('//input[@aria-label=\"Search query\"]')\n",
69
+ "search_input.send_keys(search_term)\n",
70
+ "search_input.send_keys(Keys.RETURN)\n",
71
+ "sleep(1)\n",
72
+ "\n",
73
+ "# navigate to historical 'latest' tab\n",
74
+ "driver.find_element_by_link_text('Latest').click()"
75
+ ]
76
+ },
77
+ {
78
+ "cell_type": "code",
79
+ "execution_count": null,
80
+ "metadata": {},
81
+ "outputs": [],
82
+ "source": [
83
+ "# get all tweets on the page\n",
84
+ "data = []\n",
85
+ "tweet_ids = set()\n",
86
+ "last_position = driver.execute_script(\"return window.pageYOffset;\")\n",
87
+ "scrolling = True\n",
88
+ "\n",
89
+ "while scrolling:\n",
90
+ " page_cards = driver.find_elements_by_xpath('//article[@data-testid=\"tweet\"]')\n",
91
+ " for card in page_cards[-15:]:\n",
92
+ " tweet = get_tweet_data(card)\n",
93
+ " if tweet:\n",
94
+ " tweet_id = ''.join(tweet)\n",
95
+ " if tweet_id not in tweet_ids:\n",
96
+ " tweet_ids.add(tweet_id)\n",
97
+ " data.append(tweet)\n",
98
+ " \n",
99
+ " scroll_attempt = 0\n",
100
+ " while True:\n",
101
+ " # check scroll position\n",
102
+ " driver.execute_script('window.scrollTo(0, document.body.scrollHeight);')\n",
103
+ " sleep(2)\n",
104
+ " curr_position = driver.execute_script(\"return window.pageYOffset;\")\n",
105
+ " if last_position == curr_position:\n",
106
+ " scroll_attempt += 1\n",
107
+ " \n",
108
+ " # end of scroll region\n",
109
+ " if scroll_attempt >= 3:\n",
110
+ " scrolling = False\n",
111
+ " break\n",
112
+ " else:\n",
113
+ " sleep(2) # attempt another scroll\n",
114
+ " else:\n",
115
+ " last_position = curr_position\n",
116
+ " break\n",
117
+ "\n",
118
+ "# close the web driver\n",
119
+ "driver.close()"
120
+ ]
121
+ },
122
+ {
123
+ "cell_type": "markdown",
124
+ "metadata": {},
125
+ "source": [
126
+ "### Saving the tweet data"
127
+ ]
128
+ },
129
+ {
130
+ "cell_type": "code",
131
+ "execution_count": null,
132
+ "metadata": {},
133
+ "outputs": [],
134
+ "source": [
135
+ "with open('turkcell_tweets.csv', 'w', newline='', encoding='utf-8') as f:\n",
136
+ " header = ['UserName', 'Handle', 'Timestamp', 'Text', 'Comments', 'Likes', 'Retweets']\n",
137
+ " writer = csv.writer(f)\n",
138
+ " writer.writerow(header)\n",
139
+ " writer.writerows(data)"
140
+ ]
141
+ }
142
+ ],
143
+ "metadata": {
144
+ "interpreter": {
145
+ "hash": "306b4709344c791e982a258cf5494139869959872aa39c2c4102a54cca0d2138"
146
+ },
147
+ "kernelspec": {
148
+ "display_name": "Python 3.7.0 64-bit",
149
+ "language": "python",
150
+ "name": "python3"
151
+ },
152
+ "language_info": {
153
+ "codemirror_mode": {
154
+ "name": "ipython",
155
+ "version": 3
156
+ },
157
+ "file_extension": ".py",
158
+ "mimetype": "text/x-python",
159
+ "name": "python",
160
+ "nbconvert_exporter": "python",
161
+ "pygments_lexer": "ipython3",
162
+ "version": "3.7.0"
163
+ },
164
+ "orig_nbformat": 4
165
+ },
166
+ "nbformat": 4,
167
+ "nbformat_minor": 2
168
+ }