Spaces:
Build error
Build error
Upload folder using huggingface_hub
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- .gitattributes +1 -0
- Hands-on-WebScraping-master.zip +3 -0
- Hands-on-WebScraping-master/Hands-on-WebScraping-master/.gitignore +241 -0
- Hands-on-WebScraping-master/Hands-on-WebScraping-master/LICENSE +21 -0
- Hands-on-WebScraping-master/Hands-on-WebScraping-master/README.md +2 -0
- Hands-on-WebScraping-master/Hands-on-WebScraping-master/project1_twitter_hashtag_crawler/Readme.md +118 -0
- Hands-on-WebScraping-master/Hands-on-WebScraping-master/project1_twitter_hashtag_crawler/TwitterHashTagCrawler/__init__.py +0 -0
- Hands-on-WebScraping-master/Hands-on-WebScraping-master/project1_twitter_hashtag_crawler/TwitterHashTagCrawler/items.py +14 -0
- Hands-on-WebScraping-master/Hands-on-WebScraping-master/project1_twitter_hashtag_crawler/TwitterHashTagCrawler/middlewares.py +103 -0
- Hands-on-WebScraping-master/Hands-on-WebScraping-master/project1_twitter_hashtag_crawler/TwitterHashTagCrawler/pipelines.py +11 -0
- Hands-on-WebScraping-master/Hands-on-WebScraping-master/project1_twitter_hashtag_crawler/TwitterHashTagCrawler/settings.py +90 -0
- Hands-on-WebScraping-master/Hands-on-WebScraping-master/project1_twitter_hashtag_crawler/TwitterHashTagCrawler/spiders/__init__.py +4 -0
- Hands-on-WebScraping-master/Hands-on-WebScraping-master/project1_twitter_hashtag_crawler/TwitterHashTagCrawler/spiders/hashtag.py +112 -0
- Hands-on-WebScraping-master/Hands-on-WebScraping-master/project1_twitter_hashtag_crawler/myhashtags.csv +2 -0
- Hands-on-WebScraping-master/Hands-on-WebScraping-master/project1_twitter_hashtag_crawler/requirements.txt +2 -0
- Hands-on-WebScraping-master/Hands-on-WebScraping-master/project1_twitter_hashtag_crawler/sampledata.csv +12 -0
- Hands-on-WebScraping-master/Hands-on-WebScraping-master/project1_twitter_hashtag_crawler/scrapy.cfg +11 -0
- Hands-on-WebScraping-master/Hands-on-WebScraping-master/project1_twitter_hashtag_crawler/utils.py +67 -0
- Hands-on-WebScraping/.gitignore +241 -0
- Hands-on-WebScraping/LICENSE +21 -0
- Hands-on-WebScraping/README.md +2 -0
- Hands-on-WebScraping/project1_twitter_hashtag_crawler/Readme.md +118 -0
- Hands-on-WebScraping/project1_twitter_hashtag_crawler/TwitterHashTagCrawler/__init__.py +0 -0
- Hands-on-WebScraping/project1_twitter_hashtag_crawler/TwitterHashTagCrawler/__pycache__/__init__.cpython-310.pyc +0 -0
- Hands-on-WebScraping/project1_twitter_hashtag_crawler/TwitterHashTagCrawler/__pycache__/settings.cpython-310.pyc +0 -0
- Hands-on-WebScraping/project1_twitter_hashtag_crawler/TwitterHashTagCrawler/items.py +14 -0
- Hands-on-WebScraping/project1_twitter_hashtag_crawler/TwitterHashTagCrawler/middlewares.py +103 -0
- Hands-on-WebScraping/project1_twitter_hashtag_crawler/TwitterHashTagCrawler/pipelines.py +11 -0
- Hands-on-WebScraping/project1_twitter_hashtag_crawler/TwitterHashTagCrawler/settings.py +90 -0
- Hands-on-WebScraping/project1_twitter_hashtag_crawler/TwitterHashTagCrawler/spiders/__init__.py +4 -0
- Hands-on-WebScraping/project1_twitter_hashtag_crawler/TwitterHashTagCrawler/spiders/__pycache__/__init__.cpython-310.pyc +0 -0
- Hands-on-WebScraping/project1_twitter_hashtag_crawler/TwitterHashTagCrawler/spiders/__pycache__/hashtag.cpython-310.pyc +0 -0
- Hands-on-WebScraping/project1_twitter_hashtag_crawler/TwitterHashTagCrawler/spiders/__pycache__/hashtag2.cpython-310.pyc +0 -0
- Hands-on-WebScraping/project1_twitter_hashtag_crawler/TwitterHashTagCrawler/spiders/__pycache__/hashtag3.cpython-310.pyc +0 -0
- Hands-on-WebScraping/project1_twitter_hashtag_crawler/TwitterHashTagCrawler/spiders/hashtag.py +118 -0
- Hands-on-WebScraping/project1_twitter_hashtag_crawler/TwitterHashTagCrawler/spiders/hashtag2.py +121 -0
- Hands-on-WebScraping/project1_twitter_hashtag_crawler/TwitterHashTagCrawler/spiders/hashtag3.py +142 -0
- Hands-on-WebScraping/project1_twitter_hashtag_crawler/__pycache__/utils.cpython-310.pyc +0 -0
- Hands-on-WebScraping/project1_twitter_hashtag_crawler/mydata.csv +0 -0
- Hands-on-WebScraping/project1_twitter_hashtag_crawler/myhashtag.csv +0 -0
- Hands-on-WebScraping/project1_twitter_hashtag_crawler/myhashtags.csv +1 -0
- Hands-on-WebScraping/project1_twitter_hashtag_crawler/requirements.txt +2 -0
- Hands-on-WebScraping/project1_twitter_hashtag_crawler/sampledata.csv +12 -0
- Hands-on-WebScraping/project1_twitter_hashtag_crawler/scrapy.cfg +11 -0
- Hands-on-WebScraping/project1_twitter_hashtag_crawler/utils.py +67 -0
- README.md +2 -8
- __pycache__/test.cpython-39.pyc +0 -0
- __pycache__/twitter_crawl.cpython-310.pyc +0 -0
- requirements.txt +4 -0
- scrapper.ipynb +168 -0
.gitattributes
CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
+
twitter_scraper_without_API/firefox-geckodriver/geckodriver.exe filter=lfs diff=lfs merge=lfs -text
|
Hands-on-WebScraping-master.zip
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:8634d80c9a416f3346c02ce5b6f25f96fda953cc24212a45f461a510d145f04c
|
3 |
+
size 15838
|
Hands-on-WebScraping-master/Hands-on-WebScraping-master/.gitignore
ADDED
@@ -0,0 +1,241 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Byte-compiled / optimized / DLL files
|
2 |
+
__pycache__/
|
3 |
+
*.py[cod]
|
4 |
+
*$py.class
|
5 |
+
|
6 |
+
# C extensions
|
7 |
+
*.so
|
8 |
+
|
9 |
+
# Distribution / packaging
|
10 |
+
.Python
|
11 |
+
build/
|
12 |
+
develop-eggs/
|
13 |
+
dist/
|
14 |
+
downloads/
|
15 |
+
eggs/
|
16 |
+
.eggs/
|
17 |
+
lib/
|
18 |
+
lib64/
|
19 |
+
parts/
|
20 |
+
sdist/
|
21 |
+
var/
|
22 |
+
wheels/
|
23 |
+
pip-wheel-metadata/
|
24 |
+
share/python-wheels/
|
25 |
+
*.egg-info/
|
26 |
+
.installed.cfg
|
27 |
+
*.egg
|
28 |
+
MANIFEST
|
29 |
+
|
30 |
+
# PyInstaller
|
31 |
+
# Usually these files are written by a python script from a template
|
32 |
+
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
33 |
+
*.manifest
|
34 |
+
*.spec
|
35 |
+
|
36 |
+
# Installer logs
|
37 |
+
pip-log.txt
|
38 |
+
pip-delete-this-directory.txt
|
39 |
+
|
40 |
+
# Unit test / coverage reports
|
41 |
+
htmlcov/
|
42 |
+
.tox/
|
43 |
+
.nox/
|
44 |
+
.coverage
|
45 |
+
.coverage.*
|
46 |
+
.cache
|
47 |
+
nosetests.xml
|
48 |
+
coverage.xml
|
49 |
+
*.cover
|
50 |
+
*.py,cover
|
51 |
+
.hypothesis/
|
52 |
+
.pytest_cache/
|
53 |
+
|
54 |
+
# Translations
|
55 |
+
*.mo
|
56 |
+
*.pot
|
57 |
+
|
58 |
+
# Django stuff:
|
59 |
+
*.log
|
60 |
+
local_settings.py
|
61 |
+
db.sqlite3
|
62 |
+
db.sqlite3-journal
|
63 |
+
|
64 |
+
# Flask stuff:
|
65 |
+
instance/
|
66 |
+
.webassets-cache
|
67 |
+
|
68 |
+
# Scrapy stuff:
|
69 |
+
.scrapy
|
70 |
+
|
71 |
+
# Sphinx documentation
|
72 |
+
docs/_build/
|
73 |
+
|
74 |
+
# PyBuilder
|
75 |
+
target/
|
76 |
+
|
77 |
+
# Jupyter Notebook
|
78 |
+
.ipynb_checkpoints
|
79 |
+
|
80 |
+
# IPython
|
81 |
+
profile_default/
|
82 |
+
ipython_config.py
|
83 |
+
|
84 |
+
# pyenv
|
85 |
+
.python-version
|
86 |
+
|
87 |
+
# pipenv
|
88 |
+
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
|
89 |
+
# However, in case of collaboration, if having platform-specific dependencies or dependencies
|
90 |
+
# having no cross-platform support, pipenv may install dependencies that don't work, or not
|
91 |
+
# install all needed dependencies.
|
92 |
+
#Pipfile.lock
|
93 |
+
|
94 |
+
# PEP 582; used by e.g. github.com/David-OConnor/pyflow
|
95 |
+
__pypackages__/
|
96 |
+
|
97 |
+
# Celery stuff
|
98 |
+
celerybeat-schedule
|
99 |
+
celerybeat.pid
|
100 |
+
|
101 |
+
# SageMath parsed files
|
102 |
+
*.sage.py
|
103 |
+
|
104 |
+
# Environments
|
105 |
+
.env
|
106 |
+
.venv
|
107 |
+
env/
|
108 |
+
venv/
|
109 |
+
ENV/
|
110 |
+
env.bak/
|
111 |
+
venv.bak/
|
112 |
+
|
113 |
+
# Spyder project settings
|
114 |
+
.spyderproject
|
115 |
+
.spyproject
|
116 |
+
|
117 |
+
# Rope project settings
|
118 |
+
.ropeproject
|
119 |
+
|
120 |
+
# mkdocs documentation
|
121 |
+
/site
|
122 |
+
|
123 |
+
# mypy
|
124 |
+
.mypy_cache/
|
125 |
+
.dmypy.json
|
126 |
+
dmypy.json
|
127 |
+
|
128 |
+
# Pyre type checker
|
129 |
+
.pyre/
|
130 |
+
|
131 |
+
|
132 |
+
|
133 |
+
|
134 |
+
# Logs
|
135 |
+
logs
|
136 |
+
*.log
|
137 |
+
npm-debug.log*
|
138 |
+
yarn-debug.log*
|
139 |
+
yarn-error.log*
|
140 |
+
lerna-debug.log*
|
141 |
+
|
142 |
+
# Diagnostic reports (https://nodejs.org/api/report.html)
|
143 |
+
report.[0-9]*.[0-9]*.[0-9]*.[0-9]*.json
|
144 |
+
|
145 |
+
# Runtime data
|
146 |
+
pids
|
147 |
+
*.pid
|
148 |
+
*.seed
|
149 |
+
*.pid.lock
|
150 |
+
|
151 |
+
# Directory for instrumented libs generated by jscoverage/JSCover
|
152 |
+
lib-cov
|
153 |
+
|
154 |
+
# Coverage directory used by tools like istanbul
|
155 |
+
coverage
|
156 |
+
*.lcov
|
157 |
+
|
158 |
+
# nyc test coverage
|
159 |
+
.nyc_output
|
160 |
+
|
161 |
+
# Grunt intermediate storage (https://gruntjs.com/creating-plugins#storing-task-files)
|
162 |
+
.grunt
|
163 |
+
|
164 |
+
# Bower dependency directory (https://bower.io/)
|
165 |
+
bower_components
|
166 |
+
|
167 |
+
# node-waf configuration
|
168 |
+
.lock-wscript
|
169 |
+
|
170 |
+
# Compiled binary addons (https://nodejs.org/api/addons.html)
|
171 |
+
build/Release
|
172 |
+
|
173 |
+
# Dependency directories
|
174 |
+
node_modules/
|
175 |
+
jspm_packages/
|
176 |
+
|
177 |
+
# TypeScript v1 declaration files
|
178 |
+
typings/
|
179 |
+
|
180 |
+
# TypeScript cache
|
181 |
+
*.tsbuildinfo
|
182 |
+
|
183 |
+
# Optional npm cache directory
|
184 |
+
.npm
|
185 |
+
|
186 |
+
# Optional eslint cache
|
187 |
+
.eslintcache
|
188 |
+
|
189 |
+
# Microbundle cache
|
190 |
+
.rpt2_cache/
|
191 |
+
.rts2_cache_cjs/
|
192 |
+
.rts2_cache_es/
|
193 |
+
.rts2_cache_umd/
|
194 |
+
|
195 |
+
# Optional REPL history
|
196 |
+
.node_repl_history
|
197 |
+
|
198 |
+
# Output of 'npm pack'
|
199 |
+
*.tgz
|
200 |
+
|
201 |
+
# Yarn Integrity file
|
202 |
+
.yarn-integrity
|
203 |
+
|
204 |
+
# dotenv environment variables file
|
205 |
+
.env
|
206 |
+
.env.test
|
207 |
+
|
208 |
+
# parcel-bundler cache (https://parceljs.org/)
|
209 |
+
.cache
|
210 |
+
|
211 |
+
# Next.js build output
|
212 |
+
.next
|
213 |
+
|
214 |
+
# Nuxt.js build / generate output
|
215 |
+
.nuxt
|
216 |
+
dist
|
217 |
+
|
218 |
+
# Gatsby files
|
219 |
+
.cache/
|
220 |
+
# Comment in the public line in if your project uses Gatsby and not Next.js
|
221 |
+
# https://nextjs.org/blog/next-9-1#public-directory-support
|
222 |
+
# public
|
223 |
+
|
224 |
+
# vuepress build output
|
225 |
+
.vuepress/dist
|
226 |
+
|
227 |
+
# Serverless directories
|
228 |
+
.serverless/
|
229 |
+
|
230 |
+
# FuseBox cache
|
231 |
+
.fusebox/
|
232 |
+
|
233 |
+
# DynamoDB Local files
|
234 |
+
.dynamodb/
|
235 |
+
|
236 |
+
# TernJS port file
|
237 |
+
.tern-port
|
238 |
+
|
239 |
+
# Stores VSCode versions used for testing VSCode extensions
|
240 |
+
.vscode-test
|
241 |
+
|
Hands-on-WebScraping-master/Hands-on-WebScraping-master/LICENSE
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
MIT License
|
2 |
+
|
3 |
+
Copyright (c) 2020 Amit Upreti
|
4 |
+
|
5 |
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6 |
+
of this software and associated documentation files (the "Software"), to deal
|
7 |
+
in the Software without restriction, including without limitation the rights
|
8 |
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9 |
+
copies of the Software, and to permit persons to whom the Software is
|
10 |
+
furnished to do so, subject to the following conditions:
|
11 |
+
|
12 |
+
The above copyright notice and this permission notice shall be included in all
|
13 |
+
copies or substantial portions of the Software.
|
14 |
+
|
15 |
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16 |
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17 |
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18 |
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19 |
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20 |
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
21 |
+
SOFTWARE.
|
Hands-on-WebScraping-master/Hands-on-WebScraping-master/README.md
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
# Hands-on-WebScraping (NO LONGER MAINTAINED)
|
2 |
+
This repo is a part of blog series on several web scraping projects where we will explore scraping techniques to crawl data from simple websites to websites using advanced protection.
|
Hands-on-WebScraping-master/Hands-on-WebScraping-master/project1_twitter_hashtag_crawler/Readme.md
ADDED
@@ -0,0 +1,118 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#### Depricated. No longer maintained.
|
2 |
+
|
3 |
+
# Twitter Hashtag crawler
|
4 |
+
> A fast and unofficial twitter crawler to collect tweets using hashtag search.
|
5 |
+
|
6 |
+
> Notice: The crawler is meant to be used for collecting data purely for academic and research purpose only. I am not responsible for any legal issue that might arise for any unintended use of this crawler
|
7 |
+
|
8 |
+
[![Python 3](https://img.shields.io/badge/python-3.6-blue.svg)](https://www.python.org/downloads/release/python-360/)
|
9 |
+
[![twitter crawler](https://img.shields.io/badge/twittercrawler-1.0-green)](https://github.com/amitupreti/Hands-on-WebScraping/tree/master/project1_twitter_hashtag_crawler)
|
10 |
+
|
11 |
+
This is written using scrapy and python. The logic is straight forward. We are simply sending get requests to the mobile version of the twitter(mobile.twitter.com) to collect the list of tweets and sending get requests to the web version to parse tweet details.
|
12 |
+
![](header.png)
|
13 |
+
|
14 |
+
## Installation
|
15 |
+
|
16 |
+
OS X & Linux:
|
17 |
+
|
18 |
+
1. Download the project
|
19 |
+
|
20 |
+
```sh
|
21 |
+
git clone https://github.com/amitupreti/Hands-on-WebScraping
|
22 |
+
|
23 |
+
cd Hands-on-WebScraping/project1_twitter_hashtag_crawler
|
24 |
+
```
|
25 |
+
2. Install the dependencies
|
26 |
+
|
27 |
+
```sh
|
28 |
+
pip install -r requirements.txt --user
|
29 |
+
```
|
30 |
+
|
31 |
+
3. Verify the crawler spider exists
|
32 |
+
|
33 |
+
```sh
|
34 |
+
scrapy list
|
35 |
+
```
|
36 |
+
if you see `twittercrawler` than you are all set.
|
37 |
+
|
38 |
+
|
39 |
+
Windows:
|
40 |
+
1. Install [python3](https://www.python.org/downloads/) if you haven't already
|
41 |
+
2. Download the project. https://github.com/amitupreti/Hands-on-WebScraping/archive/master.zip
|
42 |
+
3. Extract the project
|
43 |
+
4. Open cmd and navigate inside the project directory
|
44 |
+
```sh
|
45 |
+
cd Hands-on-WebScraping/project1_twitter_hashtag_crawler
|
46 |
+
```
|
47 |
+
5. Follow step 2 and 3 from Mac/Linux installation
|
48 |
+
|
49 |
+
|
50 |
+
|
51 |
+
## Usage example
|
52 |
+
|
53 |
+
1. Put the hashtags in a csv file seperated by new line. For example, I have included `myhashtags.csv` as a sample.
|
54 |
+
|
55 |
+
![Hashtags file](https://i.paste.pics/225079df0d3dc27d66430b1553b2ac39.png)
|
56 |
+
|
57 |
+
2. Run the crawler with your hashtag file and the desired [output formats] (https://docs.scrapy.org/en/latest/topics/feed-exports.html)(JSON,JSON lines,CSV,XML)
|
58 |
+
|
59 |
+
* For csv
|
60 |
+
```sh
|
61 |
+
scrapy crawl twittercrawler -a filename=myhashtags.csv -o mydata.csv
|
62 |
+
|
63 |
+
```
|
64 |
+
|
65 |
+
* For JSON
|
66 |
+
```sh
|
67 |
+
scrapy crawl twittercrawler -a filename=myhashtags.csv -o mydata.json
|
68 |
+
|
69 |
+
```
|
70 |
+
![sample images](https://i.paste.pics/4a5826a6a090522e5326bb11838258df.png)
|
71 |
+
![sample images](https://i.paste.pics/68a64bab743150e00af4cd9eea9af8dc.png)
|
72 |
+
|
73 |
+
|
74 |
+
### Speeding up the crawls
|
75 |
+
If you feel like the crawler is a little slow then find the hashtag.py file in the project and edit the custom settings.
|
76 |
+
```py
|
77 |
+
custom_settings = {
|
78 |
+
'USER_AGENT': 'Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; compatible; Googlebot/2.1; +http://www.google.com/bot.html) Safari/537.36',
|
79 |
+
'CONCURRENT_REQUESTS': 2, 'DOWNLOAD_DELAY': 1, 'LOG_LEVEL': 'INFO'}
|
80 |
+
```
|
81 |
+
> Here CONCURRENT_REQUESTS is the number of URLs that will be processed parallelly and DOWNLOAD_DELAY is a wait between each request. So, Increase CONCURRENT_REQUESTS and decrease DOWNLOAD_DELAY (minimum value for download delay is 0).
|
82 |
+
|
83 |
+
|
84 |
+
## Data Columns
|
85 |
+
* username
|
86 |
+
* full_name
|
87 |
+
* twitter_url
|
88 |
+
* tweet_text
|
89 |
+
* tweet_time
|
90 |
+
* number_of_likes
|
91 |
+
* no_of_retweets
|
92 |
+
* no_of_replies
|
93 |
+
* mentions
|
94 |
+
* no_of_mentions
|
95 |
+
* hashtags
|
96 |
+
* no_of_hashtags
|
97 |
+
* call_to_action
|
98 |
+
* image_url
|
99 |
+
|
100 |
+
## Release History
|
101 |
+
|
102 |
+
* 1.0.0
|
103 |
+
* first release crawl by hashtags
|
104 |
+
|
105 |
+
## Meta
|
106 |
+
|
107 |
+
Amit Upreti – [@amitupreti](https://www.linkedin.com/in/amitupreti/)
|
108 |
+
|
109 |
+
Distributed under the MIT license. See ``LICENSE`` for more information.
|
110 |
+
|
111 |
+
|
112 |
+
## Contributing
|
113 |
+
|
114 |
+
1. Fork it (<https://github.com/amitupreti/Hands-on-WebScraping/fork>)
|
115 |
+
2. Create your feature branch (`git checkout -b feature/fooBar`)
|
116 |
+
3. Commit your changes (`git commit -am 'Add some fooBar'`)
|
117 |
+
4. Push to the branch (`git push origin feature/fooBar`)
|
118 |
+
5. Create a new Pull Request
|
Hands-on-WebScraping-master/Hands-on-WebScraping-master/project1_twitter_hashtag_crawler/TwitterHashTagCrawler/__init__.py
ADDED
File without changes
|
Hands-on-WebScraping-master/Hands-on-WebScraping-master/project1_twitter_hashtag_crawler/TwitterHashTagCrawler/items.py
ADDED
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# -*- coding: utf-8 -*-
|
2 |
+
|
3 |
+
# Define here the models for your scraped items
|
4 |
+
#
|
5 |
+
# See documentation in:
|
6 |
+
# https://doc.scrapy.org/en/latest/topics/items.html
|
7 |
+
|
8 |
+
import scrapy
|
9 |
+
|
10 |
+
|
11 |
+
class TwitterhashtagcrawlerItem(scrapy.Item):
|
12 |
+
# define the fields for your item here like:
|
13 |
+
# name = scrapy.Field()
|
14 |
+
pass
|
Hands-on-WebScraping-master/Hands-on-WebScraping-master/project1_twitter_hashtag_crawler/TwitterHashTagCrawler/middlewares.py
ADDED
@@ -0,0 +1,103 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# -*- coding: utf-8 -*-
|
2 |
+
|
3 |
+
# Define here the models for your spider middleware
|
4 |
+
#
|
5 |
+
# See documentation in:
|
6 |
+
# https://doc.scrapy.org/en/latest/topics/spider-middleware.html
|
7 |
+
|
8 |
+
from scrapy import signals
|
9 |
+
|
10 |
+
|
11 |
+
class TwitterhashtagcrawlerSpiderMiddleware(object):
|
12 |
+
# Not all methods need to be defined. If a method is not defined,
|
13 |
+
# scrapy acts as if the spider middleware does not modify the
|
14 |
+
# passed objects.
|
15 |
+
|
16 |
+
@classmethod
|
17 |
+
def from_crawler(cls, crawler):
|
18 |
+
# This method is used by Scrapy to create your spiders.
|
19 |
+
s = cls()
|
20 |
+
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
|
21 |
+
return s
|
22 |
+
|
23 |
+
def process_spider_input(self, response, spider):
|
24 |
+
# Called for each response that goes through the spider
|
25 |
+
# middleware and into the spider.
|
26 |
+
|
27 |
+
# Should return None or raise an exception.
|
28 |
+
return None
|
29 |
+
|
30 |
+
def process_spider_output(self, response, result, spider):
|
31 |
+
# Called with the results returned from the Spider, after
|
32 |
+
# it has processed the response.
|
33 |
+
|
34 |
+
# Must return an iterable of Request, dict or Item objects.
|
35 |
+
for i in result:
|
36 |
+
yield i
|
37 |
+
|
38 |
+
def process_spider_exception(self, response, exception, spider):
|
39 |
+
# Called when a spider or process_spider_input() method
|
40 |
+
# (from other spider middleware) raises an exception.
|
41 |
+
|
42 |
+
# Should return either None or an iterable of Response, dict
|
43 |
+
# or Item objects.
|
44 |
+
pass
|
45 |
+
|
46 |
+
def process_start_requests(self, start_requests, spider):
|
47 |
+
# Called with the start requests of the spider, and works
|
48 |
+
# similarly to the process_spider_output() method, except
|
49 |
+
# that it doesn’t have a response associated.
|
50 |
+
|
51 |
+
# Must return only requests (not items).
|
52 |
+
for r in start_requests:
|
53 |
+
yield r
|
54 |
+
|
55 |
+
def spider_opened(self, spider):
|
56 |
+
spider.logger.info('Spider opened: %s' % spider.name)
|
57 |
+
|
58 |
+
|
59 |
+
class TwitterhashtagcrawlerDownloaderMiddleware(object):
|
60 |
+
# Not all methods need to be defined. If a method is not defined,
|
61 |
+
# scrapy acts as if the downloader middleware does not modify the
|
62 |
+
# passed objects.
|
63 |
+
|
64 |
+
@classmethod
|
65 |
+
def from_crawler(cls, crawler):
|
66 |
+
# This method is used by Scrapy to create your spiders.
|
67 |
+
s = cls()
|
68 |
+
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
|
69 |
+
return s
|
70 |
+
|
71 |
+
def process_request(self, request, spider):
|
72 |
+
# Called for each request that goes through the downloader
|
73 |
+
# middleware.
|
74 |
+
|
75 |
+
# Must either:
|
76 |
+
# - return None: continue processing this request
|
77 |
+
# - or return a Response object
|
78 |
+
# - or return a Request object
|
79 |
+
# - or raise IgnoreRequest: process_exception() methods of
|
80 |
+
# installed downloader middleware will be called
|
81 |
+
return None
|
82 |
+
|
83 |
+
def process_response(self, request, response, spider):
|
84 |
+
# Called with the response returned from the downloader.
|
85 |
+
|
86 |
+
# Must either;
|
87 |
+
# - return a Response object
|
88 |
+
# - return a Request object
|
89 |
+
# - or raise IgnoreRequest
|
90 |
+
return response
|
91 |
+
|
92 |
+
def process_exception(self, request, exception, spider):
|
93 |
+
# Called when a download handler or a process_request()
|
94 |
+
# (from other downloader middleware) raises an exception.
|
95 |
+
|
96 |
+
# Must either:
|
97 |
+
# - return None: continue processing this exception
|
98 |
+
# - return a Response object: stops process_exception() chain
|
99 |
+
# - return a Request object: stops process_exception() chain
|
100 |
+
pass
|
101 |
+
|
102 |
+
def spider_opened(self, spider):
|
103 |
+
spider.logger.info('Spider opened: %s' % spider.name)
|
Hands-on-WebScraping-master/Hands-on-WebScraping-master/project1_twitter_hashtag_crawler/TwitterHashTagCrawler/pipelines.py
ADDED
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# -*- coding: utf-8 -*-
|
2 |
+
|
3 |
+
# Define your item pipelines here
|
4 |
+
#
|
5 |
+
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
|
6 |
+
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
|
7 |
+
|
8 |
+
|
9 |
+
class TwitterhashtagcrawlerPipeline(object):
|
10 |
+
def process_item(self, item, spider):
|
11 |
+
return item
|
Hands-on-WebScraping-master/Hands-on-WebScraping-master/project1_twitter_hashtag_crawler/TwitterHashTagCrawler/settings.py
ADDED
@@ -0,0 +1,90 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# -*- coding: utf-8 -*-
|
2 |
+
|
3 |
+
# Scrapy settings for TwitterHashTagCrawler project
|
4 |
+
#
|
5 |
+
# For simplicity, this file contains only settings considered important or
|
6 |
+
# commonly used. You can find more settings consulting the documentation:
|
7 |
+
#
|
8 |
+
# https://doc.scrapy.org/en/latest/topics/settings.html
|
9 |
+
# https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
|
10 |
+
# https://doc.scrapy.org/en/latest/topics/spider-middleware.html
|
11 |
+
|
12 |
+
BOT_NAME = 'TwitterHashTagCrawler'
|
13 |
+
|
14 |
+
SPIDER_MODULES = ['TwitterHashTagCrawler.spiders']
|
15 |
+
NEWSPIDER_MODULE = 'TwitterHashTagCrawler.spiders'
|
16 |
+
|
17 |
+
|
18 |
+
# Crawl responsibly by identifying yourself (and your website) on the user-agent
|
19 |
+
#USER_AGENT = 'TwitterHashTagCrawler (+http://www.yourdomain.com)'
|
20 |
+
|
21 |
+
# Obey robots.txt rules
|
22 |
+
ROBOTSTXT_OBEY = True
|
23 |
+
|
24 |
+
# Configure maximum concurrent requests performed by Scrapy (default: 16)
|
25 |
+
#CONCURRENT_REQUESTS = 32
|
26 |
+
|
27 |
+
# Configure a delay for requests for the same website (default: 0)
|
28 |
+
# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
|
29 |
+
# See also autothrottle settings and docs
|
30 |
+
#DOWNLOAD_DELAY = 3
|
31 |
+
# The download delay setting will honor only one of:
|
32 |
+
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
|
33 |
+
#CONCURRENT_REQUESTS_PER_IP = 16
|
34 |
+
|
35 |
+
# Disable cookies (enabled by default)
|
36 |
+
#COOKIES_ENABLED = False
|
37 |
+
|
38 |
+
# Disable Telnet Console (enabled by default)
|
39 |
+
#TELNETCONSOLE_ENABLED = False
|
40 |
+
|
41 |
+
# Override the default request headers:
|
42 |
+
#DEFAULT_REQUEST_HEADERS = {
|
43 |
+
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
|
44 |
+
# 'Accept-Language': 'en',
|
45 |
+
#}
|
46 |
+
|
47 |
+
# Enable or disable spider middlewares
|
48 |
+
# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
|
49 |
+
#SPIDER_MIDDLEWARES = {
|
50 |
+
# 'TwitterHashTagCrawler.middlewares.TwitterhashtagcrawlerSpiderMiddleware': 543,
|
51 |
+
#}
|
52 |
+
|
53 |
+
# Enable or disable downloader middlewares
|
54 |
+
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
|
55 |
+
#DOWNLOADER_MIDDLEWARES = {
|
56 |
+
# 'TwitterHashTagCrawler.middlewares.TwitterhashtagcrawlerDownloaderMiddleware': 543,
|
57 |
+
#}
|
58 |
+
|
59 |
+
# Enable or disable extensions
|
60 |
+
# See https://doc.scrapy.org/en/latest/topics/extensions.html
|
61 |
+
#EXTENSIONS = {
|
62 |
+
# 'scrapy.extensions.telnet.TelnetConsole': None,
|
63 |
+
#}
|
64 |
+
|
65 |
+
# Configure item pipelines
|
66 |
+
# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
|
67 |
+
#ITEM_PIPELINES = {
|
68 |
+
# 'TwitterHashTagCrawler.pipelines.TwitterhashtagcrawlerPipeline': 300,
|
69 |
+
#}
|
70 |
+
|
71 |
+
# Enable and configure the AutoThrottle extension (disabled by default)
|
72 |
+
# See https://doc.scrapy.org/en/latest/topics/autothrottle.html
|
73 |
+
#AUTOTHROTTLE_ENABLED = True
|
74 |
+
# The initial download delay
|
75 |
+
#AUTOTHROTTLE_START_DELAY = 5
|
76 |
+
# The maximum download delay to be set in case of high latencies
|
77 |
+
#AUTOTHROTTLE_MAX_DELAY = 60
|
78 |
+
# The average number of requests Scrapy should be sending in parallel to
|
79 |
+
# each remote server
|
80 |
+
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
|
81 |
+
# Enable showing throttling stats for every response received:
|
82 |
+
#AUTOTHROTTLE_DEBUG = False
|
83 |
+
|
84 |
+
# Enable and configure HTTP caching (disabled by default)
|
85 |
+
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
|
86 |
+
#HTTPCACHE_ENABLED = True
|
87 |
+
#HTTPCACHE_EXPIRATION_SECS = 0
|
88 |
+
#HTTPCACHE_DIR = 'httpcache'
|
89 |
+
#HTTPCACHE_IGNORE_HTTP_CODES = []
|
90 |
+
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
|
Hands-on-WebScraping-master/Hands-on-WebScraping-master/project1_twitter_hashtag_crawler/TwitterHashTagCrawler/spiders/__init__.py
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# This package will contain the spiders of your Scrapy project
|
2 |
+
#
|
3 |
+
# Please refer to the documentation for information on how to create and manage
|
4 |
+
# your spiders.
|
Hands-on-WebScraping-master/Hands-on-WebScraping-master/project1_twitter_hashtag_crawler/TwitterHashTagCrawler/spiders/hashtag.py
ADDED
@@ -0,0 +1,112 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# -*- coding: utf-8 -*-
|
2 |
+
import scrapy
|
3 |
+
import ipdb
|
4 |
+
import re
|
5 |
+
from dateutil import parser
|
6 |
+
import sys
|
7 |
+
from scrapy.crawler import CrawlerProcess
|
8 |
+
from utils import get_links, get_hashtags, get_mentions
|
9 |
+
import logging
|
10 |
+
|
11 |
+
class HashtagSpider(scrapy.Spider):
|
12 |
+
name = 'twittercrawler'
|
13 |
+
allowed_domains = ["twitter.com"]
|
14 |
+
|
15 |
+
# custom settings for user agent and proxy. Default will get chrome as user agent and use a proxypool of 50 .
|
16 |
+
# Override here
|
17 |
+
custom_settings = {
|
18 |
+
'USER_AGENT': 'Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; compatible; Googlebot/2.1; +http://www.google.com/bot.html) Safari/537.36',
|
19 |
+
'CONCURRENT_REQUESTS': 5, 'DOWNLOAD_DELAY': 0, 'LOG_LEVEL': 'INFO'}
|
20 |
+
|
21 |
+
def __init__(self, filename=''):
|
22 |
+
if not filename:
|
23 |
+
sys.exit('Please provide the input filename also. Example \n\n$ python3 hashtags.py myinput.csv')
|
24 |
+
self.filename = filename
|
25 |
+
|
26 |
+
# the crawler will execute start_requests function at first.
|
27 |
+
def start_requests(self):
|
28 |
+
with open(self.filename, 'r') as f:
|
29 |
+
hashtags = f.read().splitlines()
|
30 |
+
if len(hashtags) == 0:
|
31 |
+
sys.exit('Emplty File detected.Please provide hashtags separated by newlines')
|
32 |
+
else:
|
33 |
+
logging.info(f'{len(hashtags)} hashtags found')
|
34 |
+
for hashtag in hashtags:
|
35 |
+
if hashtag:
|
36 |
+
search_url = "https://mobile.twitter.com/hashtag/" + hashtag.lower()
|
37 |
+
|
38 |
+
yield scrapy.Request(search_url, callback=self.find_tweets, dont_filter=True)
|
39 |
+
|
40 |
+
def find_tweets(self, response):
|
41 |
+
tweets = response.xpath('//table[@class="tweet "]/@href').getall()
|
42 |
+
logging.info(f'{len(tweets)} tweets found')
|
43 |
+
for tweet_id in tweets:
|
44 |
+
tweet_id = re.findall("\d+", tweet_id)[-1]
|
45 |
+
tweet_url = 'https://twitter.com/anyuser/status/' + \
|
46 |
+
str(tweet_id)
|
47 |
+
yield scrapy.Request(tweet_url, callback=self.parse_tweet)
|
48 |
+
|
49 |
+
# finding and visiting next page
|
50 |
+
next_page = response.xpath(
|
51 |
+
'//*[@class="w-button-more"]/a/@href').get(default='')
|
52 |
+
logging.info('Next page found:')
|
53 |
+
if next_page != '':
|
54 |
+
next_page = 'https://mobile.twitter.com' + next_page
|
55 |
+
yield scrapy.Request(next_page, callback=self.find_tweets)
|
56 |
+
|
57 |
+
def parse_tweet(self, response):
|
58 |
+
logging.info('Processing --> ' + response.url)
|
59 |
+
username = response.xpath(
|
60 |
+
'//*[@class="permalink-inner permalink-tweet-container"]//*[@class="username u-dir u-textTruncate"]/b/text()').get(
|
61 |
+
default='')
|
62 |
+
full_name = response.xpath(
|
63 |
+
'//*[@class="permalink-inner permalink-tweet-container"]//*[@class="FullNameGroup"]/strong/text()').get(
|
64 |
+
default='')
|
65 |
+
|
66 |
+
try:
|
67 |
+
tweet_text = response.xpath('//title/text()').get(default='').split(':')[1].strip()
|
68 |
+
|
69 |
+
except:
|
70 |
+
tweet_text = ' '.join(response.xpath(
|
71 |
+
'//*[contains(@class,"permalink-inner permalink-tweet-container")]//*[@class="js-tweet-text-container"]/p//text()').getall()).strip()
|
72 |
+
image_list = response.xpath(
|
73 |
+
'//*[contains(@class,"permalink-inner permalink-tweet-container")]//*[@class="AdaptiveMediaOuterContainer"]//img/@src').getall()
|
74 |
+
date_time = response.xpath(
|
75 |
+
'//*[contains(@class,"permalink-inner permalink-tweet-container")]//*[@class="js-tweet-details-fixer tweet-details-fixer"]/div[@class="client-and-actions"]/span[@class="metadata"]/span/text()').get(
|
76 |
+
default='')
|
77 |
+
|
78 |
+
date_time = parser.parse(date_time.replace('-', '')).strftime('%Y-%m-%d %H:%M:%S')
|
79 |
+
retweets = response.xpath(
|
80 |
+
'//*[contains(@class,"permalink-inner permalink-tweet-container")]//*[@class="js-tweet-details-fixer tweet-details-fixer"]/div[@class="js-tweet-stats-container tweet-stats-container"]//*[@class="js-stat-count js-stat-retweets stat-count"]/a/strong/text()').get(
|
81 |
+
default='')
|
82 |
+
|
83 |
+
likes = response.xpath(
|
84 |
+
'//*[contains(@class,"permalink-inner permalink-tweet-container")]//*[@class="js-tweet-details-fixer tweet-details-fixer"]/div[@class="js-tweet-stats-container tweet-stats-container"]//*[@class="js-stat-count js-stat-favorites stat-count"]/a/strong/text()').get(
|
85 |
+
default='')
|
86 |
+
replies = response.xpath(
|
87 |
+
'//*[contains(@class,"permalink-inner permalink-tweet-container")]//*[contains(@id,"profile-tweet-action-reply-count")]/parent::span/@data-tweet-stat-count').get(
|
88 |
+
default='')
|
89 |
+
|
90 |
+
mentions = get_mentions(tweet_text)
|
91 |
+
hashtags = get_hashtags(tweet_text)
|
92 |
+
cta = get_links(tweet_text)
|
93 |
+
|
94 |
+
result = {
|
95 |
+
'username': username.lower(),
|
96 |
+
'full_name': full_name,
|
97 |
+
'twitter_url': response.url,
|
98 |
+
'tweet_text': tweet_text,
|
99 |
+
'tweet_time': str(date_time),
|
100 |
+
'number_of_likes': str(likes),
|
101 |
+
'no_of_retweets': str(retweets),
|
102 |
+
'no_of_replies': str(replies),
|
103 |
+
'mentions': ' | '.join(mentions),
|
104 |
+
'no_of_mentions': str(len(mentions)),
|
105 |
+
'hashtags': ' | '.join(hashtags),
|
106 |
+
'no_of_hashtags': str(len(hashtags)),
|
107 |
+
'call_to_action': ' | '.join(cta),
|
108 |
+
'image_url': ' | '.join(image_list),
|
109 |
+
|
110 |
+
}
|
111 |
+
yield result
|
112 |
+
|
Hands-on-WebScraping-master/Hands-on-WebScraping-master/project1_twitter_hashtag_crawler/myhashtags.csv
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
cats
|
2 |
+
dogs
|
Hands-on-WebScraping-master/Hands-on-WebScraping-master/project1_twitter_hashtag_crawler/requirements.txt
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
scrapy
|
2 |
+
dateutil
|
Hands-on-WebScraping-master/Hands-on-WebScraping-master/project1_twitter_hashtag_crawler/sampledata.csv
ADDED
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
username,full_name,twitter_url,tweet_text,tweet_time,number_of_likes,no_of_retweets,no_of_replies,mentions,no_of_mentions,hashtags,no_of_hashtags,call_to_action,image_url
|
2 |
+
cctvasiapacific,CCTV Asia Pacific,https://twitter.com/CCTVAsiaPacific/status/1212269072328491008,"Turning off the stereotype of political faces, Hou Yanqi, the # ChineseAmbssdor to # Nepal , amazes Nepalese and gains popularity on twitter by posting her ad-like photos and wishes: ""True beauty always touches the deep heart"", said Hou.
|
3 |
+
२०२० नेपाल भ्रमाण वर्ष सफलताको शुभकामना pic.twitter.com/z0N8ru2vNd",2019-12-31 23:07:00,804,171,35,,0,,0,,https://pbs.twimg.com/media/ENLYSqlU4AAgiFh.jpg | https://pbs.twimg.com/media/ENLYSqoVAAASSS-.jpg | https://pbs.twimg.com/media/ENLYSqmU0AAZEyK.jpg
|
4 |
+
,,https://twitter.com/BishowParajuli/status/1213037950549626882,"Zimbabwe is beautiful! Glad to hear your mountain climbing adventure ; If you wish to climb further higher, another beautiful place is # Nepal ! You will you can also enjoy some terrific historical spots: pic.twitter.com/ofsCppyp0O",2020-01-03 02:02:00,27,3,1,,0,,0,,https://pbs.twimg.com/media/ENWTkzmUEAEKS1k.jpg | https://pbs.twimg.com/media/ENWTkznU4AAtVxK.jpg | https://pbs.twimg.com/media/ENWTkzoUwAEgMpX.jpg | https://pbs.twimg.com/media/ENWTkzlU4AEYxor.jpg
|
5 |
+
kopinoora,kpila,https://twitter.com/kopinoora/status/1213481511967690752,# VisitNepal2020 official inauguration at London Nepal Embassy. # pic.twitter.com/e4N9XulBH7,2020-01-04 07:25:00,3,,0,,0,,0,,https://pbs.twimg.com/media/ENcnABiXsAE7_sw.jpg | https://pbs.twimg.com/media/ENcnABsXUAAnuBL.jpg
|
6 |
+
mahbub_nazif,Nazif Mahbub,https://twitter.com/mahbub_nazif/status/1213328288271089664,"The joy of being Innocent. Durbar square, kathmandu, nepal pic.twitter.com/sbsfxTzeHN",2020-01-03 21:16:00,4,,0,,0,,0,,https://pbs.twimg.com/media/ENabn-uWwAcbUfb.jpg
|
7 |
+
prabhumteverest,Prastuti_प्रश्तुती,https://twitter.com/PrabhuMteverest/status/1213178026457878528,"Visit nepal2020. where heaven meets and you won't feel regret choosing Nepal as your destination
|
8 |
+
|
9 |
+
We are eager to welcome you with our beautiful destinations and warm hospitality pic.twitter.com/l7GQfk2ha6",2020-01-03 11:19:00,5,,0,,0,,0,,https://pbs.twimg.com/media/ENYS_CLUwAAVypp.jpg
|
10 |
+
kashishds,Kashish Das Shrestha,https://twitter.com/kashishds/status/1213120581412876295,"Marpha bazaar, Mustang, Nepal. Today.
|
11 |
+
|
12 |
+
Requested my friend & Marpha resident Dipesh Hirachan for this clip. This is just outside his Apple orchard there. pic.twitter.com/oOFy88ylIt",2020-01-03 07:30:00,123,20,4,,0,,0,,
|
Hands-on-WebScraping-master/Hands-on-WebScraping-master/project1_twitter_hashtag_crawler/scrapy.cfg
ADDED
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Automatically created by: scrapy startproject
|
2 |
+
#
|
3 |
+
# For more information about the [deploy] section see:
|
4 |
+
# https://scrapyd.readthedocs.io/en/latest/deploy.html
|
5 |
+
|
6 |
+
[settings]
|
7 |
+
default = TwitterHashTagCrawler.settings
|
8 |
+
|
9 |
+
[deploy]
|
10 |
+
#url = http://localhost:6800/
|
11 |
+
project = TwitterHashTagCrawler
|
Hands-on-WebScraping-master/Hands-on-WebScraping-master/project1_twitter_hashtag_crawler/utils.py
ADDED
@@ -0,0 +1,67 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import re
|
2 |
+
|
3 |
+
|
4 |
+
def find_emails(text):
|
5 |
+
"""
|
6 |
+
It will parse the given string and return a list of emails if found
|
7 |
+
|
8 |
+
Example:
|
9 |
+
>>find_emails('hello\n find me here\nemail@gmail.com')
|
10 |
+
['email@gmail.com']
|
11 |
+
|
12 |
+
:param text: string
|
13 |
+
:return: list
|
14 |
+
"""
|
15 |
+
return re.findall(r"([a-zA-Z0-9+._-]+@[a-zA-Z0-9._-]+\.[a-zA-Z0-9_-]+)", text)
|
16 |
+
|
17 |
+
|
18 |
+
def get_mentions(text):
|
19 |
+
"""
|
20 |
+
It will return mentions from the text i.e @someone
|
21 |
+
|
22 |
+
:param text: string
|
23 |
+
:return: list
|
24 |
+
|
25 |
+
example
|
26 |
+
>>> get_mentions('Hi @hero, How are you? I hope @hero2 is fine. BTW say hi to @heroine for me')
|
27 |
+
['hero','hero2','heroine']
|
28 |
+
"""
|
29 |
+
result = re.findall("(^|[^@\w])@(\w{1,15})", text)
|
30 |
+
if len(result) != 0:
|
31 |
+
result = [i[1] for i in result]
|
32 |
+
return result
|
33 |
+
|
34 |
+
|
35 |
+
def get_hashtags(text):
|
36 |
+
"""
|
37 |
+
It will return hashtags from the text i.e #something
|
38 |
+
|
39 |
+
:param text: string
|
40 |
+
:return: list
|
41 |
+
|
42 |
+
example
|
43 |
+
>>> get_hashtags('my first code #programmer #python #awesome #grepsr')
|
44 |
+
['programmer','python','awesome','grepsr']
|
45 |
+
"""
|
46 |
+
|
47 |
+
result = re.findall("(^|[^@\w])#(\w{1,15})", text)
|
48 |
+
if len(result) != 0:
|
49 |
+
result = [i[1] for i in result]
|
50 |
+
return result
|
51 |
+
|
52 |
+
|
53 |
+
def get_links(text):
|
54 |
+
"""
|
55 |
+
It will return website links from the text
|
56 |
+
|
57 |
+
:param text: string
|
58 |
+
:return: list
|
59 |
+
|
60 |
+
example
|
61 |
+
>>> message = 'http://twitter.com Project URL: https://app.grepsr.com/app/project/message/70454'
|
62 |
+
>>> get_links(message)
|
63 |
+
['http://twitter.com', 'https://app.grepsr.com/app/project/message/70454']
|
64 |
+
|
65 |
+
"""
|
66 |
+
result = re.findall(r"(?P<url>https?://[^\s]+)", text)
|
67 |
+
return result
|
Hands-on-WebScraping/.gitignore
ADDED
@@ -0,0 +1,241 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Byte-compiled / optimized / DLL files
|
2 |
+
__pycache__/
|
3 |
+
*.py[cod]
|
4 |
+
*$py.class
|
5 |
+
|
6 |
+
# C extensions
|
7 |
+
*.so
|
8 |
+
|
9 |
+
# Distribution / packaging
|
10 |
+
.Python
|
11 |
+
build/
|
12 |
+
develop-eggs/
|
13 |
+
dist/
|
14 |
+
downloads/
|
15 |
+
eggs/
|
16 |
+
.eggs/
|
17 |
+
lib/
|
18 |
+
lib64/
|
19 |
+
parts/
|
20 |
+
sdist/
|
21 |
+
var/
|
22 |
+
wheels/
|
23 |
+
pip-wheel-metadata/
|
24 |
+
share/python-wheels/
|
25 |
+
*.egg-info/
|
26 |
+
.installed.cfg
|
27 |
+
*.egg
|
28 |
+
MANIFEST
|
29 |
+
|
30 |
+
# PyInstaller
|
31 |
+
# Usually these files are written by a python script from a template
|
32 |
+
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
33 |
+
*.manifest
|
34 |
+
*.spec
|
35 |
+
|
36 |
+
# Installer logs
|
37 |
+
pip-log.txt
|
38 |
+
pip-delete-this-directory.txt
|
39 |
+
|
40 |
+
# Unit test / coverage reports
|
41 |
+
htmlcov/
|
42 |
+
.tox/
|
43 |
+
.nox/
|
44 |
+
.coverage
|
45 |
+
.coverage.*
|
46 |
+
.cache
|
47 |
+
nosetests.xml
|
48 |
+
coverage.xml
|
49 |
+
*.cover
|
50 |
+
*.py,cover
|
51 |
+
.hypothesis/
|
52 |
+
.pytest_cache/
|
53 |
+
|
54 |
+
# Translations
|
55 |
+
*.mo
|
56 |
+
*.pot
|
57 |
+
|
58 |
+
# Django stuff:
|
59 |
+
*.log
|
60 |
+
local_settings.py
|
61 |
+
db.sqlite3
|
62 |
+
db.sqlite3-journal
|
63 |
+
|
64 |
+
# Flask stuff:
|
65 |
+
instance/
|
66 |
+
.webassets-cache
|
67 |
+
|
68 |
+
# Scrapy stuff:
|
69 |
+
.scrapy
|
70 |
+
|
71 |
+
# Sphinx documentation
|
72 |
+
docs/_build/
|
73 |
+
|
74 |
+
# PyBuilder
|
75 |
+
target/
|
76 |
+
|
77 |
+
# Jupyter Notebook
|
78 |
+
.ipynb_checkpoints
|
79 |
+
|
80 |
+
# IPython
|
81 |
+
profile_default/
|
82 |
+
ipython_config.py
|
83 |
+
|
84 |
+
# pyenv
|
85 |
+
.python-version
|
86 |
+
|
87 |
+
# pipenv
|
88 |
+
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
|
89 |
+
# However, in case of collaboration, if having platform-specific dependencies or dependencies
|
90 |
+
# having no cross-platform support, pipenv may install dependencies that don't work, or not
|
91 |
+
# install all needed dependencies.
|
92 |
+
#Pipfile.lock
|
93 |
+
|
94 |
+
# PEP 582; used by e.g. github.com/David-OConnor/pyflow
|
95 |
+
__pypackages__/
|
96 |
+
|
97 |
+
# Celery stuff
|
98 |
+
celerybeat-schedule
|
99 |
+
celerybeat.pid
|
100 |
+
|
101 |
+
# SageMath parsed files
|
102 |
+
*.sage.py
|
103 |
+
|
104 |
+
# Environments
|
105 |
+
.env
|
106 |
+
.venv
|
107 |
+
env/
|
108 |
+
venv/
|
109 |
+
ENV/
|
110 |
+
env.bak/
|
111 |
+
venv.bak/
|
112 |
+
|
113 |
+
# Spyder project settings
|
114 |
+
.spyderproject
|
115 |
+
.spyproject
|
116 |
+
|
117 |
+
# Rope project settings
|
118 |
+
.ropeproject
|
119 |
+
|
120 |
+
# mkdocs documentation
|
121 |
+
/site
|
122 |
+
|
123 |
+
# mypy
|
124 |
+
.mypy_cache/
|
125 |
+
.dmypy.json
|
126 |
+
dmypy.json
|
127 |
+
|
128 |
+
# Pyre type checker
|
129 |
+
.pyre/
|
130 |
+
|
131 |
+
|
132 |
+
|
133 |
+
|
134 |
+
# Logs
|
135 |
+
logs
|
136 |
+
*.log
|
137 |
+
npm-debug.log*
|
138 |
+
yarn-debug.log*
|
139 |
+
yarn-error.log*
|
140 |
+
lerna-debug.log*
|
141 |
+
|
142 |
+
# Diagnostic reports (https://nodejs.org/api/report.html)
|
143 |
+
report.[0-9]*.[0-9]*.[0-9]*.[0-9]*.json
|
144 |
+
|
145 |
+
# Runtime data
|
146 |
+
pids
|
147 |
+
*.pid
|
148 |
+
*.seed
|
149 |
+
*.pid.lock
|
150 |
+
|
151 |
+
# Directory for instrumented libs generated by jscoverage/JSCover
|
152 |
+
lib-cov
|
153 |
+
|
154 |
+
# Coverage directory used by tools like istanbul
|
155 |
+
coverage
|
156 |
+
*.lcov
|
157 |
+
|
158 |
+
# nyc test coverage
|
159 |
+
.nyc_output
|
160 |
+
|
161 |
+
# Grunt intermediate storage (https://gruntjs.com/creating-plugins#storing-task-files)
|
162 |
+
.grunt
|
163 |
+
|
164 |
+
# Bower dependency directory (https://bower.io/)
|
165 |
+
bower_components
|
166 |
+
|
167 |
+
# node-waf configuration
|
168 |
+
.lock-wscript
|
169 |
+
|
170 |
+
# Compiled binary addons (https://nodejs.org/api/addons.html)
|
171 |
+
build/Release
|
172 |
+
|
173 |
+
# Dependency directories
|
174 |
+
node_modules/
|
175 |
+
jspm_packages/
|
176 |
+
|
177 |
+
# TypeScript v1 declaration files
|
178 |
+
typings/
|
179 |
+
|
180 |
+
# TypeScript cache
|
181 |
+
*.tsbuildinfo
|
182 |
+
|
183 |
+
# Optional npm cache directory
|
184 |
+
.npm
|
185 |
+
|
186 |
+
# Optional eslint cache
|
187 |
+
.eslintcache
|
188 |
+
|
189 |
+
# Microbundle cache
|
190 |
+
.rpt2_cache/
|
191 |
+
.rts2_cache_cjs/
|
192 |
+
.rts2_cache_es/
|
193 |
+
.rts2_cache_umd/
|
194 |
+
|
195 |
+
# Optional REPL history
|
196 |
+
.node_repl_history
|
197 |
+
|
198 |
+
# Output of 'npm pack'
|
199 |
+
*.tgz
|
200 |
+
|
201 |
+
# Yarn Integrity file
|
202 |
+
.yarn-integrity
|
203 |
+
|
204 |
+
# dotenv environment variables file
|
205 |
+
.env
|
206 |
+
.env.test
|
207 |
+
|
208 |
+
# parcel-bundler cache (https://parceljs.org/)
|
209 |
+
.cache
|
210 |
+
|
211 |
+
# Next.js build output
|
212 |
+
.next
|
213 |
+
|
214 |
+
# Nuxt.js build / generate output
|
215 |
+
.nuxt
|
216 |
+
dist
|
217 |
+
|
218 |
+
# Gatsby files
|
219 |
+
.cache/
|
220 |
+
# Comment in the public line in if your project uses Gatsby and not Next.js
|
221 |
+
# https://nextjs.org/blog/next-9-1#public-directory-support
|
222 |
+
# public
|
223 |
+
|
224 |
+
# vuepress build output
|
225 |
+
.vuepress/dist
|
226 |
+
|
227 |
+
# Serverless directories
|
228 |
+
.serverless/
|
229 |
+
|
230 |
+
# FuseBox cache
|
231 |
+
.fusebox/
|
232 |
+
|
233 |
+
# DynamoDB Local files
|
234 |
+
.dynamodb/
|
235 |
+
|
236 |
+
# TernJS port file
|
237 |
+
.tern-port
|
238 |
+
|
239 |
+
# Stores VSCode versions used for testing VSCode extensions
|
240 |
+
.vscode-test
|
241 |
+
|
Hands-on-WebScraping/LICENSE
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
MIT License
|
2 |
+
|
3 |
+
Copyright (c) 2020 Amit Upreti
|
4 |
+
|
5 |
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6 |
+
of this software and associated documentation files (the "Software"), to deal
|
7 |
+
in the Software without restriction, including without limitation the rights
|
8 |
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9 |
+
copies of the Software, and to permit persons to whom the Software is
|
10 |
+
furnished to do so, subject to the following conditions:
|
11 |
+
|
12 |
+
The above copyright notice and this permission notice shall be included in all
|
13 |
+
copies or substantial portions of the Software.
|
14 |
+
|
15 |
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16 |
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17 |
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18 |
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19 |
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20 |
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
21 |
+
SOFTWARE.
|
Hands-on-WebScraping/README.md
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
# Hands-on-WebScraping (NO LONGER MAINTAINED)
|
2 |
+
This repo is a part of blog series on several web scraping projects where we will explore scraping techniques to crawl data from simple websites to websites using advanced protection.
|
Hands-on-WebScraping/project1_twitter_hashtag_crawler/Readme.md
ADDED
@@ -0,0 +1,118 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#### Depricated. No longer maintained.
|
2 |
+
|
3 |
+
# Twitter Hashtag crawler
|
4 |
+
> A fast and unofficial twitter crawler to collect tweets using hashtag search.
|
5 |
+
|
6 |
+
> Notice: The crawler is meant to be used for collecting data purely for academic and research purpose only. I am not responsible for any legal issue that might arise for any unintended use of this crawler
|
7 |
+
|
8 |
+
[![Python 3](https://img.shields.io/badge/python-3.6-blue.svg)](https://www.python.org/downloads/release/python-360/)
|
9 |
+
[![twitter crawler](https://img.shields.io/badge/twittercrawler-1.0-green)](https://github.com/amitupreti/Hands-on-WebScraping/tree/master/project1_twitter_hashtag_crawler)
|
10 |
+
|
11 |
+
This is written using scrapy and python. The logic is straight forward. We are simply sending get requests to the mobile version of the twitter(mobile.twitter.com) to collect the list of tweets and sending get requests to the web version to parse tweet details.
|
12 |
+
![](header.png)
|
13 |
+
|
14 |
+
## Installation
|
15 |
+
|
16 |
+
OS X & Linux:
|
17 |
+
|
18 |
+
1. Download the project
|
19 |
+
|
20 |
+
```sh
|
21 |
+
git clone https://github.com/amitupreti/Hands-on-WebScraping
|
22 |
+
|
23 |
+
cd Hands-on-WebScraping/project1_twitter_hashtag_crawler
|
24 |
+
```
|
25 |
+
2. Install the dependencies
|
26 |
+
|
27 |
+
```sh
|
28 |
+
pip install -r requirements.txt --user
|
29 |
+
```
|
30 |
+
|
31 |
+
3. Verify the crawler spider exists
|
32 |
+
|
33 |
+
```sh
|
34 |
+
scrapy list
|
35 |
+
```
|
36 |
+
if you see `twittercrawler` than you are all set.
|
37 |
+
|
38 |
+
|
39 |
+
Windows:
|
40 |
+
1. Install [python3](https://www.python.org/downloads/) if you haven't already
|
41 |
+
2. Download the project. https://github.com/amitupreti/Hands-on-WebScraping/archive/master.zip
|
42 |
+
3. Extract the project
|
43 |
+
4. Open cmd and navigate inside the project directory
|
44 |
+
```sh
|
45 |
+
cd Hands-on-WebScraping/project1_twitter_hashtag_crawler
|
46 |
+
```
|
47 |
+
5. Follow step 2 and 3 from Mac/Linux installation
|
48 |
+
|
49 |
+
|
50 |
+
|
51 |
+
## Usage example
|
52 |
+
|
53 |
+
1. Put the hashtags in a csv file seperated by new line. For example, I have included `myhashtags.csv` as a sample.
|
54 |
+
|
55 |
+
![Hashtags file](https://i.paste.pics/225079df0d3dc27d66430b1553b2ac39.png)
|
56 |
+
|
57 |
+
2. Run the crawler with your hashtag file and the desired [output formats] (https://docs.scrapy.org/en/latest/topics/feed-exports.html)(JSON,JSON lines,CSV,XML)
|
58 |
+
|
59 |
+
* For csv
|
60 |
+
```sh
|
61 |
+
scrapy crawl twittercrawler -a filename=myhashtags.csv -o mydata.csv
|
62 |
+
|
63 |
+
```
|
64 |
+
|
65 |
+
* For JSON
|
66 |
+
```sh
|
67 |
+
scrapy crawl twittercrawler -a filename=myhashtags.csv -o mydata.json
|
68 |
+
|
69 |
+
```
|
70 |
+
![sample images](https://i.paste.pics/4a5826a6a090522e5326bb11838258df.png)
|
71 |
+
![sample images](https://i.paste.pics/68a64bab743150e00af4cd9eea9af8dc.png)
|
72 |
+
|
73 |
+
|
74 |
+
### Speeding up the crawls
|
75 |
+
If you feel like the crawler is a little slow then find the hashtag.py file in the project and edit the custom settings.
|
76 |
+
```py
|
77 |
+
custom_settings = {
|
78 |
+
'USER_AGENT': 'Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; compatible; Googlebot/2.1; +http://www.google.com/bot.html) Safari/537.36',
|
79 |
+
'CONCURRENT_REQUESTS': 2, 'DOWNLOAD_DELAY': 1, 'LOG_LEVEL': 'INFO'}
|
80 |
+
```
|
81 |
+
> Here CONCURRENT_REQUESTS is the number of URLs that will be processed parallelly and DOWNLOAD_DELAY is a wait between each request. So, Increase CONCURRENT_REQUESTS and decrease DOWNLOAD_DELAY (minimum value for download delay is 0).
|
82 |
+
|
83 |
+
|
84 |
+
## Data Columns
|
85 |
+
* username
|
86 |
+
* full_name
|
87 |
+
* twitter_url
|
88 |
+
* tweet_text
|
89 |
+
* tweet_time
|
90 |
+
* number_of_likes
|
91 |
+
* no_of_retweets
|
92 |
+
* no_of_replies
|
93 |
+
* mentions
|
94 |
+
* no_of_mentions
|
95 |
+
* hashtags
|
96 |
+
* no_of_hashtags
|
97 |
+
* call_to_action
|
98 |
+
* image_url
|
99 |
+
|
100 |
+
## Release History
|
101 |
+
|
102 |
+
* 1.0.0
|
103 |
+
* first release crawl by hashtags
|
104 |
+
|
105 |
+
## Meta
|
106 |
+
|
107 |
+
Amit Upreti – [@amitupreti](https://www.linkedin.com/in/amitupreti/)
|
108 |
+
|
109 |
+
Distributed under the MIT license. See ``LICENSE`` for more information.
|
110 |
+
|
111 |
+
|
112 |
+
## Contributing
|
113 |
+
|
114 |
+
1. Fork it (<https://github.com/amitupreti/Hands-on-WebScraping/fork>)
|
115 |
+
2. Create your feature branch (`git checkout -b feature/fooBar`)
|
116 |
+
3. Commit your changes (`git commit -am 'Add some fooBar'`)
|
117 |
+
4. Push to the branch (`git push origin feature/fooBar`)
|
118 |
+
5. Create a new Pull Request
|
Hands-on-WebScraping/project1_twitter_hashtag_crawler/TwitterHashTagCrawler/__init__.py
ADDED
File without changes
|
Hands-on-WebScraping/project1_twitter_hashtag_crawler/TwitterHashTagCrawler/__pycache__/__init__.cpython-310.pyc
ADDED
Binary file (234 Bytes). View file
|
|
Hands-on-WebScraping/project1_twitter_hashtag_crawler/TwitterHashTagCrawler/__pycache__/settings.cpython-310.pyc
ADDED
Binary file (370 Bytes). View file
|
|
Hands-on-WebScraping/project1_twitter_hashtag_crawler/TwitterHashTagCrawler/items.py
ADDED
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# -*- coding: utf-8 -*-
|
2 |
+
|
3 |
+
# Define here the models for your scraped items
|
4 |
+
#
|
5 |
+
# See documentation in:
|
6 |
+
# https://doc.scrapy.org/en/latest/topics/items.html
|
7 |
+
|
8 |
+
import scrapy
|
9 |
+
|
10 |
+
|
11 |
+
class TwitterhashtagcrawlerItem(scrapy.Item):
|
12 |
+
# define the fields for your item here like:
|
13 |
+
# name = scrapy.Field()
|
14 |
+
pass
|
Hands-on-WebScraping/project1_twitter_hashtag_crawler/TwitterHashTagCrawler/middlewares.py
ADDED
@@ -0,0 +1,103 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# -*- coding: utf-8 -*-
|
2 |
+
|
3 |
+
# Define here the models for your spider middleware
|
4 |
+
#
|
5 |
+
# See documentation in:
|
6 |
+
# https://doc.scrapy.org/en/latest/topics/spider-middleware.html
|
7 |
+
|
8 |
+
from scrapy import signals
|
9 |
+
|
10 |
+
|
11 |
+
class TwitterhashtagcrawlerSpiderMiddleware(object):
|
12 |
+
# Not all methods need to be defined. If a method is not defined,
|
13 |
+
# scrapy acts as if the spider middleware does not modify the
|
14 |
+
# passed objects.
|
15 |
+
|
16 |
+
@classmethod
|
17 |
+
def from_crawler(cls, crawler):
|
18 |
+
# This method is used by Scrapy to create your spiders.
|
19 |
+
s = cls()
|
20 |
+
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
|
21 |
+
return s
|
22 |
+
|
23 |
+
def process_spider_input(self, response, spider):
|
24 |
+
# Called for each response that goes through the spider
|
25 |
+
# middleware and into the spider.
|
26 |
+
|
27 |
+
# Should return None or raise an exception.
|
28 |
+
return None
|
29 |
+
|
30 |
+
def process_spider_output(self, response, result, spider):
|
31 |
+
# Called with the results returned from the Spider, after
|
32 |
+
# it has processed the response.
|
33 |
+
|
34 |
+
# Must return an iterable of Request, dict or Item objects.
|
35 |
+
for i in result:
|
36 |
+
yield i
|
37 |
+
|
38 |
+
def process_spider_exception(self, response, exception, spider):
|
39 |
+
# Called when a spider or process_spider_input() method
|
40 |
+
# (from other spider middleware) raises an exception.
|
41 |
+
|
42 |
+
# Should return either None or an iterable of Response, dict
|
43 |
+
# or Item objects.
|
44 |
+
pass
|
45 |
+
|
46 |
+
def process_start_requests(self, start_requests, spider):
|
47 |
+
# Called with the start requests of the spider, and works
|
48 |
+
# similarly to the process_spider_output() method, except
|
49 |
+
# that it doesn’t have a response associated.
|
50 |
+
|
51 |
+
# Must return only requests (not items).
|
52 |
+
for r in start_requests:
|
53 |
+
yield r
|
54 |
+
|
55 |
+
def spider_opened(self, spider):
|
56 |
+
spider.logger.info('Spider opened: %s' % spider.name)
|
57 |
+
|
58 |
+
|
59 |
+
class TwitterhashtagcrawlerDownloaderMiddleware(object):
|
60 |
+
# Not all methods need to be defined. If a method is not defined,
|
61 |
+
# scrapy acts as if the downloader middleware does not modify the
|
62 |
+
# passed objects.
|
63 |
+
|
64 |
+
@classmethod
|
65 |
+
def from_crawler(cls, crawler):
|
66 |
+
# This method is used by Scrapy to create your spiders.
|
67 |
+
s = cls()
|
68 |
+
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
|
69 |
+
return s
|
70 |
+
|
71 |
+
def process_request(self, request, spider):
|
72 |
+
# Called for each request that goes through the downloader
|
73 |
+
# middleware.
|
74 |
+
|
75 |
+
# Must either:
|
76 |
+
# - return None: continue processing this request
|
77 |
+
# - or return a Response object
|
78 |
+
# - or return a Request object
|
79 |
+
# - or raise IgnoreRequest: process_exception() methods of
|
80 |
+
# installed downloader middleware will be called
|
81 |
+
return None
|
82 |
+
|
83 |
+
def process_response(self, request, response, spider):
|
84 |
+
# Called with the response returned from the downloader.
|
85 |
+
|
86 |
+
# Must either;
|
87 |
+
# - return a Response object
|
88 |
+
# - return a Request object
|
89 |
+
# - or raise IgnoreRequest
|
90 |
+
return response
|
91 |
+
|
92 |
+
def process_exception(self, request, exception, spider):
|
93 |
+
# Called when a download handler or a process_request()
|
94 |
+
# (from other downloader middleware) raises an exception.
|
95 |
+
|
96 |
+
# Must either:
|
97 |
+
# - return None: continue processing this exception
|
98 |
+
# - return a Response object: stops process_exception() chain
|
99 |
+
# - return a Request object: stops process_exception() chain
|
100 |
+
pass
|
101 |
+
|
102 |
+
def spider_opened(self, spider):
|
103 |
+
spider.logger.info('Spider opened: %s' % spider.name)
|
Hands-on-WebScraping/project1_twitter_hashtag_crawler/TwitterHashTagCrawler/pipelines.py
ADDED
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# -*- coding: utf-8 -*-
|
2 |
+
|
3 |
+
# Define your item pipelines here
|
4 |
+
#
|
5 |
+
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
|
6 |
+
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
|
7 |
+
|
8 |
+
|
9 |
+
class TwitterhashtagcrawlerPipeline(object):
|
10 |
+
def process_item(self, item, spider):
|
11 |
+
return item
|
Hands-on-WebScraping/project1_twitter_hashtag_crawler/TwitterHashTagCrawler/settings.py
ADDED
@@ -0,0 +1,90 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# -*- coding: utf-8 -*-
|
2 |
+
|
3 |
+
# Scrapy settings for TwitterHashTagCrawler project
|
4 |
+
#
|
5 |
+
# For simplicity, this file contains only settings considered important or
|
6 |
+
# commonly used. You can find more settings consulting the documentation:
|
7 |
+
#
|
8 |
+
# https://doc.scrapy.org/en/latest/topics/settings.html
|
9 |
+
# https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
|
10 |
+
# https://doc.scrapy.org/en/latest/topics/spider-middleware.html
|
11 |
+
|
12 |
+
BOT_NAME = 'TwitterHashTagCrawler'
|
13 |
+
|
14 |
+
SPIDER_MODULES = ['TwitterHashTagCrawler.spiders']
|
15 |
+
NEWSPIDER_MODULE = 'TwitterHashTagCrawler.spiders'
|
16 |
+
|
17 |
+
|
18 |
+
# Crawl responsibly by identifying yourself (and your website) on the user-agent
|
19 |
+
#USER_AGENT = 'TwitterHashTagCrawler (+http://www.yourdomain.com)'
|
20 |
+
|
21 |
+
# Obey robots.txt rules
|
22 |
+
ROBOTSTXT_OBEY = False
|
23 |
+
|
24 |
+
# Configure maximum concurrent requests performed by Scrapy (default: 16)
|
25 |
+
#CONCURRENT_REQUESTS = 32
|
26 |
+
|
27 |
+
# Configure a delay for requests for the same website (default: 0)
|
28 |
+
# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
|
29 |
+
# See also autothrottle settings and docs
|
30 |
+
#DOWNLOAD_DELAY = 3
|
31 |
+
# The download delay setting will honor only one of:
|
32 |
+
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
|
33 |
+
#CONCURRENT_REQUESTS_PER_IP = 16
|
34 |
+
|
35 |
+
# Disable cookies (enabled by default)
|
36 |
+
#COOKIES_ENABLED = False
|
37 |
+
|
38 |
+
# Disable Telnet Console (enabled by default)
|
39 |
+
#TELNETCONSOLE_ENABLED = False
|
40 |
+
|
41 |
+
# Override the default request headers:
|
42 |
+
#DEFAULT_REQUEST_HEADERS = {
|
43 |
+
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
|
44 |
+
# 'Accept-Language': 'en',
|
45 |
+
#}
|
46 |
+
|
47 |
+
# Enable or disable spider middlewares
|
48 |
+
# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
|
49 |
+
#SPIDER_MIDDLEWARES = {
|
50 |
+
# 'TwitterHashTagCrawler.middlewares.TwitterhashtagcrawlerSpiderMiddleware': 543,
|
51 |
+
#}
|
52 |
+
|
53 |
+
# Enable or disable downloader middlewares
|
54 |
+
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
|
55 |
+
#DOWNLOADER_MIDDLEWARES = {
|
56 |
+
# 'TwitterHashTagCrawler.middlewares.TwitterhashtagcrawlerDownloaderMiddleware': 543,
|
57 |
+
#}
|
58 |
+
|
59 |
+
# Enable or disable extensions
|
60 |
+
# See https://doc.scrapy.org/en/latest/topics/extensions.html
|
61 |
+
#EXTENSIONS = {
|
62 |
+
# 'scrapy.extensions.telnet.TelnetConsole': None,
|
63 |
+
#}
|
64 |
+
|
65 |
+
# Configure item pipelines
|
66 |
+
# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
|
67 |
+
#ITEM_PIPELINES = {
|
68 |
+
# 'TwitterHashTagCrawler.pipelines.TwitterhashtagcrawlerPipeline': 300,
|
69 |
+
#}
|
70 |
+
|
71 |
+
# Enable and configure the AutoThrottle extension (disabled by default)
|
72 |
+
# See https://doc.scrapy.org/en/latest/topics/autothrottle.html
|
73 |
+
#AUTOTHROTTLE_ENABLED = True
|
74 |
+
# The initial download delay
|
75 |
+
#AUTOTHROTTLE_START_DELAY = 5
|
76 |
+
# The maximum download delay to be set in case of high latencies
|
77 |
+
#AUTOTHROTTLE_MAX_DELAY = 60
|
78 |
+
# The average number of requests Scrapy should be sending in parallel to
|
79 |
+
# each remote server
|
80 |
+
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
|
81 |
+
# Enable showing throttling stats for every response received:
|
82 |
+
#AUTOTHROTTLE_DEBUG = False
|
83 |
+
|
84 |
+
# Enable and configure HTTP caching (disabled by default)
|
85 |
+
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
|
86 |
+
#HTTPCACHE_ENABLED = True
|
87 |
+
#HTTPCACHE_EXPIRATION_SECS = 0
|
88 |
+
#HTTPCACHE_DIR = 'httpcache'
|
89 |
+
#HTTPCACHE_IGNORE_HTTP_CODES = []
|
90 |
+
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
|
Hands-on-WebScraping/project1_twitter_hashtag_crawler/TwitterHashTagCrawler/spiders/__init__.py
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# This package will contain the spiders of your Scrapy project
|
2 |
+
#
|
3 |
+
# Please refer to the documentation for information on how to create and manage
|
4 |
+
# your spiders.
|
Hands-on-WebScraping/project1_twitter_hashtag_crawler/TwitterHashTagCrawler/spiders/__pycache__/__init__.cpython-310.pyc
ADDED
Binary file (242 Bytes). View file
|
|
Hands-on-WebScraping/project1_twitter_hashtag_crawler/TwitterHashTagCrawler/spiders/__pycache__/hashtag.cpython-310.pyc
ADDED
Binary file (4.72 kB). View file
|
|
Hands-on-WebScraping/project1_twitter_hashtag_crawler/TwitterHashTagCrawler/spiders/__pycache__/hashtag2.cpython-310.pyc
ADDED
Binary file (4.8 kB). View file
|
|
Hands-on-WebScraping/project1_twitter_hashtag_crawler/TwitterHashTagCrawler/spiders/__pycache__/hashtag3.cpython-310.pyc
ADDED
Binary file (2.65 kB). View file
|
|
Hands-on-WebScraping/project1_twitter_hashtag_crawler/TwitterHashTagCrawler/spiders/hashtag.py
ADDED
@@ -0,0 +1,118 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# -*- coding: utf-8 -*-
|
2 |
+
import scrapy
|
3 |
+
import ipdb
|
4 |
+
import re
|
5 |
+
from dateutil import parser
|
6 |
+
import sys
|
7 |
+
from scrapy.crawler import CrawlerProcess
|
8 |
+
from utils import get_links, get_hashtags, get_mentions
|
9 |
+
import logging
|
10 |
+
|
11 |
+
class HashtagSpider(scrapy.Spider):
|
12 |
+
name = 'hashtag'
|
13 |
+
allowed_domains = ["twitter.com"]
|
14 |
+
|
15 |
+
# custom settings for user agent and proxy. Default will get chrome as user agent and use a proxypool of 50 .
|
16 |
+
# Override here
|
17 |
+
custom_settings = {
|
18 |
+
'USER_AGENT': 'my-cool-project (http://example.com)',
|
19 |
+
'CONCURRENT_REQUESTS': 5, 'DOWNLOAD_DELAY': 0, 'LOG_LEVEL': 'INFO'}
|
20 |
+
|
21 |
+
def __init__(self, filename=''):
|
22 |
+
if not filename:
|
23 |
+
sys.exit('Please provide the input filename also. Example \n\n$ python3 hashtags.py myinput.csv')
|
24 |
+
self.filename = filename
|
25 |
+
|
26 |
+
# the crawler will execute start_requests function at first.
|
27 |
+
def start_requests(self):
|
28 |
+
#with open(self.filename, 'r') as f:
|
29 |
+
#hashtags = ['danaher']
|
30 |
+
hashtags= ['danaher']
|
31 |
+
if len(hashtags) == 0:
|
32 |
+
sys.exit('Emplty File detected.Please provide hashtags separated by newlines')
|
33 |
+
else:
|
34 |
+
logging.info(f'{len(hashtags)} hashtags found')
|
35 |
+
print('hashtag is..',hashtags)
|
36 |
+
for hashtag in hashtags:
|
37 |
+
if hashtag:
|
38 |
+
search_url = "https://twitter.com/hashtag/" + hashtag.lower()
|
39 |
+
print('search_url is...', search_url)
|
40 |
+
|
41 |
+
yield scrapy.Request(search_url, callback=self.find_tweets, dont_filter=True)
|
42 |
+
|
43 |
+
def find_tweets(self, response):
|
44 |
+
print("I am in find_tweets")
|
45 |
+
tweets = response.xpath('//*[@data-testid="tweetText"]/span[1]/text()').getall()
|
46 |
+
print("Tweets is...", tweets)
|
47 |
+
print(f'{len(tweets)} tweets found')
|
48 |
+
for tweet in tweets:
|
49 |
+
# tweet_id = re.findall("\d+", tweet_id)[-1]
|
50 |
+
# tweet_url = 'https://twitter.com/anyuser/status/' + \
|
51 |
+
# str(tweet_id)
|
52 |
+
print(tweet)
|
53 |
+
yield scrapy.Request(tweet_url, callback=self.parse_tweet)
|
54 |
+
|
55 |
+
# finding and visiting next page
|
56 |
+
next_page = response.xpath(
|
57 |
+
'//*[@class="w-button-more"]/a/@href').get(default='')
|
58 |
+
logging.info('Next page found:')
|
59 |
+
if next_page != '':
|
60 |
+
next_page = 'https://mobile.twitter.com' + next_page
|
61 |
+
yield scrapy.Request(next_page, callback=self.find_tweets)
|
62 |
+
|
63 |
+
def parse_tweet(self, response):
|
64 |
+
logging.info('Processing --> ' + response.url)
|
65 |
+
username = response.xpath(
|
66 |
+
'//*[@class="permalink-inner permalink-tweet-container"]//*[@class="username u-dir u-textTruncate"]/b/text()').get(
|
67 |
+
default='')
|
68 |
+
full_name = response.xpath(
|
69 |
+
'//*[@class="permalink-inner permalink-tweet-container"]//*[@class="FullNameGroup"]/strong/text()').get(
|
70 |
+
default='')
|
71 |
+
|
72 |
+
try:
|
73 |
+
tweet_text = response.xpath('//title/text()').get(default='').split(':')[1].strip()
|
74 |
+
|
75 |
+
except:
|
76 |
+
tweet_text = ' '.join(response.xpath(
|
77 |
+
'//*[contains(@class,"permalink-inner permalink-tweet-container")]//*[@class="js-tweet-text-container"]/p//text()').getall()).strip()
|
78 |
+
image_list = response.xpath(
|
79 |
+
'//*[contains(@class,"permalink-inner permalink-tweet-container")]//*[@class="AdaptiveMediaOuterContainer"]//img/@src').getall()
|
80 |
+
date_time = response.xpath(
|
81 |
+
'//*[contains(@class,"permalink-inner permalink-tweet-container")]//*[@class="js-tweet-details-fixer tweet-details-fixer"]/div[@class="client-and-actions"]/span[@class="metadata"]/span/text()').get(
|
82 |
+
default='')
|
83 |
+
|
84 |
+
date_time = parser.parse(date_time.replace('-', '')).strftime('%Y-%m-%d %H:%M:%S')
|
85 |
+
retweets = response.xpath(
|
86 |
+
'//*[contains(@class,"permalink-inner permalink-tweet-container")]//*[@class="js-tweet-details-fixer tweet-details-fixer"]/div[@class="js-tweet-stats-container tweet-stats-container"]//*[@class="js-stat-count js-stat-retweets stat-count"]/a/strong/text()').get(
|
87 |
+
default='')
|
88 |
+
|
89 |
+
likes = response.xpath(
|
90 |
+
'//*[contains(@class,"permalink-inner permalink-tweet-container")]//*[@class="js-tweet-details-fixer tweet-details-fixer"]/div[@class="js-tweet-stats-container tweet-stats-container"]//*[@class="js-stat-count js-stat-favorites stat-count"]/a/strong/text()').get(
|
91 |
+
default='')
|
92 |
+
replies = response.xpath(
|
93 |
+
'//*[contains(@class,"permalink-inner permalink-tweet-container")]//*[contains(@id,"profile-tweet-action-reply-count")]/parent::span/@data-tweet-stat-count').get(
|
94 |
+
default='')
|
95 |
+
|
96 |
+
mentions = get_mentions(tweet_text)
|
97 |
+
hashtags = get_hashtags(tweet_text)
|
98 |
+
cta = get_links(tweet_text)
|
99 |
+
|
100 |
+
result = {
|
101 |
+
'username': username.lower(),
|
102 |
+
'full_name': full_name,
|
103 |
+
'twitter_url': response.url,
|
104 |
+
'tweet_text': tweet_text,
|
105 |
+
'tweet_time': str(date_time),
|
106 |
+
'number_of_likes': str(likes),
|
107 |
+
'no_of_retweets': str(retweets),
|
108 |
+
'no_of_replies': str(replies),
|
109 |
+
'mentions': ' | '.join(mentions),
|
110 |
+
'no_of_mentions': str(len(mentions)),
|
111 |
+
'hashtags': ' | '.join(hashtags),
|
112 |
+
'no_of_hashtags': str(len(hashtags)),
|
113 |
+
'call_to_action': ' | '.join(cta),
|
114 |
+
'image_url': ' | '.join(image_list),
|
115 |
+
|
116 |
+
}
|
117 |
+
yield result
|
118 |
+
|
Hands-on-WebScraping/project1_twitter_hashtag_crawler/TwitterHashTagCrawler/spiders/hashtag2.py
ADDED
@@ -0,0 +1,121 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# -*- coding: utf-8 -*-
|
2 |
+
import scrapy
|
3 |
+
import ipdb
|
4 |
+
import re
|
5 |
+
from dateutil import parser
|
6 |
+
import sys
|
7 |
+
from scrapy.crawler import CrawlerProcess
|
8 |
+
from utils import get_links, get_hashtags, get_mentions
|
9 |
+
from scrapy.http.request import Request
|
10 |
+
import logging
|
11 |
+
|
12 |
+
class HashtagSpider(scrapy.Spider):
|
13 |
+
name = 'hashtag2'
|
14 |
+
allowed_domains = ["twitter.com"]
|
15 |
+
|
16 |
+
# custom settings for user agent and proxy. Default will get chrome as user agent and use a proxypool of 50 .
|
17 |
+
# Override here
|
18 |
+
custom_settings = {
|
19 |
+
'USER_AGENT': 'my-cool-project (http://example.com)',
|
20 |
+
'CONCURRENT_REQUESTS': 5, 'DOWNLOAD_DELAY': 0, 'LOG_LEVEL': 'INFO'}
|
21 |
+
|
22 |
+
def __init__(self, filename=''):
|
23 |
+
if not filename:
|
24 |
+
sys.exit('Please provide the input filename also. Example \n\n$ python3 hashtags.py myinput.csv')
|
25 |
+
self.filename = filename
|
26 |
+
|
27 |
+
# the crawler will execute start_requests function at first.
|
28 |
+
def start_requests(self):
|
29 |
+
#with open(self.filename, 'r') as f:
|
30 |
+
#hashtags = ['danaher']
|
31 |
+
hashtags= ['danaher']
|
32 |
+
if len(hashtags) == 0:
|
33 |
+
sys.exit('Emplty File detected.Please provide hashtags separated by newlines')
|
34 |
+
else:
|
35 |
+
logging.info(f'{len(hashtags)} hashtags found')
|
36 |
+
print('hashtag is..',hashtags)
|
37 |
+
for hashtag in hashtags:
|
38 |
+
if hashtag:
|
39 |
+
search_url = "https://twitter.com/hashtag/" + hashtag.lower()
|
40 |
+
print('search_url is...', search_url)
|
41 |
+
try:
|
42 |
+
yield Request(search_url, callback=self.find_tweets, dont_filter=True)
|
43 |
+
except Exception as e:
|
44 |
+
print(e)
|
45 |
+
|
46 |
+
def find_tweets(self, response):
|
47 |
+
print("I am in find_tweets")
|
48 |
+
tweets = response.xpath('//*[@data-testid="tweetText"]/span[1]/text()').getall()
|
49 |
+
print("Tweets is...", tweets)
|
50 |
+
print(f'{len(tweets)} tweets found')
|
51 |
+
for tweet in tweets:
|
52 |
+
# tweet_id = re.findall("\d+", tweet_id)[-1]
|
53 |
+
# tweet_url = 'https://twitter.com/anyuser/status/' + \
|
54 |
+
# str(tweet_id)
|
55 |
+
print(tweet)
|
56 |
+
#yield scrapy.Request(tweet_url, callback=self.parse_tweet)
|
57 |
+
|
58 |
+
# finding and visiting next page
|
59 |
+
next_page = response.xpath(
|
60 |
+
'//*[@class="w-button-more"]/a/@href').get(default='')
|
61 |
+
logging.info('Next page found:')
|
62 |
+
if next_page != '':
|
63 |
+
next_page = 'https://mobile.twitter.com' + next_page
|
64 |
+
yield scrapy.Request(next_page, callback=self.find_tweets)
|
65 |
+
|
66 |
+
def parse_tweet(self, response):
|
67 |
+
logging.info('Processing --> ' + response.url)
|
68 |
+
username = response.xpath(
|
69 |
+
'//*[@class="permalink-inner permalink-tweet-container"]//*[@class="username u-dir u-textTruncate"]/b/text()').get(
|
70 |
+
default='')
|
71 |
+
full_name = response.xpath(
|
72 |
+
'//*[@class="permalink-inner permalink-tweet-container"]//*[@class="FullNameGroup"]/strong/text()').get(
|
73 |
+
default='')
|
74 |
+
|
75 |
+
try:
|
76 |
+
tweet_text = response.xpath('//title/text()').get(default='').split(':')[1].strip()
|
77 |
+
|
78 |
+
except:
|
79 |
+
tweet_text = ' '.join(response.xpath(
|
80 |
+
'//*[contains(@class,"permalink-inner permalink-tweet-container")]//*[@class="js-tweet-text-container"]/p//text()').getall()).strip()
|
81 |
+
image_list = response.xpath(
|
82 |
+
'//*[contains(@class,"permalink-inner permalink-tweet-container")]//*[@class="AdaptiveMediaOuterContainer"]//img/@src').getall()
|
83 |
+
date_time = response.xpath(
|
84 |
+
'//*[contains(@class,"permalink-inner permalink-tweet-container")]//*[@class="js-tweet-details-fixer tweet-details-fixer"]/div[@class="client-and-actions"]/span[@class="metadata"]/span/text()').get(
|
85 |
+
default='')
|
86 |
+
|
87 |
+
date_time = parser.parse(date_time.replace('-', '')).strftime('%Y-%m-%d %H:%M:%S')
|
88 |
+
retweets = response.xpath(
|
89 |
+
'//*[contains(@class,"permalink-inner permalink-tweet-container")]//*[@class="js-tweet-details-fixer tweet-details-fixer"]/div[@class="js-tweet-stats-container tweet-stats-container"]//*[@class="js-stat-count js-stat-retweets stat-count"]/a/strong/text()').get(
|
90 |
+
default='')
|
91 |
+
|
92 |
+
likes = response.xpath(
|
93 |
+
'//*[contains(@class,"permalink-inner permalink-tweet-container")]//*[@class="js-tweet-details-fixer tweet-details-fixer"]/div[@class="js-tweet-stats-container tweet-stats-container"]//*[@class="js-stat-count js-stat-favorites stat-count"]/a/strong/text()').get(
|
94 |
+
default='')
|
95 |
+
replies = response.xpath(
|
96 |
+
'//*[contains(@class,"permalink-inner permalink-tweet-container")]//*[contains(@id,"profile-tweet-action-reply-count")]/parent::span/@data-tweet-stat-count').get(
|
97 |
+
default='')
|
98 |
+
|
99 |
+
mentions = get_mentions(tweet_text)
|
100 |
+
hashtags = get_hashtags(tweet_text)
|
101 |
+
cta = get_links(tweet_text)
|
102 |
+
|
103 |
+
result = {
|
104 |
+
'username': username.lower(),
|
105 |
+
'full_name': full_name,
|
106 |
+
'twitter_url': response.url,
|
107 |
+
'tweet_text': tweet_text,
|
108 |
+
'tweet_time': str(date_time),
|
109 |
+
'number_of_likes': str(likes),
|
110 |
+
'no_of_retweets': str(retweets),
|
111 |
+
'no_of_replies': str(replies),
|
112 |
+
'mentions': ' | '.join(mentions),
|
113 |
+
'no_of_mentions': str(len(mentions)),
|
114 |
+
'hashtags': ' | '.join(hashtags),
|
115 |
+
'no_of_hashtags': str(len(hashtags)),
|
116 |
+
'call_to_action': ' | '.join(cta),
|
117 |
+
'image_url': ' | '.join(image_list),
|
118 |
+
|
119 |
+
}
|
120 |
+
yield result
|
121 |
+
|
Hands-on-WebScraping/project1_twitter_hashtag_crawler/TwitterHashTagCrawler/spiders/hashtag3.py
ADDED
@@ -0,0 +1,142 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
from operator import concat
|
3 |
+
import scrapy
|
4 |
+
import time
|
5 |
+
import pandas as pd
|
6 |
+
from scrapy.http.request import Request
|
7 |
+
from scrapy import signals
|
8 |
+
#from sentence_transformers import SentenceTransformer, util
|
9 |
+
#import numpy as np
|
10 |
+
#import yake
|
11 |
+
##import nltk.data
|
12 |
+
#import nltk
|
13 |
+
#nltk.download('punkt')
|
14 |
+
#from nltk.tokenize import sent_tokenize
|
15 |
+
#from keybert import KeyBERT
|
16 |
+
#from statistics import mean
|
17 |
+
#from urllib.parse import urlparse
|
18 |
+
#import random
|
19 |
+
|
20 |
+
|
21 |
+
from transformers import AutoTokenizer, AutoModel
|
22 |
+
|
23 |
+
import torch
|
24 |
+
|
25 |
+
from sklearn.metrics.pairwise import cosine_similarity
|
26 |
+
|
27 |
+
|
28 |
+
#text =[]
|
29 |
+
text2 =''
|
30 |
+
res =[]
|
31 |
+
len_res = 0
|
32 |
+
len_res2 = 0
|
33 |
+
|
34 |
+
list1 = []
|
35 |
+
df = pd.DataFrame()
|
36 |
+
df_sim = pd.DataFrame()
|
37 |
+
allowed_domains = []
|
38 |
+
list_start_urls = []
|
39 |
+
list_companies = []
|
40 |
+
index = 0
|
41 |
+
len_df = 0
|
42 |
+
mean_embedding = []
|
43 |
+
list_df_sim = []
|
44 |
+
|
45 |
+
class BioSpider(scrapy.Spider):
|
46 |
+
name = "hashtag3"
|
47 |
+
|
48 |
+
custom_settings = {'CONCURRENT_REQUESTS': '1', 'CONCURRENT_REQUESTS_PER_DOMAIN':'1', 'ROBOTSTXT_OBEY' : False \
|
49 |
+
, "REQUEST_FINGERPRINTER_IMPLEMENTATION": "2.7", 'USER_AGENT':'my-cool-project (http://example.com)'}
|
50 |
+
|
51 |
+
|
52 |
+
global allowed_domains
|
53 |
+
allowed_domains = ["twitter.com"]
|
54 |
+
global list_start_urls
|
55 |
+
global list_companies
|
56 |
+
global res
|
57 |
+
global index
|
58 |
+
list_start_urls2 = []
|
59 |
+
global len_df
|
60 |
+
global df
|
61 |
+
|
62 |
+
#data=pd.read_excel("C:\\Users\\RSPRASAD\OneDrive - Danaher\\Bec_LS\\2023\\D_and_B_Project\\Segmentation\\Customer focus list 2023 NGW.xlsx", sheet_name='Sheet1')
|
63 |
+
|
64 |
+
#df= data[['Company', 'Website']]
|
65 |
+
#df.drop_duplicates(inplace = True)
|
66 |
+
#df['Content'] = ''
|
67 |
+
|
68 |
+
i = 0
|
69 |
+
len_df = 1
|
70 |
+
# for i in range(0, len(df)):
|
71 |
+
# #df.loc[i, 'company_name']= df.loc[i, 'Company']
|
72 |
+
# #df.loc[i, 'company_website']= df.loc[i, 'Website']
|
73 |
+
# list_start_urls.append(df.loc[i, 'Website'])
|
74 |
+
# list_companies.append(df.loc[i, 'Company'])
|
75 |
+
# domain = urlparse(df.loc[i, 'Website']).netloc
|
76 |
+
# allowed_domains.append(domain)
|
77 |
+
# print(allowed_domains)
|
78 |
+
# upper_len_websites = 5
|
79 |
+
start_index = 0
|
80 |
+
# if(len_df >upper_len_websites):
|
81 |
+
# list_start_urls= list_start_urls[start_index:upper_len_websites]
|
82 |
+
# df = df.iloc[start_index:upper_len_websites,:]
|
83 |
+
|
84 |
+
|
85 |
+
list_start_urls = ['https://twitter.com/hashtag/danaher/']
|
86 |
+
# df = df.iloc[start_index:upper_len_websites,:]
|
87 |
+
# df = df.reset_index()
|
88 |
+
# df = df.drop('index', axis = 1)
|
89 |
+
# len_df = len(df)
|
90 |
+
|
91 |
+
# print("Dataframe for crawling website is ..")
|
92 |
+
# print(df)
|
93 |
+
|
94 |
+
|
95 |
+
|
96 |
+
print(list_start_urls)
|
97 |
+
|
98 |
+
|
99 |
+
@classmethod
|
100 |
+
def from_crawler(cls, crawler, *args, **kwargs):
|
101 |
+
spider = super(BioSpider, cls).from_crawler(crawler, *args, **kwargs)
|
102 |
+
#crawler.signals.connect(spider.spider_opened, signals.spider_opened)
|
103 |
+
crawler.signals.connect(spider.spider_closed, signals.spider_closed)
|
104 |
+
return spider
|
105 |
+
|
106 |
+
def start_requests(self):
|
107 |
+
|
108 |
+
global list_start_urls
|
109 |
+
global index
|
110 |
+
global res
|
111 |
+
|
112 |
+
|
113 |
+
index =0
|
114 |
+
index2 = len(list_start_urls)
|
115 |
+
print(" i am in start_requests")
|
116 |
+
|
117 |
+
|
118 |
+
|
119 |
+
try:
|
120 |
+
|
121 |
+
yield Request(list_start_urls[0].strip(), callback = self.parse)#, meta={'priority': index2})
|
122 |
+
|
123 |
+
except Exception as e:
|
124 |
+
print("There is exception and exception is..",e)
|
125 |
+
|
126 |
+
|
127 |
+
|
128 |
+
|
129 |
+
def parse(self, response):
|
130 |
+
print("I am in parse..")
|
131 |
+
print("I am in find_tweets")
|
132 |
+
tweets = response.xpath('//*[@data-testid="tweetText"]/span[1]/text()').getall()
|
133 |
+
print("Tweets is...", tweets)
|
134 |
+
print(f'{len(tweets)} tweets found')
|
135 |
+
for tweet in tweets:
|
136 |
+
print(tweet)
|
137 |
+
count += 1
|
138 |
+
if (count >5):
|
139 |
+
break
|
140 |
+
|
141 |
+
def spider_closed(self, spider):
|
142 |
+
print("I am in spider closed...")
|
Hands-on-WebScraping/project1_twitter_hashtag_crawler/__pycache__/utils.cpython-310.pyc
ADDED
Binary file (2.21 kB). View file
|
|
Hands-on-WebScraping/project1_twitter_hashtag_crawler/mydata.csv
ADDED
File without changes
|
Hands-on-WebScraping/project1_twitter_hashtag_crawler/myhashtag.csv
ADDED
Binary file (9.3 kB). View file
|
|
Hands-on-WebScraping/project1_twitter_hashtag_crawler/myhashtags.csv
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
danaher
|
Hands-on-WebScraping/project1_twitter_hashtag_crawler/requirements.txt
ADDED
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
1 |
+
scrapy
|
2 |
+
dateutil
|
Hands-on-WebScraping/project1_twitter_hashtag_crawler/sampledata.csv
ADDED
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
username,full_name,twitter_url,tweet_text,tweet_time,number_of_likes,no_of_retweets,no_of_replies,mentions,no_of_mentions,hashtags,no_of_hashtags,call_to_action,image_url
|
2 |
+
cctvasiapacific,CCTV Asia Pacific,https://twitter.com/CCTVAsiaPacific/status/1212269072328491008,"Turning off the stereotype of political faces, Hou Yanqi, the # ChineseAmbssdor to # Nepal , amazes Nepalese and gains popularity on twitter by posting her ad-like photos and wishes: ""True beauty always touches the deep heart"", said Hou.
|
3 |
+
२०२० नेपाल भ्रमाण वर्ष सफलताको शुभकामना pic.twitter.com/z0N8ru2vNd",2019-12-31 23:07:00,804,171,35,,0,,0,,https://pbs.twimg.com/media/ENLYSqlU4AAgiFh.jpg | https://pbs.twimg.com/media/ENLYSqoVAAASSS-.jpg | https://pbs.twimg.com/media/ENLYSqmU0AAZEyK.jpg
|
4 |
+
,,https://twitter.com/BishowParajuli/status/1213037950549626882,"Zimbabwe is beautiful! Glad to hear your mountain climbing adventure ; If you wish to climb further higher, another beautiful place is # Nepal ! You will you can also enjoy some terrific historical spots: pic.twitter.com/ofsCppyp0O",2020-01-03 02:02:00,27,3,1,,0,,0,,https://pbs.twimg.com/media/ENWTkzmUEAEKS1k.jpg | https://pbs.twimg.com/media/ENWTkznU4AAtVxK.jpg | https://pbs.twimg.com/media/ENWTkzoUwAEgMpX.jpg | https://pbs.twimg.com/media/ENWTkzlU4AEYxor.jpg
|
5 |
+
kopinoora,kpila,https://twitter.com/kopinoora/status/1213481511967690752,# VisitNepal2020 official inauguration at London Nepal Embassy. # pic.twitter.com/e4N9XulBH7,2020-01-04 07:25:00,3,,0,,0,,0,,https://pbs.twimg.com/media/ENcnABiXsAE7_sw.jpg | https://pbs.twimg.com/media/ENcnABsXUAAnuBL.jpg
|
6 |
+
mahbub_nazif,Nazif Mahbub,https://twitter.com/mahbub_nazif/status/1213328288271089664,"The joy of being Innocent. Durbar square, kathmandu, nepal pic.twitter.com/sbsfxTzeHN",2020-01-03 21:16:00,4,,0,,0,,0,,https://pbs.twimg.com/media/ENabn-uWwAcbUfb.jpg
|
7 |
+
prabhumteverest,Prastuti_प्रश्तुती,https://twitter.com/PrabhuMteverest/status/1213178026457878528,"Visit nepal2020. where heaven meets and you won't feel regret choosing Nepal as your destination
|
8 |
+
|
9 |
+
We are eager to welcome you with our beautiful destinations and warm hospitality pic.twitter.com/l7GQfk2ha6",2020-01-03 11:19:00,5,,0,,0,,0,,https://pbs.twimg.com/media/ENYS_CLUwAAVypp.jpg
|
10 |
+
kashishds,Kashish Das Shrestha,https://twitter.com/kashishds/status/1213120581412876295,"Marpha bazaar, Mustang, Nepal. Today.
|
11 |
+
|
12 |
+
Requested my friend & Marpha resident Dipesh Hirachan for this clip. This is just outside his Apple orchard there. pic.twitter.com/oOFy88ylIt",2020-01-03 07:30:00,123,20,4,,0,,0,,
|
Hands-on-WebScraping/project1_twitter_hashtag_crawler/scrapy.cfg
ADDED
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Automatically created by: scrapy startproject
|
2 |
+
#
|
3 |
+
# For more information about the [deploy] section see:
|
4 |
+
# https://scrapyd.readthedocs.io/en/latest/deploy.html
|
5 |
+
|
6 |
+
[settings]
|
7 |
+
default = TwitterHashTagCrawler.settings
|
8 |
+
|
9 |
+
[deploy]
|
10 |
+
#url = http://localhost:6800/
|
11 |
+
project = TwitterHashTagCrawler
|
Hands-on-WebScraping/project1_twitter_hashtag_crawler/utils.py
ADDED
@@ -0,0 +1,67 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import re
|
2 |
+
|
3 |
+
|
4 |
+
def find_emails(text):
|
5 |
+
"""
|
6 |
+
It will parse the given string and return a list of emails if found
|
7 |
+
|
8 |
+
Example:
|
9 |
+
>>find_emails('hello\n find me here\nemail@gmail.com')
|
10 |
+
['email@gmail.com']
|
11 |
+
|
12 |
+
:param text: string
|
13 |
+
:return: list
|
14 |
+
"""
|
15 |
+
return re.findall(r"([a-zA-Z0-9+._-]+@[a-zA-Z0-9._-]+\.[a-zA-Z0-9_-]+)", text)
|
16 |
+
|
17 |
+
|
18 |
+
def get_mentions(text):
|
19 |
+
"""
|
20 |
+
It will return mentions from the text i.e @someone
|
21 |
+
|
22 |
+
:param text: string
|
23 |
+
:return: list
|
24 |
+
|
25 |
+
example
|
26 |
+
>>> get_mentions('Hi @hero, How are you? I hope @hero2 is fine. BTW say hi to @heroine for me')
|
27 |
+
['hero','hero2','heroine']
|
28 |
+
"""
|
29 |
+
result = re.findall("(^|[^@\w])@(\w{1,15})", text)
|
30 |
+
if len(result) != 0:
|
31 |
+
result = [i[1] for i in result]
|
32 |
+
return result
|
33 |
+
|
34 |
+
|
35 |
+
def get_hashtags(text):
|
36 |
+
"""
|
37 |
+
It will return hashtags from the text i.e #something
|
38 |
+
|
39 |
+
:param text: string
|
40 |
+
:return: list
|
41 |
+
|
42 |
+
example
|
43 |
+
>>> get_hashtags('my first code #programmer #python #awesome #grepsr')
|
44 |
+
['programmer','python','awesome','grepsr']
|
45 |
+
"""
|
46 |
+
|
47 |
+
result = re.findall("(^|[^@\w])#(\w{1,15})", text)
|
48 |
+
if len(result) != 0:
|
49 |
+
result = [i[1] for i in result]
|
50 |
+
return result
|
51 |
+
|
52 |
+
|
53 |
+
def get_links(text):
|
54 |
+
"""
|
55 |
+
It will return website links from the text
|
56 |
+
|
57 |
+
:param text: string
|
58 |
+
:return: list
|
59 |
+
|
60 |
+
example
|
61 |
+
>>> message = 'http://twitter.com Project URL: https://app.grepsr.com/app/project/message/70454'
|
62 |
+
>>> get_links(message)
|
63 |
+
['http://twitter.com', 'https://app.grepsr.com/app/project/message/70454']
|
64 |
+
|
65 |
+
"""
|
66 |
+
result = re.findall(r"(?P<url>https?://[^\s]+)", text)
|
67 |
+
return result
|
README.md
CHANGED
@@ -1,12 +1,6 @@
|
|
1 |
---
|
2 |
-
title:
|
3 |
-
|
4 |
-
colorFrom: pink
|
5 |
-
colorTo: red
|
6 |
sdk: gradio
|
7 |
sdk_version: 4.31.1
|
8 |
-
app_file: app.py
|
9 |
-
pinned: false
|
10 |
---
|
11 |
-
|
12 |
-
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
|
|
1 |
---
|
2 |
+
title: twitter_sentiment
|
3 |
+
app_file: test.py
|
|
|
|
|
4 |
sdk: gradio
|
5 |
sdk_version: 4.31.1
|
|
|
|
|
6 |
---
|
|
|
|
__pycache__/test.cpython-39.pyc
ADDED
Binary file (2.57 kB). View file
|
|
__pycache__/twitter_crawl.cpython-310.pyc
ADDED
Binary file (1.12 kB). View file
|
|
requirements.txt
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
streamlit
|
2 |
+
transformers==4.40.2
|
3 |
+
tensorflow==2.16.1
|
4 |
+
tweetnlp
|
scrapper.ipynb
ADDED
@@ -0,0 +1,168 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cells": [
|
3 |
+
{
|
4 |
+
"cell_type": "code",
|
5 |
+
"execution_count": null,
|
6 |
+
"metadata": {},
|
7 |
+
"outputs": [],
|
8 |
+
"source": [
|
9 |
+
"import re\n",
|
10 |
+
"import csv\n",
|
11 |
+
"from getpass import getpass\n",
|
12 |
+
"from time import sleep\n",
|
13 |
+
"from selenium.webdriver.common.keys import Keys\n",
|
14 |
+
"from selenium.common.exceptions import NoSuchElementException\n",
|
15 |
+
"from msedge.selenium_tools import Edge, EdgeOptions "
|
16 |
+
]
|
17 |
+
},
|
18 |
+
{
|
19 |
+
"cell_type": "code",
|
20 |
+
"execution_count": null,
|
21 |
+
"metadata": {},
|
22 |
+
"outputs": [],
|
23 |
+
"source": [
|
24 |
+
"def get_tweet_data(card):\n",
|
25 |
+
" \"\"\"Extract data from tweet card\"\"\"\n",
|
26 |
+
" username = card.find_element_by_xpath('.//span').text\n",
|
27 |
+
" try:\n",
|
28 |
+
" handle = card.find_element_by_xpath('.//span[contains(text(), \"@\")]').text\n",
|
29 |
+
" except NoSuchElementException:\n",
|
30 |
+
" return\n",
|
31 |
+
" \n",
|
32 |
+
" try:\n",
|
33 |
+
" postdate = card.find_element_by_xpath('.//time').get_attribute('datetime')\n",
|
34 |
+
" except NoSuchElementException:\n",
|
35 |
+
" return\n",
|
36 |
+
" \n",
|
37 |
+
" comment = card.find_element_by_xpath('.//div[2]/div[2]/div[1]').text\n",
|
38 |
+
" responding = card.find_element_by_xpath('.//div[2]/div[2]/div[2]').text\n",
|
39 |
+
" text = comment + responding\n",
|
40 |
+
" reply_cnt = card.find_element_by_xpath('.//div[@data-testid=\"reply\"]').text\n",
|
41 |
+
" retweet_cnt = card.find_element_by_xpath('.//div[@data-testid=\"retweet\"]').text\n",
|
42 |
+
" like_cnt = card.find_element_by_xpath('.//div[@data-testid=\"like\"]').text\n",
|
43 |
+
"\n",
|
44 |
+
" \n",
|
45 |
+
" tweet = (username, handle, postdate, text, reply_cnt, retweet_cnt, like_cnt)\n",
|
46 |
+
" return tweet "
|
47 |
+
]
|
48 |
+
},
|
49 |
+
{
|
50 |
+
"cell_type": "code",
|
51 |
+
"execution_count": null,
|
52 |
+
"metadata": {},
|
53 |
+
"outputs": [],
|
54 |
+
"source": [
|
55 |
+
"search_term = input('search term: ')\n",
|
56 |
+
"\n",
|
57 |
+
"# create instance of web driver\n",
|
58 |
+
"options = EdgeOptions()\n",
|
59 |
+
"options.use_chromium = True\n",
|
60 |
+
"driver = Edge(options=options)\n",
|
61 |
+
"\n",
|
62 |
+
"# navigate to login screen\n",
|
63 |
+
"driver.get('https://twitter.com/search')\n",
|
64 |
+
"driver.maximize_window()\n",
|
65 |
+
"sleep(5)\n",
|
66 |
+
"\n",
|
67 |
+
"# find search input and search for term\n",
|
68 |
+
"search_input = driver.find_element_by_xpath('//input[@aria-label=\"Search query\"]')\n",
|
69 |
+
"search_input.send_keys(search_term)\n",
|
70 |
+
"search_input.send_keys(Keys.RETURN)\n",
|
71 |
+
"sleep(1)\n",
|
72 |
+
"\n",
|
73 |
+
"# navigate to historical 'latest' tab\n",
|
74 |
+
"driver.find_element_by_link_text('Latest').click()"
|
75 |
+
]
|
76 |
+
},
|
77 |
+
{
|
78 |
+
"cell_type": "code",
|
79 |
+
"execution_count": null,
|
80 |
+
"metadata": {},
|
81 |
+
"outputs": [],
|
82 |
+
"source": [
|
83 |
+
"# get all tweets on the page\n",
|
84 |
+
"data = []\n",
|
85 |
+
"tweet_ids = set()\n",
|
86 |
+
"last_position = driver.execute_script(\"return window.pageYOffset;\")\n",
|
87 |
+
"scrolling = True\n",
|
88 |
+
"\n",
|
89 |
+
"while scrolling:\n",
|
90 |
+
" page_cards = driver.find_elements_by_xpath('//article[@data-testid=\"tweet\"]')\n",
|
91 |
+
" for card in page_cards[-15:]:\n",
|
92 |
+
" tweet = get_tweet_data(card)\n",
|
93 |
+
" if tweet:\n",
|
94 |
+
" tweet_id = ''.join(tweet)\n",
|
95 |
+
" if tweet_id not in tweet_ids:\n",
|
96 |
+
" tweet_ids.add(tweet_id)\n",
|
97 |
+
" data.append(tweet)\n",
|
98 |
+
" \n",
|
99 |
+
" scroll_attempt = 0\n",
|
100 |
+
" while True:\n",
|
101 |
+
" # check scroll position\n",
|
102 |
+
" driver.execute_script('window.scrollTo(0, document.body.scrollHeight);')\n",
|
103 |
+
" sleep(2)\n",
|
104 |
+
" curr_position = driver.execute_script(\"return window.pageYOffset;\")\n",
|
105 |
+
" if last_position == curr_position:\n",
|
106 |
+
" scroll_attempt += 1\n",
|
107 |
+
" \n",
|
108 |
+
" # end of scroll region\n",
|
109 |
+
" if scroll_attempt >= 3:\n",
|
110 |
+
" scrolling = False\n",
|
111 |
+
" break\n",
|
112 |
+
" else:\n",
|
113 |
+
" sleep(2) # attempt another scroll\n",
|
114 |
+
" else:\n",
|
115 |
+
" last_position = curr_position\n",
|
116 |
+
" break\n",
|
117 |
+
"\n",
|
118 |
+
"# close the web driver\n",
|
119 |
+
"driver.close()"
|
120 |
+
]
|
121 |
+
},
|
122 |
+
{
|
123 |
+
"cell_type": "markdown",
|
124 |
+
"metadata": {},
|
125 |
+
"source": [
|
126 |
+
"### Saving the tweet data"
|
127 |
+
]
|
128 |
+
},
|
129 |
+
{
|
130 |
+
"cell_type": "code",
|
131 |
+
"execution_count": null,
|
132 |
+
"metadata": {},
|
133 |
+
"outputs": [],
|
134 |
+
"source": [
|
135 |
+
"with open('turkcell_tweets.csv', 'w', newline='', encoding='utf-8') as f:\n",
|
136 |
+
" header = ['UserName', 'Handle', 'Timestamp', 'Text', 'Comments', 'Likes', 'Retweets']\n",
|
137 |
+
" writer = csv.writer(f)\n",
|
138 |
+
" writer.writerow(header)\n",
|
139 |
+
" writer.writerows(data)"
|
140 |
+
]
|
141 |
+
}
|
142 |
+
],
|
143 |
+
"metadata": {
|
144 |
+
"interpreter": {
|
145 |
+
"hash": "306b4709344c791e982a258cf5494139869959872aa39c2c4102a54cca0d2138"
|
146 |
+
},
|
147 |
+
"kernelspec": {
|
148 |
+
"display_name": "Python 3.7.0 64-bit",
|
149 |
+
"language": "python",
|
150 |
+
"name": "python3"
|
151 |
+
},
|
152 |
+
"language_info": {
|
153 |
+
"codemirror_mode": {
|
154 |
+
"name": "ipython",
|
155 |
+
"version": 3
|
156 |
+
},
|
157 |
+
"file_extension": ".py",
|
158 |
+
"mimetype": "text/x-python",
|
159 |
+
"name": "python",
|
160 |
+
"nbconvert_exporter": "python",
|
161 |
+
"pygments_lexer": "ipython3",
|
162 |
+
"version": "3.7.0"
|
163 |
+
},
|
164 |
+
"orig_nbformat": 4
|
165 |
+
},
|
166 |
+
"nbformat": 4,
|
167 |
+
"nbformat_minor": 2
|
168 |
+
}
|