Spaces:
Runtime error
Runtime error
Merge pull request #17 from Demea9000/5-create-twitterscraper-class
Browse files- twitter-scraper/TwitterScraper.py +0 -3
- twitter-scraper/twint-master/.github/FUNDING.yml +3 -0
- twitter-scraper/twint-master/.github/ISSUE_TEMPLATE.md +20 -0
- twitter-scraper/twint-master/.github/ISSUE_TEMPLATE/ISSUE_TEMPLATE.md +17 -0
- twitter-scraper/twint-master/.gitignore +115 -0
- twitter-scraper/twint-master/.travis.yml +23 -0
- twitter-scraper/twint-master/Dockerfile +10 -0
- twitter-scraper/twint-master/LICENSE +21 -0
- twitter-scraper/twint-master/MANIFEST.in +1 -0
- twitter-scraper/twint-master/README.md +272 -0
- twitter-scraper/twint-master/Untitled.ipynb +282 -0
- twitter-scraper/twint-master/automate.py +65 -0
- twitter-scraper/twint-master/elasticsearch/README.md +5 -0
- twitter-scraper/twint-master/scrape.py +102 -0
- twitter-scraper/twint-master/scrape__init__.py +14 -0
- twitter-scraper/twint-master/setup.py +65 -0
- twitter-scraper/twint-master/test.py +92 -0
- twitter-scraper/twint-master/twint/__init__.py +32 -0
- twitter-scraper/twint-master/twint/__version__.py +3 -0
- twitter-scraper/twint-master/twint/cli.py +342 -0
- twitter-scraper/twint-master/twint/config.py +87 -0
- twitter-scraper/twint-master/twint/datelock.py +44 -0
- twitter-scraper/twint-master/twint/feed.py +145 -0
- twitter-scraper/twint-master/twint/format.py +91 -0
- twitter-scraper/twint-master/twint/get.py +298 -0
- twitter-scraper/twint-master/twint/output.py +241 -0
- twitter-scraper/twint-master/twint/run.py +412 -0
- twitter-scraper/twint-master/twint/storage/__init__.py +0 -0
- twitter-scraper/twint-master/twint/storage/db.py +297 -0
- twitter-scraper/twint-master/twint/storage/elasticsearch.py +364 -0
- twitter-scraper/twint-master/twint/storage/panda.py +196 -0
- twitter-scraper/twint-master/twint/storage/write.py +77 -0
- twitter-scraper/twint-master/twint/storage/write_meta.py +151 -0
- twitter-scraper/twint-master/twint/token.py +94 -0
- twitter-scraper/twint-master/twint/tweet.py +166 -0
- twitter-scraper/twint-master/twint/url.py +195 -0
- twitter-scraper/twint-master/twint/user.py +52 -0
- twitter-scraper/twint-master/twint/verbose.py +18 -0
- twitter-scraper/twint-master/twitter_scraper.ipynb +265 -0
twitter-scraper/TwitterScraper.py
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
print("My name is Nils and this is my Test branch")
|
2 |
-
|
3 |
-
print("This is my testBranch, you wont see this on main")
|
|
|
|
|
|
|
|
twitter-scraper/twint-master/.github/FUNDING.yml
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
# These are supported funding model platforms
|
2 |
+
patreon: twintproject
|
3 |
+
custom: paypal.me/noneprivacy
|
twitter-scraper/twint-master/.github/ISSUE_TEMPLATE.md
ADDED
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Issue Template
|
2 |
+
Please use this template!
|
3 |
+
|
4 |
+
## Initial Check
|
5 |
+
> If the issue is a request please specify that it is a request in the title (Example: [REQUEST] more features). If this is a question regarding 'twint' please specify that it's a question in the title (Example: [QUESTION] What is x?). Please **only** submit issues related to 'twint'. Thanks.
|
6 |
+
|
7 |
+
>Make sure you've checked the following:
|
8 |
+
|
9 |
+
- [] Python version is 3.6 or later;
|
10 |
+
- [] Updated Twint with `pip3 install --user --upgrade -e git+https://github.com/minamotorin/twint.git@origin/master#egg=twint`;
|
11 |
+
- [] I have searched the issues and there are no duplicates of this issue/question/request (please link to related issues of twintproject/twint for reference).
|
12 |
+
|
13 |
+
## Command Ran
|
14 |
+
>Please provide the _exact_ command ran including the username/search/code so I may reproduce the issue.
|
15 |
+
|
16 |
+
## Description of Issue
|
17 |
+
>Please use **as much detail as possible.**
|
18 |
+
|
19 |
+
## Environment Details
|
20 |
+
>Using Windows, Linux? What OS version? Running this in Anaconda? Jupyter Notebook? Terminal?
|
twitter-scraper/twint-master/.github/ISSUE_TEMPLATE/ISSUE_TEMPLATE.md
ADDED
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
### Initial Check
|
2 |
+
> If the issue is a request please specify that it is a request in the title (Example: [REQUEST] more features). If this is a question regarding 'twint' please specify that it's a question in the title (Example: [QUESTION] What is x?). Please **only** submit issues related to 'twint'. Thanks.
|
3 |
+
|
4 |
+
>Make sure you've checked the following:
|
5 |
+
|
6 |
+
- [] Python version is 3.6;
|
7 |
+
- [] Using the latest version of Twint;
|
8 |
+
- [] Updated Twint with `pip3 install --upgrade -e git+https://github.com/twintproject/twint.git@origin/master#egg=twint`;
|
9 |
+
|
10 |
+
### Command Ran
|
11 |
+
>Please provide the _exact_ command ran including the username/search/code so I may reproduce the issue.
|
12 |
+
|
13 |
+
### Description of Issue
|
14 |
+
>Please use **as much detail as possible.**
|
15 |
+
|
16 |
+
### Environment Details
|
17 |
+
>Using Windows, Linux? What OS version? Running this in Anaconda? Jupyter Notebook? Terminal?
|
twitter-scraper/twint-master/.gitignore
ADDED
@@ -0,0 +1,115 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Byte-compiled / optimized / DLL files
|
2 |
+
__pycache__/
|
3 |
+
*.py[cod]
|
4 |
+
*$py.class
|
5 |
+
tweets.db
|
6 |
+
# C extensions
|
7 |
+
*.so
|
8 |
+
|
9 |
+
config.ini
|
10 |
+
twint/storage/mysql.py
|
11 |
+
|
12 |
+
# Node Dependency directories
|
13 |
+
node_modules/
|
14 |
+
jspm_packages/
|
15 |
+
tests/
|
16 |
+
# Distribution / packaging
|
17 |
+
.Python
|
18 |
+
env/
|
19 |
+
build/
|
20 |
+
develop-eggs/
|
21 |
+
dist/
|
22 |
+
downloads/
|
23 |
+
eggs/
|
24 |
+
.eggs/
|
25 |
+
lib/
|
26 |
+
lib64/
|
27 |
+
parts/
|
28 |
+
sdist/
|
29 |
+
var/
|
30 |
+
wheels/
|
31 |
+
*.egg-info/
|
32 |
+
.installed.cfg
|
33 |
+
*.egg
|
34 |
+
|
35 |
+
# PyInstaller
|
36 |
+
# Usually these files are written by a python script from a template
|
37 |
+
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
38 |
+
*.manifest
|
39 |
+
*.spec
|
40 |
+
|
41 |
+
# Installer logs
|
42 |
+
pip-log.txt
|
43 |
+
pip-delete-this-directory.txt
|
44 |
+
|
45 |
+
# Unit test / coverage reports
|
46 |
+
htmlcov/
|
47 |
+
.tox/
|
48 |
+
.coverage
|
49 |
+
.coverage.*
|
50 |
+
.cache
|
51 |
+
nosetests.xml
|
52 |
+
coverage.xml
|
53 |
+
*.cover
|
54 |
+
.hypothesis/
|
55 |
+
|
56 |
+
# Translations
|
57 |
+
*.mo
|
58 |
+
*.pot
|
59 |
+
|
60 |
+
# Django stuff:
|
61 |
+
*.log
|
62 |
+
local_settings.py
|
63 |
+
|
64 |
+
# Flask stuff:
|
65 |
+
instance/
|
66 |
+
.webassets-cache
|
67 |
+
|
68 |
+
# Scrapy stuff:
|
69 |
+
.scrapy
|
70 |
+
|
71 |
+
# Sphinx documentation
|
72 |
+
docs/_build/
|
73 |
+
|
74 |
+
# PyBuilder
|
75 |
+
target/
|
76 |
+
|
77 |
+
# Jupyter Notebook
|
78 |
+
.ipynb_checkpoints
|
79 |
+
|
80 |
+
# pyenv
|
81 |
+
.python-version
|
82 |
+
|
83 |
+
# celery beat schedule file
|
84 |
+
celerybeat-schedule
|
85 |
+
|
86 |
+
# SageMath parsed files
|
87 |
+
*.sage.py
|
88 |
+
|
89 |
+
# dotenv
|
90 |
+
.env
|
91 |
+
|
92 |
+
# virtualenv
|
93 |
+
.venv
|
94 |
+
venv/
|
95 |
+
ENV/
|
96 |
+
|
97 |
+
# Spyder project settings
|
98 |
+
.spyderproject
|
99 |
+
.spyproject
|
100 |
+
|
101 |
+
# Rope project settings
|
102 |
+
.ropeproject
|
103 |
+
|
104 |
+
# mkdocs documentation
|
105 |
+
/site
|
106 |
+
|
107 |
+
# mypy
|
108 |
+
.mypy_cache/
|
109 |
+
|
110 |
+
# output
|
111 |
+
*.csv
|
112 |
+
*.json
|
113 |
+
*.txt
|
114 |
+
|
115 |
+
test_twint.py
|
twitter-scraper/twint-master/.travis.yml
ADDED
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
dist: bionic
|
2 |
+
language: python
|
3 |
+
python:
|
4 |
+
- "3.6"
|
5 |
+
- "3.7"
|
6 |
+
- "3.8"
|
7 |
+
- "nightly"
|
8 |
+
matrix:
|
9 |
+
allow_failures:
|
10 |
+
- python: "nightly"
|
11 |
+
- python: "3.8"
|
12 |
+
install:
|
13 |
+
- pip install -r requirements.txt
|
14 |
+
script:
|
15 |
+
- python test.py
|
16 |
+
deploy:
|
17 |
+
provider: pypi
|
18 |
+
user: "codyzacharias"
|
19 |
+
password:
|
20 |
+
secure: sWWvx50F7KJBtf8z2njc+Q31WIAHiQs4zKEiGD4/7xrshw55H5z+WnqZ9VIP83qm9yKefoRKp7WnaJeXZ3ulZSLn64ue45lqFozWMyGvelRPOKvZi9XPMqBA7+qllR/GseTHSGC3G5EGxac6UEI3irYe3mZXxfjpxNOXVti8rJ2xX8TiJM0AVKRrdDiAstOhMMkXkB7fYXMQALwEp8UoW/UbjbeqsKueXydjStaESNP/QzRFZ3/tuNu+3HMz/olniLUhUWcF/xDbJVpXuaRMUalgqe+BTbDdtUVt/s/GKtpg5GAzJyhQphiCM/huihedUIKSoI+6A8PTzuxrLhB5BMi9pcllED02v7w1enpu5L2l5cRDgQJSOpkxkA5Eese8nxKOOq0KzwDQa3JByrRor8R4yz+p5s4u2r0Rs2A9fkjQYwd/uWBSEIRF4K9WZoniiikahwXq070DMRgV7HbovKSjo5NK5F8j+psrtqPF+OHN2aVfWxbGnezrOOkmzuTHhWZVj3pPSpQU1WFWHo9fPo4I6YstR4q6XjNNjrpY3ojSlv0ThMbUem7zhHTRkRsSA2SpPfqw5E3Jf7vaiQb4M5zkBVqxuq4tXb14GJ26tGD8tel8u8b+ccpkAE9xf+QavP8UHz4PbBhqgFX5TbV/H++cdsICyoZnT35yiaDOELM=
|
21 |
+
on:
|
22 |
+
tags: true
|
23 |
+
python: "3.7"
|
twitter-scraper/twint-master/Dockerfile
ADDED
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
FROM python:3.6-buster
|
2 |
+
LABEL maintainer="codyzacharias@pm.me"
|
3 |
+
|
4 |
+
WORKDIR /root
|
5 |
+
|
6 |
+
RUN git clone --depth=1 https://github.com/twintproject/twint.git && \
|
7 |
+
cd /root/twint && \
|
8 |
+
pip3 install . -r requirements.txt
|
9 |
+
|
10 |
+
CMD /bin/bash
|
twitter-scraper/twint-master/LICENSE
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
MIT License
|
2 |
+
|
3 |
+
Copyright (c) 2018 Cody Zacharias
|
4 |
+
|
5 |
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6 |
+
of this software and associated documentation files (the "Software"), to deal
|
7 |
+
in the Software without restriction, including without limitation the rights
|
8 |
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9 |
+
copies of the Software, and to permit persons to whom the Software is
|
10 |
+
furnished to do so, subject to the following conditions:
|
11 |
+
|
12 |
+
The above copyright notice and this permission notice shall be included in all
|
13 |
+
copies or substantial portions of the Software.
|
14 |
+
|
15 |
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16 |
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17 |
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18 |
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19 |
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20 |
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
21 |
+
SOFTWARE.
|
twitter-scraper/twint-master/MANIFEST.in
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
include README.md LICENSE
|
twitter-scraper/twint-master/README.md
ADDED
@@ -0,0 +1,272 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
20220207.0
|
2 |
+
|
3 |
+
# About this fork
|
4 |
+
|
5 |
+
[This repository](https://github.com/minamotorin/twint) is the fork of [https://github.com/twintproject/twint](https://github.com/twintproject/twint) and for myself.
|
6 |
+
|
7 |
+
Modified by [minamotorin](https://github.com/minamotorin).
|
8 |
+
|
9 |
+
## Updates from twintproject/twint
|
10 |
+
|
11 |
+
### twint.token.RefreshTokenException: Could not find the Guest token in HTML
|
12 |
+
|
13 |
+
This problem doesn't happen recently.
|
14 |
+
|
15 |
+
#### Related
|
16 |
+
|
17 |
+
- [twintproject/twint#1320](https://github.com/twintproject/twint/issues/1320)
|
18 |
+
- [twintproject/twint#1322](https://github.com/twintproject/twint/pull/1322)
|
19 |
+
- [twintproject/twint#1328](https://github.com/twintproject/twint/pull/1328)
|
20 |
+
- [twintproject/twint#1061](https://github.com/twintproject/twint/issues/1061)
|
21 |
+
- [twintproject/twint#1114](https://github.com/twintproject/twint/issues/1114)
|
22 |
+
|
23 |
+
### json.decoder.JSONDecodeError: Expecting value: line 1 column 1 (char 0)
|
24 |
+
|
25 |
+
The fix is **not complete**.
|
26 |
+
`twint.run.Profile` will work but `twint.run.db` will not.
|
27 |
+
This means [`test.py`](./test.py) causes an error.
|
28 |
+
|
29 |
+
I think this is because the fields of the result table are not exactly the same as the traditional ones.
|
30 |
+
|
31 |
+
#### Related
|
32 |
+
|
33 |
+
- [twintproject/twint#1335](https://github.com/twintproject/twint/issues/1335)
|
34 |
+
|
35 |
+
### [-] TWINT requires Python version 3.6+.
|
36 |
+
|
37 |
+
#### Related
|
38 |
+
|
39 |
+
- [twintproject/twint#1344](https://github.com/twintproject/twint/issues/1344)
|
40 |
+
- [twintproject/twint#1345](https://github.com/twintproject/twint/pull/1345)
|
41 |
+
- [twintproject/twint#1344](https://github.com/twintproject/twint/issues/1346)
|
42 |
+
- [twintproject/twint#1309](https://github.com/twintproject/twint/pull/1309)
|
43 |
+
- [twintproject/twint#1313](https://github.com/twintproject/twint/issues/1313)
|
44 |
+
|
45 |
+
## References
|
46 |
+
|
47 |
+
- [snscrape](https://github.com/JustAnotherArchivist/snscrape)
|
48 |
+
- [gallery-dl](https://github.com/mikf/gallery-dl)
|
49 |
+
|
50 |
+
## License
|
51 |
+
|
52 |
+
This repository is also under the [MIT License](https://opensource.org/licenses/mit-license.php).
|
53 |
+
|
54 |
+
---
|
55 |
+
|
56 |
+
# TWINT - Twitter Intelligence Tool
|
57 |
+
![2](https://i.imgur.com/iaH3s7z.png)
|
58 |
+
![3](https://i.imgur.com/hVeCrqL.png)
|
59 |
+
|
60 |
+
[![PyPI](https://img.shields.io/pypi/v/twint.svg)](https://pypi.org/project/twint/) [![Build Status](https://travis-ci.org/twintproject/twint.svg?branch=master)](https://travis-ci.org/twintproject/twint) [![Python 3.6|3.7|3.8](https://img.shields.io/badge/Python-3.6%2F3.7%2F3.8-blue.svg)](https://www.python.org/download/releases/3.0/) [![GitHub license](https://img.shields.io/github/license/haccer/tweep.svg)](https://github.com/haccer/tweep/blob/master/LICENSE) [![Downloads](https://pepy.tech/badge/twint)](https://pepy.tech/project/twint) [![Downloads](https://pepy.tech/badge/twint/week)](https://pepy.tech/project/twint/week) [![Patreon](https://img.shields.io/endpoint.svg?url=https:%2F%2Fshieldsio-patreon.herokuapp.com%2Ftwintproject)](https://www.patreon.com/twintproject) ![](https://img.shields.io/twitter/follow/noneprivacy.svg?label=Follow&style=social)
|
61 |
+
|
62 |
+
>No authentication. No API. No limits.
|
63 |
+
|
64 |
+
Twint is an advanced Twitter scraping tool written in Python that allows for scraping Tweets from Twitter profiles **without** using Twitter's API.
|
65 |
+
|
66 |
+
Twint utilizes Twitter's search operators to let you scrape Tweets from specific users, scrape Tweets relating to certain topics, hashtags & trends, or sort out *sensitive* information from Tweets like e-mail and phone numbers. I find this very useful, and you can get really creative with it too.
|
67 |
+
|
68 |
+
Twint also makes special queries to Twitter allowing you to also scrape a Twitter user's followers, Tweets a user has liked, and who they follow **without** any authentication, API, Selenium, or browser emulation.
|
69 |
+
|
70 |
+
## tl;dr Benefits
|
71 |
+
Some of the benefits of using Twint vs Twitter API:
|
72 |
+
- Can fetch almost __all__ Tweets (Twitter API limits to last 3200 Tweets only);
|
73 |
+
- Fast initial setup;
|
74 |
+
- Can be used anonymously and without Twitter sign up;
|
75 |
+
- **No rate limitations**.
|
76 |
+
|
77 |
+
## Limits imposed by Twitter
|
78 |
+
Twitter limits scrolls while browsing the user timeline. This means that with `.Profile` or with `.Favorites` you will be able to get ~3200 tweets.
|
79 |
+
|
80 |
+
## Requirements
|
81 |
+
- Python 3.6;
|
82 |
+
- aiohttp;
|
83 |
+
- aiodns;
|
84 |
+
- beautifulsoup4;
|
85 |
+
- cchardet;
|
86 |
+
- dataclasses
|
87 |
+
- elasticsearch;
|
88 |
+
- pysocks;
|
89 |
+
- pandas (>=0.23.0);
|
90 |
+
- aiohttp_socks;
|
91 |
+
- schedule;
|
92 |
+
- geopy;
|
93 |
+
- fake-useragent;
|
94 |
+
- py-googletransx.
|
95 |
+
|
96 |
+
## Installing
|
97 |
+
|
98 |
+
**Git:**
|
99 |
+
```bash
|
100 |
+
git clone --depth=1 https://github.com/twintproject/twint.git
|
101 |
+
cd twint
|
102 |
+
pip3 install . -r requirements.txt
|
103 |
+
```
|
104 |
+
|
105 |
+
**Pip:**
|
106 |
+
```bash
|
107 |
+
pip3 install twint
|
108 |
+
```
|
109 |
+
|
110 |
+
or
|
111 |
+
|
112 |
+
```bash
|
113 |
+
pip3 install --user --upgrade git+https://github.com/twintproject/twint.git@origin/master#egg=twint
|
114 |
+
```
|
115 |
+
|
116 |
+
**Pipenv**:
|
117 |
+
```bash
|
118 |
+
pipenv install git+https://github.com/twintproject/twint.git#egg=twint
|
119 |
+
```
|
120 |
+
|
121 |
+
### March 2, 2021 Update
|
122 |
+
|
123 |
+
**Added**: Dockerfile
|
124 |
+
|
125 |
+
Noticed a lot of people are having issues installing (including me). Please use the Dockerfile temporarily while I look into them.
|
126 |
+
|
127 |
+
## CLI Basic Examples and Combos
|
128 |
+
A few simple examples to help you understand the basics:
|
129 |
+
|
130 |
+
- `twint -u username` - Scrape all the Tweets of a *user* (doesn't include **retweets** but includes **replies**).
|
131 |
+
- `twint -u username -s pineapple` - Scrape all Tweets from the *user*'s timeline containing _pineapple_.
|
132 |
+
- `twint -s pineapple` - Collect every Tweet containing *pineapple* from everyone's Tweets.
|
133 |
+
- `twint -u username --year 2014` - Collect Tweets that were tweeted **before** 2014.
|
134 |
+
- `twint -u username --since "2015-12-20 20:30:15"` - Collect Tweets that were tweeted since 2015-12-20 20:30:15.
|
135 |
+
- `twint -u username --since 2015-12-20` - Collect Tweets that were tweeted since 2015-12-20 00:00:00.
|
136 |
+
- `twint -u username -o file.txt` - Scrape Tweets and save to file.txt.
|
137 |
+
- `twint -u username -o file.csv --csv` - Scrape Tweets and save as a csv file.
|
138 |
+
- `twint -u username --email --phone` - Show Tweets that might have phone numbers or email addresses.
|
139 |
+
- `twint -s "Donald Trump" --verified` - Display Tweets by verified users that Tweeted about Donald Trump.
|
140 |
+
- `twint -g="48.880048,2.385939,1km" -o file.csv --csv` - Scrape Tweets from a radius of 1km around a place in Paris and export them to a csv file.
|
141 |
+
- `twint -u username -es localhost:9200` - Output Tweets to Elasticsearch
|
142 |
+
- `twint -u username -o file.json --json` - Scrape Tweets and save as a json file.
|
143 |
+
- `twint -u username --database tweets.db` - Save Tweets to a SQLite database.
|
144 |
+
- `twint -u username --followers` - Scrape a Twitter user's followers.
|
145 |
+
- `twint -u username --following` - Scrape who a Twitter user follows.
|
146 |
+
- `twint -u username --favorites` - Collect all the Tweets a user has favorited (gathers ~3200 tweet).
|
147 |
+
- `twint -u username --following --user-full` - Collect full user information a person follows
|
148 |
+
- `twint -u username --timeline` - Use an effective method to gather Tweets from a user's profile (Gathers ~3200 Tweets, including **retweets** & **replies**).
|
149 |
+
- `twint -u username --retweets` - Use a quick method to gather the last 900 Tweets (that includes retweets) from a user's profile.
|
150 |
+
- `twint -u username --resume resume_file.txt` - Resume a search starting from the last saved scroll-id.
|
151 |
+
|
152 |
+
More detail about the commands and options are located in the [wiki](https://github.com/twintproject/twint/wiki/Commands)
|
153 |
+
|
154 |
+
## Module Example
|
155 |
+
|
156 |
+
Twint can now be used as a module and supports custom formatting. **More details are located in the [wiki](https://github.com/twintproject/twint/wiki/Module)**
|
157 |
+
|
158 |
+
```python
|
159 |
+
import twint
|
160 |
+
|
161 |
+
# Configure
|
162 |
+
c = twint.Config()
|
163 |
+
c.Username = "realDonaldTrump"
|
164 |
+
c.Search = "great"
|
165 |
+
|
166 |
+
# Run
|
167 |
+
twint.run.Search(c)
|
168 |
+
```
|
169 |
+
> Output
|
170 |
+
|
171 |
+
`955511208597184512 2018-01-22 18:43:19 GMT <now> pineapples are the best fruit`
|
172 |
+
|
173 |
+
```python
|
174 |
+
import twint
|
175 |
+
|
176 |
+
c = twint.Config()
|
177 |
+
|
178 |
+
c.Username = "noneprivacy"
|
179 |
+
c.Custom["tweet"] = ["id"]
|
180 |
+
c.Custom["user"] = ["bio"]
|
181 |
+
c.Limit = 10
|
182 |
+
c.Store_csv = True
|
183 |
+
c.Output = "none"
|
184 |
+
|
185 |
+
twint.run.Search(c)
|
186 |
+
```
|
187 |
+
|
188 |
+
## Storing Options
|
189 |
+
- Write to file;
|
190 |
+
- CSV;
|
191 |
+
- JSON;
|
192 |
+
- SQLite;
|
193 |
+
- Elasticsearch.
|
194 |
+
|
195 |
+
## Elasticsearch Setup
|
196 |
+
|
197 |
+
Details on setting up Elasticsearch with Twint is located in the [wiki](https://github.com/twintproject/twint/wiki/Elasticsearch).
|
198 |
+
|
199 |
+
## Graph Visualization
|
200 |
+
![graph](https://i.imgur.com/EEJqB8n.png)
|
201 |
+
|
202 |
+
[Graph](https://github.com/twintproject/twint/wiki/Graph) details are also located in the [wiki](https://github.com/twintproject/twint/wiki/Graph).
|
203 |
+
|
204 |
+
We are developing a Twint Desktop App.
|
205 |
+
|
206 |
+
![4](https://i.imgur.com/DzcfIgL.png)
|
207 |
+
|
208 |
+
## FAQ
|
209 |
+
> I tried scraping tweets from a user, I know that they exist but I'm not getting them
|
210 |
+
|
211 |
+
Twitter can shadow-ban accounts, which means that their tweets will not be available via search. To solve this, pass `--profile-full` if you are using Twint via CLI or, if are using Twint as module, add `config.Profile_full = True`. Please note that this process will be quite slow.
|
212 |
+
## More Examples
|
213 |
+
|
214 |
+
#### Followers/Following
|
215 |
+
|
216 |
+
> To get only follower usernames/following usernames
|
217 |
+
|
218 |
+
`twint -u username --followers`
|
219 |
+
|
220 |
+
`twint -u username --following`
|
221 |
+
|
222 |
+
> To get user info of followers/following users
|
223 |
+
|
224 |
+
`twint -u username --followers --user-full`
|
225 |
+
|
226 |
+
`twint -u username --following --user-full`
|
227 |
+
|
228 |
+
#### userlist
|
229 |
+
|
230 |
+
> To get only user info of user
|
231 |
+
|
232 |
+
`twint -u username --user-full`
|
233 |
+
|
234 |
+
> To get user info of users from a userlist
|
235 |
+
|
236 |
+
`twint --userlist inputlist --user-full`
|
237 |
+
|
238 |
+
|
239 |
+
#### tweet translation (experimental)
|
240 |
+
|
241 |
+
> To get 100 english tweets and translate them to italian
|
242 |
+
|
243 |
+
`twint -u noneprivacy --csv --output none.csv --lang en --translate --translate-dest it --limit 100`
|
244 |
+
|
245 |
+
or
|
246 |
+
|
247 |
+
```python
|
248 |
+
import twint
|
249 |
+
|
250 |
+
c = twint.Config()
|
251 |
+
c.Username = "noneprivacy"
|
252 |
+
c.Limit = 100
|
253 |
+
c.Store_csv = True
|
254 |
+
c.Output = "none.csv"
|
255 |
+
c.Lang = "en"
|
256 |
+
c.Translate = True
|
257 |
+
c.TranslateDest = "it"
|
258 |
+
twint.run.Search(c)
|
259 |
+
```
|
260 |
+
|
261 |
+
Notes:
|
262 |
+
- [Google translate has some quotas](https://cloud.google.com/translate/quotas)
|
263 |
+
|
264 |
+
## Featured Blog Posts:
|
265 |
+
- [How to use Twint as an OSINT tool](https://pielco11.ovh/posts/twint-osint/)
|
266 |
+
- [Basic tutorial made by Null Byte](https://null-byte.wonderhowto.com/how-to/mine-twitter-for-targeted-information-with-twint-0193853/)
|
267 |
+
- [Analyzing Tweets with NLP in minutes with Spark, Optimus and Twint](https://towardsdatascience.com/analyzing-tweets-with-nlp-in-minutes-with-spark-optimus-and-twint-a0c96084995f)
|
268 |
+
- [Loading tweets into Kafka and Neo4j](https://markhneedham.com/blog/2019/05/29/loading-tweets-twint-kafka-neo4j/)
|
269 |
+
|
270 |
+
## Contact
|
271 |
+
|
272 |
+
If you have any question, want to join in discussions, or need extra help, you are welcome to join our Twint focused channel at [OSINT team](https://osint.team)
|
twitter-scraper/twint-master/Untitled.ipynb
ADDED
@@ -0,0 +1,282 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cells": [
|
3 |
+
{
|
4 |
+
"cell_type": "code",
|
5 |
+
"execution_count": 67,
|
6 |
+
"metadata": {},
|
7 |
+
"outputs": [],
|
8 |
+
"source": [
|
9 |
+
"text= \"\\n\\n0. Brottslighet, 1. Miljö, 2. Skola, 3. Sjukvård, 4. Militär, 5. Invandring, 6. Integration \""
|
10 |
+
]
|
11 |
+
},
|
12 |
+
{
|
13 |
+
"cell_type": "code",
|
14 |
+
"execution_count": 17,
|
15 |
+
"metadata": {},
|
16 |
+
"outputs": [
|
17 |
+
{
|
18 |
+
"name": "stdout",
|
19 |
+
"output_type": "stream",
|
20 |
+
"text": [
|
21 |
+
"WARNING: pip is being invoked by an old script wrapper. This will fail in a future version of pip.\n",
|
22 |
+
"Please see https://github.com/pypa/pip/issues/5599 for advice on fixing the underlying issue.\n",
|
23 |
+
"To avoid this problem you can invoke Python with '-m pip' instead of running pip directly.\n",
|
24 |
+
"Requirement already satisfied: regex in /home/oxygen/snap/jupyter/common/lib/python3.7/site-packages (2022.6.2)\n"
|
25 |
+
]
|
26 |
+
}
|
27 |
+
],
|
28 |
+
"source": [
|
29 |
+
"!pip install regex\n"
|
30 |
+
]
|
31 |
+
},
|
32 |
+
{
|
33 |
+
"cell_type": "code",
|
34 |
+
"execution_count": 15,
|
35 |
+
"metadata": {},
|
36 |
+
"outputs": [
|
37 |
+
{
|
38 |
+
"data": {
|
39 |
+
"text/plain": [
|
40 |
+
"['0']"
|
41 |
+
]
|
42 |
+
},
|
43 |
+
"execution_count": 15,
|
44 |
+
"metadata": {},
|
45 |
+
"output_type": "execute_result"
|
46 |
+
}
|
47 |
+
],
|
48 |
+
"source": [
|
49 |
+
"re.findall(\"[0-9]+\", tl[0])"
|
50 |
+
]
|
51 |
+
},
|
52 |
+
{
|
53 |
+
"cell_type": "code",
|
54 |
+
"execution_count": 48,
|
55 |
+
"metadata": {},
|
56 |
+
"outputs": [
|
57 |
+
{
|
58 |
+
"data": {
|
59 |
+
"text/plain": [
|
60 |
+
"'0. Äldrefrågor'"
|
61 |
+
]
|
62 |
+
},
|
63 |
+
"execution_count": 48,
|
64 |
+
"metadata": {},
|
65 |
+
"output_type": "execute_result"
|
66 |
+
}
|
67 |
+
],
|
68 |
+
"source": [
|
69 |
+
"tl[0]"
|
70 |
+
]
|
71 |
+
},
|
72 |
+
{
|
73 |
+
"cell_type": "code",
|
74 |
+
"execution_count": 49,
|
75 |
+
"metadata": {},
|
76 |
+
"outputs": [
|
77 |
+
{
|
78 |
+
"data": {
|
79 |
+
"text/plain": [
|
80 |
+
"['0', ' Äldrefrågor']"
|
81 |
+
]
|
82 |
+
},
|
83 |
+
"execution_count": 49,
|
84 |
+
"metadata": {},
|
85 |
+
"output_type": "execute_result"
|
86 |
+
}
|
87 |
+
],
|
88 |
+
"source": [
|
89 |
+
"f=tl[0].split('.')\n",
|
90 |
+
"\n",
|
91 |
+
"f#int(f[0])"
|
92 |
+
]
|
93 |
+
},
|
94 |
+
{
|
95 |
+
"cell_type": "code",
|
96 |
+
"execution_count": 29,
|
97 |
+
"metadata": {},
|
98 |
+
"outputs": [
|
99 |
+
{
|
100 |
+
"ename": "NameError",
|
101 |
+
"evalue": "name 'str_topics_to_dict' is not defined",
|
102 |
+
"output_type": "error",
|
103 |
+
"traceback": [
|
104 |
+
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
|
105 |
+
"\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)",
|
106 |
+
"\u001b[0;32m<ipython-input-29-b05d9860dbcf>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mstr_topics_to_dict\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtext\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
|
107 |
+
"\u001b[0;31mNameError\u001b[0m: name 'str_topics_to_dict' is not defined"
|
108 |
+
]
|
109 |
+
}
|
110 |
+
],
|
111 |
+
"source": []
|
112 |
+
},
|
113 |
+
{
|
114 |
+
"cell_type": "code",
|
115 |
+
"execution_count": 65,
|
116 |
+
"metadata": {},
|
117 |
+
"outputs": [],
|
118 |
+
"source": [
|
119 |
+
"\n",
|
120 |
+
"def str_topics_to_dict(topics):\n",
|
121 |
+
" topic_list=topics.split(\",\")\n",
|
122 |
+
" ind_topic_dict={}\n",
|
123 |
+
" for i inrange(len(topic_list)): \n",
|
124 |
+
" index_topic_list=\n",
|
125 |
+
" ind=index_topic_list[0]\n",
|
126 |
+
" just_topic=index_topic_list[1][1:]\n",
|
127 |
+
" ind_topic_dict[int(ind)]=just_topic\n",
|
128 |
+
" return ind_topic_dict"
|
129 |
+
]
|
130 |
+
},
|
131 |
+
{
|
132 |
+
"cell_type": "code",
|
133 |
+
"execution_count": 68,
|
134 |
+
"metadata": {},
|
135 |
+
"outputs": [
|
136 |
+
{
|
137 |
+
"data": {
|
138 |
+
"text/plain": [
|
139 |
+
"{0: 'Brottslighet',\n",
|
140 |
+
" 1: 'Miljö',\n",
|
141 |
+
" 2: 'Skola',\n",
|
142 |
+
" 3: 'Sjukvård',\n",
|
143 |
+
" 4: 'Militär',\n",
|
144 |
+
" 5: 'Invandring',\n",
|
145 |
+
" 6: 'Integration '}"
|
146 |
+
]
|
147 |
+
},
|
148 |
+
"execution_count": 68,
|
149 |
+
"metadata": {},
|
150 |
+
"output_type": "execute_result"
|
151 |
+
}
|
152 |
+
],
|
153 |
+
"source": [
|
154 |
+
"str_topics_to_dict(text)"
|
155 |
+
]
|
156 |
+
},
|
157 |
+
{
|
158 |
+
"cell_type": "code",
|
159 |
+
"execution_count": 109,
|
160 |
+
"metadata": {},
|
161 |
+
"outputs": [
|
162 |
+
{
|
163 |
+
"data": {
|
164 |
+
"text/plain": [
|
165 |
+
"' Brottslighet, Miljö, Skola, Sjukvård, Militär stöd, Invandring, Integration '"
|
166 |
+
]
|
167 |
+
},
|
168 |
+
"execution_count": 109,
|
169 |
+
"metadata": {},
|
170 |
+
"output_type": "execute_result"
|
171 |
+
}
|
172 |
+
],
|
173 |
+
"source": [
|
174 |
+
"\n",
|
175 |
+
"text=\"\\n\\n0. Brottslighet, 1. Miljö, 2. Skola, 3. Sjukvård, 4. Militär stöd, 5. Invandring, 6. Integration \"\n",
|
176 |
+
"text=re.sub(r\"(\\n+)\",\" \",text)\n",
|
177 |
+
"text=re.sub(\"(\\.)|\\d+\",\"\",text )\n",
|
178 |
+
"text"
|
179 |
+
]
|
180 |
+
},
|
181 |
+
{
|
182 |
+
"cell_type": "code",
|
183 |
+
"execution_count": 100,
|
184 |
+
"metadata": {},
|
185 |
+
"outputs": [
|
186 |
+
{
|
187 |
+
"data": {
|
188 |
+
"text/plain": [
|
189 |
+
"[' Brottslighet',\n",
|
190 |
+
" ' Miljö',\n",
|
191 |
+
" ' Skola',\n",
|
192 |
+
" ' Sjukvård',\n",
|
193 |
+
" ' Militär stöd',\n",
|
194 |
+
" ' Invandring',\n",
|
195 |
+
" ' Integration ']"
|
196 |
+
]
|
197 |
+
},
|
198 |
+
"execution_count": 100,
|
199 |
+
"metadata": {},
|
200 |
+
"output_type": "execute_result"
|
201 |
+
}
|
202 |
+
],
|
203 |
+
"source": [
|
204 |
+
"text.split(\",\")"
|
205 |
+
]
|
206 |
+
},
|
207 |
+
{
|
208 |
+
"cell_type": "code",
|
209 |
+
"execution_count": 116,
|
210 |
+
"metadata": {},
|
211 |
+
"outputs": [],
|
212 |
+
"source": [
|
213 |
+
"import regex as re \n",
|
214 |
+
"def str_topics_to_dict(topics):\n",
|
215 |
+
" text=re.sub(r\"(\\n+)\",\" \",topics)\n",
|
216 |
+
" text=re.sub(\"(\\.)|\\d+\",\"\",topics )\n",
|
217 |
+
" topics=re.sub(r\"(\\n+)|(\\.)|\\d+\",\"\",topics)\n",
|
218 |
+
" topic_list=topics.split(\",\")\n",
|
219 |
+
" ind_topic_dict={}\n",
|
220 |
+
" for i in range(len(topic_list)): \n",
|
221 |
+
" ind=i\n",
|
222 |
+
" just_topic=topic_list[i]\n",
|
223 |
+
" ind_topic_dict[ind]=just_topic\n",
|
224 |
+
" return ind_topic_dict"
|
225 |
+
]
|
226 |
+
},
|
227 |
+
{
|
228 |
+
"cell_type": "code",
|
229 |
+
"execution_count": 117,
|
230 |
+
"metadata": {},
|
231 |
+
"outputs": [
|
232 |
+
{
|
233 |
+
"data": {
|
234 |
+
"text/plain": [
|
235 |
+
"{0: ' Brottslighet',\n",
|
236 |
+
" 1: ' Miljö',\n",
|
237 |
+
" 2: ' Skola',\n",
|
238 |
+
" 3: ' Sjukvård',\n",
|
239 |
+
" 4: ' Militär stöd',\n",
|
240 |
+
" 5: ' Invandring',\n",
|
241 |
+
" 6: ' Integration '}"
|
242 |
+
]
|
243 |
+
},
|
244 |
+
"execution_count": 117,
|
245 |
+
"metadata": {},
|
246 |
+
"output_type": "execute_result"
|
247 |
+
}
|
248 |
+
],
|
249 |
+
"source": [
|
250 |
+
"str_topics_to_dict(text)"
|
251 |
+
]
|
252 |
+
},
|
253 |
+
{
|
254 |
+
"cell_type": "code",
|
255 |
+
"execution_count": null,
|
256 |
+
"metadata": {},
|
257 |
+
"outputs": [],
|
258 |
+
"source": []
|
259 |
+
}
|
260 |
+
],
|
261 |
+
"metadata": {
|
262 |
+
"kernelspec": {
|
263 |
+
"display_name": "Python 3",
|
264 |
+
"language": "python",
|
265 |
+
"name": "python3"
|
266 |
+
},
|
267 |
+
"language_info": {
|
268 |
+
"codemirror_mode": {
|
269 |
+
"name": "ipython",
|
270 |
+
"version": 3
|
271 |
+
},
|
272 |
+
"file_extension": ".py",
|
273 |
+
"mimetype": "text/x-python",
|
274 |
+
"name": "python",
|
275 |
+
"nbconvert_exporter": "python",
|
276 |
+
"pygments_lexer": "ipython3",
|
277 |
+
"version": "3.7.3"
|
278 |
+
}
|
279 |
+
},
|
280 |
+
"nbformat": 4,
|
281 |
+
"nbformat_minor": 2
|
282 |
+
}
|
twitter-scraper/twint-master/automate.py
ADDED
@@ -0,0 +1,65 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import twint
|
2 |
+
import schedule
|
3 |
+
import time
|
4 |
+
|
5 |
+
# you can change the name of each "job" after "def" if you'd like.
|
6 |
+
def jobone():
|
7 |
+
print ("Fetching Tweets")
|
8 |
+
c = twint.Config()
|
9 |
+
# choose username (optional)
|
10 |
+
c.Username = "insert username here"
|
11 |
+
# choose search term (optional)
|
12 |
+
c.Search = "insert search term here"
|
13 |
+
# choose beginning time (narrow results)
|
14 |
+
c.Since = "2018-01-01"
|
15 |
+
# set limit on total tweets
|
16 |
+
c.Limit = 1000
|
17 |
+
# no idea, but makes the csv format properly
|
18 |
+
c.Store_csv = True
|
19 |
+
# format of the csv
|
20 |
+
c.Custom = ["date", "time", "username", "tweet", "link", "likes", "retweets", "replies", "mentions", "hashtags"]
|
21 |
+
# change the name of the csv file
|
22 |
+
c.Output = "filename.csv"
|
23 |
+
twint.run.Search(c)
|
24 |
+
|
25 |
+
def jobtwo():
|
26 |
+
print ("Fetching Tweets")
|
27 |
+
c = twint.Config()
|
28 |
+
# choose username (optional)
|
29 |
+
c.Username = "insert username here"
|
30 |
+
# choose search term (optional)
|
31 |
+
c.Search = "insert search term here"
|
32 |
+
# choose beginning time (narrow results)
|
33 |
+
c.Since = "2018-01-01"
|
34 |
+
# set limit on total tweets
|
35 |
+
c.Limit = 1000
|
36 |
+
# no idea, but makes the csv format properly
|
37 |
+
c.Store_csv = True
|
38 |
+
# format of the csv
|
39 |
+
c.Custom = ["date", "time", "username", "tweet", "link", "likes", "retweets", "replies", "mentions", "hashtags"]
|
40 |
+
# change the name of the csv file
|
41 |
+
c.Output = "filename2.csv"
|
42 |
+
twint.run.Search(c)
|
43 |
+
|
44 |
+
# run once when you start the program
|
45 |
+
|
46 |
+
jobone()
|
47 |
+
jobtwo()
|
48 |
+
|
49 |
+
# run every minute(s), hour, day at, day of the week, day of the week and time. Use "#" to block out which ones you don't want to use. Remove it to active. Also, replace "jobone" and "jobtwo" with your new function names (if applicable)
|
50 |
+
|
51 |
+
# schedule.every(1).minutes.do(jobone)
|
52 |
+
schedule.every().hour.do(jobone)
|
53 |
+
# schedule.every().day.at("10:30").do(jobone)
|
54 |
+
# schedule.every().monday.do(jobone)
|
55 |
+
# schedule.every().wednesday.at("13:15").do(jobone)
|
56 |
+
|
57 |
+
# schedule.every(1).minutes.do(jobtwo)
|
58 |
+
schedule.every().hour.do(jobtwo)
|
59 |
+
# schedule.every().day.at("10:30").do(jobtwo)
|
60 |
+
# schedule.every().monday.do(jobtwo)
|
61 |
+
# schedule.every().wednesday.at("13:15").do(jobtwo)
|
62 |
+
|
63 |
+
while True:
|
64 |
+
schedule.run_pending()
|
65 |
+
time.sleep(1)
|
twitter-scraper/twint-master/elasticsearch/README.md
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Elasticsearch How-To
|
2 |
+
|
3 |
+
![dashboard](https://i.imgur.com/BEbtdo5.png)
|
4 |
+
|
5 |
+
Please read the Wiki [here](https://github.com/twintproject/twint/wiki/Elasticsearch)
|
twitter-scraper/twint-master/scrape.py
ADDED
@@ -0,0 +1,102 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import sys
|
2 |
+
import io
|
3 |
+
import time
|
4 |
+
import asyncio
|
5 |
+
import os
|
6 |
+
from tkinter import EXCEPTION
|
7 |
+
from numpy import not_equal
|
8 |
+
loop = asyncio.get_event_loop()
|
9 |
+
loop.is_running()
|
10 |
+
import twint
|
11 |
+
import nest_asyncio
|
12 |
+
nest_asyncio.apply()
|
13 |
+
from datetime import date
|
14 |
+
class scraper:
|
15 |
+
def get_tweets(search_str, from_date="2006-07-01", to_date=str(date.today()), num_tweets=10,u_or_s='s', acceptable_range=10):
|
16 |
+
|
17 |
+
if (type(from_date) or type("str")) is not type("str"):
|
18 |
+
print("[!] Please make sure the date is a string in this format \"yyyy-mm-dd\" ")
|
19 |
+
raise EXCEPTION("Incorrect date type Exception!")
|
20 |
+
|
21 |
+
time_out= time.time()+2*60
|
22 |
+
_dict={}
|
23 |
+
c=twint.Config()
|
24 |
+
if u_or_s.lower() =="u":
|
25 |
+
c.Search = "from:@"+search_str # topic
|
26 |
+
else:
|
27 |
+
c.Search = search_str # topic
|
28 |
+
c.Pandas = True
|
29 |
+
num_tweets_and_replies=num_tweets
|
30 |
+
c.Count=True
|
31 |
+
for j in range(1,5):
|
32 |
+
c.Limit = num_tweets_and_replies
|
33 |
+
c.Since = from_date
|
34 |
+
c.Until = to_date
|
35 |
+
c.Hide_output =True
|
36 |
+
old_stdout = sys.stdout
|
37 |
+
new_stdout = io.StringIO()
|
38 |
+
sys.stdout = new_stdout
|
39 |
+
twint.run.Search(c)
|
40 |
+
output = new_stdout.getvalue()
|
41 |
+
sys.stdout = old_stdout
|
42 |
+
print(output[0:-2])
|
43 |
+
tweet_info=twint.output.panda.Tweets_df
|
44 |
+
|
45 |
+
t_count=0
|
46 |
+
try:
|
47 |
+
_keys=tweet_info["id"]
|
48 |
+
#tweet infor is a dataframe with fallowing columns
|
49 |
+
'''Index(['id', 'conversation_id', 'created_at', 'date', 'timezone', 'place',
|
50 |
+
'tweet', 'language', 'hashtags', 'cashtags', 'user_id', 'user_id_str',
|
51 |
+
'username', 'name', 'day', 'hour', 'link', 'urls', 'photos', 'video',
|
52 |
+
'thumbnail', 'retweet', 'nlikes', 'nreplies', 'nretweets', 'quote_url',
|
53 |
+
'search', 'near', 'geo', 'source', 'user_rt_id', 'user_rt',
|
54 |
+
'retweet_id', 'reply_to', 'retweet_date', 'translate', 'trans_src',
|
55 |
+
'trans_dest'],
|
56 |
+
dtype='object')'''
|
57 |
+
|
58 |
+
for i in range (len(_keys)):
|
59 |
+
if _keys[i] in _dict.keys() or tweet_info["tweet"][i].startswith("@"):
|
60 |
+
pass
|
61 |
+
else:
|
62 |
+
_dict[int(_keys[i])] = {"tweet": tweet_info["tweet"][i],
|
63 |
+
"date" :tweet_info["date"][i],
|
64 |
+
"nlikes": tweet_info["nlikes"][i],
|
65 |
+
"nreplies":tweet_info["nreplies"][i] ,
|
66 |
+
"nretweets": tweet_info["nretweets"][i],"topic":""}
|
67 |
+
if len(list(_dict.keys()))==num_tweets:
|
68 |
+
break
|
69 |
+
except:
|
70 |
+
pass
|
71 |
+
print(len(list(_dict.keys())), " of them are Tweets")
|
72 |
+
if (num_tweets-len(list(_dict.keys())))< acceptable_range:
|
73 |
+
return _dict
|
74 |
+
if len(list(_dict.keys())) < num_tweets:
|
75 |
+
num_tweets_and_replies= num_tweets_and_replies+100*3**j
|
76 |
+
else:
|
77 |
+
break
|
78 |
+
if time_out <time.time():
|
79 |
+
break
|
80 |
+
if output.startswith("[!] No more data!"):
|
81 |
+
break
|
82 |
+
return _dict
|
83 |
+
|
84 |
+
def string_search_user_tweets(user_name,search_str ,from_date="2006-07-01", to_date=str(date.today()), num_tweets=10):
|
85 |
+
c=twint.Config()
|
86 |
+
c.Username =user_name
|
87 |
+
c.Search = search_str # topic
|
88 |
+
c.Pandas = True
|
89 |
+
num_tweets_and_replies=num_tweets
|
90 |
+
c.Count=True
|
91 |
+
c.Limit = num_tweets_and_replies
|
92 |
+
c.Since = from_date
|
93 |
+
c.Until = to_date
|
94 |
+
c.Hide_output =True
|
95 |
+
twint.run.Search(c)
|
96 |
+
return twint.output.panda.Tweets_df
|
97 |
+
|
98 |
+
|
99 |
+
|
100 |
+
|
101 |
+
|
102 |
+
|
twitter-scraper/twint-master/scrape__init__.py
ADDED
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
def scraper_libs():
|
2 |
+
import sys
|
3 |
+
import io
|
4 |
+
import time
|
5 |
+
import asyncio
|
6 |
+
import os
|
7 |
+
from tkinter import EXCEPTION
|
8 |
+
from numpy import not_equal
|
9 |
+
loop = asyncio.get_event_loop()
|
10 |
+
loop.is_running()
|
11 |
+
import twint
|
12 |
+
import nest_asyncio
|
13 |
+
nest_asyncio.apply()
|
14 |
+
from datetime import date
|
twitter-scraper/twint-master/setup.py
ADDED
@@ -0,0 +1,65 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/python3
|
2 |
+
from setuptools import setup
|
3 |
+
import io
|
4 |
+
import os
|
5 |
+
|
6 |
+
# Package meta-data
|
7 |
+
NAME = 'twint'
|
8 |
+
DESCRIPTION = 'An advanced Twitter scraping & OSINT tool.'
|
9 |
+
URL = 'https://github.com/twintproject/twint'
|
10 |
+
EMAIL = 'codyzacharias@pm.me'
|
11 |
+
AUTHOR = 'Cody Zacharias'
|
12 |
+
REQUIRES_PYTHON = '>=3.6.0'
|
13 |
+
VERSION = None
|
14 |
+
|
15 |
+
# Packages required
|
16 |
+
REQUIRED = [
|
17 |
+
'aiohttp', 'aiodns', 'beautifulsoup4', 'cchardet', 'dataclasses',
|
18 |
+
'elasticsearch', 'pysocks', 'pandas', 'aiohttp_socks',
|
19 |
+
'schedule', 'geopy', 'fake-useragent', 'googletransx'
|
20 |
+
]
|
21 |
+
|
22 |
+
here = os.path.abspath(os.path.dirname(__file__))
|
23 |
+
|
24 |
+
with io.open(os.path.join(here, 'README.md'), encoding='utf-8') as f:
|
25 |
+
long_description = '\n' + f.read()
|
26 |
+
|
27 |
+
# Load the package's __version__.py
|
28 |
+
about = {}
|
29 |
+
if not VERSION:
|
30 |
+
with open(os.path.join(here, NAME, '__version__.py')) as f:
|
31 |
+
exec(f.read(), about)
|
32 |
+
else:
|
33 |
+
about['__version__'] = VERSION
|
34 |
+
|
35 |
+
setup(
|
36 |
+
name=NAME,
|
37 |
+
version=about['__version__'],
|
38 |
+
description=DESCRIPTION,
|
39 |
+
long_description=long_description,
|
40 |
+
long_description_content_type="text/markdown",
|
41 |
+
author=AUTHOR,
|
42 |
+
author_email=EMAIL,
|
43 |
+
python_requires=REQUIRES_PYTHON,
|
44 |
+
url=URL,
|
45 |
+
packages=['twint', 'twint.storage'],
|
46 |
+
entry_points={
|
47 |
+
'console_scripts': [
|
48 |
+
'twint = twint.cli:run_as_command',
|
49 |
+
],
|
50 |
+
},
|
51 |
+
install_requires=REQUIRED,
|
52 |
+
dependency_links=[
|
53 |
+
'git+https://github.com/x0rzkov/py-googletrans#egg=googletrans'
|
54 |
+
],
|
55 |
+
license='MIT',
|
56 |
+
classifiers=[
|
57 |
+
'License :: OSI Approved :: MIT License',
|
58 |
+
'Programming Language :: Python',
|
59 |
+
'Programming Language :: Python :: 3',
|
60 |
+
'Programming Language :: Python :: 3.6',
|
61 |
+
'Programming Language :: Python :: 3.7',
|
62 |
+
'Programming Language :: Python :: 3.8',
|
63 |
+
'Programming Language :: Python :: Implementation :: CPython',
|
64 |
+
],
|
65 |
+
)
|
twitter-scraper/twint-master/test.py
ADDED
@@ -0,0 +1,92 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import twint
|
2 |
+
import os
|
3 |
+
|
4 |
+
'''
|
5 |
+
Test.py - Testing TWINT to make sure everything works.
|
6 |
+
'''
|
7 |
+
|
8 |
+
|
9 |
+
def test_reg(c, run):
|
10 |
+
print("[+] Beginning vanilla test in {}".format(str(run)))
|
11 |
+
run(c)
|
12 |
+
|
13 |
+
|
14 |
+
def test_db(c, run):
|
15 |
+
print("[+] Beginning DB test in {}".format(str(run)))
|
16 |
+
c.Database = "test_twint.db"
|
17 |
+
run(c)
|
18 |
+
|
19 |
+
|
20 |
+
def custom(c, run, _type):
|
21 |
+
print("[+] Beginning custom {} test in {}".format(_type, str(run)))
|
22 |
+
c.Custom['tweet'] = ["id", "username"]
|
23 |
+
c.Custom['user'] = ["id", "username"]
|
24 |
+
run(c)
|
25 |
+
|
26 |
+
|
27 |
+
def test_json(c, run):
|
28 |
+
c.Store_json = True
|
29 |
+
c.Output = "test_twint.json"
|
30 |
+
custom(c, run, "JSON")
|
31 |
+
print("[+] Beginning JSON test in {}".format(str(run)))
|
32 |
+
run(c)
|
33 |
+
|
34 |
+
|
35 |
+
def test_csv(c, run):
|
36 |
+
c.Store_csv = True
|
37 |
+
c.Output = "test_twint.csv"
|
38 |
+
custom(c, run, "CSV")
|
39 |
+
print("[+] Beginning CSV test in {}".format(str(run)))
|
40 |
+
run(c)
|
41 |
+
|
42 |
+
|
43 |
+
def main():
|
44 |
+
c = twint.Config()
|
45 |
+
c.Username = "verified"
|
46 |
+
c.Limit = 20
|
47 |
+
c.Store_object = True
|
48 |
+
|
49 |
+
# Separate objects are necessary.
|
50 |
+
|
51 |
+
f = twint.Config()
|
52 |
+
f.Username = "verified"
|
53 |
+
f.Limit = 20
|
54 |
+
f.Store_object = True
|
55 |
+
f.User_full = True
|
56 |
+
|
57 |
+
runs = [
|
58 |
+
twint.run.Profile, # this doesn't
|
59 |
+
twint.run.Search, # this works
|
60 |
+
twint.run.Following,
|
61 |
+
twint.run.Followers,
|
62 |
+
twint.run.Favorites,
|
63 |
+
]
|
64 |
+
|
65 |
+
tests = [test_reg, test_json, test_csv, test_db]
|
66 |
+
|
67 |
+
# Something breaks if we don't split these up
|
68 |
+
|
69 |
+
for run in runs[:3]:
|
70 |
+
if run == twint.run.Search:
|
71 |
+
c.Since = "2012-1-1 20:30:22"
|
72 |
+
c.Until = "2017-1-1"
|
73 |
+
else:
|
74 |
+
c.Since = ""
|
75 |
+
c.Until = ""
|
76 |
+
|
77 |
+
for test in tests:
|
78 |
+
test(c, run)
|
79 |
+
|
80 |
+
for run in runs[3:]:
|
81 |
+
for test in tests:
|
82 |
+
test(f, run)
|
83 |
+
|
84 |
+
files = ["test_twint.db", "test_twint.json", "test_twint.csv"]
|
85 |
+
for _file in files:
|
86 |
+
os.remove(_file)
|
87 |
+
|
88 |
+
print("[+] Testing complete!")
|
89 |
+
|
90 |
+
|
91 |
+
if __name__ == '__main__':
|
92 |
+
main()
|
twitter-scraper/twint-master/twint/__init__.py
ADDED
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
'''
|
2 |
+
TWINT - Twitter Intelligence Tool (formerly known as Tweep).
|
3 |
+
|
4 |
+
See wiki on Github for in-depth details.
|
5 |
+
https://github.com/twintproject/twint/wiki
|
6 |
+
|
7 |
+
Licensed under MIT License
|
8 |
+
Copyright (c) 2018 Cody Zacharias
|
9 |
+
'''
|
10 |
+
import logging, os
|
11 |
+
|
12 |
+
from .config import Config
|
13 |
+
from .__version__ import __version__
|
14 |
+
from . import run
|
15 |
+
|
16 |
+
_levels = {
|
17 |
+
'info': logging.INFO,
|
18 |
+
'debug': logging.DEBUG
|
19 |
+
}
|
20 |
+
|
21 |
+
_level = os.getenv('TWINT_DEBUG', 'info')
|
22 |
+
_logLevel = _levels[_level]
|
23 |
+
|
24 |
+
if _level == "debug":
|
25 |
+
logger = logging.getLogger()
|
26 |
+
_output_fn = 'twint.log'
|
27 |
+
logger.setLevel(_logLevel)
|
28 |
+
formatter = logging.Formatter('%(levelname)s:%(asctime)s:%(name)s:%(message)s')
|
29 |
+
fileHandler = logging.FileHandler(_output_fn)
|
30 |
+
fileHandler.setLevel(_logLevel)
|
31 |
+
fileHandler.setFormatter(formatter)
|
32 |
+
logger.addHandler(fileHandler)
|
twitter-scraper/twint-master/twint/__version__.py
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
VERSION = (2, 1, 21)
|
2 |
+
|
3 |
+
__version__ = '.'.join(map(str, VERSION))
|
twitter-scraper/twint-master/twint/cli.py
ADDED
@@ -0,0 +1,342 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python3
|
2 |
+
'''
|
3 |
+
Twint.py - Twitter Intelligence Tool (formerly known as Tweep).
|
4 |
+
|
5 |
+
See wiki on Github for in-depth details.
|
6 |
+
https://github.com/twintproject/twint/wiki
|
7 |
+
|
8 |
+
Licensed under MIT License
|
9 |
+
Copyright (c) 2018 The Twint Project
|
10 |
+
'''
|
11 |
+
import sys
|
12 |
+
import os
|
13 |
+
import argparse
|
14 |
+
|
15 |
+
from . import run
|
16 |
+
from . import config
|
17 |
+
from . import storage
|
18 |
+
|
19 |
+
|
20 |
+
def error(_error, message):
|
21 |
+
""" Print errors to stdout
|
22 |
+
"""
|
23 |
+
print("[-] {}: {}".format(_error, message))
|
24 |
+
sys.exit(0)
|
25 |
+
|
26 |
+
|
27 |
+
def check(args):
|
28 |
+
""" Error checking
|
29 |
+
"""
|
30 |
+
if args.username is not None or args.userlist or args.members_list:
|
31 |
+
if args.verified:
|
32 |
+
error("Contradicting Args",
|
33 |
+
"Please use --verified in combination with -s.")
|
34 |
+
if args.userid:
|
35 |
+
error("Contradicting Args",
|
36 |
+
"--userid and -u cannot be used together.")
|
37 |
+
if args.all:
|
38 |
+
error("Contradicting Args",
|
39 |
+
"--all and -u cannot be used together.")
|
40 |
+
elif args.search and args.timeline:
|
41 |
+
error("Contradicting Args",
|
42 |
+
"--s and --tl cannot be used together.")
|
43 |
+
elif args.timeline and not args.username:
|
44 |
+
error("Error", "-tl cannot be used without -u.")
|
45 |
+
elif args.search is None:
|
46 |
+
if args.custom_query is not None:
|
47 |
+
pass
|
48 |
+
elif (args.geo or args.near) is None and not (args.all or args.userid):
|
49 |
+
error("Error", "Please use at least -u, -s, -g or --near.")
|
50 |
+
elif args.all and args.userid:
|
51 |
+
error("Contradicting Args",
|
52 |
+
"--all and --userid cannot be used together")
|
53 |
+
if args.output is None:
|
54 |
+
if args.csv:
|
55 |
+
error("Error", "Please specify an output file (Example: -o file.csv).")
|
56 |
+
elif args.json:
|
57 |
+
error("Error", "Please specify an output file (Example: -o file.json).")
|
58 |
+
if args.backoff_exponent <= 0:
|
59 |
+
error("Error", "Please specifiy a positive value for backoff_exponent")
|
60 |
+
if args.min_wait_time < 0:
|
61 |
+
error("Error", "Please specifiy a non negative value for min_wait_time")
|
62 |
+
|
63 |
+
|
64 |
+
def loadUserList(ul, _type):
|
65 |
+
""" Concatenate users
|
66 |
+
"""
|
67 |
+
if os.path.exists(os.path.abspath(ul)):
|
68 |
+
userlist = open(os.path.abspath(ul), "r").read().splitlines()
|
69 |
+
else:
|
70 |
+
userlist = ul.split(",")
|
71 |
+
if _type == "search":
|
72 |
+
un = ""
|
73 |
+
for user in userlist:
|
74 |
+
un += "%20OR%20from%3A" + user
|
75 |
+
return un[15:]
|
76 |
+
return userlist
|
77 |
+
|
78 |
+
|
79 |
+
def initialize(args):
|
80 |
+
""" Set default values for config from args
|
81 |
+
"""
|
82 |
+
c = config.Config()
|
83 |
+
c.Username = args.username
|
84 |
+
c.User_id = args.userid
|
85 |
+
c.Search = args.search
|
86 |
+
c.Geo = args.geo
|
87 |
+
c.Location = args.location
|
88 |
+
c.Near = args.near
|
89 |
+
c.Lang = args.lang
|
90 |
+
c.Output = args.output
|
91 |
+
c.Elasticsearch = args.elasticsearch
|
92 |
+
c.Year = args.year
|
93 |
+
c.Since = args.since
|
94 |
+
c.Until = args.until
|
95 |
+
c.Email = args.email
|
96 |
+
c.Phone = args.phone
|
97 |
+
c.Verified = args.verified
|
98 |
+
c.Store_csv = args.csv
|
99 |
+
c.Tabs = args.tabs
|
100 |
+
c.Store_json = args.json
|
101 |
+
c.Show_hashtags = args.hashtags
|
102 |
+
c.Show_cashtags = args.cashtags
|
103 |
+
c.Limit = args.limit
|
104 |
+
c.Count = args.count
|
105 |
+
c.Stats = args.stats
|
106 |
+
c.Database = args.database
|
107 |
+
c.To = args.to
|
108 |
+
c.All = args.all
|
109 |
+
c.Essid = args.essid
|
110 |
+
c.Format = args.format
|
111 |
+
c.User_full = args.user_full
|
112 |
+
# c.Profile_full = args.profile_full
|
113 |
+
c.Pandas_type = args.pandas_type
|
114 |
+
c.Index_tweets = args.index_tweets
|
115 |
+
c.Index_follow = args.index_follow
|
116 |
+
c.Index_users = args.index_users
|
117 |
+
c.Debug = args.debug
|
118 |
+
c.Resume = args.resume
|
119 |
+
c.Images = args.images
|
120 |
+
c.Videos = args.videos
|
121 |
+
c.Media = args.media
|
122 |
+
c.Replies = args.replies
|
123 |
+
c.Pandas_clean = args.pandas_clean
|
124 |
+
c.Proxy_host = args.proxy_host
|
125 |
+
c.Proxy_port = args.proxy_port
|
126 |
+
c.Proxy_type = args.proxy_type
|
127 |
+
c.Tor_control_port = args.tor_control_port
|
128 |
+
c.Tor_control_password = args.tor_control_password
|
129 |
+
c.Retweets = args.retweets
|
130 |
+
c.Custom_query = args.custom_query
|
131 |
+
c.Popular_tweets = args.popular_tweets
|
132 |
+
c.Skip_certs = args.skip_certs
|
133 |
+
c.Hide_output = args.hide_output
|
134 |
+
c.Native_retweets = args.native_retweets
|
135 |
+
c.Min_likes = args.min_likes
|
136 |
+
c.Min_retweets = args.min_retweets
|
137 |
+
c.Min_replies = args.min_replies
|
138 |
+
c.Links = args.links
|
139 |
+
c.Source = args.source
|
140 |
+
c.Members_list = args.members_list
|
141 |
+
c.Filter_retweets = args.filter_retweets
|
142 |
+
c.Translate = args.translate
|
143 |
+
c.TranslateDest = args.translate_dest
|
144 |
+
c.Backoff_exponent = args.backoff_exponent
|
145 |
+
c.Min_wait_time = args.min_wait_time
|
146 |
+
return c
|
147 |
+
|
148 |
+
|
149 |
+
def options():
|
150 |
+
""" Parse arguments
|
151 |
+
"""
|
152 |
+
ap = argparse.ArgumentParser(prog="twint",
|
153 |
+
usage="python3 %(prog)s [options]",
|
154 |
+
description="TWINT - An Advanced Twitter Scraping Tool.")
|
155 |
+
ap.add_argument("-u", "--username", help="User's Tweets you want to scrape.")
|
156 |
+
ap.add_argument("-s", "--search", help="Search for Tweets containing this word or phrase.")
|
157 |
+
ap.add_argument("-g", "--geo", help="Search for geocoded Tweets.")
|
158 |
+
ap.add_argument("--near", help="Near a specified city.")
|
159 |
+
ap.add_argument("--location", help="Show user's location (Experimental).", action="store_true")
|
160 |
+
ap.add_argument("-l", "--lang", help="Search for Tweets in a specific language.")
|
161 |
+
ap.add_argument("-o", "--output", help="Save output to a file.")
|
162 |
+
ap.add_argument("-es", "--elasticsearch", help="Index to Elasticsearch.")
|
163 |
+
ap.add_argument("--year", help="Filter Tweets before specified year.")
|
164 |
+
ap.add_argument("--since", help="Filter Tweets sent since date (Example: \"2017-12-27 20:30:15\" or 2017-12-27).",
|
165 |
+
metavar="DATE")
|
166 |
+
ap.add_argument("--until", help="Filter Tweets sent until date (Example: \"2017-12-27 20:30:15\" or 2017-12-27).",
|
167 |
+
metavar="DATE")
|
168 |
+
ap.add_argument("--email", help="Filter Tweets that might have email addresses", action="store_true")
|
169 |
+
ap.add_argument("--phone", help="Filter Tweets that might have phone numbers", action="store_true")
|
170 |
+
ap.add_argument("--verified", help="Display Tweets only from verified users (Use with -s).",
|
171 |
+
action="store_true")
|
172 |
+
ap.add_argument("--csv", help="Write as .csv file.", action="store_true")
|
173 |
+
ap.add_argument("--tabs", help="Separate CSV fields with tab characters, not commas.", action="store_true")
|
174 |
+
ap.add_argument("--json", help="Write as .json file", action="store_true")
|
175 |
+
ap.add_argument("--hashtags", help="Output hashtags in seperate column.", action="store_true")
|
176 |
+
ap.add_argument("--cashtags", help="Output cashtags in seperate column.", action="store_true")
|
177 |
+
ap.add_argument("--userid", help="Twitter user id.")
|
178 |
+
ap.add_argument("--limit", help="Number of Tweets to pull (Increments of 20).")
|
179 |
+
ap.add_argument("--count", help="Display number of Tweets scraped at the end of session.",
|
180 |
+
action="store_true")
|
181 |
+
ap.add_argument("--stats", help="Show number of replies, retweets, and likes.",
|
182 |
+
action="store_true")
|
183 |
+
ap.add_argument("-db", "--database", help="Store Tweets in a sqlite3 database.")
|
184 |
+
ap.add_argument("--to", help="Search Tweets to a user.", metavar="USERNAME")
|
185 |
+
ap.add_argument("--all", help="Search all Tweets associated with a user.", metavar="USERNAME")
|
186 |
+
ap.add_argument("--followers", help="Scrape a person's followers.", action="store_true")
|
187 |
+
ap.add_argument("--following", help="Scrape a person's follows", action="store_true")
|
188 |
+
ap.add_argument("--favorites", help="Scrape Tweets a user has liked.", action="store_true")
|
189 |
+
ap.add_argument("--proxy-type", help="Socks5, HTTP, etc.")
|
190 |
+
ap.add_argument("--proxy-host", help="Proxy hostname or IP.")
|
191 |
+
ap.add_argument("--proxy-port", help="The port of the proxy server.")
|
192 |
+
ap.add_argument("--tor-control-port", help="If proxy-host is set to tor, this is the control port", default=9051)
|
193 |
+
ap.add_argument("--tor-control-password",
|
194 |
+
help="If proxy-host is set to tor, this is the password for the control port",
|
195 |
+
default="my_password")
|
196 |
+
ap.add_argument("--essid",
|
197 |
+
help="Elasticsearch Session ID, use this to differentiate scraping sessions.",
|
198 |
+
nargs="?", default="")
|
199 |
+
ap.add_argument("--userlist", help="Userlist from list or file.")
|
200 |
+
ap.add_argument("--retweets",
|
201 |
+
help="Include user's Retweets (Warning: limited).",
|
202 |
+
action="store_true")
|
203 |
+
ap.add_argument("--format", help="Custom output format (See wiki for details).")
|
204 |
+
ap.add_argument("--user-full",
|
205 |
+
help="Collect all user information (Use with followers or following only).",
|
206 |
+
action="store_true")
|
207 |
+
# I am removing this this feature for the time being, because it is no longer required, default method will do this
|
208 |
+
# ap.add_argument("--profile-full",
|
209 |
+
# help="Slow, but effective method of collecting a user's Tweets and RT.",
|
210 |
+
# action="store_true")
|
211 |
+
ap.add_argument(
|
212 |
+
"-tl",
|
213 |
+
"--timeline",
|
214 |
+
help="Collects every tweet from a User's Timeline. (Tweets, RTs & Replies)",
|
215 |
+
action="store_true",
|
216 |
+
)
|
217 |
+
ap.add_argument("--translate",
|
218 |
+
help="Get tweets translated by Google Translate.",
|
219 |
+
action="store_true")
|
220 |
+
ap.add_argument("--translate-dest", help="Translate tweet to language (ISO2).",
|
221 |
+
default="en")
|
222 |
+
ap.add_argument("--store-pandas", help="Save Tweets in a DataFrame (Pandas) file.")
|
223 |
+
ap.add_argument("--pandas-type",
|
224 |
+
help="Specify HDF5 or Pickle (HDF5 as default)", nargs="?", default="HDF5")
|
225 |
+
ap.add_argument("-it", "--index-tweets",
|
226 |
+
help="Custom Elasticsearch Index name for Tweets.", nargs="?", default="twinttweets")
|
227 |
+
ap.add_argument("-if", "--index-follow",
|
228 |
+
help="Custom Elasticsearch Index name for Follows.",
|
229 |
+
nargs="?", default="twintgraph")
|
230 |
+
ap.add_argument("-iu", "--index-users", help="Custom Elasticsearch Index name for Users.",
|
231 |
+
nargs="?", default="twintuser")
|
232 |
+
ap.add_argument("--debug",
|
233 |
+
help="Store information in debug logs", action="store_true")
|
234 |
+
ap.add_argument("--resume", help="Resume from Tweet ID.", metavar="TWEET_ID")
|
235 |
+
ap.add_argument("--videos", help="Display only Tweets with videos.", action="store_true")
|
236 |
+
ap.add_argument("--images", help="Display only Tweets with images.", action="store_true")
|
237 |
+
ap.add_argument("--media",
|
238 |
+
help="Display Tweets with only images or videos.", action="store_true")
|
239 |
+
ap.add_argument("--replies", help="Display replies to a subject.", action="store_true")
|
240 |
+
ap.add_argument("-pc", "--pandas-clean",
|
241 |
+
help="Automatically clean Pandas dataframe at every scrape.")
|
242 |
+
ap.add_argument("-cq", "--custom-query", help="Custom search query.")
|
243 |
+
ap.add_argument("-pt", "--popular-tweets", help="Scrape popular tweets instead of recent ones.",
|
244 |
+
action="store_true")
|
245 |
+
ap.add_argument("-sc", "--skip-certs", help="Skip certs verification, useful for SSC.", action="store_false")
|
246 |
+
ap.add_argument("-ho", "--hide-output", help="Hide output, no tweets will be displayed.", action="store_true")
|
247 |
+
ap.add_argument("-nr", "--native-retweets", help="Filter the results for retweets only.", action="store_true")
|
248 |
+
ap.add_argument("--min-likes", help="Filter the tweets by minimum number of likes.")
|
249 |
+
ap.add_argument("--min-retweets", help="Filter the tweets by minimum number of retweets.")
|
250 |
+
ap.add_argument("--min-replies", help="Filter the tweets by minimum number of replies.")
|
251 |
+
ap.add_argument("--links", help="Include or exclude tweets containing one o more links. If not specified" +
|
252 |
+
" you will get both tweets that might contain links or not.")
|
253 |
+
ap.add_argument("--source", help="Filter the tweets for specific source client.")
|
254 |
+
ap.add_argument("--members-list", help="Filter the tweets sent by users in a given list.")
|
255 |
+
ap.add_argument("-fr", "--filter-retweets", help="Exclude retweets from the results.", action="store_true")
|
256 |
+
ap.add_argument("--backoff-exponent", help="Specify a exponent for the polynomial backoff in case of errors.",
|
257 |
+
type=float, default=3.0)
|
258 |
+
ap.add_argument("--min-wait-time", type=float, default=15,
|
259 |
+
help="specifiy a minimum wait time in case of scraping limit error. This value will be adjusted by twint if the value provided does not satisfy the limits constraints")
|
260 |
+
args = ap.parse_args()
|
261 |
+
|
262 |
+
return args
|
263 |
+
|
264 |
+
|
265 |
+
def main():
|
266 |
+
""" Main
|
267 |
+
"""
|
268 |
+
args = options()
|
269 |
+
check(args)
|
270 |
+
|
271 |
+
if args.pandas_clean:
|
272 |
+
storage.panda.clean()
|
273 |
+
|
274 |
+
c = initialize(args)
|
275 |
+
|
276 |
+
if args.userlist:
|
277 |
+
c.Query = loadUserList(args.userlist, "search")
|
278 |
+
|
279 |
+
if args.pandas_clean:
|
280 |
+
storage.panda.clean()
|
281 |
+
|
282 |
+
if args.favorites:
|
283 |
+
if args.userlist:
|
284 |
+
_userlist = loadUserList(args.userlist, "favorites")
|
285 |
+
for _user in _userlist:
|
286 |
+
args.username = _user
|
287 |
+
c = initialize(args)
|
288 |
+
run.Favorites(c)
|
289 |
+
else:
|
290 |
+
run.Favorites(c)
|
291 |
+
elif args.following:
|
292 |
+
if args.userlist:
|
293 |
+
_userlist = loadUserList(args.userlist, "following")
|
294 |
+
for _user in _userlist:
|
295 |
+
args.username = _user
|
296 |
+
c = initialize(args)
|
297 |
+
run.Following(c)
|
298 |
+
else:
|
299 |
+
run.Following(c)
|
300 |
+
elif args.followers:
|
301 |
+
if args.userlist:
|
302 |
+
_userlist = loadUserList(args.userlist, "followers")
|
303 |
+
for _user in _userlist:
|
304 |
+
args.username = _user
|
305 |
+
c = initialize(args)
|
306 |
+
run.Followers(c)
|
307 |
+
else:
|
308 |
+
run.Followers(c)
|
309 |
+
elif args.retweets: # or args.profile_full:
|
310 |
+
if args.userlist:
|
311 |
+
_userlist = loadUserList(args.userlist, "profile")
|
312 |
+
for _user in _userlist:
|
313 |
+
args.username = _user
|
314 |
+
c = initialize(args)
|
315 |
+
run.Profile(c)
|
316 |
+
else:
|
317 |
+
run.Profile(c)
|
318 |
+
elif args.user_full:
|
319 |
+
if args.userlist:
|
320 |
+
_userlist = loadUserList(args.userlist, "userlist")
|
321 |
+
for _user in _userlist:
|
322 |
+
args.username = _user
|
323 |
+
c = initialize(args)
|
324 |
+
run.Lookup(c)
|
325 |
+
else:
|
326 |
+
run.Lookup(c)
|
327 |
+
elif args.timeline:
|
328 |
+
run.Profile(c)
|
329 |
+
else:
|
330 |
+
run.Search(c)
|
331 |
+
|
332 |
+
|
333 |
+
def run_as_command():
|
334 |
+
if(sys.version_info.major < 3 or (sys.version_info.major == 3 and sys.version_info.minor < 6)):
|
335 |
+
print("[-] TWINT requires Python version 3.6+.")
|
336 |
+
sys.exit(0)
|
337 |
+
|
338 |
+
main()
|
339 |
+
|
340 |
+
|
341 |
+
if __name__ == '__main__':
|
342 |
+
main()
|
twitter-scraper/twint-master/twint/config.py
ADDED
@@ -0,0 +1,87 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from dataclasses import dataclass
|
2 |
+
from typing import Optional
|
3 |
+
|
4 |
+
@dataclass
|
5 |
+
class Config:
|
6 |
+
Username: Optional[str] = None
|
7 |
+
User_id: Optional[str] = None
|
8 |
+
Search: Optional[str] = None
|
9 |
+
Lookup: bool = False
|
10 |
+
Geo: str = ""
|
11 |
+
Location: bool = False
|
12 |
+
Near: str = None
|
13 |
+
Lang: Optional[str] = None
|
14 |
+
Output: Optional[str] = None
|
15 |
+
Elasticsearch: object = None
|
16 |
+
Year: Optional[int] = None
|
17 |
+
Since: Optional[str] = None
|
18 |
+
Until: Optional[str] = None
|
19 |
+
Email: Optional[str] = None
|
20 |
+
Phone: Optional[str] = None
|
21 |
+
Verified: bool = False
|
22 |
+
Store_csv: bool = False
|
23 |
+
Store_json: bool = False
|
24 |
+
Custom = {"tweet": None, "user": None, "username": None}
|
25 |
+
Show_hashtags: bool = False
|
26 |
+
Show_cashtags: bool = False
|
27 |
+
Limit: Optional[int] = None
|
28 |
+
Count: Optional[int] = None
|
29 |
+
Stats: bool = False
|
30 |
+
Database: object = None
|
31 |
+
To: str = None
|
32 |
+
All = None
|
33 |
+
Debug: bool = False
|
34 |
+
Format = None
|
35 |
+
Essid: str = ""
|
36 |
+
Profile: bool = False
|
37 |
+
Followers: bool = False
|
38 |
+
Following: bool = False
|
39 |
+
Favorites: bool = False
|
40 |
+
TwitterSearch: bool = False
|
41 |
+
User_full: bool = False
|
42 |
+
# Profile_full: bool = False
|
43 |
+
Store_object: bool = False
|
44 |
+
Store_object_tweets_list: list = None
|
45 |
+
Store_object_users_list: list = None
|
46 |
+
Store_object_follow_list: list = None
|
47 |
+
Pandas_type: type = None
|
48 |
+
Pandas: bool = False
|
49 |
+
Index_tweets: str = "twinttweets"
|
50 |
+
Index_follow: str = "twintgraph"
|
51 |
+
Index_users: str = "twintuser"
|
52 |
+
Retries_count: int = 10
|
53 |
+
Resume: object = None
|
54 |
+
Images: bool = False
|
55 |
+
Videos: bool = False
|
56 |
+
Media: bool = False
|
57 |
+
Replies: bool = False
|
58 |
+
Pandas_clean: bool = True
|
59 |
+
Lowercase: bool = True
|
60 |
+
Pandas_au: bool = True
|
61 |
+
Proxy_host: str = ""
|
62 |
+
Proxy_port: int = 0
|
63 |
+
Proxy_type: object = None
|
64 |
+
Tor_control_port: int = 9051
|
65 |
+
Tor_control_password: str = None
|
66 |
+
Retweets: bool = False
|
67 |
+
Query: str = None
|
68 |
+
Hide_output: bool = False
|
69 |
+
Custom_query: str = ""
|
70 |
+
Popular_tweets: bool = False
|
71 |
+
Skip_certs: bool = False
|
72 |
+
Native_retweets: bool = False
|
73 |
+
Min_likes: int = 0
|
74 |
+
Min_retweets: int = 0
|
75 |
+
Min_replies: int = 0
|
76 |
+
Links: Optional[str] = None
|
77 |
+
Source: Optional[str] = None
|
78 |
+
Members_list: Optional[str] = None
|
79 |
+
Filter_retweets: bool = False
|
80 |
+
Translate: bool = False
|
81 |
+
TranslateSrc: str = "en"
|
82 |
+
TranslateDest: str = "en"
|
83 |
+
Backoff_exponent: float = 3.0
|
84 |
+
Min_wait_time: int = 0
|
85 |
+
Bearer_token: str = None
|
86 |
+
Guest_token: str = None
|
87 |
+
deleted: list = None
|
twitter-scraper/twint-master/twint/datelock.py
ADDED
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import datetime
|
2 |
+
|
3 |
+
import logging as logme
|
4 |
+
|
5 |
+
from .tweet import utc_to_local
|
6 |
+
|
7 |
+
|
8 |
+
class Datelock:
|
9 |
+
until = None
|
10 |
+
since = None
|
11 |
+
_since_def_user = None
|
12 |
+
|
13 |
+
|
14 |
+
def convertToDateTime(string):
|
15 |
+
dateTimeList = string.split()
|
16 |
+
ListLength = len(dateTimeList)
|
17 |
+
if ListLength == 2:
|
18 |
+
return string
|
19 |
+
if ListLength == 1:
|
20 |
+
return string + " 00:00:00"
|
21 |
+
else:
|
22 |
+
return ""
|
23 |
+
|
24 |
+
|
25 |
+
def Set(Until, Since):
|
26 |
+
logme.debug(__name__+':Set')
|
27 |
+
d = Datelock()
|
28 |
+
|
29 |
+
if Until:
|
30 |
+
d.until = datetime.datetime.strptime(convertToDateTime(Until), "%Y-%m-%d %H:%M:%S")
|
31 |
+
d.until = utc_to_local(d.until)
|
32 |
+
else:
|
33 |
+
d.until = datetime.datetime.today()
|
34 |
+
|
35 |
+
if Since:
|
36 |
+
d.since = datetime.datetime.strptime(convertToDateTime(Since), "%Y-%m-%d %H:%M:%S")
|
37 |
+
d.since = utc_to_local(d.since)
|
38 |
+
d._since_def_user = True
|
39 |
+
else:
|
40 |
+
d.since = datetime.datetime.strptime("2006-03-21 00:00:00", "%Y-%m-%d %H:%M:%S")
|
41 |
+
d.since = utc_to_local(d.since)
|
42 |
+
d._since_def_user = False
|
43 |
+
|
44 |
+
return d
|
twitter-scraper/twint-master/twint/feed.py
ADDED
@@ -0,0 +1,145 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import time
|
2 |
+
from datetime import datetime
|
3 |
+
|
4 |
+
from bs4 import BeautifulSoup
|
5 |
+
from re import findall
|
6 |
+
from json import loads
|
7 |
+
|
8 |
+
import logging as logme
|
9 |
+
|
10 |
+
from .tweet import utc_to_local, Tweet_formats
|
11 |
+
|
12 |
+
|
13 |
+
class NoMoreTweetsException(Exception):
|
14 |
+
def __init__(self, msg):
|
15 |
+
super().__init__(msg)
|
16 |
+
|
17 |
+
|
18 |
+
def Follow(response):
|
19 |
+
logme.debug(__name__ + ':Follow')
|
20 |
+
soup = BeautifulSoup(response, "html.parser")
|
21 |
+
follow = soup.find_all("td", "info fifty screenname")
|
22 |
+
cursor = soup.find_all("div", "w-button-more")
|
23 |
+
try:
|
24 |
+
cursor = findall(r'cursor=(.*?)">', str(cursor))[0]
|
25 |
+
except IndexError:
|
26 |
+
logme.critical(__name__ + ':Follow:IndexError')
|
27 |
+
|
28 |
+
return follow, cursor
|
29 |
+
|
30 |
+
|
31 |
+
# TODO: this won't be used by --profile-full anymore. if it isn't used anywhere else, perhaps remove this in future
|
32 |
+
def Mobile(response):
|
33 |
+
logme.debug(__name__ + ':Mobile')
|
34 |
+
soup = BeautifulSoup(response, "html.parser")
|
35 |
+
tweets = soup.find_all("span", "metadata")
|
36 |
+
max_id = soup.find_all("div", "w-button-more")
|
37 |
+
try:
|
38 |
+
max_id = findall(r'max_id=(.*?)">', str(max_id))[0]
|
39 |
+
except Exception as e:
|
40 |
+
logme.critical(__name__ + ':Mobile:' + str(e))
|
41 |
+
|
42 |
+
return tweets, max_id
|
43 |
+
|
44 |
+
|
45 |
+
def MobileFav(response):
|
46 |
+
soup = BeautifulSoup(response, "html.parser")
|
47 |
+
tweets = soup.find_all("table", "tweet")
|
48 |
+
max_id = soup.find_all("div", "w-button-more")
|
49 |
+
try:
|
50 |
+
max_id = findall(r'max_id=(.*?)">', str(max_id))[0]
|
51 |
+
except Exception as e:
|
52 |
+
print(str(e) + " [x] feed.MobileFav")
|
53 |
+
|
54 |
+
return tweets, max_id
|
55 |
+
|
56 |
+
|
57 |
+
def _get_cursor(response):
|
58 |
+
if isinstance(response, dict): # case 1
|
59 |
+
try:
|
60 |
+
next_cursor = response['timeline']['instructions'][0]['addEntries']['entries'][-1]['content'][
|
61 |
+
'operation']['cursor']['value']
|
62 |
+
except KeyError:
|
63 |
+
# this is needed because after the first request location of cursor is changed
|
64 |
+
next_cursor = response['timeline']['instructions'][-1]['replaceEntry']['entry']['content']['operation'][
|
65 |
+
'cursor']['value']
|
66 |
+
else: # case 2
|
67 |
+
next_cursor = response[-1]['content']['value']
|
68 |
+
return next_cursor
|
69 |
+
|
70 |
+
|
71 |
+
def Json(response):
|
72 |
+
logme.debug(__name__ + ':Json')
|
73 |
+
json_response = loads(response)
|
74 |
+
html = json_response["items_html"]
|
75 |
+
soup = BeautifulSoup(html, "html.parser")
|
76 |
+
feed = soup.find_all("div", "tweet")
|
77 |
+
return feed, json_response["min_position"]
|
78 |
+
|
79 |
+
|
80 |
+
def parse_tweets(config, response):
|
81 |
+
logme.debug(__name__ + ':parse_tweets')
|
82 |
+
response = loads(response)
|
83 |
+
feed = []
|
84 |
+
if 'globalObjects' in response:
|
85 |
+
if len(response['globalObjects']['tweets']) == 0:
|
86 |
+
msg = 'No more data!'
|
87 |
+
raise NoMoreTweetsException(msg)
|
88 |
+
for timeline_entry in response['timeline']['instructions'][0]['addEntries']['entries']:
|
89 |
+
# this will handle the cases when the timeline entry is a tweet
|
90 |
+
if (config.TwitterSearch or config.Profile) and (timeline_entry['entryId'].startswith('sq-I-t-') or
|
91 |
+
timeline_entry['entryId'].startswith('tweet-')):
|
92 |
+
if 'tweet' in timeline_entry['content']['item']['content']:
|
93 |
+
_id = timeline_entry['content']['item']['content']['tweet']['id']
|
94 |
+
# skip the ads
|
95 |
+
if 'promotedMetadata' in timeline_entry['content']['item']['content']['tweet']:
|
96 |
+
continue
|
97 |
+
elif 'tombstone' in timeline_entry['content']['item']['content'] and 'tweet' in \
|
98 |
+
timeline_entry['content']['item']['content']['tombstone']:
|
99 |
+
_id = timeline_entry['content']['item']['content']['tombstone']['tweet']['id']
|
100 |
+
else:
|
101 |
+
_id = None
|
102 |
+
if _id is None:
|
103 |
+
raise ValueError('Unable to find ID of tweet in timeline.')
|
104 |
+
try:
|
105 |
+
temp_obj = response['globalObjects']['tweets'][_id]
|
106 |
+
except KeyError:
|
107 |
+
logme.info('encountered a deleted tweet with id {}'.format(_id))
|
108 |
+
|
109 |
+
config.deleted.append(_id)
|
110 |
+
continue
|
111 |
+
temp_obj['user_data'] = response['globalObjects']['users'][temp_obj['user_id_str']]
|
112 |
+
if 'retweeted_status_id_str' in temp_obj:
|
113 |
+
rt_id = temp_obj['retweeted_status_id_str']
|
114 |
+
_dt = response['globalObjects']['tweets'][rt_id]['created_at']
|
115 |
+
_dt = datetime.strptime(_dt, '%a %b %d %H:%M:%S %z %Y')
|
116 |
+
_dt = utc_to_local(_dt)
|
117 |
+
_dt = str(_dt.strftime(Tweet_formats['datetime']))
|
118 |
+
temp_obj['retweet_data'] = {
|
119 |
+
'user_rt_id': response['globalObjects']['tweets'][rt_id]['user_id_str'],
|
120 |
+
'user_rt': response['globalObjects']['tweets'][rt_id]['full_text'],
|
121 |
+
'retweet_id': rt_id,
|
122 |
+
'retweet_date': _dt,
|
123 |
+
}
|
124 |
+
feed.append(temp_obj)
|
125 |
+
next_cursor = _get_cursor(response) # case 1
|
126 |
+
else:
|
127 |
+
response = response['data']['user']['result']['timeline']
|
128 |
+
entries = response['timeline']['instructions']
|
129 |
+
for e in entries:
|
130 |
+
if e.get('entries'):
|
131 |
+
entries = e['entries']
|
132 |
+
break
|
133 |
+
if len(entries) == 2:
|
134 |
+
msg = 'No more data!'
|
135 |
+
raise NoMoreTweetsException(msg)
|
136 |
+
for timeline_entry in entries:
|
137 |
+
if timeline_entry['content'].get('itemContent'):
|
138 |
+
try:
|
139 |
+
temp_obj = timeline_entry['content']['itemContent']['tweet_results']['result']['legacy']
|
140 |
+
temp_obj['user_data'] = timeline_entry['content']['itemContent']['tweet_results']['result']['core']['user_results']['result']['legacy']
|
141 |
+
feed.append(temp_obj)
|
142 |
+
except KeyError: # doubtful
|
143 |
+
next
|
144 |
+
next_cursor = _get_cursor(entries) # case 2
|
145 |
+
return feed, next_cursor
|
twitter-scraper/twint-master/twint/format.py
ADDED
@@ -0,0 +1,91 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import logging as logme
|
2 |
+
|
3 |
+
def Tweet(config, t):
|
4 |
+
if config.Format:
|
5 |
+
logme.debug(__name__+':Tweet:Format')
|
6 |
+
output = config.Format.replace("{id}", t.id_str)
|
7 |
+
output = output.replace("{conversation_id}", t.conversation_id)
|
8 |
+
output = output.replace("{date}", t.datestamp)
|
9 |
+
output = output.replace("{time}", t.timestamp)
|
10 |
+
output = output.replace("{user_id}", t.user_id_str)
|
11 |
+
output = output.replace("{username}", t.username)
|
12 |
+
output = output.replace("{name}", t.name)
|
13 |
+
output = output.replace("{place}", t.place)
|
14 |
+
output = output.replace("{timezone}", t.timezone)
|
15 |
+
output = output.replace("{urls}", ",".join(t.urls))
|
16 |
+
output = output.replace("{photos}", ",".join(t.photos))
|
17 |
+
output = output.replace("{video}", str(t.video))
|
18 |
+
output = output.replace("{thumbnail}", t.thumbnail)
|
19 |
+
output = output.replace("{tweet}", t.tweet)
|
20 |
+
output = output.replace("{language}", t.lang)
|
21 |
+
output = output.replace("{hashtags}", ",".join(t.hashtags))
|
22 |
+
output = output.replace("{cashtags}", ",".join(t.cashtags))
|
23 |
+
output = output.replace("{replies}", t.replies_count)
|
24 |
+
output = output.replace("{retweets}", t.retweets_count)
|
25 |
+
output = output.replace("{likes}", t.likes_count)
|
26 |
+
output = output.replace("{link}", t.link)
|
27 |
+
output = output.replace("{is_retweet}", str(t.retweet))
|
28 |
+
output = output.replace("{user_rt_id}", str(t.user_rt_id))
|
29 |
+
output = output.replace("{quote_url}", t.quote_url)
|
30 |
+
output = output.replace("{near}", t.near)
|
31 |
+
output = output.replace("{geo}", t.geo)
|
32 |
+
output = output.replace("{mentions}", ",".join(t.mentions))
|
33 |
+
output = output.replace("{translate}", t.translate)
|
34 |
+
output = output.replace("{trans_src}", t.trans_src)
|
35 |
+
output = output.replace("{trans_dest}", t.trans_dest)
|
36 |
+
else:
|
37 |
+
logme.debug(__name__+':Tweet:notFormat')
|
38 |
+
output = f"{t.id_str} {t.datestamp} {t.timestamp} {t.timezone} "
|
39 |
+
|
40 |
+
# TODO: someone who is familiar with this code, needs to take a look at what this is <also see tweet.py>
|
41 |
+
# if t.retweet:
|
42 |
+
# output += "RT "
|
43 |
+
|
44 |
+
output += f"<{t.username}> {t.tweet}"
|
45 |
+
|
46 |
+
if config.Show_hashtags:
|
47 |
+
hashtags = ",".join(t.hashtags)
|
48 |
+
output += f" {hashtags}"
|
49 |
+
if config.Show_cashtags:
|
50 |
+
cashtags = ",".join(t.cashtags)
|
51 |
+
output += f" {cashtags}"
|
52 |
+
if config.Stats:
|
53 |
+
output += f" | {t.replies_count} replies {t.retweets_count} retweets {t.likes_count} likes"
|
54 |
+
if config.Translate:
|
55 |
+
output += f" {t.translate} {t.trans_src} {t.trans_dest}"
|
56 |
+
return output
|
57 |
+
|
58 |
+
def User(_format, u):
|
59 |
+
if _format:
|
60 |
+
logme.debug(__name__+':User:Format')
|
61 |
+
output = _format.replace("{id}", str(u.id))
|
62 |
+
output = output.replace("{name}", u.name)
|
63 |
+
output = output.replace("{username}", u.username)
|
64 |
+
output = output.replace("{bio}", u.bio)
|
65 |
+
output = output.replace("{location}", u.location)
|
66 |
+
output = output.replace("{url}", u.url)
|
67 |
+
output = output.replace("{join_date}", u.join_date)
|
68 |
+
output = output.replace("{join_time}", u.join_time)
|
69 |
+
output = output.replace("{tweets}", str(u.tweets))
|
70 |
+
output = output.replace("{following}", str(u.following))
|
71 |
+
output = output.replace("{followers}", str(u.followers))
|
72 |
+
output = output.replace("{likes}", str(u.likes))
|
73 |
+
output = output.replace("{media}", str(u.media_count))
|
74 |
+
output = output.replace("{private}", str(u.is_private))
|
75 |
+
output = output.replace("{verified}", str(u.is_verified))
|
76 |
+
output = output.replace("{avatar}", u.avatar)
|
77 |
+
if u.background_image:
|
78 |
+
output = output.replace("{background_image}", u.background_image)
|
79 |
+
else:
|
80 |
+
output = output.replace("{background_image}", "")
|
81 |
+
else:
|
82 |
+
logme.debug(__name__+':User:notFormat')
|
83 |
+
output = f"{u.id} | {u.name} | @{u.username} | Private: "
|
84 |
+
output += f"{u.is_private} | Verified: {u.is_verified} |"
|
85 |
+
output += f" Bio: {u.bio} | Location: {u.location} | Url: "
|
86 |
+
output += f"{u.url} | Joined: {u.join_date} {u.join_time} "
|
87 |
+
output += f"| Tweets: {u.tweets} | Following: {u.following}"
|
88 |
+
output += f" | Followers: {u.followers} | Likes: {u.likes} "
|
89 |
+
output += f"| Media: {u.media_count} | Avatar: {u.avatar}"
|
90 |
+
|
91 |
+
return output
|
twitter-scraper/twint-master/twint/get.py
ADDED
@@ -0,0 +1,298 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from async_timeout import timeout
|
2 |
+
from datetime import datetime
|
3 |
+
from bs4 import BeautifulSoup
|
4 |
+
import sys
|
5 |
+
import socket
|
6 |
+
import aiohttp
|
7 |
+
from fake_useragent import UserAgent
|
8 |
+
import asyncio
|
9 |
+
import concurrent.futures
|
10 |
+
import random
|
11 |
+
from json import loads, dumps
|
12 |
+
from aiohttp_socks import ProxyConnector, ProxyType
|
13 |
+
from urllib.parse import quote
|
14 |
+
import time
|
15 |
+
|
16 |
+
from . import url
|
17 |
+
from .output import Tweets, Users
|
18 |
+
from .token import TokenExpiryException
|
19 |
+
|
20 |
+
import logging as logme
|
21 |
+
|
22 |
+
httpproxy = None
|
23 |
+
|
24 |
+
user_agent_list = [
|
25 |
+
# 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)'
|
26 |
+
# ' Chrome/60.0.3112.113 Safari/537.36',
|
27 |
+
# 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)'
|
28 |
+
# ' Chrome/60.0.3112.90 Safari/537.36',
|
29 |
+
# 'Mozilla/5.0 (Windows NT 5.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)'
|
30 |
+
# ' Chrome/60.0.3112.90 Safari/537.36',
|
31 |
+
# 'Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)'
|
32 |
+
# ' Chrome/60.0.3112.90 Safari/537.36',
|
33 |
+
# 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko)'
|
34 |
+
# ' Chrome/44.0.2403.157 Safari/537.36',
|
35 |
+
# 'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)'
|
36 |
+
# ' Chrome/60.0.3112.113 Safari/537.36',
|
37 |
+
# 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)'
|
38 |
+
# ' Chrome/57.0.2987.133 Safari/537.36',
|
39 |
+
# 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)'
|
40 |
+
# ' Chrome/57.0.2987.133 Safari/537.36',
|
41 |
+
# 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)'
|
42 |
+
# ' Chrome/55.0.2883.87 Safari/537.36',
|
43 |
+
# 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)'
|
44 |
+
# ' Chrome/55.0.2883.87 Safari/537.36',
|
45 |
+
|
46 |
+
'Mozilla/4.0 (compatible; MSIE 9.0; Windows NT 6.1)',
|
47 |
+
'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko',
|
48 |
+
'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0)',
|
49 |
+
'Mozilla/5.0 (Windows NT 6.1; Trident/7.0; rv:11.0) like Gecko',
|
50 |
+
'Mozilla/5.0 (Windows NT 6.2; WOW64; Trident/7.0; rv:11.0) like Gecko',
|
51 |
+
'Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko',
|
52 |
+
'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.0; Trident/5.0)',
|
53 |
+
'Mozilla/5.0 (Windows NT 6.3; WOW64; Trident/7.0; rv:11.0) like Gecko',
|
54 |
+
'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)',
|
55 |
+
'Mozilla/5.0 (Windows NT 6.1; Win64; x64; Trident/7.0; rv:11.0) like Gecko',
|
56 |
+
'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0)',
|
57 |
+
'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/6.0)',
|
58 |
+
'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; .NET CLR 2.0.50727; .NET CLR 3.0.4506.2152; .NET '
|
59 |
+
'CLR 3.5.30729)',
|
60 |
+
]
|
61 |
+
|
62 |
+
|
63 |
+
# function to convert python `dict` to json and then encode it to be passed in the url as a parameter
|
64 |
+
# some urls require this format
|
65 |
+
def dict_to_url(dct):
|
66 |
+
return quote(dumps(dct))
|
67 |
+
|
68 |
+
|
69 |
+
def get_connector(config):
|
70 |
+
logme.debug(__name__ + ':get_connector')
|
71 |
+
_connector = None
|
72 |
+
if config.Proxy_host:
|
73 |
+
if config.Proxy_host.lower() == "tor":
|
74 |
+
_connector = ProxyConnector(
|
75 |
+
host='127.0.0.1',
|
76 |
+
port=9050,
|
77 |
+
rdns=True)
|
78 |
+
elif config.Proxy_port and config.Proxy_type:
|
79 |
+
if config.Proxy_type.lower() == "socks5":
|
80 |
+
_type = ProxyType.SOCKS5
|
81 |
+
elif config.Proxy_type.lower() == "socks4":
|
82 |
+
_type = ProxyType.SOCKS4
|
83 |
+
elif config.Proxy_type.lower() == "http":
|
84 |
+
global httpproxy
|
85 |
+
httpproxy = "http://" + config.Proxy_host + ":" + str(config.Proxy_port)
|
86 |
+
return _connector
|
87 |
+
else:
|
88 |
+
logme.critical("get_connector:proxy-type-error")
|
89 |
+
print("Error: Proxy types allowed are: http, socks5 and socks4. No https.")
|
90 |
+
sys.exit(1)
|
91 |
+
_connector = ProxyConnector(
|
92 |
+
proxy_type=_type,
|
93 |
+
host=config.Proxy_host,
|
94 |
+
port=config.Proxy_port,
|
95 |
+
rdns=True)
|
96 |
+
else:
|
97 |
+
logme.critical(__name__ + ':get_connector:proxy-port-type-error')
|
98 |
+
print("Error: Please specify --proxy-host, --proxy-port, and --proxy-type")
|
99 |
+
sys.exit(1)
|
100 |
+
else:
|
101 |
+
if config.Proxy_port or config.Proxy_type:
|
102 |
+
logme.critical(__name__ + ':get_connector:proxy-host-arg-error')
|
103 |
+
print("Error: Please specify --proxy-host, --proxy-port, and --proxy-type")
|
104 |
+
sys.exit(1)
|
105 |
+
|
106 |
+
return _connector
|
107 |
+
|
108 |
+
|
109 |
+
async def RequestUrl(config, init):
|
110 |
+
logme.debug(__name__ + ':RequestUrl')
|
111 |
+
_connector = get_connector(config)
|
112 |
+
_serialQuery = ""
|
113 |
+
params = []
|
114 |
+
_url = ""
|
115 |
+
_headers = [("authorization", config.Bearer_token), ("x-guest-token", config.Guest_token)]
|
116 |
+
|
117 |
+
# TODO : do this later
|
118 |
+
if config.Profile:
|
119 |
+
logme.debug(__name__ + ':RequestUrl:Profile')
|
120 |
+
_url, params, _serialQuery = url.SearchProfile(config, init)
|
121 |
+
elif config.TwitterSearch:
|
122 |
+
logme.debug(__name__ + ':RequestUrl:TwitterSearch')
|
123 |
+
_url, params, _serialQuery = await url.Search(config, init)
|
124 |
+
else:
|
125 |
+
if config.Following:
|
126 |
+
logme.debug(__name__ + ':RequestUrl:Following')
|
127 |
+
_url = await url.Following(config.Username, init)
|
128 |
+
elif config.Followers:
|
129 |
+
logme.debug(__name__ + ':RequestUrl:Followers')
|
130 |
+
_url = await url.Followers(config.Username, init)
|
131 |
+
else:
|
132 |
+
logme.debug(__name__ + ':RequestUrl:Favorites')
|
133 |
+
_url = await url.Favorites(config.Username, init)
|
134 |
+
_serialQuery = _url
|
135 |
+
|
136 |
+
response = await Request(_url, params=params, connector=_connector, headers=_headers)
|
137 |
+
|
138 |
+
if config.Debug:
|
139 |
+
print(_serialQuery, file=open("twint-request_urls.log", "a", encoding="utf-8"))
|
140 |
+
|
141 |
+
return response
|
142 |
+
|
143 |
+
|
144 |
+
def ForceNewTorIdentity(config):
|
145 |
+
logme.debug(__name__ + ':ForceNewTorIdentity')
|
146 |
+
try:
|
147 |
+
tor_c = socket.create_connection(('127.0.0.1', config.Tor_control_port))
|
148 |
+
tor_c.send('AUTHENTICATE "{}"\r\nSIGNAL NEWNYM\r\n'.format(config.Tor_control_password).encode())
|
149 |
+
response = tor_c.recv(1024)
|
150 |
+
if response != b'250 OK\r\n250 OK\r\n':
|
151 |
+
sys.stderr.write('Unexpected response from Tor control port: {}\n'.format(response))
|
152 |
+
logme.critical(__name__ + ':ForceNewTorIdentity:unexpectedResponse')
|
153 |
+
except Exception as e:
|
154 |
+
logme.debug(__name__ + ':ForceNewTorIdentity:errorConnectingTor')
|
155 |
+
sys.stderr.write('Error connecting to Tor control port: {}\n'.format(repr(e)))
|
156 |
+
sys.stderr.write('If you want to rotate Tor ports automatically - enable Tor control port\n')
|
157 |
+
|
158 |
+
|
159 |
+
async def Request(_url, connector=None, params=None, headers=None):
|
160 |
+
logme.debug(__name__ + ':Request:Connector')
|
161 |
+
async with aiohttp.ClientSession(connector=connector, headers=headers) as session:
|
162 |
+
return await Response(session, _url, params)
|
163 |
+
|
164 |
+
|
165 |
+
async def Response(session, _url, params=None):
|
166 |
+
logme.debug(__name__ + ':Response')
|
167 |
+
retries = 5
|
168 |
+
wait = 10 # No basis, maybe work with 0
|
169 |
+
for attempt in range(retries + 1):
|
170 |
+
try:
|
171 |
+
with timeout(120):
|
172 |
+
async with session.get(_url, ssl=True, params=params, proxy=httpproxy) as response:
|
173 |
+
resp = await response.text()
|
174 |
+
if response.status == 429: # 429 implies Too many requests i.e. Rate Limit Exceeded
|
175 |
+
raise TokenExpiryException(loads(resp)['errors'][0]['message'])
|
176 |
+
return resp
|
177 |
+
except aiohttp.client_exceptions.ClientConnectorError as exc:
|
178 |
+
if attempt < retries:
|
179 |
+
retrying = ', retrying'
|
180 |
+
level = logme.WARNING
|
181 |
+
else:
|
182 |
+
retrying = ''
|
183 |
+
level = logme.ERROR
|
184 |
+
logme.log(level, f'Error retrieving {_url}: {exc!r}{retrying}')
|
185 |
+
if attempt < retries:
|
186 |
+
time.sleep(wait)
|
187 |
+
else:
|
188 |
+
logme.fatal(f'{retries + 1} requests to {_url} failed, giving up.')
|
189 |
+
raise TokenExpiryException(f'{exc!r}')
|
190 |
+
|
191 |
+
|
192 |
+
async def RandomUserAgent(wa=None):
|
193 |
+
logme.debug(__name__ + ':RandomUserAgent')
|
194 |
+
try:
|
195 |
+
if wa:
|
196 |
+
return "Mozilla/5.0 (Windows NT 6.4; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2225.0 Safari/537.36"
|
197 |
+
return UserAgent(verify_ssl=False, use_cache_server=False).random
|
198 |
+
except:
|
199 |
+
return random.choice(user_agent_list)
|
200 |
+
|
201 |
+
|
202 |
+
async def Username(_id, bearer_token, guest_token):
|
203 |
+
logme.debug(__name__ + ':Username')
|
204 |
+
_dct = {'userId': _id, 'withHighlightedLabel': False}
|
205 |
+
_url = "https://api.twitter.com/graphql/B9FuNQVmyx32rdbIPEZKag/UserByRestId?variables={}".format(dict_to_url(_dct))
|
206 |
+
_headers = {
|
207 |
+
'authorization': bearer_token,
|
208 |
+
'x-guest-token': guest_token,
|
209 |
+
}
|
210 |
+
r = await Request(_url, headers=_headers)
|
211 |
+
j_r = loads(r)
|
212 |
+
username = j_r['data']['user']['legacy']['screen_name']
|
213 |
+
return username
|
214 |
+
|
215 |
+
|
216 |
+
async def Tweet(url, config, conn):
|
217 |
+
logme.debug(__name__ + ':Tweet')
|
218 |
+
try:
|
219 |
+
response = await Request(url)
|
220 |
+
soup = BeautifulSoup(response, "html.parser")
|
221 |
+
tweets = soup.find_all("div", "tweet")
|
222 |
+
await Tweets(tweets, config, conn, url)
|
223 |
+
except Exception as e:
|
224 |
+
logme.critical(__name__ + ':Tweet:' + str(e))
|
225 |
+
|
226 |
+
|
227 |
+
async def User(username, config, conn, user_id=False):
|
228 |
+
logme.debug(__name__ + ':User')
|
229 |
+
_dct = {'screen_name': username, 'withHighlightedLabel': False}
|
230 |
+
_url = 'https://api.twitter.com/graphql/jMaTS-_Ea8vh9rpKggJbCQ/UserByScreenName?variables={}'\
|
231 |
+
.format(dict_to_url(_dct))
|
232 |
+
_headers = {
|
233 |
+
'authorization': config.Bearer_token,
|
234 |
+
'x-guest-token': config.Guest_token,
|
235 |
+
}
|
236 |
+
try:
|
237 |
+
response = await Request(_url, headers=_headers)
|
238 |
+
j_r = loads(response)
|
239 |
+
if user_id:
|
240 |
+
try:
|
241 |
+
_id = j_r['data']['user']['rest_id']
|
242 |
+
return _id
|
243 |
+
except KeyError as e:
|
244 |
+
logme.critical(__name__ + ':User:' + str(e))
|
245 |
+
return
|
246 |
+
await Users(j_r, config, conn)
|
247 |
+
except Exception as e:
|
248 |
+
logme.critical(__name__ + ':User:' + str(e))
|
249 |
+
raise
|
250 |
+
|
251 |
+
|
252 |
+
def Limit(Limit, count):
|
253 |
+
logme.debug(__name__ + ':Limit')
|
254 |
+
if Limit is not None and count >= int(Limit):
|
255 |
+
return True
|
256 |
+
|
257 |
+
|
258 |
+
async def Multi(feed, config, conn):
|
259 |
+
logme.debug(__name__ + ':Multi')
|
260 |
+
count = 0
|
261 |
+
try:
|
262 |
+
with concurrent.futures.ThreadPoolExecutor(max_workers=20) as executor:
|
263 |
+
loop = asyncio.get_event_loop()
|
264 |
+
futures = []
|
265 |
+
for tweet in feed:
|
266 |
+
count += 1
|
267 |
+
if config.Favorites or config.Profile_full:
|
268 |
+
logme.debug(__name__ + ':Multi:Favorites-profileFull')
|
269 |
+
link = tweet.find("a")["href"]
|
270 |
+
url = f"https://twitter.com{link}&lang=en"
|
271 |
+
elif config.User_full:
|
272 |
+
logme.debug(__name__ + ':Multi:userFull')
|
273 |
+
username = tweet.find("a")["name"]
|
274 |
+
url = f"http://twitter.com/{username}?lang=en"
|
275 |
+
else:
|
276 |
+
logme.debug(__name__ + ':Multi:else-url')
|
277 |
+
link = tweet.find("a", "tweet-timestamp js-permalink js-nav js-tooltip")["href"]
|
278 |
+
url = f"https://twitter.com{link}?lang=en"
|
279 |
+
|
280 |
+
if config.User_full:
|
281 |
+
logme.debug(__name__ + ':Multi:user-full-Run')
|
282 |
+
futures.append(loop.run_in_executor(executor, await User(url,
|
283 |
+
config, conn)))
|
284 |
+
else:
|
285 |
+
logme.debug(__name__ + ':Multi:notUser-full-Run')
|
286 |
+
futures.append(loop.run_in_executor(executor, await Tweet(url,
|
287 |
+
config, conn)))
|
288 |
+
logme.debug(__name__ + ':Multi:asyncioGather')
|
289 |
+
await asyncio.gather(*futures)
|
290 |
+
except Exception as e:
|
291 |
+
# TODO: fix error not error
|
292 |
+
# print(str(e) + " [x] get.Multi")
|
293 |
+
# will return "'NoneType' object is not callable"
|
294 |
+
# but still works
|
295 |
+
# logme.critical(__name__+':Multi:' + str(e))
|
296 |
+
pass
|
297 |
+
|
298 |
+
return count
|
twitter-scraper/twint-master/twint/output.py
ADDED
@@ -0,0 +1,241 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from datetime import datetime
|
2 |
+
|
3 |
+
from . import format, get
|
4 |
+
from .tweet import Tweet
|
5 |
+
from .user import User
|
6 |
+
from .storage import db, elasticsearch, write, panda
|
7 |
+
|
8 |
+
import logging as logme
|
9 |
+
|
10 |
+
follows_list = []
|
11 |
+
tweets_list = []
|
12 |
+
users_list = []
|
13 |
+
|
14 |
+
author_list = {''}
|
15 |
+
author_list.pop()
|
16 |
+
|
17 |
+
# used by Pandas
|
18 |
+
_follows_object = {}
|
19 |
+
|
20 |
+
|
21 |
+
def _formatDateTime(datetimestamp):
|
22 |
+
try:
|
23 |
+
return int(datetime.strptime(datetimestamp, "%Y-%m-%d %H:%M:%S").timestamp())
|
24 |
+
except ValueError:
|
25 |
+
return int(datetime.strptime(datetimestamp, "%Y-%m-%d").timestamp())
|
26 |
+
|
27 |
+
|
28 |
+
def _clean_follow_list():
|
29 |
+
logme.debug(__name__ + ':clean_follow_list')
|
30 |
+
global _follows_object
|
31 |
+
_follows_object = {}
|
32 |
+
|
33 |
+
|
34 |
+
def clean_lists():
|
35 |
+
logme.debug(__name__ + ':clean_lists')
|
36 |
+
global follows_list
|
37 |
+
global tweets_list
|
38 |
+
global users_list
|
39 |
+
follows_list = []
|
40 |
+
tweets_list = []
|
41 |
+
users_list = []
|
42 |
+
|
43 |
+
|
44 |
+
def datecheck(datetimestamp, config):
|
45 |
+
logme.debug(__name__ + ':datecheck')
|
46 |
+
if config.Since:
|
47 |
+
logme.debug(__name__ + ':datecheck:SinceTrue')
|
48 |
+
|
49 |
+
d = _formatDateTime(datetimestamp)
|
50 |
+
s = _formatDateTime(config.Since)
|
51 |
+
|
52 |
+
if d < s:
|
53 |
+
return False
|
54 |
+
if config.Until:
|
55 |
+
logme.debug(__name__ + ':datecheck:UntilTrue')
|
56 |
+
|
57 |
+
d = _formatDateTime(datetimestamp)
|
58 |
+
s = _formatDateTime(config.Until)
|
59 |
+
|
60 |
+
if d > s:
|
61 |
+
return False
|
62 |
+
logme.debug(__name__ + ':datecheck:dateRangeFalse')
|
63 |
+
return True
|
64 |
+
|
65 |
+
|
66 |
+
# TODO In this method we need to delete the quoted tweets, because twitter also sends the quoted tweets in the
|
67 |
+
# `tweets` list along with the other tweets
|
68 |
+
def is_tweet(tw):
|
69 |
+
try:
|
70 |
+
tw["data-item-id"]
|
71 |
+
logme.debug(__name__ + ':is_tweet:True')
|
72 |
+
return True
|
73 |
+
except:
|
74 |
+
logme.critical(__name__ + ':is_tweet:False')
|
75 |
+
return False
|
76 |
+
|
77 |
+
|
78 |
+
def _output(obj, output, config, **extra):
|
79 |
+
logme.debug(__name__ + ':_output')
|
80 |
+
if config.Lowercase:
|
81 |
+
if isinstance(obj, str):
|
82 |
+
logme.debug(__name__ + ':_output:Lowercase:username')
|
83 |
+
obj = obj.lower()
|
84 |
+
elif obj.__class__.__name__ == "user":
|
85 |
+
logme.debug(__name__ + ':_output:Lowercase:user')
|
86 |
+
pass
|
87 |
+
elif obj.__class__.__name__ == "tweet":
|
88 |
+
logme.debug(__name__ + ':_output:Lowercase:tweet')
|
89 |
+
obj.username = obj.username.lower()
|
90 |
+
author_list.update({obj.username})
|
91 |
+
for dct in obj.mentions:
|
92 |
+
for key, val in dct.items():
|
93 |
+
dct[key] = val.lower()
|
94 |
+
for i in range(len(obj.hashtags)):
|
95 |
+
obj.hashtags[i] = obj.hashtags[i].lower()
|
96 |
+
for i in range(len(obj.cashtags)):
|
97 |
+
obj.cashtags[i] = obj.cashtags[i].lower()
|
98 |
+
else:
|
99 |
+
logme.info('_output:Lowercase:hiddenTweetFound')
|
100 |
+
print("[x] Hidden tweet found, account suspended due to violation of TOS")
|
101 |
+
return
|
102 |
+
if config.Output != None:
|
103 |
+
if config.Store_csv:
|
104 |
+
try:
|
105 |
+
write.Csv(obj, config)
|
106 |
+
logme.debug(__name__ + ':_output:CSV')
|
107 |
+
except Exception as e:
|
108 |
+
logme.critical(__name__ + ':_output:CSV:Error:' + str(e))
|
109 |
+
print(str(e) + " [x] output._output")
|
110 |
+
elif config.Store_json:
|
111 |
+
write.Json(obj, config)
|
112 |
+
logme.debug(__name__ + ':_output:JSON')
|
113 |
+
else:
|
114 |
+
write.Text(output, config.Output)
|
115 |
+
logme.debug(__name__ + ':_output:Text')
|
116 |
+
|
117 |
+
if config.Elasticsearch:
|
118 |
+
logme.debug(__name__ + ':_output:Elasticsearch')
|
119 |
+
print("", end=".", flush=True)
|
120 |
+
else:
|
121 |
+
if not config.Hide_output:
|
122 |
+
try:
|
123 |
+
print(output.replace('\n', ' '))
|
124 |
+
except UnicodeEncodeError:
|
125 |
+
logme.critical(__name__ + ':_output:UnicodeEncodeError')
|
126 |
+
print("unicode error [x] output._output")
|
127 |
+
|
128 |
+
|
129 |
+
async def checkData(tweet, config, conn):
|
130 |
+
logme.debug(__name__ + ':checkData')
|
131 |
+
tweet = Tweet(tweet, config)
|
132 |
+
if not tweet.datestamp:
|
133 |
+
logme.critical(__name__ + ':checkData:hiddenTweetFound')
|
134 |
+
print("[x] Hidden tweet found, account suspended due to violation of TOS")
|
135 |
+
return
|
136 |
+
if datecheck(tweet.datestamp + " " + tweet.timestamp, config):
|
137 |
+
output = format.Tweet(config, tweet)
|
138 |
+
if config.Database:
|
139 |
+
logme.debug(__name__ + ':checkData:Database')
|
140 |
+
db.tweets(conn, tweet, config)
|
141 |
+
if config.Pandas:
|
142 |
+
logme.debug(__name__ + ':checkData:Pandas')
|
143 |
+
panda.update(tweet, config)
|
144 |
+
if config.Store_object:
|
145 |
+
logme.debug(__name__ + ':checkData:Store_object')
|
146 |
+
if hasattr(config.Store_object_tweets_list, 'append'):
|
147 |
+
config.Store_object_tweets_list.append(tweet)
|
148 |
+
else:
|
149 |
+
tweets_list.append(tweet)
|
150 |
+
if config.Elasticsearch:
|
151 |
+
logme.debug(__name__ + ':checkData:Elasticsearch')
|
152 |
+
elasticsearch.Tweet(tweet, config)
|
153 |
+
_output(tweet, output, config)
|
154 |
+
# else:
|
155 |
+
# logme.critical(__name__+':checkData:copyrightedTweet')
|
156 |
+
|
157 |
+
|
158 |
+
async def Tweets(tweets, config, conn):
|
159 |
+
logme.debug(__name__ + ':Tweets')
|
160 |
+
if config.Favorites or config.Location:
|
161 |
+
logme.debug(__name__ + ':Tweets:fav+full+loc')
|
162 |
+
for tw in tweets:
|
163 |
+
await checkData(tw, config, conn)
|
164 |
+
elif config.TwitterSearch or config.Profile:
|
165 |
+
logme.debug(__name__ + ':Tweets:TwitterSearch')
|
166 |
+
await checkData(tweets, config, conn)
|
167 |
+
else:
|
168 |
+
logme.debug(__name__ + ':Tweets:else')
|
169 |
+
if int(tweets["data-user-id"]) == config.User_id or config.Retweets:
|
170 |
+
await checkData(tweets, config, conn)
|
171 |
+
|
172 |
+
|
173 |
+
async def Users(u, config, conn):
|
174 |
+
logme.debug(__name__ + ':User')
|
175 |
+
global users_list
|
176 |
+
|
177 |
+
user = User(u)
|
178 |
+
output = format.User(config.Format, user)
|
179 |
+
|
180 |
+
if config.Database:
|
181 |
+
logme.debug(__name__ + ':User:Database')
|
182 |
+
db.user(conn, config, user)
|
183 |
+
|
184 |
+
if config.Elasticsearch:
|
185 |
+
logme.debug(__name__ + ':User:Elasticsearch')
|
186 |
+
_save_date = user.join_date
|
187 |
+
_save_time = user.join_time
|
188 |
+
user.join_date = str(datetime.strptime(user.join_date, "%d %b %Y")).split()[0]
|
189 |
+
user.join_time = str(datetime.strptime(user.join_time, "%I:%M %p")).split()[1]
|
190 |
+
elasticsearch.UserProfile(user, config)
|
191 |
+
user.join_date = _save_date
|
192 |
+
user.join_time = _save_time
|
193 |
+
|
194 |
+
if config.Store_object:
|
195 |
+
logme.debug(__name__ + ':User:Store_object')
|
196 |
+
|
197 |
+
if hasattr(config.Store_object_follow_list, 'append'):
|
198 |
+
config.Store_object_follow_list.append(user)
|
199 |
+
elif hasattr(config.Store_object_users_list, 'append'):
|
200 |
+
config.Store_object_users_list.append(user)
|
201 |
+
else:
|
202 |
+
users_list.append(user) # twint.user.user
|
203 |
+
|
204 |
+
if config.Pandas:
|
205 |
+
logme.debug(__name__ + ':User:Pandas+user')
|
206 |
+
panda.update(user, config)
|
207 |
+
|
208 |
+
_output(user, output, config)
|
209 |
+
|
210 |
+
|
211 |
+
async def Username(username, config, conn):
|
212 |
+
logme.debug(__name__ + ':Username')
|
213 |
+
global _follows_object
|
214 |
+
global follows_list
|
215 |
+
follow_var = config.Following * "following" + config.Followers * "followers"
|
216 |
+
|
217 |
+
if config.Database:
|
218 |
+
logme.debug(__name__ + ':Username:Database')
|
219 |
+
db.follow(conn, config.Username, config.Followers, username)
|
220 |
+
|
221 |
+
if config.Elasticsearch:
|
222 |
+
logme.debug(__name__ + ':Username:Elasticsearch')
|
223 |
+
elasticsearch.Follow(username, config)
|
224 |
+
|
225 |
+
if config.Store_object:
|
226 |
+
if hasattr(config.Store_object_follow_list, 'append'):
|
227 |
+
config.Store_object_follow_list.append(username)
|
228 |
+
else:
|
229 |
+
follows_list.append(username) # twint.user.user
|
230 |
+
|
231 |
+
if config.Pandas:
|
232 |
+
logme.debug(__name__ + ':Username:object+pandas')
|
233 |
+
try:
|
234 |
+
_ = _follows_object[config.Username][follow_var]
|
235 |
+
except KeyError:
|
236 |
+
_follows_object.update({config.Username: {follow_var: []}})
|
237 |
+
_follows_object[config.Username][follow_var].append(username)
|
238 |
+
if config.Pandas_au:
|
239 |
+
logme.debug(__name__ + ':Username:object+pandas+au')
|
240 |
+
panda.update(_follows_object[config.Username], config)
|
241 |
+
_output(username, username, config)
|
twitter-scraper/twint-master/twint/run.py
ADDED
@@ -0,0 +1,412 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import sys, os, datetime
|
2 |
+
from asyncio import get_event_loop, TimeoutError, ensure_future, new_event_loop, set_event_loop
|
3 |
+
|
4 |
+
from . import datelock, feed, get, output, verbose, storage
|
5 |
+
from .token import TokenExpiryException
|
6 |
+
from . import token
|
7 |
+
from .storage import db
|
8 |
+
from .feed import NoMoreTweetsException
|
9 |
+
|
10 |
+
import logging as logme
|
11 |
+
|
12 |
+
import time
|
13 |
+
|
14 |
+
bearer = 'Bearer AAAAAAAAAAAAAAAAAAAAANRILgAAAAAAnNwIzUejRCOuH5E6I8xnZz4puTs' \
|
15 |
+
'%3D1Zv7ttfk8LF81IUq16cHjhLTvJu4FA33AGWWjCpTnA'
|
16 |
+
|
17 |
+
|
18 |
+
class Twint:
|
19 |
+
def __init__(self, config):
|
20 |
+
logme.debug(__name__ + ':Twint:__init__')
|
21 |
+
if config.Resume is not None and (config.TwitterSearch or config.Followers or config.Following):
|
22 |
+
logme.debug(__name__ + ':Twint:__init__:Resume')
|
23 |
+
self.init = self.get_resume(config.Resume)
|
24 |
+
else:
|
25 |
+
self.init = -1
|
26 |
+
|
27 |
+
config.deleted = []
|
28 |
+
self.feed: list = [-1]
|
29 |
+
self.count = 0
|
30 |
+
self.user_agent = ""
|
31 |
+
self.config = config
|
32 |
+
self.config.Bearer_token = bearer
|
33 |
+
# TODO might have to make some adjustments for it to work with multi-treading
|
34 |
+
# USAGE : to get a new guest token simply do `self.token.refresh()`
|
35 |
+
self.token = token.Token(config)
|
36 |
+
self.token.refresh()
|
37 |
+
self.conn = db.Conn(config.Database)
|
38 |
+
self.d = datelock.Set(self.config.Until, self.config.Since)
|
39 |
+
verbose.Elastic(config.Elasticsearch)
|
40 |
+
|
41 |
+
if self.config.Store_object:
|
42 |
+
logme.debug(__name__ + ':Twint:__init__:clean_follow_list')
|
43 |
+
output._clean_follow_list()
|
44 |
+
|
45 |
+
if self.config.Pandas_clean:
|
46 |
+
logme.debug(__name__ + ':Twint:__init__:pandas_clean')
|
47 |
+
storage.panda.clean()
|
48 |
+
|
49 |
+
def get_resume(self, resumeFile):
|
50 |
+
if not os.path.exists(resumeFile):
|
51 |
+
return '-1'
|
52 |
+
with open(resumeFile, 'r') as rFile:
|
53 |
+
_init = rFile.readlines()[-1].strip('\n')
|
54 |
+
return _init
|
55 |
+
|
56 |
+
async def Feed(self):
|
57 |
+
logme.debug(__name__ + ':Twint:Feed')
|
58 |
+
consecutive_errors_count = 0
|
59 |
+
while True:
|
60 |
+
# this will receive a JSON string, parse it into a `dict` and do the required stuff
|
61 |
+
try:
|
62 |
+
response = await get.RequestUrl(self.config, self.init)
|
63 |
+
except TokenExpiryException as e:
|
64 |
+
logme.debug(__name__ + 'Twint:Feed:' + str(e))
|
65 |
+
self.token.refresh()
|
66 |
+
response = await get.RequestUrl(self.config, self.init)
|
67 |
+
|
68 |
+
if self.config.Debug:
|
69 |
+
print(response, file=open("twint-last-request.log", "w", encoding="utf-8"))
|
70 |
+
|
71 |
+
self.feed = []
|
72 |
+
try:
|
73 |
+
if self.config.Favorites:
|
74 |
+
self.feed, self.init = feed.MobileFav(response)
|
75 |
+
favorite_err_cnt = 0
|
76 |
+
if len(self.feed) == 0 and len(self.init) == 0:
|
77 |
+
while (len(self.feed) == 0 or len(self.init) == 0) and favorite_err_cnt < 5:
|
78 |
+
self.user_agent = await get.RandomUserAgent(wa=False)
|
79 |
+
response = await get.RequestUrl(self.config, self.init,
|
80 |
+
headers=[("User-Agent", self.user_agent)])
|
81 |
+
self.feed, self.init = feed.MobileFav(response)
|
82 |
+
favorite_err_cnt += 1
|
83 |
+
time.sleep(1)
|
84 |
+
if favorite_err_cnt == 5:
|
85 |
+
print("Favorite page could not be fetched")
|
86 |
+
if not self.count % 40:
|
87 |
+
time.sleep(5)
|
88 |
+
elif self.config.Followers or self.config.Following:
|
89 |
+
self.feed, self.init = feed.Follow(response)
|
90 |
+
if not self.count % 40:
|
91 |
+
time.sleep(5)
|
92 |
+
elif self.config.Profile or self.config.TwitterSearch:
|
93 |
+
try:
|
94 |
+
self.feed, self.init = feed.parse_tweets(self.config, response)
|
95 |
+
except NoMoreTweetsException as e:
|
96 |
+
logme.debug(__name__ + ':Twint:Feed:' + str(e))
|
97 |
+
print('[!] ' + str(e) + ' Scraping will stop now.')
|
98 |
+
print('found {} deleted tweets in this search.'.format(len(self.config.deleted)))
|
99 |
+
break
|
100 |
+
break
|
101 |
+
except TimeoutError as e:
|
102 |
+
if self.config.Proxy_host.lower() == "tor":
|
103 |
+
print("[?] Timed out, changing Tor identity...")
|
104 |
+
if self.config.Tor_control_password is None:
|
105 |
+
logme.critical(__name__ + ':Twint:Feed:tor-password')
|
106 |
+
sys.stderr.write("Error: config.Tor_control_password must be set for proxy auto-rotation!\r\n")
|
107 |
+
sys.stderr.write(
|
108 |
+
"Info: What is it? See https://stem.torproject.org/faq.html#can-i-interact-with-tors"
|
109 |
+
"-controller-interface-directly\r\n")
|
110 |
+
break
|
111 |
+
else:
|
112 |
+
get.ForceNewTorIdentity(self.config)
|
113 |
+
continue
|
114 |
+
else:
|
115 |
+
logme.critical(__name__ + ':Twint:Feed:' + str(e))
|
116 |
+
print(str(e))
|
117 |
+
break
|
118 |
+
except Exception as e:
|
119 |
+
if self.config.Profile or self.config.Favorites:
|
120 |
+
print("[!] Twitter does not return more data, scrape stops here.")
|
121 |
+
break
|
122 |
+
|
123 |
+
logme.critical(__name__ + ':Twint:Feed:noData' + str(e))
|
124 |
+
# Sometimes Twitter says there is no data. But it's a lie.
|
125 |
+
# raise
|
126 |
+
consecutive_errors_count += 1
|
127 |
+
if consecutive_errors_count < self.config.Retries_count:
|
128 |
+
# skip to the next iteration if wait time does not satisfy limit constraints
|
129 |
+
delay = round(consecutive_errors_count ** self.config.Backoff_exponent, 1)
|
130 |
+
|
131 |
+
# if the delay is less than users set min wait time then replace delay
|
132 |
+
if self.config.Min_wait_time > delay:
|
133 |
+
delay = self.config.Min_wait_time
|
134 |
+
|
135 |
+
sys.stderr.write('sleeping for {} secs\n'.format(delay))
|
136 |
+
time.sleep(delay)
|
137 |
+
self.user_agent = await get.RandomUserAgent(wa=True)
|
138 |
+
continue
|
139 |
+
logme.critical(__name__ + ':Twint:Feed:Tweets_known_error:' + str(e))
|
140 |
+
sys.stderr.write(str(e) + " [x] run.Feed")
|
141 |
+
sys.stderr.write(
|
142 |
+
"[!] if you get this error but you know for sure that more tweets exist, please open an issue and "
|
143 |
+
"we will investigate it!")
|
144 |
+
break
|
145 |
+
if self.config.Resume:
|
146 |
+
print(self.init, file=open(self.config.Resume, "a", encoding="utf-8"))
|
147 |
+
|
148 |
+
async def follow(self):
|
149 |
+
await self.Feed()
|
150 |
+
if self.config.User_full:
|
151 |
+
logme.debug(__name__ + ':Twint:follow:userFull')
|
152 |
+
self.count += await get.Multi(self.feed, self.config, self.conn)
|
153 |
+
else:
|
154 |
+
logme.debug(__name__ + ':Twint:follow:notUserFull')
|
155 |
+
for user in self.feed:
|
156 |
+
self.count += 1
|
157 |
+
username = user.find("a")["name"]
|
158 |
+
await output.Username(username, self.config, self.conn)
|
159 |
+
|
160 |
+
async def favorite(self):
|
161 |
+
logme.debug(__name__ + ':Twint:favorite')
|
162 |
+
await self.Feed()
|
163 |
+
favorited_tweets_list = []
|
164 |
+
for tweet in self.feed:
|
165 |
+
tweet_dict = {}
|
166 |
+
self.count += 1
|
167 |
+
try:
|
168 |
+
tweet_dict['data-item-id'] = tweet.find("div", {"class": "tweet-text"})['data-id']
|
169 |
+
t_url = tweet.find("span", {"class": "metadata"}).find("a")["href"]
|
170 |
+
tweet_dict['data-conversation-id'] = t_url.split('?')[0].split('/')[-1]
|
171 |
+
tweet_dict['username'] = tweet.find("div", {"class": "username"}).text.replace('\n', '').replace(' ',
|
172 |
+
'')
|
173 |
+
tweet_dict['tweet'] = tweet.find("div", {"class": "tweet-text"}).find("div", {"class": "dir-ltr"}).text
|
174 |
+
date_str = tweet.find("td", {"class": "timestamp"}).find("a").text
|
175 |
+
# test_dates = ["1m", "2h", "Jun 21, 2019", "Mar 12", "28 Jun 19"]
|
176 |
+
# date_str = test_dates[3]
|
177 |
+
if len(date_str) <= 3 and (date_str[-1] == "m" or date_str[-1] == "h"): # 25m 1h
|
178 |
+
dateu = str(datetime.date.today())
|
179 |
+
tweet_dict['date'] = dateu
|
180 |
+
elif ',' in date_str: # Aug 21, 2019
|
181 |
+
sp = date_str.replace(',', '').split(' ')
|
182 |
+
date_str_formatted = sp[1] + ' ' + sp[0] + ' ' + sp[2]
|
183 |
+
dateu = datetime.datetime.strptime(date_str_formatted, "%d %b %Y").strftime("%Y-%m-%d")
|
184 |
+
tweet_dict['date'] = dateu
|
185 |
+
elif len(date_str.split(' ')) == 3: # 28 Jun 19
|
186 |
+
sp = date_str.split(' ')
|
187 |
+
if len(sp[2]) == 2:
|
188 |
+
sp[2] = '20' + sp[2]
|
189 |
+
date_str_formatted = sp[0] + ' ' + sp[1] + ' ' + sp[2]
|
190 |
+
dateu = datetime.datetime.strptime(date_str_formatted, "%d %b %Y").strftime("%Y-%m-%d")
|
191 |
+
tweet_dict['date'] = dateu
|
192 |
+
else: # Aug 21
|
193 |
+
sp = date_str.split(' ')
|
194 |
+
date_str_formatted = sp[1] + ' ' + sp[0] + ' ' + str(datetime.date.today().year)
|
195 |
+
dateu = datetime.datetime.strptime(date_str_formatted, "%d %b %Y").strftime("%Y-%m-%d")
|
196 |
+
tweet_dict['date'] = dateu
|
197 |
+
|
198 |
+
favorited_tweets_list.append(tweet_dict)
|
199 |
+
|
200 |
+
except Exception as e:
|
201 |
+
logme.critical(__name__ + ':Twint:favorite:favorite_field_lack')
|
202 |
+
print("shit: ", date_str, " ", str(e))
|
203 |
+
|
204 |
+
try:
|
205 |
+
self.config.favorited_tweets_list += favorited_tweets_list
|
206 |
+
except AttributeError:
|
207 |
+
self.config.favorited_tweets_list = favorited_tweets_list
|
208 |
+
|
209 |
+
async def profile(self):
|
210 |
+
await self.Feed()
|
211 |
+
logme.debug(__name__ + ':Twint:profile')
|
212 |
+
for tweet in self.feed:
|
213 |
+
self.count += 1
|
214 |
+
await output.Tweets(tweet, self.config, self.conn)
|
215 |
+
|
216 |
+
async def tweets(self):
|
217 |
+
await self.Feed()
|
218 |
+
# TODO : need to take care of this later
|
219 |
+
if self.config.Location:
|
220 |
+
logme.debug(__name__ + ':Twint:tweets:location')
|
221 |
+
self.count += await get.Multi(self.feed, self.config, self.conn)
|
222 |
+
else:
|
223 |
+
logme.debug(__name__ + ':Twint:tweets:notLocation')
|
224 |
+
for tweet in self.feed:
|
225 |
+
self.count += 1
|
226 |
+
await output.Tweets(tweet, self.config, self.conn)
|
227 |
+
|
228 |
+
async def main(self, callback=None):
|
229 |
+
|
230 |
+
task = ensure_future(self.run()) # Might be changed to create_task in 3.7+.
|
231 |
+
|
232 |
+
if callback:
|
233 |
+
task.add_done_callback(callback)
|
234 |
+
|
235 |
+
await task
|
236 |
+
|
237 |
+
async def run(self):
|
238 |
+
if self.config.TwitterSearch:
|
239 |
+
self.user_agent = await get.RandomUserAgent(wa=True)
|
240 |
+
else:
|
241 |
+
self.user_agent = await get.RandomUserAgent()
|
242 |
+
|
243 |
+
if self.config.User_id is not None and self.config.Username is None:
|
244 |
+
logme.debug(__name__ + ':Twint:main:user_id')
|
245 |
+
self.config.Username = await get.Username(self.config.User_id, self.config.Bearer_token,
|
246 |
+
self.config.Guest_token)
|
247 |
+
|
248 |
+
if self.config.Username is not None and self.config.User_id is None:
|
249 |
+
logme.debug(__name__ + ':Twint:main:username')
|
250 |
+
|
251 |
+
self.config.User_id = await get.User(self.config.Username, self.config, self.conn, True)
|
252 |
+
if self.config.User_id is None:
|
253 |
+
raise ValueError("Cannot find twitter account with name = " + self.config.Username)
|
254 |
+
|
255 |
+
# TODO : will need to modify it to work with the new endpoints
|
256 |
+
if self.config.TwitterSearch and self.config.Since and self.config.Until:
|
257 |
+
logme.debug(__name__ + ':Twint:main:search+since+until')
|
258 |
+
while self.d.since < self.d.until:
|
259 |
+
self.config.Since = datetime.datetime.strftime(self.d.since, "%Y-%m-%d %H:%M:%S")
|
260 |
+
self.config.Until = datetime.datetime.strftime(self.d.until, "%Y-%m-%d %H:%M:%S")
|
261 |
+
if len(self.feed) > 0:
|
262 |
+
await self.tweets()
|
263 |
+
else:
|
264 |
+
logme.debug(__name__ + ':Twint:main:gettingNewTweets')
|
265 |
+
break
|
266 |
+
|
267 |
+
if get.Limit(self.config.Limit, self.count):
|
268 |
+
break
|
269 |
+
elif self.config.Lookup:
|
270 |
+
await self.Lookup()
|
271 |
+
else:
|
272 |
+
logme.debug(__name__ + ':Twint:main:not-search+since+until')
|
273 |
+
while True:
|
274 |
+
if len(self.feed) > 0:
|
275 |
+
if self.config.Followers or self.config.Following:
|
276 |
+
logme.debug(__name__ + ':Twint:main:follow')
|
277 |
+
await self.follow()
|
278 |
+
elif self.config.Favorites:
|
279 |
+
logme.debug(__name__ + ':Twint:main:favorites')
|
280 |
+
await self.favorite()
|
281 |
+
elif self.config.Profile:
|
282 |
+
logme.debug(__name__ + ':Twint:main:profile')
|
283 |
+
await self.profile()
|
284 |
+
elif self.config.TwitterSearch:
|
285 |
+
logme.debug(__name__ + ':Twint:main:twitter-search')
|
286 |
+
await self.tweets()
|
287 |
+
else:
|
288 |
+
logme.debug(__name__ + ':Twint:main:no-more-tweets')
|
289 |
+
break
|
290 |
+
|
291 |
+
# logging.info("[<] " + str(datetime.now()) + ':: run+Twint+main+CallingGetLimit2')
|
292 |
+
if get.Limit(self.config.Limit, self.count):
|
293 |
+
logme.debug(__name__ + ':Twint:main:reachedLimit')
|
294 |
+
break
|
295 |
+
|
296 |
+
if self.config.Count:
|
297 |
+
verbose.Count(self.count, self.config)
|
298 |
+
|
299 |
+
async def Lookup(self):
|
300 |
+
logme.debug(__name__ + ':Twint:Lookup')
|
301 |
+
|
302 |
+
try:
|
303 |
+
if self.config.User_id is not None and self.config.Username is None:
|
304 |
+
logme.debug(__name__ + ':Twint:Lookup:user_id')
|
305 |
+
self.config.Username = await get.Username(self.config.User_id, self.config.Bearer_token,
|
306 |
+
self.config.Guest_token)
|
307 |
+
await get.User(self.config.Username, self.config, db.Conn(self.config.Database))
|
308 |
+
|
309 |
+
except Exception as e:
|
310 |
+
logme.exception(__name__ + ':Twint:Lookup:Unexpected exception occurred.')
|
311 |
+
raise
|
312 |
+
|
313 |
+
|
314 |
+
def run(config, callback=None):
|
315 |
+
logme.debug(__name__ + ':run')
|
316 |
+
try:
|
317 |
+
get_event_loop()
|
318 |
+
except RuntimeError as e:
|
319 |
+
if "no current event loop" in str(e):
|
320 |
+
set_event_loop(new_event_loop())
|
321 |
+
else:
|
322 |
+
logme.exception(__name__ + ':run:Unexpected exception while handling an expected RuntimeError.')
|
323 |
+
raise
|
324 |
+
except Exception as e:
|
325 |
+
logme.exception(
|
326 |
+
__name__ + ':run:Unexpected exception occurred while attempting to get or create a new event loop.')
|
327 |
+
raise
|
328 |
+
|
329 |
+
get_event_loop().run_until_complete(Twint(config).main(callback))
|
330 |
+
|
331 |
+
|
332 |
+
def Favorites(config):
|
333 |
+
logme.debug(__name__ + ':Favorites')
|
334 |
+
config.Favorites = True
|
335 |
+
config.Following = False
|
336 |
+
config.Followers = False
|
337 |
+
config.Profile = False
|
338 |
+
config.TwitterSearch = False
|
339 |
+
run(config)
|
340 |
+
if config.Pandas_au:
|
341 |
+
storage.panda._autoget("tweet")
|
342 |
+
|
343 |
+
|
344 |
+
def Followers(config):
|
345 |
+
logme.debug(__name__ + ':Followers')
|
346 |
+
config.Followers = True
|
347 |
+
config.Following = False
|
348 |
+
config.Profile = False
|
349 |
+
config.Favorites = False
|
350 |
+
config.TwitterSearch = False
|
351 |
+
run(config)
|
352 |
+
if config.Pandas_au:
|
353 |
+
storage.panda._autoget("followers")
|
354 |
+
if config.User_full:
|
355 |
+
storage.panda._autoget("user")
|
356 |
+
if config.Pandas_clean and not config.Store_object:
|
357 |
+
# storage.panda.clean()
|
358 |
+
output._clean_follow_list()
|
359 |
+
|
360 |
+
|
361 |
+
def Following(config):
|
362 |
+
logme.debug(__name__ + ':Following')
|
363 |
+
config.Following = True
|
364 |
+
config.Followers = False
|
365 |
+
config.Profile = False
|
366 |
+
config.Favorites = False
|
367 |
+
config.TwitterSearch = False
|
368 |
+
run(config)
|
369 |
+
if config.Pandas_au:
|
370 |
+
storage.panda._autoget("following")
|
371 |
+
if config.User_full:
|
372 |
+
storage.panda._autoget("user")
|
373 |
+
if config.Pandas_clean and not config.Store_object:
|
374 |
+
# storage.panda.clean()
|
375 |
+
output._clean_follow_list()
|
376 |
+
|
377 |
+
|
378 |
+
def Lookup(config):
|
379 |
+
logme.debug(__name__ + ':Lookup')
|
380 |
+
config.Profile = False
|
381 |
+
config.Lookup = True
|
382 |
+
config.Favorites = False
|
383 |
+
config.FOllowing = False
|
384 |
+
config.Followers = False
|
385 |
+
config.TwitterSearch = False
|
386 |
+
run(config)
|
387 |
+
if config.Pandas_au:
|
388 |
+
storage.panda._autoget("user")
|
389 |
+
|
390 |
+
|
391 |
+
def Profile(config):
|
392 |
+
logme.debug(__name__ + ':Profile')
|
393 |
+
config.Profile = True
|
394 |
+
config.Favorites = False
|
395 |
+
config.Following = False
|
396 |
+
config.Followers = False
|
397 |
+
config.TwitterSearch = False
|
398 |
+
run(config)
|
399 |
+
if config.Pandas_au:
|
400 |
+
storage.panda._autoget("tweet")
|
401 |
+
|
402 |
+
|
403 |
+
def Search(config, callback=None):
|
404 |
+
logme.debug(__name__ + ':Search')
|
405 |
+
config.TwitterSearch = True
|
406 |
+
config.Favorites = False
|
407 |
+
config.Following = False
|
408 |
+
config.Followers = False
|
409 |
+
config.Profile = False
|
410 |
+
run(config, callback)
|
411 |
+
if config.Pandas_au:
|
412 |
+
storage.panda._autoget("tweet")
|
twitter-scraper/twint-master/twint/storage/__init__.py
ADDED
File without changes
|
twitter-scraper/twint-master/twint/storage/db.py
ADDED
@@ -0,0 +1,297 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import sqlite3
|
2 |
+
import sys
|
3 |
+
import time
|
4 |
+
import hashlib
|
5 |
+
|
6 |
+
from datetime import datetime
|
7 |
+
|
8 |
+
def Conn(database):
|
9 |
+
if database:
|
10 |
+
print("[+] Inserting into Database: " + str(database))
|
11 |
+
conn = init(database)
|
12 |
+
if isinstance(conn, str): # error
|
13 |
+
print(conn)
|
14 |
+
sys.exit(1)
|
15 |
+
else:
|
16 |
+
conn = ""
|
17 |
+
|
18 |
+
return conn
|
19 |
+
|
20 |
+
def init(db):
|
21 |
+
try:
|
22 |
+
conn = sqlite3.connect(db)
|
23 |
+
cursor = conn.cursor()
|
24 |
+
|
25 |
+
table_users = """
|
26 |
+
CREATE TABLE IF NOT EXISTS
|
27 |
+
users(
|
28 |
+
id integer not null,
|
29 |
+
id_str text not null,
|
30 |
+
name text,
|
31 |
+
username text not null,
|
32 |
+
bio text,
|
33 |
+
location text,
|
34 |
+
url text,
|
35 |
+
join_date text not null,
|
36 |
+
join_time text not null,
|
37 |
+
tweets integer,
|
38 |
+
following integer,
|
39 |
+
followers integer,
|
40 |
+
likes integer,
|
41 |
+
media integer,
|
42 |
+
private integer not null,
|
43 |
+
verified integer not null,
|
44 |
+
profile_image_url text not null,
|
45 |
+
background_image text,
|
46 |
+
hex_dig text not null,
|
47 |
+
time_update integer not null,
|
48 |
+
CONSTRAINT users_pk PRIMARY KEY (id, hex_dig)
|
49 |
+
);
|
50 |
+
"""
|
51 |
+
cursor.execute(table_users)
|
52 |
+
|
53 |
+
table_tweets = """
|
54 |
+
CREATE TABLE IF NOT EXISTS
|
55 |
+
tweets (
|
56 |
+
id integer not null,
|
57 |
+
id_str text not null,
|
58 |
+
tweet text default '',
|
59 |
+
language text default '',
|
60 |
+
conversation_id text not null,
|
61 |
+
created_at integer not null,
|
62 |
+
date text not null,
|
63 |
+
time text not null,
|
64 |
+
timezone text not null,
|
65 |
+
place text default '',
|
66 |
+
replies_count integer,
|
67 |
+
likes_count integer,
|
68 |
+
retweets_count integer,
|
69 |
+
user_id integer not null,
|
70 |
+
user_id_str text not null,
|
71 |
+
screen_name text not null,
|
72 |
+
name text default '',
|
73 |
+
link text,
|
74 |
+
mentions text,
|
75 |
+
hashtags text,
|
76 |
+
cashtags text,
|
77 |
+
urls text,
|
78 |
+
photos text,
|
79 |
+
thumbnail text,
|
80 |
+
quote_url text,
|
81 |
+
video integer,
|
82 |
+
geo text,
|
83 |
+
near text,
|
84 |
+
source text,
|
85 |
+
time_update integer not null,
|
86 |
+
`translate` text default '',
|
87 |
+
trans_src text default '',
|
88 |
+
trans_dest text default '',
|
89 |
+
PRIMARY KEY (id)
|
90 |
+
);
|
91 |
+
"""
|
92 |
+
cursor.execute(table_tweets)
|
93 |
+
|
94 |
+
table_retweets = """
|
95 |
+
CREATE TABLE IF NOT EXISTS
|
96 |
+
retweets(
|
97 |
+
user_id integer not null,
|
98 |
+
username text not null,
|
99 |
+
tweet_id integer not null,
|
100 |
+
retweet_id integer not null,
|
101 |
+
retweet_date integer,
|
102 |
+
CONSTRAINT retweets_pk PRIMARY KEY(user_id, tweet_id),
|
103 |
+
CONSTRAINT user_id_fk FOREIGN KEY(user_id) REFERENCES users(id),
|
104 |
+
CONSTRAINT tweet_id_fk FOREIGN KEY(tweet_id) REFERENCES tweets(id)
|
105 |
+
);
|
106 |
+
"""
|
107 |
+
cursor.execute(table_retweets)
|
108 |
+
|
109 |
+
table_reply_to = """
|
110 |
+
CREATE TABLE IF NOT EXISTS
|
111 |
+
replies(
|
112 |
+
tweet_id integer not null,
|
113 |
+
user_id integer not null,
|
114 |
+
username text not null,
|
115 |
+
CONSTRAINT replies_pk PRIMARY KEY (user_id, tweet_id),
|
116 |
+
CONSTRAINT tweet_id_fk FOREIGN KEY (tweet_id) REFERENCES tweets(id)
|
117 |
+
);
|
118 |
+
"""
|
119 |
+
cursor.execute(table_reply_to)
|
120 |
+
|
121 |
+
table_favorites = """
|
122 |
+
CREATE TABLE IF NOT EXISTS
|
123 |
+
favorites(
|
124 |
+
user_id integer not null,
|
125 |
+
tweet_id integer not null,
|
126 |
+
CONSTRAINT favorites_pk PRIMARY KEY (user_id, tweet_id),
|
127 |
+
CONSTRAINT user_id_fk FOREIGN KEY (user_id) REFERENCES users(id),
|
128 |
+
CONSTRAINT tweet_id_fk FOREIGN KEY (tweet_id) REFERENCES tweets(id)
|
129 |
+
);
|
130 |
+
"""
|
131 |
+
cursor.execute(table_favorites)
|
132 |
+
|
133 |
+
table_followers = """
|
134 |
+
CREATE TABLE IF NOT EXISTS
|
135 |
+
followers (
|
136 |
+
id integer not null,
|
137 |
+
follower_id integer not null,
|
138 |
+
CONSTRAINT followers_pk PRIMARY KEY (id, follower_id),
|
139 |
+
CONSTRAINT id_fk FOREIGN KEY(id) REFERENCES users(id),
|
140 |
+
CONSTRAINT follower_id_fk FOREIGN KEY(follower_id) REFERENCES users(id)
|
141 |
+
);
|
142 |
+
"""
|
143 |
+
cursor.execute(table_followers)
|
144 |
+
|
145 |
+
table_following = """
|
146 |
+
CREATE TABLE IF NOT EXISTS
|
147 |
+
following (
|
148 |
+
id integer not null,
|
149 |
+
following_id integer not null,
|
150 |
+
CONSTRAINT following_pk PRIMARY KEY (id, following_id),
|
151 |
+
CONSTRAINT id_fk FOREIGN KEY(id) REFERENCES users(id),
|
152 |
+
CONSTRAINT following_id_fk FOREIGN KEY(following_id) REFERENCES users(id)
|
153 |
+
);
|
154 |
+
"""
|
155 |
+
cursor.execute(table_following)
|
156 |
+
|
157 |
+
table_followers_names = """
|
158 |
+
CREATE TABLE IF NOT EXISTS
|
159 |
+
followers_names (
|
160 |
+
user text not null,
|
161 |
+
time_update integer not null,
|
162 |
+
follower text not null,
|
163 |
+
PRIMARY KEY (user, follower)
|
164 |
+
);
|
165 |
+
"""
|
166 |
+
cursor.execute(table_followers_names)
|
167 |
+
|
168 |
+
table_following_names = """
|
169 |
+
CREATE TABLE IF NOT EXISTS
|
170 |
+
following_names (
|
171 |
+
user text not null,
|
172 |
+
time_update integer not null,
|
173 |
+
follows text not null,
|
174 |
+
PRIMARY KEY (user, follows)
|
175 |
+
);
|
176 |
+
"""
|
177 |
+
cursor.execute(table_following_names)
|
178 |
+
|
179 |
+
return conn
|
180 |
+
except Exception as e:
|
181 |
+
return str(e)
|
182 |
+
|
183 |
+
def fTable(Followers):
|
184 |
+
if Followers:
|
185 |
+
table = "followers_names"
|
186 |
+
else:
|
187 |
+
table = "following_names"
|
188 |
+
|
189 |
+
return table
|
190 |
+
|
191 |
+
def uTable(Followers):
|
192 |
+
if Followers:
|
193 |
+
table = "followers"
|
194 |
+
else:
|
195 |
+
table = "following"
|
196 |
+
|
197 |
+
return table
|
198 |
+
|
199 |
+
def follow(conn, Username, Followers, User):
|
200 |
+
try:
|
201 |
+
time_ms = round(time.time()*1000)
|
202 |
+
cursor = conn.cursor()
|
203 |
+
entry = (User, time_ms, Username,)
|
204 |
+
table = fTable(Followers)
|
205 |
+
query = f"INSERT INTO {table} VALUES(?,?,?)"
|
206 |
+
cursor.execute(query, entry)
|
207 |
+
conn.commit()
|
208 |
+
except sqlite3.IntegrityError:
|
209 |
+
pass
|
210 |
+
|
211 |
+
def get_hash_id(conn, id):
|
212 |
+
cursor = conn.cursor()
|
213 |
+
cursor.execute('SELECT hex_dig FROM users WHERE id = ? LIMIT 1', (id,))
|
214 |
+
resultset = cursor.fetchall()
|
215 |
+
return resultset[0][0] if resultset else -1
|
216 |
+
|
217 |
+
def user(conn, config, User):
|
218 |
+
try:
|
219 |
+
time_ms = round(time.time()*1000)
|
220 |
+
cursor = conn.cursor()
|
221 |
+
user = [int(User.id), User.id, User.name, User.username, User.bio, User.location, User.url,User.join_date, User.join_time, User.tweets, User.following, User.followers, User.likes, User.media_count, User.is_private, User.is_verified, User.avatar, User.background_image]
|
222 |
+
|
223 |
+
hex_dig = hashlib.sha256(','.join(str(v) for v in user).encode()).hexdigest()
|
224 |
+
entry = tuple(user) + (hex_dig,time_ms,)
|
225 |
+
old_hash = get_hash_id(conn, User.id)
|
226 |
+
|
227 |
+
if old_hash == -1 or old_hash != hex_dig:
|
228 |
+
query = f"INSERT INTO users VALUES(?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)"
|
229 |
+
cursor.execute(query, entry)
|
230 |
+
else:
|
231 |
+
pass
|
232 |
+
|
233 |
+
if config.Followers or config.Following:
|
234 |
+
table = uTable(config.Followers)
|
235 |
+
query = f"INSERT INTO {table} VALUES(?,?)"
|
236 |
+
cursor.execute(query, (config.User_id, int(User.id)))
|
237 |
+
|
238 |
+
conn.commit()
|
239 |
+
except sqlite3.IntegrityError:
|
240 |
+
pass
|
241 |
+
|
242 |
+
def tweets(conn, Tweet, config):
|
243 |
+
try:
|
244 |
+
time_ms = round(time.time()*1000)
|
245 |
+
cursor = conn.cursor()
|
246 |
+
entry = (Tweet.id,
|
247 |
+
Tweet.id_str,
|
248 |
+
Tweet.tweet,
|
249 |
+
Tweet.lang,
|
250 |
+
Tweet.conversation_id,
|
251 |
+
Tweet.datetime,
|
252 |
+
Tweet.datestamp,
|
253 |
+
Tweet.timestamp,
|
254 |
+
Tweet.timezone,
|
255 |
+
Tweet.place,
|
256 |
+
Tweet.replies_count,
|
257 |
+
Tweet.likes_count,
|
258 |
+
Tweet.retweets_count,
|
259 |
+
Tweet.user_id,
|
260 |
+
Tweet.user_id_str,
|
261 |
+
Tweet.username,
|
262 |
+
Tweet.name,
|
263 |
+
Tweet.link,
|
264 |
+
",".join(Tweet.mentions),
|
265 |
+
",".join(Tweet.hashtags),
|
266 |
+
",".join(Tweet.cashtags),
|
267 |
+
",".join(Tweet.urls),
|
268 |
+
",".join(Tweet.photos),
|
269 |
+
Tweet.thumbnail,
|
270 |
+
Tweet.quote_url,
|
271 |
+
Tweet.video,
|
272 |
+
Tweet.geo,
|
273 |
+
Tweet.near,
|
274 |
+
Tweet.source,
|
275 |
+
time_ms,
|
276 |
+
Tweet.translate,
|
277 |
+
Tweet.trans_src,
|
278 |
+
Tweet.trans_dest)
|
279 |
+
cursor.execute('INSERT INTO tweets VALUES(?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)', entry)
|
280 |
+
|
281 |
+
if config.Favorites:
|
282 |
+
query = 'INSERT INTO favorites VALUES(?,?)'
|
283 |
+
cursor.execute(query, (config.User_id, Tweet.id))
|
284 |
+
|
285 |
+
if Tweet.retweet:
|
286 |
+
query = 'INSERT INTO retweets VALUES(?,?,?,?,?)'
|
287 |
+
_d = datetime.timestamp(datetime.strptime(Tweet.retweet_date, "%Y-%m-%d %H:%M:%S"))
|
288 |
+
cursor.execute(query, (int(Tweet.user_rt_id), Tweet.user_rt, Tweet.id, int(Tweet.retweet_id), _d))
|
289 |
+
|
290 |
+
if Tweet.reply_to:
|
291 |
+
for reply in Tweet.reply_to:
|
292 |
+
query = 'INSERT INTO replies VALUES(?,?,?)'
|
293 |
+
cursor.execute(query, (Tweet.id, int(reply['user_id']), reply['username']))
|
294 |
+
|
295 |
+
conn.commit()
|
296 |
+
except sqlite3.IntegrityError:
|
297 |
+
pass
|
twitter-scraper/twint-master/twint/storage/elasticsearch.py
ADDED
@@ -0,0 +1,364 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
## TODO - Fix Weekday situation
|
2 |
+
from elasticsearch import Elasticsearch, helpers
|
3 |
+
from geopy.geocoders import Nominatim
|
4 |
+
from datetime import datetime
|
5 |
+
import contextlib
|
6 |
+
import sys
|
7 |
+
|
8 |
+
_index_tweet_status = False
|
9 |
+
_index_follow_status = False
|
10 |
+
_index_user_status = False
|
11 |
+
_is_near_def = False
|
12 |
+
_is_location_def = False
|
13 |
+
_near = {}
|
14 |
+
_location = {}
|
15 |
+
|
16 |
+
geolocator = Nominatim(user_agent="twint-1.2")
|
17 |
+
|
18 |
+
class RecycleObject(object):
|
19 |
+
def write(self, junk): pass
|
20 |
+
def flush(self): pass
|
21 |
+
|
22 |
+
def getLocation(place, **options):
|
23 |
+
location = geolocator.geocode(place,timeout=1000)
|
24 |
+
if location:
|
25 |
+
if options.get("near"):
|
26 |
+
global _near
|
27 |
+
_near = {"lat": location.latitude, "lon": location.longitude}
|
28 |
+
return True
|
29 |
+
elif options.get("location"):
|
30 |
+
global _location
|
31 |
+
_location = {"lat": location.latitude, "lon": location.longitude}
|
32 |
+
return True
|
33 |
+
return {"lat": location.latitude, "lon": location.longitude}
|
34 |
+
else:
|
35 |
+
return {}
|
36 |
+
|
37 |
+
def handleIndexResponse(response):
|
38 |
+
try:
|
39 |
+
if response["status"] == 400:
|
40 |
+
return True
|
41 |
+
except KeyError:
|
42 |
+
pass
|
43 |
+
if response["acknowledged"]:
|
44 |
+
print("[+] Index \"" + response["index"] + "\" created!")
|
45 |
+
else:
|
46 |
+
print("[x] error index creation :: storage.elasticsearch.handleIndexCreation")
|
47 |
+
if response["shards_acknowledged"]:
|
48 |
+
print("[+] Shards acknowledged, everything is ready to be used!")
|
49 |
+
return True
|
50 |
+
else:
|
51 |
+
print("[x] error with shards :: storage.elasticsearch.HandleIndexCreation")
|
52 |
+
return False
|
53 |
+
|
54 |
+
def createIndex(config, instance, **scope):
|
55 |
+
if scope.get("scope") == "tweet":
|
56 |
+
tweets_body = {
|
57 |
+
"mappings": {
|
58 |
+
"properties": {
|
59 |
+
"id": {"type": "long"},
|
60 |
+
"conversation_id": {"type": "long"},
|
61 |
+
"created_at": {"type": "text"},
|
62 |
+
"date": {"type": "date", "format": "yyyy-MM-dd HH:mm:ss"},
|
63 |
+
"timezone": {"type": "keyword"},
|
64 |
+
"place": {"type": "keyword"},
|
65 |
+
"location": {"type": "keyword"},
|
66 |
+
"tweet": {"type": "text"},
|
67 |
+
"lang": {"type": "keyword"},
|
68 |
+
"hashtags": {"type": "keyword", "normalizer": "hashtag_normalizer"},
|
69 |
+
"cashtags": {"type": "keyword", "normalizer": "hashtag_normalizer"},
|
70 |
+
"user_id_str": {"type": "keyword"},
|
71 |
+
"username": {"type": "keyword", "normalizer": "hashtag_normalizer"},
|
72 |
+
"name": {"type": "text"},
|
73 |
+
"profile_image_url": {"type": "text"},
|
74 |
+
"day": {"type": "integer"},
|
75 |
+
"hour": {"type": "integer"},
|
76 |
+
"link": {"type": "text"},
|
77 |
+
"retweet": {"type": "text"},
|
78 |
+
"essid": {"type": "keyword"},
|
79 |
+
"nlikes": {"type": "integer"},
|
80 |
+
"nreplies": {"type": "integer"},
|
81 |
+
"nretweets": {"type": "integer"},
|
82 |
+
"quote_url": {"type": "text"},
|
83 |
+
"video": {"type":"integer"},
|
84 |
+
"thumbnail": {"type":"text"},
|
85 |
+
"search": {"type": "text"},
|
86 |
+
"near": {"type": "text"},
|
87 |
+
"geo_near": {"type": "geo_point"},
|
88 |
+
"geo_tweet": {"type": "geo_point"},
|
89 |
+
"photos": {"type": "text"},
|
90 |
+
"user_rt_id": {"type": "keyword"},
|
91 |
+
"mentions": {"type": "keyword", "normalizer": "hashtag_normalizer"},
|
92 |
+
"source": {"type": "keyword"},
|
93 |
+
"user_rt": {"type": "keyword"},
|
94 |
+
"retweet_id": {"type": "keyword"},
|
95 |
+
"reply_to": {
|
96 |
+
"type": "nested",
|
97 |
+
"properties": {
|
98 |
+
"user_id": {"type": "keyword"},
|
99 |
+
"username": {"type": "keyword"}
|
100 |
+
}
|
101 |
+
},
|
102 |
+
"retweet_date": {"type": "date", "format": "yyyy-MM-dd HH:mm:ss", "ignore_malformed": True},
|
103 |
+
"urls": {"type": "keyword"},
|
104 |
+
"translate": {"type": "text"},
|
105 |
+
"trans_src": {"type": "keyword"},
|
106 |
+
"trans_dest": {"type": "keyword"},
|
107 |
+
}
|
108 |
+
},
|
109 |
+
"settings": {
|
110 |
+
"number_of_shards": 1,
|
111 |
+
"analysis": {
|
112 |
+
"normalizer": {
|
113 |
+
"hashtag_normalizer": {
|
114 |
+
"type": "custom",
|
115 |
+
"char_filter": [],
|
116 |
+
"filter": ["lowercase", "asciifolding"]
|
117 |
+
}
|
118 |
+
}
|
119 |
+
}
|
120 |
+
}
|
121 |
+
}
|
122 |
+
with nostdout():
|
123 |
+
resp = instance.indices.create(index=config.Index_tweets, body=tweets_body, ignore=400)
|
124 |
+
return handleIndexResponse(resp)
|
125 |
+
elif scope.get("scope") == "follow":
|
126 |
+
follow_body = {
|
127 |
+
"mappings": {
|
128 |
+
"properties": {
|
129 |
+
"user": {"type": "keyword"},
|
130 |
+
"follow": {"type": "keyword"},
|
131 |
+
"essid": {"type": "keyword"}
|
132 |
+
}
|
133 |
+
},
|
134 |
+
"settings": {
|
135 |
+
"number_of_shards": 1
|
136 |
+
}
|
137 |
+
}
|
138 |
+
with nostdout():
|
139 |
+
resp = instance.indices.create(index=config.Index_follow, body=follow_body, ignore=400)
|
140 |
+
return handleIndexResponse(resp)
|
141 |
+
elif scope.get("scope") == "user":
|
142 |
+
user_body = {
|
143 |
+
"mappings": {
|
144 |
+
"properties": {
|
145 |
+
"id": {"type": "keyword"},
|
146 |
+
"name": {"type": "keyword"},
|
147 |
+
"username": {"type": "keyword"},
|
148 |
+
"bio": {"type": "text"},
|
149 |
+
"location": {"type": "keyword"},
|
150 |
+
"url": {"type": "text"},
|
151 |
+
"join_datetime": {"type": "date", "format": "yyyy-MM-dd HH:mm:ss"},
|
152 |
+
"tweets": {"type": "integer"},
|
153 |
+
"following": {"type": "integer"},
|
154 |
+
"followers": {"type": "integer"},
|
155 |
+
"likes": {"type": "integer"},
|
156 |
+
"media": {"type": "integer"},
|
157 |
+
"private": {"type": "integer"},
|
158 |
+
"verified": {"type": "integer"},
|
159 |
+
"avatar": {"type": "text"},
|
160 |
+
"background_image": {"type": "text"},
|
161 |
+
"session": {"type": "keyword"},
|
162 |
+
"geo_user": {"type": "geo_point"}
|
163 |
+
}
|
164 |
+
},
|
165 |
+
"settings": {
|
166 |
+
"number_of_shards": 1
|
167 |
+
}
|
168 |
+
}
|
169 |
+
with nostdout():
|
170 |
+
resp = instance.indices.create(index=config.Index_users, body=user_body, ignore=400)
|
171 |
+
return handleIndexResponse(resp)
|
172 |
+
else:
|
173 |
+
print("[x] error index pre-creation :: storage.elasticsearch.createIndex")
|
174 |
+
return False
|
175 |
+
|
176 |
+
@contextlib.contextmanager
|
177 |
+
def nostdout():
|
178 |
+
savestdout = sys.stdout
|
179 |
+
sys.stdout = RecycleObject()
|
180 |
+
yield
|
181 |
+
sys.stdout = savestdout
|
182 |
+
|
183 |
+
def weekday(day):
|
184 |
+
weekdays = {
|
185 |
+
"Monday": 1,
|
186 |
+
"Tuesday": 2,
|
187 |
+
"Wednesday": 3,
|
188 |
+
"Thursday": 4,
|
189 |
+
"Friday": 5,
|
190 |
+
"Saturday": 6,
|
191 |
+
"Sunday": 7,
|
192 |
+
}
|
193 |
+
|
194 |
+
return weekdays[day]
|
195 |
+
|
196 |
+
def Tweet(Tweet, config):
|
197 |
+
global _index_tweet_status
|
198 |
+
global _is_near_def
|
199 |
+
date_obj = datetime.strptime(Tweet.datetime, "%Y-%m-%d %H:%M:%S %Z")
|
200 |
+
|
201 |
+
actions = []
|
202 |
+
|
203 |
+
try:
|
204 |
+
retweet = Tweet.retweet
|
205 |
+
except AttributeError:
|
206 |
+
retweet = None
|
207 |
+
|
208 |
+
dt = f"{Tweet.datestamp} {Tweet.timestamp}"
|
209 |
+
|
210 |
+
j_data = {
|
211 |
+
"_index": config.Index_tweets,
|
212 |
+
"_id": str(Tweet.id) + "_raw_" + config.Essid,
|
213 |
+
"_source": {
|
214 |
+
"id": str(Tweet.id),
|
215 |
+
"conversation_id": Tweet.conversation_id,
|
216 |
+
"created_at": Tweet.datetime,
|
217 |
+
"date": dt,
|
218 |
+
"timezone": Tweet.timezone,
|
219 |
+
"place": Tweet.place,
|
220 |
+
"tweet": Tweet.tweet,
|
221 |
+
"language": Tweet.lang,
|
222 |
+
"hashtags": Tweet.hashtags,
|
223 |
+
"cashtags": Tweet.cashtags,
|
224 |
+
"user_id_str": Tweet.user_id_str,
|
225 |
+
"username": Tweet.username,
|
226 |
+
"name": Tweet.name,
|
227 |
+
"day": date_obj.weekday(),
|
228 |
+
"hour": date_obj.hour,
|
229 |
+
"link": Tweet.link,
|
230 |
+
"retweet": retweet,
|
231 |
+
"essid": config.Essid,
|
232 |
+
"nlikes": int(Tweet.likes_count),
|
233 |
+
"nreplies": int(Tweet.replies_count),
|
234 |
+
"nretweets": int(Tweet.retweets_count),
|
235 |
+
"quote_url": Tweet.quote_url,
|
236 |
+
"video": Tweet.video,
|
237 |
+
"search": str(config.Search),
|
238 |
+
"near": config.Near
|
239 |
+
}
|
240 |
+
}
|
241 |
+
if retweet is not None:
|
242 |
+
j_data["_source"].update({"user_rt_id": Tweet.user_rt_id})
|
243 |
+
j_data["_source"].update({"user_rt": Tweet.user_rt})
|
244 |
+
j_data["_source"].update({"retweet_id": Tweet.retweet_id})
|
245 |
+
j_data["_source"].update({"retweet_date": Tweet.retweet_date})
|
246 |
+
if Tweet.reply_to:
|
247 |
+
j_data["_source"].update({"reply_to": Tweet.reply_to})
|
248 |
+
if Tweet.photos:
|
249 |
+
_photos = []
|
250 |
+
for photo in Tweet.photos:
|
251 |
+
_photos.append(photo)
|
252 |
+
j_data["_source"].update({"photos": _photos})
|
253 |
+
if Tweet.thumbnail:
|
254 |
+
j_data["_source"].update({"thumbnail": Tweet.thumbnail})
|
255 |
+
if Tweet.mentions:
|
256 |
+
_mentions = []
|
257 |
+
for mention in Tweet.mentions:
|
258 |
+
_mentions.append(mention)
|
259 |
+
j_data["_source"].update({"mentions": _mentions})
|
260 |
+
if Tweet.urls:
|
261 |
+
_urls = []
|
262 |
+
for url in Tweet.urls:
|
263 |
+
_urls.append(url)
|
264 |
+
j_data["_source"].update({"urls": _urls})
|
265 |
+
if config.Near or config.Geo:
|
266 |
+
if not _is_near_def:
|
267 |
+
__geo = ""
|
268 |
+
__near = ""
|
269 |
+
if config.Geo:
|
270 |
+
__geo = config.Geo
|
271 |
+
if config.Near:
|
272 |
+
__near = config.Near
|
273 |
+
_is_near_def = getLocation(__near + __geo, near=True)
|
274 |
+
if _near:
|
275 |
+
j_data["_source"].update({"geo_near": _near})
|
276 |
+
if Tweet.place:
|
277 |
+
_t_place = getLocation(Tweet.place)
|
278 |
+
if _t_place:
|
279 |
+
j_data["_source"].update({"geo_tweet": getLocation(Tweet.place)})
|
280 |
+
if Tweet.source:
|
281 |
+
j_data["_source"].update({"source": Tweet.Source})
|
282 |
+
if config.Translate:
|
283 |
+
j_data["_source"].update({"translate": Tweet.translate})
|
284 |
+
j_data["_source"].update({"trans_src": Tweet.trans_src})
|
285 |
+
j_data["_source"].update({"trans_dest": Tweet.trans_dest})
|
286 |
+
|
287 |
+
actions.append(j_data)
|
288 |
+
|
289 |
+
es = Elasticsearch(config.Elasticsearch, verify_certs=config.Skip_certs)
|
290 |
+
if not _index_tweet_status:
|
291 |
+
_index_tweet_status = createIndex(config, es, scope="tweet")
|
292 |
+
with nostdout():
|
293 |
+
helpers.bulk(es, actions, chunk_size=2000, request_timeout=200)
|
294 |
+
actions = []
|
295 |
+
|
296 |
+
def Follow(user, config):
|
297 |
+
global _index_follow_status
|
298 |
+
actions = []
|
299 |
+
|
300 |
+
if config.Following:
|
301 |
+
_user = config.Username
|
302 |
+
_follow = user
|
303 |
+
else:
|
304 |
+
_user = user
|
305 |
+
_follow = config.Username
|
306 |
+
j_data = {
|
307 |
+
"_index": config.Index_follow,
|
308 |
+
"_id": _user + "_" + _follow + "_" + config.Essid,
|
309 |
+
"_source": {
|
310 |
+
"user": _user,
|
311 |
+
"follow": _follow,
|
312 |
+
"essid": config.Essid
|
313 |
+
}
|
314 |
+
}
|
315 |
+
actions.append(j_data)
|
316 |
+
|
317 |
+
es = Elasticsearch(config.Elasticsearch, verify_certs=config.Skip_certs)
|
318 |
+
if not _index_follow_status:
|
319 |
+
_index_follow_status = createIndex(config, es, scope="follow")
|
320 |
+
with nostdout():
|
321 |
+
helpers.bulk(es, actions, chunk_size=2000, request_timeout=200)
|
322 |
+
actions = []
|
323 |
+
|
324 |
+
def UserProfile(user, config):
|
325 |
+
global _index_user_status
|
326 |
+
global _is_location_def
|
327 |
+
actions = []
|
328 |
+
|
329 |
+
j_data = {
|
330 |
+
"_index": config.Index_users,
|
331 |
+
"_id": user.id + "_" + user.join_date + "_" + user.join_time + "_" + config.Essid,
|
332 |
+
"_source": {
|
333 |
+
"id": user.id,
|
334 |
+
"name": user.name,
|
335 |
+
"username": user.username,
|
336 |
+
"bio": user.bio,
|
337 |
+
"location": user.location,
|
338 |
+
"url": user.url,
|
339 |
+
"join_datetime": user.join_date + " " + user.join_time,
|
340 |
+
"tweets": user.tweets,
|
341 |
+
"following": user.following,
|
342 |
+
"followers": user.followers,
|
343 |
+
"likes": user.likes,
|
344 |
+
"media": user.media_count,
|
345 |
+
"private": user.is_private,
|
346 |
+
"verified": user.is_verified,
|
347 |
+
"avatar": user.avatar,
|
348 |
+
"background_image": user.background_image,
|
349 |
+
"session": config.Essid
|
350 |
+
}
|
351 |
+
}
|
352 |
+
if config.Location:
|
353 |
+
if not _is_location_def:
|
354 |
+
_is_location_def = getLocation(user.location, location=True)
|
355 |
+
if _location:
|
356 |
+
j_data["_source"].update({"geo_user": _location})
|
357 |
+
actions.append(j_data)
|
358 |
+
|
359 |
+
es = Elasticsearch(config.Elasticsearch, verify_certs=config.Skip_certs)
|
360 |
+
if not _index_user_status:
|
361 |
+
_index_user_status = createIndex(config, es, scope="user")
|
362 |
+
with nostdout():
|
363 |
+
helpers.bulk(es, actions, chunk_size=2000, request_timeout=200)
|
364 |
+
actions = []
|
twitter-scraper/twint-master/twint/storage/panda.py
ADDED
@@ -0,0 +1,196 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import datetime, pandas as pd, warnings
|
2 |
+
from time import strftime, localtime
|
3 |
+
from twint.tweet import Tweet_formats
|
4 |
+
|
5 |
+
Tweets_df = None
|
6 |
+
Follow_df = None
|
7 |
+
User_df = None
|
8 |
+
|
9 |
+
_object_blocks = {
|
10 |
+
"tweet": [],
|
11 |
+
"user": [],
|
12 |
+
"following": [],
|
13 |
+
"followers": []
|
14 |
+
}
|
15 |
+
|
16 |
+
weekdays = {
|
17 |
+
"Monday": 1,
|
18 |
+
"Tuesday": 2,
|
19 |
+
"Wednesday": 3,
|
20 |
+
"Thursday": 4,
|
21 |
+
"Friday": 5,
|
22 |
+
"Saturday": 6,
|
23 |
+
"Sunday": 7,
|
24 |
+
}
|
25 |
+
|
26 |
+
_type = ""
|
27 |
+
|
28 |
+
def _concat(df, _type):
|
29 |
+
if df is None:
|
30 |
+
df = pd.DataFrame(_object_blocks[_type])
|
31 |
+
else:
|
32 |
+
_df = pd.DataFrame(_object_blocks[_type])
|
33 |
+
df = pd.concat([df, _df], sort=True)
|
34 |
+
return df
|
35 |
+
|
36 |
+
def _autoget(_type):
|
37 |
+
global Tweets_df
|
38 |
+
global Follow_df
|
39 |
+
global User_df
|
40 |
+
|
41 |
+
if _type == "tweet":
|
42 |
+
Tweets_df = _concat(Tweets_df, _type)
|
43 |
+
elif _type == "followers" or _type == "following":
|
44 |
+
Follow_df = _concat(Follow_df, _type)
|
45 |
+
elif _type == "user":
|
46 |
+
User_df = _concat(User_df, _type)
|
47 |
+
else:
|
48 |
+
error("[x] Wrong type of object passed")
|
49 |
+
|
50 |
+
|
51 |
+
def update(object, config):
|
52 |
+
global _type
|
53 |
+
|
54 |
+
#try:
|
55 |
+
# _type = ((object.__class__.__name__ == "tweet")*"tweet" +
|
56 |
+
# (object.__class__.__name__ == "user")*"user")
|
57 |
+
#except AttributeError:
|
58 |
+
# _type = config.Following*"following" + config.Followers*"followers"
|
59 |
+
if object.__class__.__name__ == "tweet":
|
60 |
+
_type = "tweet"
|
61 |
+
elif object.__class__.__name__ == "user":
|
62 |
+
_type = "user"
|
63 |
+
elif object.__class__.__name__ == "dict":
|
64 |
+
_type = config.Following*"following" + config.Followers*"followers"
|
65 |
+
|
66 |
+
if _type == "tweet":
|
67 |
+
Tweet = object
|
68 |
+
datetime_ms = datetime.datetime.strptime(Tweet.datetime, Tweet_formats['datetime']).timestamp() * 1000
|
69 |
+
day = weekdays[strftime("%A", localtime(datetime_ms/1000))]
|
70 |
+
dt = f"{object.datestamp} {object.timestamp}"
|
71 |
+
_data = {
|
72 |
+
"id": str(Tweet.id),
|
73 |
+
"conversation_id": Tweet.conversation_id,
|
74 |
+
"created_at": datetime_ms,
|
75 |
+
"date": dt,
|
76 |
+
"timezone": Tweet.timezone,
|
77 |
+
"place": Tweet.place,
|
78 |
+
"tweet": Tweet.tweet,
|
79 |
+
"language": Tweet.lang,
|
80 |
+
"hashtags": Tweet.hashtags,
|
81 |
+
"cashtags": Tweet.cashtags,
|
82 |
+
"user_id": Tweet.user_id,
|
83 |
+
"user_id_str": Tweet.user_id_str,
|
84 |
+
"username": Tweet.username,
|
85 |
+
"name": Tweet.name,
|
86 |
+
"day": day,
|
87 |
+
"hour": strftime("%H", localtime(datetime_ms/1000)),
|
88 |
+
"link": Tweet.link,
|
89 |
+
"urls": Tweet.urls,
|
90 |
+
"photos": Tweet.photos,
|
91 |
+
"video": Tweet.video,
|
92 |
+
"thumbnail": Tweet.thumbnail,
|
93 |
+
"retweet": Tweet.retweet,
|
94 |
+
"nlikes": int(Tweet.likes_count),
|
95 |
+
"nreplies": int(Tweet.replies_count),
|
96 |
+
"nretweets": int(Tweet.retweets_count),
|
97 |
+
"quote_url": Tweet.quote_url,
|
98 |
+
"search": str(config.Search),
|
99 |
+
"near": Tweet.near,
|
100 |
+
"geo": Tweet.geo,
|
101 |
+
"source": Tweet.source,
|
102 |
+
"user_rt_id": Tweet.user_rt_id,
|
103 |
+
"user_rt": Tweet.user_rt,
|
104 |
+
"retweet_id": Tweet.retweet_id,
|
105 |
+
"reply_to": Tweet.reply_to,
|
106 |
+
"retweet_date": Tweet.retweet_date,
|
107 |
+
"translate": Tweet.translate,
|
108 |
+
"trans_src": Tweet.trans_src,
|
109 |
+
"trans_dest": Tweet.trans_dest
|
110 |
+
}
|
111 |
+
_object_blocks[_type].append(_data)
|
112 |
+
elif _type == "user":
|
113 |
+
user = object
|
114 |
+
try:
|
115 |
+
background_image = user.background_image
|
116 |
+
except:
|
117 |
+
background_image = ""
|
118 |
+
_data = {
|
119 |
+
"id": user.id,
|
120 |
+
"name": user.name,
|
121 |
+
"username": user.username,
|
122 |
+
"bio": user.bio,
|
123 |
+
"url": user.url,
|
124 |
+
"join_datetime": user.join_date + " " + user.join_time,
|
125 |
+
"join_date": user.join_date,
|
126 |
+
"join_time": user.join_time,
|
127 |
+
"tweets": user.tweets,
|
128 |
+
"location": user.location,
|
129 |
+
"following": user.following,
|
130 |
+
"followers": user.followers,
|
131 |
+
"likes": user.likes,
|
132 |
+
"media": user.media_count,
|
133 |
+
"private": user.is_private,
|
134 |
+
"verified": user.is_verified,
|
135 |
+
"avatar": user.avatar,
|
136 |
+
"background_image": background_image,
|
137 |
+
}
|
138 |
+
_object_blocks[_type].append(_data)
|
139 |
+
elif _type == "followers" or _type == "following":
|
140 |
+
_data = {
|
141 |
+
config.Following*"following" + config.Followers*"followers" :
|
142 |
+
{config.Username: object[_type]}
|
143 |
+
}
|
144 |
+
_object_blocks[_type] = _data
|
145 |
+
else:
|
146 |
+
print("Wrong type of object passed!")
|
147 |
+
|
148 |
+
|
149 |
+
def clean():
|
150 |
+
global Tweets_df
|
151 |
+
global Follow_df
|
152 |
+
global User_df
|
153 |
+
_object_blocks["tweet"].clear()
|
154 |
+
_object_blocks["following"].clear()
|
155 |
+
_object_blocks["followers"].clear()
|
156 |
+
_object_blocks["user"].clear()
|
157 |
+
Tweets_df = None
|
158 |
+
Follow_df = None
|
159 |
+
User_df = None
|
160 |
+
|
161 |
+
def save(_filename, _dataframe, **options):
|
162 |
+
if options.get("dataname"):
|
163 |
+
_dataname = options.get("dataname")
|
164 |
+
else:
|
165 |
+
_dataname = "twint"
|
166 |
+
|
167 |
+
if not options.get("type"):
|
168 |
+
with warnings.catch_warnings():
|
169 |
+
warnings.simplefilter("ignore")
|
170 |
+
_store = pd.HDFStore(_filename + ".h5")
|
171 |
+
_store[_dataname] = _dataframe
|
172 |
+
_store.close()
|
173 |
+
elif options.get("type") == "Pickle":
|
174 |
+
with warnings.catch_warnings():
|
175 |
+
warnings.simplefilter("ignore")
|
176 |
+
_dataframe.to_pickle(_filename + ".pkl")
|
177 |
+
else:
|
178 |
+
print("""Please specify: filename, DataFrame, DataFrame name and type
|
179 |
+
(HDF5, default, or Pickle)""")
|
180 |
+
|
181 |
+
def read(_filename, **options):
|
182 |
+
if not options.get("dataname"):
|
183 |
+
_dataname = "twint"
|
184 |
+
else:
|
185 |
+
_dataname = options.get("dataname")
|
186 |
+
|
187 |
+
if not options.get("type"):
|
188 |
+
_store = pd.HDFStore(_filename + ".h5")
|
189 |
+
_df = _store[_dataname]
|
190 |
+
return _df
|
191 |
+
elif options.get("type") == "Pickle":
|
192 |
+
_df = pd.read_pickle(_filename + ".pkl")
|
193 |
+
return _df
|
194 |
+
else:
|
195 |
+
print("""Please specify: DataFrame, DataFrame name (twint as default),
|
196 |
+
filename and type (HDF5, default, or Pickle""")
|
twitter-scraper/twint-master/twint/storage/write.py
ADDED
@@ -0,0 +1,77 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from . import write_meta as meta
|
2 |
+
import csv
|
3 |
+
import json
|
4 |
+
import os
|
5 |
+
|
6 |
+
def outputExt(objType, fType):
|
7 |
+
if objType == "str":
|
8 |
+
objType = "username"
|
9 |
+
outExt = f"/{objType}s.{fType}"
|
10 |
+
|
11 |
+
return outExt
|
12 |
+
|
13 |
+
def addExt(base, objType, fType):
|
14 |
+
if len(base.split('.')) == 1:
|
15 |
+
createDirIfMissing(base)
|
16 |
+
base += outputExt(objType, fType)
|
17 |
+
|
18 |
+
return base
|
19 |
+
|
20 |
+
def Text(entry, f):
|
21 |
+
print(entry.replace('\n', ' '), file=open(f, "a", encoding="utf-8"))
|
22 |
+
|
23 |
+
def Type(config):
|
24 |
+
if config.User_full:
|
25 |
+
_type = "user"
|
26 |
+
elif config.Followers or config.Following:
|
27 |
+
_type = "username"
|
28 |
+
else:
|
29 |
+
_type = "tweet"
|
30 |
+
|
31 |
+
return _type
|
32 |
+
|
33 |
+
def struct(obj, custom, _type):
|
34 |
+
if custom:
|
35 |
+
fieldnames = custom
|
36 |
+
row = {}
|
37 |
+
for f in fieldnames:
|
38 |
+
row[f] = meta.Data(obj, _type)[f]
|
39 |
+
else:
|
40 |
+
fieldnames = meta.Fieldnames(_type)
|
41 |
+
row = meta.Data(obj, _type)
|
42 |
+
|
43 |
+
return fieldnames, row
|
44 |
+
|
45 |
+
def createDirIfMissing(dirname):
|
46 |
+
if not os.path.exists(dirname):
|
47 |
+
os.makedirs(dirname)
|
48 |
+
|
49 |
+
def Csv(obj, config):
|
50 |
+
_obj_type = obj.__class__.__name__
|
51 |
+
if _obj_type == "str":
|
52 |
+
_obj_type = "username"
|
53 |
+
fieldnames, row = struct(obj, config.Custom[_obj_type], _obj_type)
|
54 |
+
|
55 |
+
base = addExt(config.Output, _obj_type, "csv")
|
56 |
+
dialect = 'excel-tab' if 'Tabs' in config.__dict__ else 'excel'
|
57 |
+
|
58 |
+
if not (os.path.exists(base)):
|
59 |
+
with open(base, "w", newline='', encoding="utf-8") as csv_file:
|
60 |
+
writer = csv.DictWriter(csv_file, fieldnames=fieldnames, dialect=dialect)
|
61 |
+
writer.writeheader()
|
62 |
+
|
63 |
+
with open(base, "a", newline='', encoding="utf-8") as csv_file:
|
64 |
+
writer = csv.DictWriter(csv_file, fieldnames=fieldnames, dialect=dialect)
|
65 |
+
writer.writerow(row)
|
66 |
+
|
67 |
+
def Json(obj, config):
|
68 |
+
_obj_type = obj.__class__.__name__
|
69 |
+
if _obj_type == "str":
|
70 |
+
_obj_type = "username"
|
71 |
+
null, data = struct(obj, config.Custom[_obj_type], _obj_type)
|
72 |
+
|
73 |
+
base = addExt(config.Output, _obj_type, "json")
|
74 |
+
|
75 |
+
with open(base, "a", newline='', encoding="utf-8") as json_file:
|
76 |
+
json.dump(data, json_file, ensure_ascii=False)
|
77 |
+
json_file.write("\n")
|
twitter-scraper/twint-master/twint/storage/write_meta.py
ADDED
@@ -0,0 +1,151 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
def tweetData(t):
|
2 |
+
data = {
|
3 |
+
"id": int(t.id),
|
4 |
+
"conversation_id": t.conversation_id,
|
5 |
+
"created_at": t.datetime,
|
6 |
+
"date": t.datestamp,
|
7 |
+
"time": t.timestamp,
|
8 |
+
"timezone": t.timezone,
|
9 |
+
"user_id": t.user_id,
|
10 |
+
"username": t.username,
|
11 |
+
"name": t.name,
|
12 |
+
"place": t.place,
|
13 |
+
"tweet": t.tweet,
|
14 |
+
"language": t.lang,
|
15 |
+
"mentions": t.mentions,
|
16 |
+
"urls": t.urls,
|
17 |
+
"photos": t.photos,
|
18 |
+
"replies_count": int(t.replies_count),
|
19 |
+
"retweets_count": int(t.retweets_count),
|
20 |
+
"likes_count": int(t.likes_count),
|
21 |
+
"hashtags": t.hashtags,
|
22 |
+
"cashtags": t.cashtags,
|
23 |
+
"link": t.link,
|
24 |
+
"retweet": t.retweet,
|
25 |
+
"quote_url": t.quote_url,
|
26 |
+
"video": t.video,
|
27 |
+
"thumbnail": t.thumbnail,
|
28 |
+
"near": t.near,
|
29 |
+
"geo": t.geo,
|
30 |
+
"source": t.source,
|
31 |
+
"user_rt_id": t.user_rt_id,
|
32 |
+
"user_rt": t.user_rt,
|
33 |
+
"retweet_id": t.retweet_id,
|
34 |
+
"reply_to": t.reply_to,
|
35 |
+
"retweet_date": t.retweet_date,
|
36 |
+
"translate": t.translate,
|
37 |
+
"trans_src": t.trans_src,
|
38 |
+
"trans_dest": t.trans_dest,
|
39 |
+
}
|
40 |
+
return data
|
41 |
+
|
42 |
+
def tweetFieldnames():
|
43 |
+
fieldnames = [
|
44 |
+
"id",
|
45 |
+
"conversation_id",
|
46 |
+
"created_at",
|
47 |
+
"date",
|
48 |
+
"time",
|
49 |
+
"timezone",
|
50 |
+
"user_id",
|
51 |
+
"username",
|
52 |
+
"name",
|
53 |
+
"place",
|
54 |
+
"tweet",
|
55 |
+
"language",
|
56 |
+
"mentions",
|
57 |
+
"urls",
|
58 |
+
"photos",
|
59 |
+
"replies_count",
|
60 |
+
"retweets_count",
|
61 |
+
"likes_count",
|
62 |
+
"hashtags",
|
63 |
+
"cashtags",
|
64 |
+
"link",
|
65 |
+
"retweet",
|
66 |
+
"quote_url",
|
67 |
+
"video",
|
68 |
+
"thumbnail",
|
69 |
+
"near",
|
70 |
+
"geo",
|
71 |
+
"source",
|
72 |
+
"user_rt_id",
|
73 |
+
"user_rt",
|
74 |
+
"retweet_id",
|
75 |
+
"reply_to",
|
76 |
+
"retweet_date",
|
77 |
+
"translate",
|
78 |
+
"trans_src",
|
79 |
+
"trans_dest"
|
80 |
+
]
|
81 |
+
return fieldnames
|
82 |
+
|
83 |
+
def userData(u):
|
84 |
+
data = {
|
85 |
+
"id": int(u.id),
|
86 |
+
"name": u.name,
|
87 |
+
"username": u.username,
|
88 |
+
"bio": u.bio,
|
89 |
+
"location": u.location,
|
90 |
+
"url": u.url,
|
91 |
+
"join_date": u.join_date,
|
92 |
+
"join_time": u.join_time,
|
93 |
+
"tweets": int(u.tweets),
|
94 |
+
"following": int(u.following),
|
95 |
+
"followers": int(u.followers),
|
96 |
+
"likes": int(u.likes),
|
97 |
+
"media": int(u.media_count),
|
98 |
+
"private": u.is_private,
|
99 |
+
"verified": u.is_verified,
|
100 |
+
"profile_image_url": u.avatar,
|
101 |
+
"background_image": u.background_image
|
102 |
+
}
|
103 |
+
return data
|
104 |
+
|
105 |
+
def userFieldnames():
|
106 |
+
fieldnames = [
|
107 |
+
"id",
|
108 |
+
"name",
|
109 |
+
"username",
|
110 |
+
"bio",
|
111 |
+
"location",
|
112 |
+
"url",
|
113 |
+
"join_date",
|
114 |
+
"join_time",
|
115 |
+
"tweets",
|
116 |
+
"following",
|
117 |
+
"followers",
|
118 |
+
"likes",
|
119 |
+
"media",
|
120 |
+
"private",
|
121 |
+
"verified",
|
122 |
+
"profile_image_url",
|
123 |
+
"background_image"
|
124 |
+
]
|
125 |
+
return fieldnames
|
126 |
+
|
127 |
+
def usernameData(u):
|
128 |
+
return {"username": u}
|
129 |
+
|
130 |
+
def usernameFieldnames():
|
131 |
+
return ["username"]
|
132 |
+
|
133 |
+
def Data(obj, _type):
|
134 |
+
if _type == "user":
|
135 |
+
ret = userData(obj)
|
136 |
+
elif _type == "username":
|
137 |
+
ret = usernameData(obj)
|
138 |
+
else:
|
139 |
+
ret = tweetData(obj)
|
140 |
+
|
141 |
+
return ret
|
142 |
+
|
143 |
+
def Fieldnames(_type):
|
144 |
+
if _type == "user":
|
145 |
+
ret = userFieldnames()
|
146 |
+
elif _type == "username":
|
147 |
+
ret = usernameFieldnames()
|
148 |
+
else:
|
149 |
+
ret = tweetFieldnames()
|
150 |
+
|
151 |
+
return ret
|
twitter-scraper/twint-master/twint/token.py
ADDED
@@ -0,0 +1,94 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import re
|
2 |
+
import time
|
3 |
+
|
4 |
+
import requests
|
5 |
+
import logging as logme
|
6 |
+
|
7 |
+
|
8 |
+
class TokenExpiryException(Exception):
|
9 |
+
def __init__(self, msg):
|
10 |
+
super().__init__(msg)
|
11 |
+
|
12 |
+
|
13 |
+
class RefreshTokenException(Exception):
|
14 |
+
def __init__(self, msg):
|
15 |
+
super().__init__(msg)
|
16 |
+
|
17 |
+
|
18 |
+
class Token:
|
19 |
+
def __init__(self, config):
|
20 |
+
self._session = requests.Session()
|
21 |
+
self._session.headers.update({'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:78.0) Gecko/20100101 Firefox/78.0'})
|
22 |
+
self.config = config
|
23 |
+
self._retries = 5
|
24 |
+
self._timeout = 10
|
25 |
+
self.url = 'https://twitter.com'
|
26 |
+
|
27 |
+
def _request(self):
|
28 |
+
for attempt in range(self._retries + 1):
|
29 |
+
# The request is newly prepared on each retry because of potential cookie updates.
|
30 |
+
req = self._session.prepare_request(requests.Request('GET', self.url))
|
31 |
+
logme.debug(f'Retrieving {req.url}')
|
32 |
+
try:
|
33 |
+
r = self._session.send(req, allow_redirects=True, timeout=self._timeout)
|
34 |
+
except requests.exceptions.RequestException as exc:
|
35 |
+
if attempt < self._retries:
|
36 |
+
retrying = ', retrying'
|
37 |
+
level = logme.WARNING
|
38 |
+
else:
|
39 |
+
retrying = ''
|
40 |
+
level = logme.ERROR
|
41 |
+
logme.log(level, f'Error retrieving {req.url}: {exc!r}{retrying}')
|
42 |
+
else:
|
43 |
+
success, msg = (True, None)
|
44 |
+
msg = f': {msg}' if msg else ''
|
45 |
+
|
46 |
+
if success:
|
47 |
+
logme.debug(f'{req.url} retrieved successfully{msg}')
|
48 |
+
return r
|
49 |
+
if attempt < self._retries:
|
50 |
+
# TODO : might wanna tweak this back-off timer
|
51 |
+
sleep_time = 2.0 * 2 ** attempt
|
52 |
+
logme.info(f'Waiting {sleep_time:.0f} seconds')
|
53 |
+
time.sleep(sleep_time)
|
54 |
+
else:
|
55 |
+
msg = f'{self._retries + 1} requests to {self.url} failed, giving up.'
|
56 |
+
logme.fatal(msg)
|
57 |
+
self.config.Guest_token = None
|
58 |
+
raise RefreshTokenException(msg)
|
59 |
+
|
60 |
+
def refresh(self):
|
61 |
+
logme.debug('Retrieving guest token')
|
62 |
+
res = self._request()
|
63 |
+
match = re.search(r'\("gt=(\d+);', res.text)
|
64 |
+
if match:
|
65 |
+
logme.debug('Found guest token in HTML')
|
66 |
+
self.config.Guest_token = str(match.group(1))
|
67 |
+
else:
|
68 |
+
headers = {
|
69 |
+
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:78.0) Gecko/20100101 Firefox/78.0',
|
70 |
+
'authority': 'api.twitter.com',
|
71 |
+
'content-length': '0',
|
72 |
+
'authorization': self.config.Bearer_token,
|
73 |
+
'x-twitter-client-language': 'en',
|
74 |
+
'x-csrf-token': res.cookies.get("ct0"),
|
75 |
+
'x-twitter-active-user': 'yes',
|
76 |
+
'content-type': 'application/x-www-form-urlencoded',
|
77 |
+
'accept': '*/*',
|
78 |
+
'sec-gpc': '1',
|
79 |
+
'origin': 'https://twitter.com',
|
80 |
+
'sec-fetch-site': 'same-site',
|
81 |
+
'sec-fetch-mode': 'cors',
|
82 |
+
'sec-fetch-dest': 'empty',
|
83 |
+
'referer': 'https://twitter.com/',
|
84 |
+
'accept-language': 'en-US',
|
85 |
+
}
|
86 |
+
self._session.headers.update(headers)
|
87 |
+
req = self._session.prepare_request(requests.Request('POST', 'https://api.twitter.com/1.1/guest/activate.json'))
|
88 |
+
res = self._session.send(req, allow_redirects=True, timeout=self._timeout)
|
89 |
+
if 'guest_token' in res.json():
|
90 |
+
logme.debug('Found guest token in JSON')
|
91 |
+
self.config.Guest_token = res.json()['guest_token']
|
92 |
+
else:
|
93 |
+
self.config.Guest_token = None
|
94 |
+
raise RefreshTokenException('Could not find the Guest token in HTML')
|
twitter-scraper/twint-master/twint/tweet.py
ADDED
@@ -0,0 +1,166 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from time import strftime, localtime
|
2 |
+
from datetime import datetime, timezone
|
3 |
+
|
4 |
+
import logging as logme
|
5 |
+
from googletransx import Translator
|
6 |
+
# ref.
|
7 |
+
# - https://github.com/x0rzkov/py-googletrans#basic-usage
|
8 |
+
translator = Translator()
|
9 |
+
|
10 |
+
|
11 |
+
class tweet:
|
12 |
+
"""Define Tweet class
|
13 |
+
"""
|
14 |
+
type = "tweet"
|
15 |
+
|
16 |
+
def __init__(self):
|
17 |
+
pass
|
18 |
+
|
19 |
+
|
20 |
+
def utc_to_local(utc_dt):
|
21 |
+
return utc_dt.replace(tzinfo=timezone.utc).astimezone(tz=None)
|
22 |
+
|
23 |
+
|
24 |
+
Tweet_formats = {
|
25 |
+
'datetime': '%Y-%m-%d %H:%M:%S %Z',
|
26 |
+
'datestamp': '%Y-%m-%d',
|
27 |
+
'timestamp': '%H:%M:%S'
|
28 |
+
}
|
29 |
+
|
30 |
+
|
31 |
+
def _get_mentions(tw):
|
32 |
+
"""Extract mentions from tweet
|
33 |
+
"""
|
34 |
+
logme.debug(__name__ + ':get_mentions')
|
35 |
+
try:
|
36 |
+
mentions = [
|
37 |
+
{
|
38 |
+
'screen_name': _mention['screen_name'],
|
39 |
+
'name': _mention['name'],
|
40 |
+
'id': _mention['id_str'],
|
41 |
+
} for _mention in tw['entities']['user_mentions']
|
42 |
+
if tw['display_text_range'][0] < _mention['indices'][0]
|
43 |
+
]
|
44 |
+
except KeyError:
|
45 |
+
mentions = []
|
46 |
+
return mentions
|
47 |
+
|
48 |
+
|
49 |
+
def _get_reply_to(tw):
|
50 |
+
try:
|
51 |
+
reply_to = [
|
52 |
+
{
|
53 |
+
'screen_name': _mention['screen_name'],
|
54 |
+
'name': _mention['name'],
|
55 |
+
'id': _mention['id_str'],
|
56 |
+
} for _mention in tw['entities']['user_mentions']
|
57 |
+
if tw['display_text_range'][0] > _mention['indices'][1]
|
58 |
+
]
|
59 |
+
except KeyError:
|
60 |
+
reply_to = []
|
61 |
+
return reply_to
|
62 |
+
|
63 |
+
|
64 |
+
def getText(tw):
|
65 |
+
"""Replace some text
|
66 |
+
"""
|
67 |
+
logme.debug(__name__ + ':getText')
|
68 |
+
text = tw['full_text']
|
69 |
+
text = text.replace("http", " http")
|
70 |
+
text = text.replace("pic.twitter", " pic.twitter")
|
71 |
+
text = text.replace("\n", " ")
|
72 |
+
|
73 |
+
return text
|
74 |
+
|
75 |
+
|
76 |
+
def Tweet(tw, config):
|
77 |
+
"""Create Tweet object
|
78 |
+
"""
|
79 |
+
logme.debug(__name__ + ':Tweet')
|
80 |
+
t = tweet()
|
81 |
+
t.id = int(tw['id_str'])
|
82 |
+
t.id_str = tw["id_str"]
|
83 |
+
t.conversation_id = tw["conversation_id_str"]
|
84 |
+
|
85 |
+
# parsing date to user-friendly format
|
86 |
+
_dt = tw['created_at']
|
87 |
+
_dt = datetime.strptime(_dt, '%a %b %d %H:%M:%S %z %Y')
|
88 |
+
_dt = utc_to_local(_dt)
|
89 |
+
t.datetime = str(_dt.strftime(Tweet_formats['datetime']))
|
90 |
+
# date is of the format year,
|
91 |
+
t.datestamp = _dt.strftime(Tweet_formats['datestamp'])
|
92 |
+
t.timestamp = _dt.strftime(Tweet_formats['timestamp'])
|
93 |
+
t.user_id = int(tw["user_id_str"])
|
94 |
+
t.user_id_str = tw["user_id_str"]
|
95 |
+
t.username = tw["user_data"]['screen_name']
|
96 |
+
t.name = tw["user_data"]['name']
|
97 |
+
t.place = tw['geo'] if 'geo' in tw and tw['geo'] else ""
|
98 |
+
t.timezone = strftime("%z", localtime())
|
99 |
+
t.mentions = _get_mentions(tw)
|
100 |
+
t.reply_to = _get_reply_to(tw)
|
101 |
+
try:
|
102 |
+
t.urls = [_url['expanded_url'] for _url in tw['entities']['urls']]
|
103 |
+
except KeyError:
|
104 |
+
t.urls = []
|
105 |
+
try:
|
106 |
+
t.photos = [_img['media_url_https'] for _img in tw['entities']['media'] if _img['type'] == 'photo' and
|
107 |
+
_img['expanded_url'].find('/photo/') != -1]
|
108 |
+
except KeyError:
|
109 |
+
t.photos = []
|
110 |
+
try:
|
111 |
+
t.video = 1 if len(tw['extended_entities']['media']) else 0
|
112 |
+
except KeyError:
|
113 |
+
t.video = 0
|
114 |
+
try:
|
115 |
+
t.thumbnail = tw['extended_entities']['media'][0]['media_url_https']
|
116 |
+
except KeyError:
|
117 |
+
t.thumbnail = ''
|
118 |
+
t.tweet = getText(tw)
|
119 |
+
t.lang = tw['lang']
|
120 |
+
try:
|
121 |
+
t.hashtags = [hashtag['text'] for hashtag in tw['entities']['hashtags']]
|
122 |
+
except KeyError:
|
123 |
+
t.hashtags = []
|
124 |
+
try:
|
125 |
+
t.cashtags = [cashtag['text'] for cashtag in tw['entities']['symbols']]
|
126 |
+
except KeyError:
|
127 |
+
t.cashtags = []
|
128 |
+
t.replies_count = tw['reply_count']
|
129 |
+
t.retweets_count = tw['retweet_count']
|
130 |
+
t.likes_count = tw['favorite_count']
|
131 |
+
t.link = f"https://twitter.com/{t.username}/status/{t.id}"
|
132 |
+
try:
|
133 |
+
if 'user_rt_id' in tw['retweet_data']:
|
134 |
+
t.retweet = True
|
135 |
+
t.retweet_id = tw['retweet_data']['retweet_id']
|
136 |
+
t.retweet_date = tw['retweet_data']['retweet_date']
|
137 |
+
t.user_rt = tw['retweet_data']['user_rt']
|
138 |
+
t.user_rt_id = tw['retweet_data']['user_rt_id']
|
139 |
+
except KeyError:
|
140 |
+
t.retweet = False
|
141 |
+
t.retweet_id = ''
|
142 |
+
t.retweet_date = ''
|
143 |
+
t.user_rt = ''
|
144 |
+
t.user_rt_id = ''
|
145 |
+
try:
|
146 |
+
t.quote_url = tw['quoted_status_permalink']['expanded'] if tw['is_quote_status'] else ''
|
147 |
+
except KeyError:
|
148 |
+
# means that the quoted tweet have been deleted
|
149 |
+
t.quote_url = 0
|
150 |
+
t.near = config.Near if config.Near else ""
|
151 |
+
t.geo = config.Geo if config.Geo else ""
|
152 |
+
t.source = config.Source if config.Source else ""
|
153 |
+
t.translate = ''
|
154 |
+
t.trans_src = ''
|
155 |
+
t.trans_dest = ''
|
156 |
+
if config.Translate:
|
157 |
+
try:
|
158 |
+
ts = translator.translate(text=t.tweet, dest=config.TranslateDest)
|
159 |
+
t.translate = ts.text
|
160 |
+
t.trans_src = ts.src
|
161 |
+
t.trans_dest = ts.dest
|
162 |
+
# ref. https://github.com/SuniTheFish/ChainTranslator/blob/master/ChainTranslator/__main__.py#L31
|
163 |
+
except ValueError as e:
|
164 |
+
logme.debug(__name__ + ':Tweet:translator.translate:' + str(e))
|
165 |
+
raise Exception("Invalid destination language: {} / Tweet: {}".format(config.TranslateDest, t.tweet))
|
166 |
+
return t
|
twitter-scraper/twint-master/twint/url.py
ADDED
@@ -0,0 +1,195 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import datetime
|
2 |
+
import json
|
3 |
+
from sys import platform
|
4 |
+
import logging as logme
|
5 |
+
from urllib.parse import urlencode
|
6 |
+
from urllib.parse import quote
|
7 |
+
|
8 |
+
mobile = "https://mobile.twitter.com"
|
9 |
+
base = "https://api.twitter.com/2/search/adaptive.json"
|
10 |
+
|
11 |
+
|
12 |
+
def _sanitizeQuery(_url, params):
|
13 |
+
_serialQuery = ""
|
14 |
+
_serialQuery = urlencode(params, quote_via=quote)
|
15 |
+
_serialQuery = _url + "?" + _serialQuery
|
16 |
+
return _serialQuery
|
17 |
+
|
18 |
+
|
19 |
+
def _formatDate(date):
|
20 |
+
if "win" in platform:
|
21 |
+
return f'\"{date.split()[0]}\"'
|
22 |
+
try:
|
23 |
+
return int(datetime.datetime.strptime(date, "%Y-%m-%d %H:%M:%S").timestamp())
|
24 |
+
except ValueError:
|
25 |
+
return int(datetime.datetime.strptime(date, "%Y-%m-%d").timestamp())
|
26 |
+
|
27 |
+
|
28 |
+
async def Favorites(username, init):
|
29 |
+
logme.debug(__name__ + ':Favorites')
|
30 |
+
url = f"{mobile}/{username}/favorites?lang=en"
|
31 |
+
|
32 |
+
if init != '-1':
|
33 |
+
url += f"&max_id={init}"
|
34 |
+
|
35 |
+
return url
|
36 |
+
|
37 |
+
|
38 |
+
async def Followers(username, init):
|
39 |
+
logme.debug(__name__ + ':Followers')
|
40 |
+
url = f"{mobile}/{username}/followers?lang=en"
|
41 |
+
|
42 |
+
if init != '-1':
|
43 |
+
url += f"&cursor={init}"
|
44 |
+
|
45 |
+
return url
|
46 |
+
|
47 |
+
|
48 |
+
async def Following(username, init):
|
49 |
+
logme.debug(__name__ + ':Following')
|
50 |
+
url = f"{mobile}/{username}/following?lang=en"
|
51 |
+
|
52 |
+
if init != '-1':
|
53 |
+
url += f"&cursor={init}"
|
54 |
+
|
55 |
+
return url
|
56 |
+
|
57 |
+
|
58 |
+
async def MobileProfile(username, init):
|
59 |
+
logme.debug(__name__ + ':MobileProfile')
|
60 |
+
url = f"{mobile}/{username}?lang=en"
|
61 |
+
|
62 |
+
if init != '-1':
|
63 |
+
url += f"&max_id={init}"
|
64 |
+
|
65 |
+
return url
|
66 |
+
|
67 |
+
|
68 |
+
async def Search(config, init):
|
69 |
+
logme.debug(__name__ + ':Search')
|
70 |
+
url = base
|
71 |
+
tweet_count = 100 if not config.Limit else config.Limit
|
72 |
+
q = ""
|
73 |
+
params = [
|
74 |
+
# ('include_blocking', '1'),
|
75 |
+
# ('include_blocked_by', '1'),
|
76 |
+
# ('include_followed_by', '1'),
|
77 |
+
# ('include_want_retweets', '1'),
|
78 |
+
# ('include_mute_edge', '1'),
|
79 |
+
# ('include_can_dm', '1'),
|
80 |
+
('include_can_media_tag', '1'),
|
81 |
+
# ('skip_status', '1'),
|
82 |
+
# ('include_cards', '1'),
|
83 |
+
('include_ext_alt_text', 'true'),
|
84 |
+
('include_quote_count', 'true'),
|
85 |
+
('include_reply_count', '1'),
|
86 |
+
('tweet_mode', 'extended'),
|
87 |
+
('include_entities', 'true'),
|
88 |
+
('include_user_entities', 'true'),
|
89 |
+
('include_ext_media_availability', 'true'),
|
90 |
+
('send_error_codes', 'true'),
|
91 |
+
('simple_quoted_tweet', 'true'),
|
92 |
+
('count', tweet_count),
|
93 |
+
('query_source', 'typed_query'),
|
94 |
+
# ('pc', '1'),
|
95 |
+
('cursor', str(init)),
|
96 |
+
('spelling_corrections', '1'),
|
97 |
+
('ext', 'mediaStats%2ChighlightedLabel'),
|
98 |
+
('tweet_search_mode', 'live'), # this can be handled better, maybe take an argument and set it then
|
99 |
+
]
|
100 |
+
if not config.Popular_tweets:
|
101 |
+
params.append(('f', 'tweets'))
|
102 |
+
if config.Lang:
|
103 |
+
params.append(("l", config.Lang))
|
104 |
+
params.append(("lang", "en"))
|
105 |
+
if config.Query:
|
106 |
+
q += f" from:{config.Query}"
|
107 |
+
if config.Username:
|
108 |
+
q += f" from:{config.Username}"
|
109 |
+
if config.Geo:
|
110 |
+
config.Geo = config.Geo.replace(" ", "")
|
111 |
+
q += f" geocode:{config.Geo}"
|
112 |
+
if config.Search:
|
113 |
+
|
114 |
+
q += f" {config.Search}"
|
115 |
+
if config.Year:
|
116 |
+
q += f" until:{config.Year}-1-1"
|
117 |
+
if config.Since:
|
118 |
+
q += f" since:{_formatDate(config.Since)}"
|
119 |
+
if config.Until:
|
120 |
+
q += f" until:{_formatDate(config.Until)}"
|
121 |
+
if config.Email:
|
122 |
+
q += ' "mail" OR "email" OR'
|
123 |
+
q += ' "gmail" OR "e-mail"'
|
124 |
+
if config.Phone:
|
125 |
+
q += ' "phone" OR "call me" OR "text me"'
|
126 |
+
if config.Verified:
|
127 |
+
q += " filter:verified"
|
128 |
+
if config.To:
|
129 |
+
q += f" to:{config.To}"
|
130 |
+
if config.All:
|
131 |
+
q += f" to:{config.All} OR from:{config.All} OR @{config.All}"
|
132 |
+
if config.Near:
|
133 |
+
q += f' near:"{config.Near}"'
|
134 |
+
if config.Images:
|
135 |
+
q += " filter:images"
|
136 |
+
if config.Videos:
|
137 |
+
q += " filter:videos"
|
138 |
+
if config.Media:
|
139 |
+
q += " filter:media"
|
140 |
+
if config.Replies:
|
141 |
+
q += " filter:replies"
|
142 |
+
# although this filter can still be used, but I found it broken in my preliminary testing, needs more testing
|
143 |
+
if config.Native_retweets:
|
144 |
+
q += " filter:nativeretweets"
|
145 |
+
if config.Min_likes:
|
146 |
+
q += f" min_faves:{config.Min_likes}"
|
147 |
+
if config.Min_retweets:
|
148 |
+
q += f" min_retweets:{config.Min_retweets}"
|
149 |
+
if config.Min_replies:
|
150 |
+
q += f" min_replies:{config.Min_replies}"
|
151 |
+
if config.Links == "include":
|
152 |
+
q += " filter:links"
|
153 |
+
elif config.Links == "exclude":
|
154 |
+
q += " exclude:links"
|
155 |
+
if config.Source:
|
156 |
+
q += f" source:\"{config.Source}\""
|
157 |
+
if config.Members_list:
|
158 |
+
q += f" list:{config.Members_list}"
|
159 |
+
if config.Filter_retweets:
|
160 |
+
q += f" exclude:nativeretweets exclude:retweets"
|
161 |
+
if config.Custom_query:
|
162 |
+
q = config.Custom_query
|
163 |
+
|
164 |
+
q = q.strip()
|
165 |
+
params.append(("q", q))
|
166 |
+
_serialQuery = _sanitizeQuery(url, params)
|
167 |
+
return url, params, _serialQuery
|
168 |
+
|
169 |
+
|
170 |
+
def SearchProfile(config, init=None):
|
171 |
+
logme.debug(__name__ + ':SearchProfile')
|
172 |
+
_url = 'https://twitter.com/i/api/graphql/CwLU7qTfeu0doqhSr6tW4A/UserTweetsAndReplies'
|
173 |
+
tweet_count = 100
|
174 |
+
variables = {
|
175 |
+
"userId": config.User_id,
|
176 |
+
"count": tweet_count,
|
177 |
+
"includePromotedContent": True,
|
178 |
+
"withCommunity": True,
|
179 |
+
"withSuperFollowsUserFields": True,
|
180 |
+
"withBirdwatchPivots": False,
|
181 |
+
"withDownvotePerspective": False,
|
182 |
+
"withReactionsMetadata": False,
|
183 |
+
"withReactionsPerspective": False,
|
184 |
+
"withSuperFollowsTweetFields": True,
|
185 |
+
"withVoice": True,
|
186 |
+
"withV2Timeline": False,
|
187 |
+
"__fs_interactive_text": False,
|
188 |
+
"__fs_dont_mention_me_view_api_enabled": False,
|
189 |
+
}
|
190 |
+
if type(init) == str:
|
191 |
+
variables['cursor'] = init
|
192 |
+
params = [('variables', json.dumps(variables, separators=(',',':')))]
|
193 |
+
|
194 |
+
_serialQuery = _sanitizeQuery(_url, params)
|
195 |
+
return _serialQuery, [], _serialQuery
|
twitter-scraper/twint-master/twint/user.py
ADDED
@@ -0,0 +1,52 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import datetime
|
2 |
+
import logging as logme
|
3 |
+
|
4 |
+
|
5 |
+
class user:
|
6 |
+
type = "user"
|
7 |
+
|
8 |
+
def __init__(self):
|
9 |
+
pass
|
10 |
+
|
11 |
+
|
12 |
+
User_formats = {
|
13 |
+
'join_date': '%Y-%m-%d',
|
14 |
+
'join_time': '%H:%M:%S %Z'
|
15 |
+
}
|
16 |
+
|
17 |
+
|
18 |
+
# ur object must be a json from the endpoint https://api.twitter.com/graphql
|
19 |
+
def User(ur):
|
20 |
+
logme.debug(__name__ + ':User')
|
21 |
+
if 'data' not in ur and 'user' not in ur['data']:
|
22 |
+
msg = 'malformed json! cannot be parsed to get user data'
|
23 |
+
logme.fatal(msg)
|
24 |
+
raise KeyError(msg)
|
25 |
+
_usr = user()
|
26 |
+
_usr.id = ur['data']['user']['rest_id']
|
27 |
+
_usr.name = ur['data']['user']['legacy']['name']
|
28 |
+
_usr.username = ur['data']['user']['legacy']['screen_name']
|
29 |
+
_usr.bio = ur['data']['user']['legacy']['description']
|
30 |
+
_usr.location = ur['data']['user']['legacy']['location']
|
31 |
+
_usr.url = ur['data']['user']['legacy']['url']
|
32 |
+
# parsing date to user-friendly format
|
33 |
+
_dt = ur['data']['user']['legacy']['created_at']
|
34 |
+
_dt = datetime.datetime.strptime(_dt, '%a %b %d %H:%M:%S %z %Y')
|
35 |
+
# date is of the format year,
|
36 |
+
_usr.join_date = _dt.strftime(User_formats['join_date'])
|
37 |
+
_usr.join_time = _dt.strftime(User_formats['join_time'])
|
38 |
+
|
39 |
+
# :type `int`
|
40 |
+
_usr.tweets = int(ur['data']['user']['legacy']['statuses_count'])
|
41 |
+
_usr.following = int(ur['data']['user']['legacy']['friends_count'])
|
42 |
+
_usr.followers = int(ur['data']['user']['legacy']['followers_count'])
|
43 |
+
_usr.likes = int(ur['data']['user']['legacy']['favourites_count'])
|
44 |
+
_usr.media_count = int(ur['data']['user']['legacy']['media_count'])
|
45 |
+
|
46 |
+
_usr.is_private = ur['data']['user']['legacy']['protected']
|
47 |
+
_usr.is_verified = ur['data']['user']['legacy']['verified']
|
48 |
+
_usr.avatar = ur['data']['user']['legacy']['profile_image_url_https']
|
49 |
+
_usr.background_image = ur['data']['user']['legacy']['profile_banner_url']
|
50 |
+
# TODO : future implementation
|
51 |
+
# legacy_extended_profile is also available in some cases which can be used to get DOB of user
|
52 |
+
return _usr
|
twitter-scraper/twint-master/twint/verbose.py
ADDED
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
def Count(count, config):
|
2 |
+
msg = "[+] Finished: Successfully collected "
|
3 |
+
if config.Followers:
|
4 |
+
msg += f"all {count} users who follow @{config.Username}"
|
5 |
+
elif config.Following:
|
6 |
+
msg += f"all {count} users who @{config.Username} follows"
|
7 |
+
elif config.Favorites:
|
8 |
+
msg += f"{count} Tweets that @{config.Username} liked"
|
9 |
+
else:
|
10 |
+
msg += f"{count} Tweets_and_replies"
|
11 |
+
if config.Username:
|
12 |
+
msg += f" from @{config.Username}"
|
13 |
+
msg += "."
|
14 |
+
print(msg)
|
15 |
+
|
16 |
+
def Elastic(elasticsearch):
|
17 |
+
if elasticsearch:
|
18 |
+
print("[+] Indexing to Elasticsearch @ " + str(elasticsearch))
|
twitter-scraper/twint-master/twitter_scraper.ipynb
ADDED
@@ -0,0 +1,265 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cells": [
|
3 |
+
{
|
4 |
+
"cell_type": "markdown",
|
5 |
+
"id": "a5361789",
|
6 |
+
"metadata": {},
|
7 |
+
"source": [
|
8 |
+
"## Have to install these packages \n"
|
9 |
+
]
|
10 |
+
},
|
11 |
+
{
|
12 |
+
"cell_type": "code",
|
13 |
+
"execution_count": null,
|
14 |
+
"id": "c9021300",
|
15 |
+
"metadata": {
|
16 |
+
"scrolled": true
|
17 |
+
},
|
18 |
+
"outputs": [],
|
19 |
+
"source": [
|
20 |
+
"%%capture \n",
|
21 |
+
"!pip3 install Twint \n"
|
22 |
+
]
|
23 |
+
},
|
24 |
+
{
|
25 |
+
"cell_type": "markdown",
|
26 |
+
"id": "5c857dbf",
|
27 |
+
"metadata": {},
|
28 |
+
"source": [
|
29 |
+
"## Nessessary Imports"
|
30 |
+
]
|
31 |
+
},
|
32 |
+
{
|
33 |
+
"cell_type": "code",
|
34 |
+
"execution_count": null,
|
35 |
+
"id": "1413ab2b",
|
36 |
+
"metadata": {},
|
37 |
+
"outputs": [],
|
38 |
+
"source": [
|
39 |
+
"# import asyncio\n",
|
40 |
+
"# import os\n",
|
41 |
+
"# loop = asyncio.get_event_loop()\n",
|
42 |
+
"# loop.is_running()\n",
|
43 |
+
"# import twint\n",
|
44 |
+
"# import nest_asyncio\n",
|
45 |
+
"# nest_asyncio.apply()"
|
46 |
+
]
|
47 |
+
},
|
48 |
+
{
|
49 |
+
"cell_type": "code",
|
50 |
+
"execution_count": null,
|
51 |
+
"id": "d38514f3",
|
52 |
+
"metadata": {},
|
53 |
+
"outputs": [],
|
54 |
+
"source": [
|
55 |
+
"import scrape\n"
|
56 |
+
]
|
57 |
+
},
|
58 |
+
{
|
59 |
+
"cell_type": "code",
|
60 |
+
"execution_count": null,
|
61 |
+
"id": "a7912a91",
|
62 |
+
"metadata": {},
|
63 |
+
"outputs": [],
|
64 |
+
"source": [
|
65 |
+
"from_date=\"2022-6-10 10:30:22\"\n",
|
66 |
+
"to_date= \"2022-6-30\"\n",
|
67 |
+
"num_tweets = 20\n",
|
68 |
+
"_data=scrape.scraper.get_tweets(\"jimmieakesson\",u_or_s=\"u\",from_date=221232,to_date=2313)\n"
|
69 |
+
]
|
70 |
+
},
|
71 |
+
{
|
72 |
+
"cell_type": "code",
|
73 |
+
"execution_count": null,
|
74 |
+
"id": "48d50b46",
|
75 |
+
"metadata": {},
|
76 |
+
"outputs": [],
|
77 |
+
"source": [
|
78 |
+
"tweets= _data.keys()\n",
|
79 |
+
"for i in tweets:\n",
|
80 |
+
" _data[i][\"tweet\"]\n",
|
81 |
+
" print(_data[i][\"tweet\"], \"\\n\", \"__________________________________________________________\")"
|
82 |
+
]
|
83 |
+
},
|
84 |
+
{
|
85 |
+
"cell_type": "code",
|
86 |
+
"execution_count": null,
|
87 |
+
"id": "72cabcb5",
|
88 |
+
"metadata": {},
|
89 |
+
"outputs": [],
|
90 |
+
"source": [
|
91 |
+
"from_date=\"2022-6-10 10:30:22\"\n",
|
92 |
+
"to_date= \"2022-6-30\"\n",
|
93 |
+
"num_tweets = 20\n",
|
94 |
+
"_data=scrape.scraper.string_search_user_tweets(\"jimmieakesson\",\"invandring\")\n"
|
95 |
+
]
|
96 |
+
},
|
97 |
+
{
|
98 |
+
"cell_type": "code",
|
99 |
+
"execution_count": null,
|
100 |
+
"id": "549e4fb3",
|
101 |
+
"metadata": {},
|
102 |
+
"outputs": [],
|
103 |
+
"source": [
|
104 |
+
"tweets= _data[\"tweet\"]\n",
|
105 |
+
"for i in tweets:\n",
|
106 |
+
" print(i, \"\\n\", \"__________________________________________________________\")"
|
107 |
+
]
|
108 |
+
},
|
109 |
+
{
|
110 |
+
"cell_type": "code",
|
111 |
+
"execution_count": 3,
|
112 |
+
"id": "733dd44a",
|
113 |
+
"metadata": {},
|
114 |
+
"outputs": [
|
115 |
+
{
|
116 |
+
"name": "stdout",
|
117 |
+
"output_type": "stream",
|
118 |
+
"text": [
|
119 |
+
"Defaulting to user installation because normal site-packages is not writeable\n",
|
120 |
+
"Requirement already satisfied: snscrape in /home/oxygen/.local/lib/python3.10/site-packages (0.3.4)\n",
|
121 |
+
"Requirement already satisfied: beautifulsoup4 in /home/oxygen/.local/lib/python3.10/site-packages (from snscrape) (4.11.1)\n",
|
122 |
+
"Requirement already satisfied: requests[socks] in /usr/lib/python3/dist-packages (from snscrape) (2.25.1)\n",
|
123 |
+
"Requirement already satisfied: lxml in /usr/lib/python3/dist-packages (from snscrape) (4.8.0)\n",
|
124 |
+
"Requirement already satisfied: soupsieve>1.2 in /home/oxygen/.local/lib/python3.10/site-packages (from beautifulsoup4->snscrape) (2.3.2.post1)\n",
|
125 |
+
"Requirement already satisfied: PySocks!=1.5.7,>=1.5.6 in /home/oxygen/.local/lib/python3.10/site-packages (from requests[socks]->snscrape) (1.7.1)\n"
|
126 |
+
]
|
127 |
+
}
|
128 |
+
],
|
129 |
+
"source": [
|
130 |
+
"#%pip install -q snscrape==0.3.4\n",
|
131 |
+
"!pip3 install snscrape\n",
|
132 |
+
"#!pip3 install git+https://github.com/JustAnotherArchivist/snscrape.git"
|
133 |
+
]
|
134 |
+
},
|
135 |
+
{
|
136 |
+
"cell_type": "code",
|
137 |
+
"execution_count": 14,
|
138 |
+
"id": "0d16422c",
|
139 |
+
"metadata": {},
|
140 |
+
"outputs": [
|
141 |
+
{
|
142 |
+
"name": "stdout",
|
143 |
+
"output_type": "stream",
|
144 |
+
"text": [
|
145 |
+
"Note: you may need to restart the kernel to use updated packages.\n"
|
146 |
+
]
|
147 |
+
}
|
148 |
+
],
|
149 |
+
"source": [
|
150 |
+
"%pip install -q snscrape==0.3.4\n",
|
151 |
+
"from datetime import date\n",
|
152 |
+
"import os\n",
|
153 |
+
"import pandas as pd\n",
|
154 |
+
"\n",
|
155 |
+
"\n",
|
156 |
+
"def get_tweets(search_term, from_date, to_date=date.today(), num_tweets=100,u_or_s='s'):\n",
|
157 |
+
" if u_or_s.lower() =='u':\n",
|
158 |
+
" extracted_tweets = \"snscrape --format '{content!r}'\"+ f\" --max-results {num_tweets} --since {from_date} twitter-user '{search_term} until:{to_date}' > extracted-tweets.txt\" \n",
|
159 |
+
" else:\n",
|
160 |
+
" extracted_tweets = \"snscrape --format '{content!r}'\"+ f\" --max-results {num_tweets} --since {from_date} twitter-search '{search_term} until:{to_date}' > extracted-tweets.txt\"\n",
|
161 |
+
" \n",
|
162 |
+
" os.system(extracted_tweets)\n",
|
163 |
+
" if os.stat(\"extracted-tweets.txt\").st_size == 0:\n",
|
164 |
+
" print('No Tweets found')\n",
|
165 |
+
" else:\n",
|
166 |
+
" df = pd.read_csv('extracted-tweets.txt', names=['content'])\n",
|
167 |
+
" data_list=[]\n",
|
168 |
+
" for row in df['content'].iteritems():\n",
|
169 |
+
" temp= str(row[0])+str(row[1])\n",
|
170 |
+
" temp= temp.replace(\"\\'\",\"\")\n",
|
171 |
+
" data_list.append(temp)\n",
|
172 |
+
" return data_list\n",
|
173 |
+
"\n"
|
174 |
+
]
|
175 |
+
},
|
176 |
+
{
|
177 |
+
"cell_type": "code",
|
178 |
+
"execution_count": 12,
|
179 |
+
"id": "8e2adb35",
|
180 |
+
"metadata": {},
|
181 |
+
"outputs": [
|
182 |
+
{
|
183 |
+
"name": "stdout",
|
184 |
+
"output_type": "stream",
|
185 |
+
"text": [
|
186 |
+
"No Tweets found\n"
|
187 |
+
]
|
188 |
+
},
|
189 |
+
{
|
190 |
+
"name": "stderr",
|
191 |
+
"output_type": "stream",
|
192 |
+
"text": [
|
193 |
+
"Traceback (most recent call last):\n",
|
194 |
+
" File \"/home/oxygen/.local/bin/snscrape\", line 8, in <module>\n",
|
195 |
+
" sys.exit(main())\n",
|
196 |
+
" File \"/home/oxygen/.local/lib/python3.10/site-packages/snscrape/cli.py\", line 224, in main\n",
|
197 |
+
" args = parse_args()\n",
|
198 |
+
" File \"/home/oxygen/.local/lib/python3.10/site-packages/snscrape/cli.py\", line 159, in parse_args\n",
|
199 |
+
" import snscrape.modules\n",
|
200 |
+
" File \"/home/oxygen/.local/lib/python3.10/site-packages/snscrape/modules/__init__.py\", line 15, in <module>\n",
|
201 |
+
" _import_modules()\n",
|
202 |
+
" File \"/home/oxygen/.local/lib/python3.10/site-packages/snscrape/modules/__init__.py\", line 12, in _import_modules\n",
|
203 |
+
" module = importlib.import_module(moduleName)\n",
|
204 |
+
" File \"/usr/lib/python3.10/importlib/__init__.py\", line 126, in import_module\n",
|
205 |
+
" return _bootstrap._gcd_import(name[level:], package, level)\n",
|
206 |
+
" File \"/home/oxygen/.local/lib/python3.10/site-packages/snscrape/modules/instagram.py\", line 12, in <module>\n",
|
207 |
+
" class InstagramPost(typing.NamedTuple, snscrape.base.Item):\n",
|
208 |
+
" File \"/usr/lib/python3.10/typing.py\", line 2329, in _namedtuple_mro_entries\n",
|
209 |
+
" raise TypeError(\"Multiple inheritance with NamedTuple is not supported\")\n",
|
210 |
+
"TypeError: Multiple inheritance with NamedTuple is not supported\n"
|
211 |
+
]
|
212 |
+
},
|
213 |
+
{
|
214 |
+
"ename": "UnboundLocalError",
|
215 |
+
"evalue": "local variable 'df' referenced before assignment",
|
216 |
+
"output_type": "error",
|
217 |
+
"traceback": [
|
218 |
+
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
|
219 |
+
"\u001b[0;31mUnboundLocalError\u001b[0m Traceback (most recent call last)",
|
220 |
+
"\u001b[0;32m/tmp/ipykernel_26511/1892081786.py\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0md\u001b[0m\u001b[0;34m=\u001b[0m \u001b[0mget_tweets\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"jimmieakesson\"\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0mfrom_date\u001b[0m\u001b[0;34m=\u001b[0m \u001b[0;34m\"2022-06-01\"\u001b[0m \u001b[0;34m,\u001b[0m\u001b[0mnum_tweets\u001b[0m \u001b[0;34m=\u001b[0m\u001b[0;36m5\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mu_or_s\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m\"u\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
|
221 |
+
"\u001b[0;32m/tmp/ipykernel_26511/275462205.py\u001b[0m in \u001b[0;36mget_tweets\u001b[0;34m(search_term, from_date, to_date, num_tweets, u_or_s)\u001b[0m\n\u001b[1;32m 17\u001b[0m \u001b[0mdf\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mpd\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mread_csv\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'extracted-tweets.txt'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mnames\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'content'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 18\u001b[0m \u001b[0mdata_list\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 19\u001b[0;31m \u001b[0;32mfor\u001b[0m \u001b[0mrow\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mdf\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'content'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0miteritems\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 20\u001b[0m \u001b[0mtemp\u001b[0m\u001b[0;34m=\u001b[0m \u001b[0mstr\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mrow\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m+\u001b[0m\u001b[0mstr\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mrow\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 21\u001b[0m \u001b[0mtemp\u001b[0m\u001b[0;34m=\u001b[0m \u001b[0mtemp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mreplace\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"\\'\"\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\"\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
|
222 |
+
"\u001b[0;31mUnboundLocalError\u001b[0m: local variable 'df' referenced before assignment"
|
223 |
+
]
|
224 |
+
}
|
225 |
+
],
|
226 |
+
"source": [
|
227 |
+
"d= get_tweets(\"jimmieakesson\",from_date= \"2022-06-01\" ,num_tweets =5, u_or_s=\"u\")"
|
228 |
+
]
|
229 |
+
},
|
230 |
+
{
|
231 |
+
"cell_type": "code",
|
232 |
+
"execution_count": null,
|
233 |
+
"id": "a2c837f4",
|
234 |
+
"metadata": {},
|
235 |
+
"outputs": [],
|
236 |
+
"source": []
|
237 |
+
}
|
238 |
+
],
|
239 |
+
"metadata": {
|
240 |
+
"kernelspec": {
|
241 |
+
"display_name": "Python 3.10.4 64-bit",
|
242 |
+
"language": "python",
|
243 |
+
"name": "python3"
|
244 |
+
},
|
245 |
+
"language_info": {
|
246 |
+
"codemirror_mode": {
|
247 |
+
"name": "ipython",
|
248 |
+
"version": 3
|
249 |
+
},
|
250 |
+
"file_extension": ".py",
|
251 |
+
"mimetype": "text/x-python",
|
252 |
+
"name": "python",
|
253 |
+
"nbconvert_exporter": "python",
|
254 |
+
"pygments_lexer": "ipython3",
|
255 |
+
"version": "3.10.4"
|
256 |
+
},
|
257 |
+
"vscode": {
|
258 |
+
"interpreter": {
|
259 |
+
"hash": "916dbcbb3f70747c44a77c7bcd40155683ae19c65e1c03b4aa3499c5328201f1"
|
260 |
+
}
|
261 |
+
}
|
262 |
+
},
|
263 |
+
"nbformat": 4,
|
264 |
+
"nbformat_minor": 5
|
265 |
+
}
|