4stra commited on
Commit
76b4944
2 Parent(s): eceff29 a2b888f

Merge pull request #17 from Demea9000/5-create-twitterscraper-class

Browse files
Files changed (39) hide show
  1. twitter-scraper/TwitterScraper.py +0 -3
  2. twitter-scraper/twint-master/.github/FUNDING.yml +3 -0
  3. twitter-scraper/twint-master/.github/ISSUE_TEMPLATE.md +20 -0
  4. twitter-scraper/twint-master/.github/ISSUE_TEMPLATE/ISSUE_TEMPLATE.md +17 -0
  5. twitter-scraper/twint-master/.gitignore +115 -0
  6. twitter-scraper/twint-master/.travis.yml +23 -0
  7. twitter-scraper/twint-master/Dockerfile +10 -0
  8. twitter-scraper/twint-master/LICENSE +21 -0
  9. twitter-scraper/twint-master/MANIFEST.in +1 -0
  10. twitter-scraper/twint-master/README.md +272 -0
  11. twitter-scraper/twint-master/Untitled.ipynb +282 -0
  12. twitter-scraper/twint-master/automate.py +65 -0
  13. twitter-scraper/twint-master/elasticsearch/README.md +5 -0
  14. twitter-scraper/twint-master/scrape.py +102 -0
  15. twitter-scraper/twint-master/scrape__init__.py +14 -0
  16. twitter-scraper/twint-master/setup.py +65 -0
  17. twitter-scraper/twint-master/test.py +92 -0
  18. twitter-scraper/twint-master/twint/__init__.py +32 -0
  19. twitter-scraper/twint-master/twint/__version__.py +3 -0
  20. twitter-scraper/twint-master/twint/cli.py +342 -0
  21. twitter-scraper/twint-master/twint/config.py +87 -0
  22. twitter-scraper/twint-master/twint/datelock.py +44 -0
  23. twitter-scraper/twint-master/twint/feed.py +145 -0
  24. twitter-scraper/twint-master/twint/format.py +91 -0
  25. twitter-scraper/twint-master/twint/get.py +298 -0
  26. twitter-scraper/twint-master/twint/output.py +241 -0
  27. twitter-scraper/twint-master/twint/run.py +412 -0
  28. twitter-scraper/twint-master/twint/storage/__init__.py +0 -0
  29. twitter-scraper/twint-master/twint/storage/db.py +297 -0
  30. twitter-scraper/twint-master/twint/storage/elasticsearch.py +364 -0
  31. twitter-scraper/twint-master/twint/storage/panda.py +196 -0
  32. twitter-scraper/twint-master/twint/storage/write.py +77 -0
  33. twitter-scraper/twint-master/twint/storage/write_meta.py +151 -0
  34. twitter-scraper/twint-master/twint/token.py +94 -0
  35. twitter-scraper/twint-master/twint/tweet.py +166 -0
  36. twitter-scraper/twint-master/twint/url.py +195 -0
  37. twitter-scraper/twint-master/twint/user.py +52 -0
  38. twitter-scraper/twint-master/twint/verbose.py +18 -0
  39. twitter-scraper/twint-master/twitter_scraper.ipynb +265 -0
twitter-scraper/TwitterScraper.py DELETED
@@ -1,3 +0,0 @@
1
- print("My name is Nils and this is my Test branch")
2
-
3
- print("This is my testBranch, you wont see this on main")
 
 
 
 
twitter-scraper/twint-master/.github/FUNDING.yml ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ # These are supported funding model platforms
2
+ patreon: twintproject
3
+ custom: paypal.me/noneprivacy
twitter-scraper/twint-master/.github/ISSUE_TEMPLATE.md ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Issue Template
2
+ Please use this template!
3
+
4
+ ## Initial Check
5
+ > If the issue is a request please specify that it is a request in the title (Example: [REQUEST] more features). If this is a question regarding 'twint' please specify that it's a question in the title (Example: [QUESTION] What is x?). Please **only** submit issues related to 'twint'. Thanks.
6
+
7
+ >Make sure you've checked the following:
8
+
9
+ - [] Python version is 3.6 or later;
10
+ - [] Updated Twint with `pip3 install --user --upgrade -e git+https://github.com/minamotorin/twint.git@origin/master#egg=twint`;
11
+ - [] I have searched the issues and there are no duplicates of this issue/question/request (please link to related issues of twintproject/twint for reference).
12
+
13
+ ## Command Ran
14
+ >Please provide the _exact_ command ran including the username/search/code so I may reproduce the issue.
15
+
16
+ ## Description of Issue
17
+ >Please use **as much detail as possible.**
18
+
19
+ ## Environment Details
20
+ >Using Windows, Linux? What OS version? Running this in Anaconda? Jupyter Notebook? Terminal?
twitter-scraper/twint-master/.github/ISSUE_TEMPLATE/ISSUE_TEMPLATE.md ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ### Initial Check
2
+ > If the issue is a request please specify that it is a request in the title (Example: [REQUEST] more features). If this is a question regarding 'twint' please specify that it's a question in the title (Example: [QUESTION] What is x?). Please **only** submit issues related to 'twint'. Thanks.
3
+
4
+ >Make sure you've checked the following:
5
+
6
+ - [] Python version is 3.6;
7
+ - [] Using the latest version of Twint;
8
+ - [] Updated Twint with `pip3 install --upgrade -e git+https://github.com/twintproject/twint.git@origin/master#egg=twint`;
9
+
10
+ ### Command Ran
11
+ >Please provide the _exact_ command ran including the username/search/code so I may reproduce the issue.
12
+
13
+ ### Description of Issue
14
+ >Please use **as much detail as possible.**
15
+
16
+ ### Environment Details
17
+ >Using Windows, Linux? What OS version? Running this in Anaconda? Jupyter Notebook? Terminal?
twitter-scraper/twint-master/.gitignore ADDED
@@ -0,0 +1,115 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+ tweets.db
6
+ # C extensions
7
+ *.so
8
+
9
+ config.ini
10
+ twint/storage/mysql.py
11
+
12
+ # Node Dependency directories
13
+ node_modules/
14
+ jspm_packages/
15
+ tests/
16
+ # Distribution / packaging
17
+ .Python
18
+ env/
19
+ build/
20
+ develop-eggs/
21
+ dist/
22
+ downloads/
23
+ eggs/
24
+ .eggs/
25
+ lib/
26
+ lib64/
27
+ parts/
28
+ sdist/
29
+ var/
30
+ wheels/
31
+ *.egg-info/
32
+ .installed.cfg
33
+ *.egg
34
+
35
+ # PyInstaller
36
+ # Usually these files are written by a python script from a template
37
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
38
+ *.manifest
39
+ *.spec
40
+
41
+ # Installer logs
42
+ pip-log.txt
43
+ pip-delete-this-directory.txt
44
+
45
+ # Unit test / coverage reports
46
+ htmlcov/
47
+ .tox/
48
+ .coverage
49
+ .coverage.*
50
+ .cache
51
+ nosetests.xml
52
+ coverage.xml
53
+ *.cover
54
+ .hypothesis/
55
+
56
+ # Translations
57
+ *.mo
58
+ *.pot
59
+
60
+ # Django stuff:
61
+ *.log
62
+ local_settings.py
63
+
64
+ # Flask stuff:
65
+ instance/
66
+ .webassets-cache
67
+
68
+ # Scrapy stuff:
69
+ .scrapy
70
+
71
+ # Sphinx documentation
72
+ docs/_build/
73
+
74
+ # PyBuilder
75
+ target/
76
+
77
+ # Jupyter Notebook
78
+ .ipynb_checkpoints
79
+
80
+ # pyenv
81
+ .python-version
82
+
83
+ # celery beat schedule file
84
+ celerybeat-schedule
85
+
86
+ # SageMath parsed files
87
+ *.sage.py
88
+
89
+ # dotenv
90
+ .env
91
+
92
+ # virtualenv
93
+ .venv
94
+ venv/
95
+ ENV/
96
+
97
+ # Spyder project settings
98
+ .spyderproject
99
+ .spyproject
100
+
101
+ # Rope project settings
102
+ .ropeproject
103
+
104
+ # mkdocs documentation
105
+ /site
106
+
107
+ # mypy
108
+ .mypy_cache/
109
+
110
+ # output
111
+ *.csv
112
+ *.json
113
+ *.txt
114
+
115
+ test_twint.py
twitter-scraper/twint-master/.travis.yml ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ dist: bionic
2
+ language: python
3
+ python:
4
+ - "3.6"
5
+ - "3.7"
6
+ - "3.8"
7
+ - "nightly"
8
+ matrix:
9
+ allow_failures:
10
+ - python: "nightly"
11
+ - python: "3.8"
12
+ install:
13
+ - pip install -r requirements.txt
14
+ script:
15
+ - python test.py
16
+ deploy:
17
+ provider: pypi
18
+ user: "codyzacharias"
19
+ password:
20
+ secure: sWWvx50F7KJBtf8z2njc+Q31WIAHiQs4zKEiGD4/7xrshw55H5z+WnqZ9VIP83qm9yKefoRKp7WnaJeXZ3ulZSLn64ue45lqFozWMyGvelRPOKvZi9XPMqBA7+qllR/GseTHSGC3G5EGxac6UEI3irYe3mZXxfjpxNOXVti8rJ2xX8TiJM0AVKRrdDiAstOhMMkXkB7fYXMQALwEp8UoW/UbjbeqsKueXydjStaESNP/QzRFZ3/tuNu+3HMz/olniLUhUWcF/xDbJVpXuaRMUalgqe+BTbDdtUVt/s/GKtpg5GAzJyhQphiCM/huihedUIKSoI+6A8PTzuxrLhB5BMi9pcllED02v7w1enpu5L2l5cRDgQJSOpkxkA5Eese8nxKOOq0KzwDQa3JByrRor8R4yz+p5s4u2r0Rs2A9fkjQYwd/uWBSEIRF4K9WZoniiikahwXq070DMRgV7HbovKSjo5NK5F8j+psrtqPF+OHN2aVfWxbGnezrOOkmzuTHhWZVj3pPSpQU1WFWHo9fPo4I6YstR4q6XjNNjrpY3ojSlv0ThMbUem7zhHTRkRsSA2SpPfqw5E3Jf7vaiQb4M5zkBVqxuq4tXb14GJ26tGD8tel8u8b+ccpkAE9xf+QavP8UHz4PbBhqgFX5TbV/H++cdsICyoZnT35yiaDOELM=
21
+ on:
22
+ tags: true
23
+ python: "3.7"
twitter-scraper/twint-master/Dockerfile ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.6-buster
2
+ LABEL maintainer="codyzacharias@pm.me"
3
+
4
+ WORKDIR /root
5
+
6
+ RUN git clone --depth=1 https://github.com/twintproject/twint.git && \
7
+ cd /root/twint && \
8
+ pip3 install . -r requirements.txt
9
+
10
+ CMD /bin/bash
twitter-scraper/twint-master/LICENSE ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Copyright (c) 2018 Cody Zacharias
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
twitter-scraper/twint-master/MANIFEST.in ADDED
@@ -0,0 +1 @@
 
 
1
+ include README.md LICENSE
twitter-scraper/twint-master/README.md ADDED
@@ -0,0 +1,272 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 20220207.0
2
+
3
+ # About this fork
4
+
5
+ [This repository](https://github.com/minamotorin/twint) is the fork of [https://github.com/twintproject/twint](https://github.com/twintproject/twint) and for myself.
6
+
7
+ Modified by [minamotorin](https://github.com/minamotorin).
8
+
9
+ ## Updates from twintproject/twint
10
+
11
+ ### twint.token.RefreshTokenException: Could not find the Guest token in HTML
12
+
13
+ This problem doesn't happen recently.
14
+
15
+ #### Related
16
+
17
+ - [twintproject/twint#1320](https://github.com/twintproject/twint/issues/1320)
18
+ - [twintproject/twint#1322](https://github.com/twintproject/twint/pull/1322)
19
+ - [twintproject/twint#1328](https://github.com/twintproject/twint/pull/1328)
20
+ - [twintproject/twint#1061](https://github.com/twintproject/twint/issues/1061)
21
+ - [twintproject/twint#1114](https://github.com/twintproject/twint/issues/1114)
22
+
23
+ ### json.decoder.JSONDecodeError: Expecting value: line 1 column 1 (char 0)
24
+
25
+ The fix is **not complete**.
26
+ `twint.run.Profile` will work but `twint.run.db` will not.
27
+ This means [`test.py`](./test.py) causes an error.
28
+
29
+ I think this is because the fields of the result table are not exactly the same as the traditional ones.
30
+
31
+ #### Related
32
+
33
+ - [twintproject/twint#1335](https://github.com/twintproject/twint/issues/1335)
34
+
35
+ ### [-] TWINT requires Python version 3.6+.
36
+
37
+ #### Related
38
+
39
+ - [twintproject/twint#1344](https://github.com/twintproject/twint/issues/1344)
40
+ - [twintproject/twint#1345](https://github.com/twintproject/twint/pull/1345)
41
+ - [twintproject/twint#1344](https://github.com/twintproject/twint/issues/1346)
42
+ - [twintproject/twint#1309](https://github.com/twintproject/twint/pull/1309)
43
+ - [twintproject/twint#1313](https://github.com/twintproject/twint/issues/1313)
44
+
45
+ ## References
46
+
47
+ - [snscrape](https://github.com/JustAnotherArchivist/snscrape)
48
+ - [gallery-dl](https://github.com/mikf/gallery-dl)
49
+
50
+ ## License
51
+
52
+ This repository is also under the [MIT License](https://opensource.org/licenses/mit-license.php).
53
+
54
+ ---
55
+
56
+ # TWINT - Twitter Intelligence Tool
57
+ ![2](https://i.imgur.com/iaH3s7z.png)
58
+ ![3](https://i.imgur.com/hVeCrqL.png)
59
+
60
+ [![PyPI](https://img.shields.io/pypi/v/twint.svg)](https://pypi.org/project/twint/) [![Build Status](https://travis-ci.org/twintproject/twint.svg?branch=master)](https://travis-ci.org/twintproject/twint) [![Python 3.6|3.7|3.8](https://img.shields.io/badge/Python-3.6%2F3.7%2F3.8-blue.svg)](https://www.python.org/download/releases/3.0/) [![GitHub license](https://img.shields.io/github/license/haccer/tweep.svg)](https://github.com/haccer/tweep/blob/master/LICENSE) [![Downloads](https://pepy.tech/badge/twint)](https://pepy.tech/project/twint) [![Downloads](https://pepy.tech/badge/twint/week)](https://pepy.tech/project/twint/week) [![Patreon](https://img.shields.io/endpoint.svg?url=https:%2F%2Fshieldsio-patreon.herokuapp.com%2Ftwintproject)](https://www.patreon.com/twintproject) ![](https://img.shields.io/twitter/follow/noneprivacy.svg?label=Follow&style=social)
61
+
62
+ >No authentication. No API. No limits.
63
+
64
+ Twint is an advanced Twitter scraping tool written in Python that allows for scraping Tweets from Twitter profiles **without** using Twitter's API.
65
+
66
+ Twint utilizes Twitter's search operators to let you scrape Tweets from specific users, scrape Tweets relating to certain topics, hashtags & trends, or sort out *sensitive* information from Tweets like e-mail and phone numbers. I find this very useful, and you can get really creative with it too.
67
+
68
+ Twint also makes special queries to Twitter allowing you to also scrape a Twitter user's followers, Tweets a user has liked, and who they follow **without** any authentication, API, Selenium, or browser emulation.
69
+
70
+ ## tl;dr Benefits
71
+ Some of the benefits of using Twint vs Twitter API:
72
+ - Can fetch almost __all__ Tweets (Twitter API limits to last 3200 Tweets only);
73
+ - Fast initial setup;
74
+ - Can be used anonymously and without Twitter sign up;
75
+ - **No rate limitations**.
76
+
77
+ ## Limits imposed by Twitter
78
+ Twitter limits scrolls while browsing the user timeline. This means that with `.Profile` or with `.Favorites` you will be able to get ~3200 tweets.
79
+
80
+ ## Requirements
81
+ - Python 3.6;
82
+ - aiohttp;
83
+ - aiodns;
84
+ - beautifulsoup4;
85
+ - cchardet;
86
+ - dataclasses
87
+ - elasticsearch;
88
+ - pysocks;
89
+ - pandas (>=0.23.0);
90
+ - aiohttp_socks;
91
+ - schedule;
92
+ - geopy;
93
+ - fake-useragent;
94
+ - py-googletransx.
95
+
96
+ ## Installing
97
+
98
+ **Git:**
99
+ ```bash
100
+ git clone --depth=1 https://github.com/twintproject/twint.git
101
+ cd twint
102
+ pip3 install . -r requirements.txt
103
+ ```
104
+
105
+ **Pip:**
106
+ ```bash
107
+ pip3 install twint
108
+ ```
109
+
110
+ or
111
+
112
+ ```bash
113
+ pip3 install --user --upgrade git+https://github.com/twintproject/twint.git@origin/master#egg=twint
114
+ ```
115
+
116
+ **Pipenv**:
117
+ ```bash
118
+ pipenv install git+https://github.com/twintproject/twint.git#egg=twint
119
+ ```
120
+
121
+ ### March 2, 2021 Update
122
+
123
+ **Added**: Dockerfile
124
+
125
+ Noticed a lot of people are having issues installing (including me). Please use the Dockerfile temporarily while I look into them.
126
+
127
+ ## CLI Basic Examples and Combos
128
+ A few simple examples to help you understand the basics:
129
+
130
+ - `twint -u username` - Scrape all the Tweets of a *user* (doesn't include **retweets** but includes **replies**).
131
+ - `twint -u username -s pineapple` - Scrape all Tweets from the *user*'s timeline containing _pineapple_.
132
+ - `twint -s pineapple` - Collect every Tweet containing *pineapple* from everyone's Tweets.
133
+ - `twint -u username --year 2014` - Collect Tweets that were tweeted **before** 2014.
134
+ - `twint -u username --since "2015-12-20 20:30:15"` - Collect Tweets that were tweeted since 2015-12-20 20:30:15.
135
+ - `twint -u username --since 2015-12-20` - Collect Tweets that were tweeted since 2015-12-20 00:00:00.
136
+ - `twint -u username -o file.txt` - Scrape Tweets and save to file.txt.
137
+ - `twint -u username -o file.csv --csv` - Scrape Tweets and save as a csv file.
138
+ - `twint -u username --email --phone` - Show Tweets that might have phone numbers or email addresses.
139
+ - `twint -s "Donald Trump" --verified` - Display Tweets by verified users that Tweeted about Donald Trump.
140
+ - `twint -g="48.880048,2.385939,1km" -o file.csv --csv` - Scrape Tweets from a radius of 1km around a place in Paris and export them to a csv file.
141
+ - `twint -u username -es localhost:9200` - Output Tweets to Elasticsearch
142
+ - `twint -u username -o file.json --json` - Scrape Tweets and save as a json file.
143
+ - `twint -u username --database tweets.db` - Save Tweets to a SQLite database.
144
+ - `twint -u username --followers` - Scrape a Twitter user's followers.
145
+ - `twint -u username --following` - Scrape who a Twitter user follows.
146
+ - `twint -u username --favorites` - Collect all the Tweets a user has favorited (gathers ~3200 tweet).
147
+ - `twint -u username --following --user-full` - Collect full user information a person follows
148
+ - `twint -u username --timeline` - Use an effective method to gather Tweets from a user's profile (Gathers ~3200 Tweets, including **retweets** & **replies**).
149
+ - `twint -u username --retweets` - Use a quick method to gather the last 900 Tweets (that includes retweets) from a user's profile.
150
+ - `twint -u username --resume resume_file.txt` - Resume a search starting from the last saved scroll-id.
151
+
152
+ More detail about the commands and options are located in the [wiki](https://github.com/twintproject/twint/wiki/Commands)
153
+
154
+ ## Module Example
155
+
156
+ Twint can now be used as a module and supports custom formatting. **More details are located in the [wiki](https://github.com/twintproject/twint/wiki/Module)**
157
+
158
+ ```python
159
+ import twint
160
+
161
+ # Configure
162
+ c = twint.Config()
163
+ c.Username = "realDonaldTrump"
164
+ c.Search = "great"
165
+
166
+ # Run
167
+ twint.run.Search(c)
168
+ ```
169
+ > Output
170
+
171
+ `955511208597184512 2018-01-22 18:43:19 GMT <now> pineapples are the best fruit`
172
+
173
+ ```python
174
+ import twint
175
+
176
+ c = twint.Config()
177
+
178
+ c.Username = "noneprivacy"
179
+ c.Custom["tweet"] = ["id"]
180
+ c.Custom["user"] = ["bio"]
181
+ c.Limit = 10
182
+ c.Store_csv = True
183
+ c.Output = "none"
184
+
185
+ twint.run.Search(c)
186
+ ```
187
+
188
+ ## Storing Options
189
+ - Write to file;
190
+ - CSV;
191
+ - JSON;
192
+ - SQLite;
193
+ - Elasticsearch.
194
+
195
+ ## Elasticsearch Setup
196
+
197
+ Details on setting up Elasticsearch with Twint is located in the [wiki](https://github.com/twintproject/twint/wiki/Elasticsearch).
198
+
199
+ ## Graph Visualization
200
+ ![graph](https://i.imgur.com/EEJqB8n.png)
201
+
202
+ [Graph](https://github.com/twintproject/twint/wiki/Graph) details are also located in the [wiki](https://github.com/twintproject/twint/wiki/Graph).
203
+
204
+ We are developing a Twint Desktop App.
205
+
206
+ ![4](https://i.imgur.com/DzcfIgL.png)
207
+
208
+ ## FAQ
209
+ > I tried scraping tweets from a user, I know that they exist but I'm not getting them
210
+
211
+ Twitter can shadow-ban accounts, which means that their tweets will not be available via search. To solve this, pass `--profile-full` if you are using Twint via CLI or, if are using Twint as module, add `config.Profile_full = True`. Please note that this process will be quite slow.
212
+ ## More Examples
213
+
214
+ #### Followers/Following
215
+
216
+ > To get only follower usernames/following usernames
217
+
218
+ `twint -u username --followers`
219
+
220
+ `twint -u username --following`
221
+
222
+ > To get user info of followers/following users
223
+
224
+ `twint -u username --followers --user-full`
225
+
226
+ `twint -u username --following --user-full`
227
+
228
+ #### userlist
229
+
230
+ > To get only user info of user
231
+
232
+ `twint -u username --user-full`
233
+
234
+ > To get user info of users from a userlist
235
+
236
+ `twint --userlist inputlist --user-full`
237
+
238
+
239
+ #### tweet translation (experimental)
240
+
241
+ > To get 100 english tweets and translate them to italian
242
+
243
+ `twint -u noneprivacy --csv --output none.csv --lang en --translate --translate-dest it --limit 100`
244
+
245
+ or
246
+
247
+ ```python
248
+ import twint
249
+
250
+ c = twint.Config()
251
+ c.Username = "noneprivacy"
252
+ c.Limit = 100
253
+ c.Store_csv = True
254
+ c.Output = "none.csv"
255
+ c.Lang = "en"
256
+ c.Translate = True
257
+ c.TranslateDest = "it"
258
+ twint.run.Search(c)
259
+ ```
260
+
261
+ Notes:
262
+ - [Google translate has some quotas](https://cloud.google.com/translate/quotas)
263
+
264
+ ## Featured Blog Posts:
265
+ - [How to use Twint as an OSINT tool](https://pielco11.ovh/posts/twint-osint/)
266
+ - [Basic tutorial made by Null Byte](https://null-byte.wonderhowto.com/how-to/mine-twitter-for-targeted-information-with-twint-0193853/)
267
+ - [Analyzing Tweets with NLP in minutes with Spark, Optimus and Twint](https://towardsdatascience.com/analyzing-tweets-with-nlp-in-minutes-with-spark-optimus-and-twint-a0c96084995f)
268
+ - [Loading tweets into Kafka and Neo4j](https://markhneedham.com/blog/2019/05/29/loading-tweets-twint-kafka-neo4j/)
269
+
270
+ ## Contact
271
+
272
+ If you have any question, want to join in discussions, or need extra help, you are welcome to join our Twint focused channel at [OSINT team](https://osint.team)
twitter-scraper/twint-master/Untitled.ipynb ADDED
@@ -0,0 +1,282 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 67,
6
+ "metadata": {},
7
+ "outputs": [],
8
+ "source": [
9
+ "text= \"\\n\\n0. Brottslighet, 1. Miljö, 2. Skola, 3. Sjukvård, 4. Militär, 5. Invandring, 6. Integration \""
10
+ ]
11
+ },
12
+ {
13
+ "cell_type": "code",
14
+ "execution_count": 17,
15
+ "metadata": {},
16
+ "outputs": [
17
+ {
18
+ "name": "stdout",
19
+ "output_type": "stream",
20
+ "text": [
21
+ "WARNING: pip is being invoked by an old script wrapper. This will fail in a future version of pip.\n",
22
+ "Please see https://github.com/pypa/pip/issues/5599 for advice on fixing the underlying issue.\n",
23
+ "To avoid this problem you can invoke Python with '-m pip' instead of running pip directly.\n",
24
+ "Requirement already satisfied: regex in /home/oxygen/snap/jupyter/common/lib/python3.7/site-packages (2022.6.2)\n"
25
+ ]
26
+ }
27
+ ],
28
+ "source": [
29
+ "!pip install regex\n"
30
+ ]
31
+ },
32
+ {
33
+ "cell_type": "code",
34
+ "execution_count": 15,
35
+ "metadata": {},
36
+ "outputs": [
37
+ {
38
+ "data": {
39
+ "text/plain": [
40
+ "['0']"
41
+ ]
42
+ },
43
+ "execution_count": 15,
44
+ "metadata": {},
45
+ "output_type": "execute_result"
46
+ }
47
+ ],
48
+ "source": [
49
+ "re.findall(\"[0-9]+\", tl[0])"
50
+ ]
51
+ },
52
+ {
53
+ "cell_type": "code",
54
+ "execution_count": 48,
55
+ "metadata": {},
56
+ "outputs": [
57
+ {
58
+ "data": {
59
+ "text/plain": [
60
+ "'0. Äldrefrågor'"
61
+ ]
62
+ },
63
+ "execution_count": 48,
64
+ "metadata": {},
65
+ "output_type": "execute_result"
66
+ }
67
+ ],
68
+ "source": [
69
+ "tl[0]"
70
+ ]
71
+ },
72
+ {
73
+ "cell_type": "code",
74
+ "execution_count": 49,
75
+ "metadata": {},
76
+ "outputs": [
77
+ {
78
+ "data": {
79
+ "text/plain": [
80
+ "['0', ' Äldrefrågor']"
81
+ ]
82
+ },
83
+ "execution_count": 49,
84
+ "metadata": {},
85
+ "output_type": "execute_result"
86
+ }
87
+ ],
88
+ "source": [
89
+ "f=tl[0].split('.')\n",
90
+ "\n",
91
+ "f#int(f[0])"
92
+ ]
93
+ },
94
+ {
95
+ "cell_type": "code",
96
+ "execution_count": 29,
97
+ "metadata": {},
98
+ "outputs": [
99
+ {
100
+ "ename": "NameError",
101
+ "evalue": "name 'str_topics_to_dict' is not defined",
102
+ "output_type": "error",
103
+ "traceback": [
104
+ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
105
+ "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)",
106
+ "\u001b[0;32m<ipython-input-29-b05d9860dbcf>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mstr_topics_to_dict\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtext\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
107
+ "\u001b[0;31mNameError\u001b[0m: name 'str_topics_to_dict' is not defined"
108
+ ]
109
+ }
110
+ ],
111
+ "source": []
112
+ },
113
+ {
114
+ "cell_type": "code",
115
+ "execution_count": 65,
116
+ "metadata": {},
117
+ "outputs": [],
118
+ "source": [
119
+ "\n",
120
+ "def str_topics_to_dict(topics):\n",
121
+ " topic_list=topics.split(\",\")\n",
122
+ " ind_topic_dict={}\n",
123
+ " for i inrange(len(topic_list)): \n",
124
+ " index_topic_list=\n",
125
+ " ind=index_topic_list[0]\n",
126
+ " just_topic=index_topic_list[1][1:]\n",
127
+ " ind_topic_dict[int(ind)]=just_topic\n",
128
+ " return ind_topic_dict"
129
+ ]
130
+ },
131
+ {
132
+ "cell_type": "code",
133
+ "execution_count": 68,
134
+ "metadata": {},
135
+ "outputs": [
136
+ {
137
+ "data": {
138
+ "text/plain": [
139
+ "{0: 'Brottslighet',\n",
140
+ " 1: 'Miljö',\n",
141
+ " 2: 'Skola',\n",
142
+ " 3: 'Sjukvård',\n",
143
+ " 4: 'Militär',\n",
144
+ " 5: 'Invandring',\n",
145
+ " 6: 'Integration '}"
146
+ ]
147
+ },
148
+ "execution_count": 68,
149
+ "metadata": {},
150
+ "output_type": "execute_result"
151
+ }
152
+ ],
153
+ "source": [
154
+ "str_topics_to_dict(text)"
155
+ ]
156
+ },
157
+ {
158
+ "cell_type": "code",
159
+ "execution_count": 109,
160
+ "metadata": {},
161
+ "outputs": [
162
+ {
163
+ "data": {
164
+ "text/plain": [
165
+ "' Brottslighet, Miljö, Skola, Sjukvård, Militär stöd, Invandring, Integration '"
166
+ ]
167
+ },
168
+ "execution_count": 109,
169
+ "metadata": {},
170
+ "output_type": "execute_result"
171
+ }
172
+ ],
173
+ "source": [
174
+ "\n",
175
+ "text=\"\\n\\n0. Brottslighet, 1. Miljö, 2. Skola, 3. Sjukvård, 4. Militär stöd, 5. Invandring, 6. Integration \"\n",
176
+ "text=re.sub(r\"(\\n+)\",\" \",text)\n",
177
+ "text=re.sub(\"(\\.)|\\d+\",\"\",text )\n",
178
+ "text"
179
+ ]
180
+ },
181
+ {
182
+ "cell_type": "code",
183
+ "execution_count": 100,
184
+ "metadata": {},
185
+ "outputs": [
186
+ {
187
+ "data": {
188
+ "text/plain": [
189
+ "[' Brottslighet',\n",
190
+ " ' Miljö',\n",
191
+ " ' Skola',\n",
192
+ " ' Sjukvård',\n",
193
+ " ' Militär stöd',\n",
194
+ " ' Invandring',\n",
195
+ " ' Integration ']"
196
+ ]
197
+ },
198
+ "execution_count": 100,
199
+ "metadata": {},
200
+ "output_type": "execute_result"
201
+ }
202
+ ],
203
+ "source": [
204
+ "text.split(\",\")"
205
+ ]
206
+ },
207
+ {
208
+ "cell_type": "code",
209
+ "execution_count": 116,
210
+ "metadata": {},
211
+ "outputs": [],
212
+ "source": [
213
+ "import regex as re \n",
214
+ "def str_topics_to_dict(topics):\n",
215
+ " text=re.sub(r\"(\\n+)\",\" \",topics)\n",
216
+ " text=re.sub(\"(\\.)|\\d+\",\"\",topics )\n",
217
+ " topics=re.sub(r\"(\\n+)|(\\.)|\\d+\",\"\",topics)\n",
218
+ " topic_list=topics.split(\",\")\n",
219
+ " ind_topic_dict={}\n",
220
+ " for i in range(len(topic_list)): \n",
221
+ " ind=i\n",
222
+ " just_topic=topic_list[i]\n",
223
+ " ind_topic_dict[ind]=just_topic\n",
224
+ " return ind_topic_dict"
225
+ ]
226
+ },
227
+ {
228
+ "cell_type": "code",
229
+ "execution_count": 117,
230
+ "metadata": {},
231
+ "outputs": [
232
+ {
233
+ "data": {
234
+ "text/plain": [
235
+ "{0: ' Brottslighet',\n",
236
+ " 1: ' Miljö',\n",
237
+ " 2: ' Skola',\n",
238
+ " 3: ' Sjukvård',\n",
239
+ " 4: ' Militär stöd',\n",
240
+ " 5: ' Invandring',\n",
241
+ " 6: ' Integration '}"
242
+ ]
243
+ },
244
+ "execution_count": 117,
245
+ "metadata": {},
246
+ "output_type": "execute_result"
247
+ }
248
+ ],
249
+ "source": [
250
+ "str_topics_to_dict(text)"
251
+ ]
252
+ },
253
+ {
254
+ "cell_type": "code",
255
+ "execution_count": null,
256
+ "metadata": {},
257
+ "outputs": [],
258
+ "source": []
259
+ }
260
+ ],
261
+ "metadata": {
262
+ "kernelspec": {
263
+ "display_name": "Python 3",
264
+ "language": "python",
265
+ "name": "python3"
266
+ },
267
+ "language_info": {
268
+ "codemirror_mode": {
269
+ "name": "ipython",
270
+ "version": 3
271
+ },
272
+ "file_extension": ".py",
273
+ "mimetype": "text/x-python",
274
+ "name": "python",
275
+ "nbconvert_exporter": "python",
276
+ "pygments_lexer": "ipython3",
277
+ "version": "3.7.3"
278
+ }
279
+ },
280
+ "nbformat": 4,
281
+ "nbformat_minor": 2
282
+ }
twitter-scraper/twint-master/automate.py ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import twint
2
+ import schedule
3
+ import time
4
+
5
+ # you can change the name of each "job" after "def" if you'd like.
6
+ def jobone():
7
+ print ("Fetching Tweets")
8
+ c = twint.Config()
9
+ # choose username (optional)
10
+ c.Username = "insert username here"
11
+ # choose search term (optional)
12
+ c.Search = "insert search term here"
13
+ # choose beginning time (narrow results)
14
+ c.Since = "2018-01-01"
15
+ # set limit on total tweets
16
+ c.Limit = 1000
17
+ # no idea, but makes the csv format properly
18
+ c.Store_csv = True
19
+ # format of the csv
20
+ c.Custom = ["date", "time", "username", "tweet", "link", "likes", "retweets", "replies", "mentions", "hashtags"]
21
+ # change the name of the csv file
22
+ c.Output = "filename.csv"
23
+ twint.run.Search(c)
24
+
25
+ def jobtwo():
26
+ print ("Fetching Tweets")
27
+ c = twint.Config()
28
+ # choose username (optional)
29
+ c.Username = "insert username here"
30
+ # choose search term (optional)
31
+ c.Search = "insert search term here"
32
+ # choose beginning time (narrow results)
33
+ c.Since = "2018-01-01"
34
+ # set limit on total tweets
35
+ c.Limit = 1000
36
+ # no idea, but makes the csv format properly
37
+ c.Store_csv = True
38
+ # format of the csv
39
+ c.Custom = ["date", "time", "username", "tweet", "link", "likes", "retweets", "replies", "mentions", "hashtags"]
40
+ # change the name of the csv file
41
+ c.Output = "filename2.csv"
42
+ twint.run.Search(c)
43
+
44
+ # run once when you start the program
45
+
46
+ jobone()
47
+ jobtwo()
48
+
49
+ # run every minute(s), hour, day at, day of the week, day of the week and time. Use "#" to block out which ones you don't want to use. Remove it to active. Also, replace "jobone" and "jobtwo" with your new function names (if applicable)
50
+
51
+ # schedule.every(1).minutes.do(jobone)
52
+ schedule.every().hour.do(jobone)
53
+ # schedule.every().day.at("10:30").do(jobone)
54
+ # schedule.every().monday.do(jobone)
55
+ # schedule.every().wednesday.at("13:15").do(jobone)
56
+
57
+ # schedule.every(1).minutes.do(jobtwo)
58
+ schedule.every().hour.do(jobtwo)
59
+ # schedule.every().day.at("10:30").do(jobtwo)
60
+ # schedule.every().monday.do(jobtwo)
61
+ # schedule.every().wednesday.at("13:15").do(jobtwo)
62
+
63
+ while True:
64
+ schedule.run_pending()
65
+ time.sleep(1)
twitter-scraper/twint-master/elasticsearch/README.md ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ # Elasticsearch How-To
2
+
3
+ ![dashboard](https://i.imgur.com/BEbtdo5.png)
4
+
5
+ Please read the Wiki [here](https://github.com/twintproject/twint/wiki/Elasticsearch)
twitter-scraper/twint-master/scrape.py ADDED
@@ -0,0 +1,102 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ import io
3
+ import time
4
+ import asyncio
5
+ import os
6
+ from tkinter import EXCEPTION
7
+ from numpy import not_equal
8
+ loop = asyncio.get_event_loop()
9
+ loop.is_running()
10
+ import twint
11
+ import nest_asyncio
12
+ nest_asyncio.apply()
13
+ from datetime import date
14
+ class scraper:
15
+ def get_tweets(search_str, from_date="2006-07-01", to_date=str(date.today()), num_tweets=10,u_or_s='s', acceptable_range=10):
16
+
17
+ if (type(from_date) or type("str")) is not type("str"):
18
+ print("[!] Please make sure the date is a string in this format \"yyyy-mm-dd\" ")
19
+ raise EXCEPTION("Incorrect date type Exception!")
20
+
21
+ time_out= time.time()+2*60
22
+ _dict={}
23
+ c=twint.Config()
24
+ if u_or_s.lower() =="u":
25
+ c.Search = "from:@"+search_str # topic
26
+ else:
27
+ c.Search = search_str # topic
28
+ c.Pandas = True
29
+ num_tweets_and_replies=num_tweets
30
+ c.Count=True
31
+ for j in range(1,5):
32
+ c.Limit = num_tweets_and_replies
33
+ c.Since = from_date
34
+ c.Until = to_date
35
+ c.Hide_output =True
36
+ old_stdout = sys.stdout
37
+ new_stdout = io.StringIO()
38
+ sys.stdout = new_stdout
39
+ twint.run.Search(c)
40
+ output = new_stdout.getvalue()
41
+ sys.stdout = old_stdout
42
+ print(output[0:-2])
43
+ tweet_info=twint.output.panda.Tweets_df
44
+
45
+ t_count=0
46
+ try:
47
+ _keys=tweet_info["id"]
48
+ #tweet infor is a dataframe with fallowing columns
49
+ '''Index(['id', 'conversation_id', 'created_at', 'date', 'timezone', 'place',
50
+ 'tweet', 'language', 'hashtags', 'cashtags', 'user_id', 'user_id_str',
51
+ 'username', 'name', 'day', 'hour', 'link', 'urls', 'photos', 'video',
52
+ 'thumbnail', 'retweet', 'nlikes', 'nreplies', 'nretweets', 'quote_url',
53
+ 'search', 'near', 'geo', 'source', 'user_rt_id', 'user_rt',
54
+ 'retweet_id', 'reply_to', 'retweet_date', 'translate', 'trans_src',
55
+ 'trans_dest'],
56
+ dtype='object')'''
57
+
58
+ for i in range (len(_keys)):
59
+ if _keys[i] in _dict.keys() or tweet_info["tweet"][i].startswith("@"):
60
+ pass
61
+ else:
62
+ _dict[int(_keys[i])] = {"tweet": tweet_info["tweet"][i],
63
+ "date" :tweet_info["date"][i],
64
+ "nlikes": tweet_info["nlikes"][i],
65
+ "nreplies":tweet_info["nreplies"][i] ,
66
+ "nretweets": tweet_info["nretweets"][i],"topic":""}
67
+ if len(list(_dict.keys()))==num_tweets:
68
+ break
69
+ except:
70
+ pass
71
+ print(len(list(_dict.keys())), " of them are Tweets")
72
+ if (num_tweets-len(list(_dict.keys())))< acceptable_range:
73
+ return _dict
74
+ if len(list(_dict.keys())) < num_tweets:
75
+ num_tweets_and_replies= num_tweets_and_replies+100*3**j
76
+ else:
77
+ break
78
+ if time_out <time.time():
79
+ break
80
+ if output.startswith("[!] No more data!"):
81
+ break
82
+ return _dict
83
+
84
+ def string_search_user_tweets(user_name,search_str ,from_date="2006-07-01", to_date=str(date.today()), num_tweets=10):
85
+ c=twint.Config()
86
+ c.Username =user_name
87
+ c.Search = search_str # topic
88
+ c.Pandas = True
89
+ num_tweets_and_replies=num_tweets
90
+ c.Count=True
91
+ c.Limit = num_tweets_and_replies
92
+ c.Since = from_date
93
+ c.Until = to_date
94
+ c.Hide_output =True
95
+ twint.run.Search(c)
96
+ return twint.output.panda.Tweets_df
97
+
98
+
99
+
100
+
101
+
102
+
twitter-scraper/twint-master/scrape__init__.py ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ def scraper_libs():
2
+ import sys
3
+ import io
4
+ import time
5
+ import asyncio
6
+ import os
7
+ from tkinter import EXCEPTION
8
+ from numpy import not_equal
9
+ loop = asyncio.get_event_loop()
10
+ loop.is_running()
11
+ import twint
12
+ import nest_asyncio
13
+ nest_asyncio.apply()
14
+ from datetime import date
twitter-scraper/twint-master/setup.py ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/python3
2
+ from setuptools import setup
3
+ import io
4
+ import os
5
+
6
+ # Package meta-data
7
+ NAME = 'twint'
8
+ DESCRIPTION = 'An advanced Twitter scraping & OSINT tool.'
9
+ URL = 'https://github.com/twintproject/twint'
10
+ EMAIL = 'codyzacharias@pm.me'
11
+ AUTHOR = 'Cody Zacharias'
12
+ REQUIRES_PYTHON = '>=3.6.0'
13
+ VERSION = None
14
+
15
+ # Packages required
16
+ REQUIRED = [
17
+ 'aiohttp', 'aiodns', 'beautifulsoup4', 'cchardet', 'dataclasses',
18
+ 'elasticsearch', 'pysocks', 'pandas', 'aiohttp_socks',
19
+ 'schedule', 'geopy', 'fake-useragent', 'googletransx'
20
+ ]
21
+
22
+ here = os.path.abspath(os.path.dirname(__file__))
23
+
24
+ with io.open(os.path.join(here, 'README.md'), encoding='utf-8') as f:
25
+ long_description = '\n' + f.read()
26
+
27
+ # Load the package's __version__.py
28
+ about = {}
29
+ if not VERSION:
30
+ with open(os.path.join(here, NAME, '__version__.py')) as f:
31
+ exec(f.read(), about)
32
+ else:
33
+ about['__version__'] = VERSION
34
+
35
+ setup(
36
+ name=NAME,
37
+ version=about['__version__'],
38
+ description=DESCRIPTION,
39
+ long_description=long_description,
40
+ long_description_content_type="text/markdown",
41
+ author=AUTHOR,
42
+ author_email=EMAIL,
43
+ python_requires=REQUIRES_PYTHON,
44
+ url=URL,
45
+ packages=['twint', 'twint.storage'],
46
+ entry_points={
47
+ 'console_scripts': [
48
+ 'twint = twint.cli:run_as_command',
49
+ ],
50
+ },
51
+ install_requires=REQUIRED,
52
+ dependency_links=[
53
+ 'git+https://github.com/x0rzkov/py-googletrans#egg=googletrans'
54
+ ],
55
+ license='MIT',
56
+ classifiers=[
57
+ 'License :: OSI Approved :: MIT License',
58
+ 'Programming Language :: Python',
59
+ 'Programming Language :: Python :: 3',
60
+ 'Programming Language :: Python :: 3.6',
61
+ 'Programming Language :: Python :: 3.7',
62
+ 'Programming Language :: Python :: 3.8',
63
+ 'Programming Language :: Python :: Implementation :: CPython',
64
+ ],
65
+ )
twitter-scraper/twint-master/test.py ADDED
@@ -0,0 +1,92 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import twint
2
+ import os
3
+
4
+ '''
5
+ Test.py - Testing TWINT to make sure everything works.
6
+ '''
7
+
8
+
9
+ def test_reg(c, run):
10
+ print("[+] Beginning vanilla test in {}".format(str(run)))
11
+ run(c)
12
+
13
+
14
+ def test_db(c, run):
15
+ print("[+] Beginning DB test in {}".format(str(run)))
16
+ c.Database = "test_twint.db"
17
+ run(c)
18
+
19
+
20
+ def custom(c, run, _type):
21
+ print("[+] Beginning custom {} test in {}".format(_type, str(run)))
22
+ c.Custom['tweet'] = ["id", "username"]
23
+ c.Custom['user'] = ["id", "username"]
24
+ run(c)
25
+
26
+
27
+ def test_json(c, run):
28
+ c.Store_json = True
29
+ c.Output = "test_twint.json"
30
+ custom(c, run, "JSON")
31
+ print("[+] Beginning JSON test in {}".format(str(run)))
32
+ run(c)
33
+
34
+
35
+ def test_csv(c, run):
36
+ c.Store_csv = True
37
+ c.Output = "test_twint.csv"
38
+ custom(c, run, "CSV")
39
+ print("[+] Beginning CSV test in {}".format(str(run)))
40
+ run(c)
41
+
42
+
43
+ def main():
44
+ c = twint.Config()
45
+ c.Username = "verified"
46
+ c.Limit = 20
47
+ c.Store_object = True
48
+
49
+ # Separate objects are necessary.
50
+
51
+ f = twint.Config()
52
+ f.Username = "verified"
53
+ f.Limit = 20
54
+ f.Store_object = True
55
+ f.User_full = True
56
+
57
+ runs = [
58
+ twint.run.Profile, # this doesn't
59
+ twint.run.Search, # this works
60
+ twint.run.Following,
61
+ twint.run.Followers,
62
+ twint.run.Favorites,
63
+ ]
64
+
65
+ tests = [test_reg, test_json, test_csv, test_db]
66
+
67
+ # Something breaks if we don't split these up
68
+
69
+ for run in runs[:3]:
70
+ if run == twint.run.Search:
71
+ c.Since = "2012-1-1 20:30:22"
72
+ c.Until = "2017-1-1"
73
+ else:
74
+ c.Since = ""
75
+ c.Until = ""
76
+
77
+ for test in tests:
78
+ test(c, run)
79
+
80
+ for run in runs[3:]:
81
+ for test in tests:
82
+ test(f, run)
83
+
84
+ files = ["test_twint.db", "test_twint.json", "test_twint.csv"]
85
+ for _file in files:
86
+ os.remove(_file)
87
+
88
+ print("[+] Testing complete!")
89
+
90
+
91
+ if __name__ == '__main__':
92
+ main()
twitter-scraper/twint-master/twint/__init__.py ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ '''
2
+ TWINT - Twitter Intelligence Tool (formerly known as Tweep).
3
+
4
+ See wiki on Github for in-depth details.
5
+ https://github.com/twintproject/twint/wiki
6
+
7
+ Licensed under MIT License
8
+ Copyright (c) 2018 Cody Zacharias
9
+ '''
10
+ import logging, os
11
+
12
+ from .config import Config
13
+ from .__version__ import __version__
14
+ from . import run
15
+
16
+ _levels = {
17
+ 'info': logging.INFO,
18
+ 'debug': logging.DEBUG
19
+ }
20
+
21
+ _level = os.getenv('TWINT_DEBUG', 'info')
22
+ _logLevel = _levels[_level]
23
+
24
+ if _level == "debug":
25
+ logger = logging.getLogger()
26
+ _output_fn = 'twint.log'
27
+ logger.setLevel(_logLevel)
28
+ formatter = logging.Formatter('%(levelname)s:%(asctime)s:%(name)s:%(message)s')
29
+ fileHandler = logging.FileHandler(_output_fn)
30
+ fileHandler.setLevel(_logLevel)
31
+ fileHandler.setFormatter(formatter)
32
+ logger.addHandler(fileHandler)
twitter-scraper/twint-master/twint/__version__.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ VERSION = (2, 1, 21)
2
+
3
+ __version__ = '.'.join(map(str, VERSION))
twitter-scraper/twint-master/twint/cli.py ADDED
@@ -0,0 +1,342 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ '''
3
+ Twint.py - Twitter Intelligence Tool (formerly known as Tweep).
4
+
5
+ See wiki on Github for in-depth details.
6
+ https://github.com/twintproject/twint/wiki
7
+
8
+ Licensed under MIT License
9
+ Copyright (c) 2018 The Twint Project
10
+ '''
11
+ import sys
12
+ import os
13
+ import argparse
14
+
15
+ from . import run
16
+ from . import config
17
+ from . import storage
18
+
19
+
20
+ def error(_error, message):
21
+ """ Print errors to stdout
22
+ """
23
+ print("[-] {}: {}".format(_error, message))
24
+ sys.exit(0)
25
+
26
+
27
+ def check(args):
28
+ """ Error checking
29
+ """
30
+ if args.username is not None or args.userlist or args.members_list:
31
+ if args.verified:
32
+ error("Contradicting Args",
33
+ "Please use --verified in combination with -s.")
34
+ if args.userid:
35
+ error("Contradicting Args",
36
+ "--userid and -u cannot be used together.")
37
+ if args.all:
38
+ error("Contradicting Args",
39
+ "--all and -u cannot be used together.")
40
+ elif args.search and args.timeline:
41
+ error("Contradicting Args",
42
+ "--s and --tl cannot be used together.")
43
+ elif args.timeline and not args.username:
44
+ error("Error", "-tl cannot be used without -u.")
45
+ elif args.search is None:
46
+ if args.custom_query is not None:
47
+ pass
48
+ elif (args.geo or args.near) is None and not (args.all or args.userid):
49
+ error("Error", "Please use at least -u, -s, -g or --near.")
50
+ elif args.all and args.userid:
51
+ error("Contradicting Args",
52
+ "--all and --userid cannot be used together")
53
+ if args.output is None:
54
+ if args.csv:
55
+ error("Error", "Please specify an output file (Example: -o file.csv).")
56
+ elif args.json:
57
+ error("Error", "Please specify an output file (Example: -o file.json).")
58
+ if args.backoff_exponent <= 0:
59
+ error("Error", "Please specifiy a positive value for backoff_exponent")
60
+ if args.min_wait_time < 0:
61
+ error("Error", "Please specifiy a non negative value for min_wait_time")
62
+
63
+
64
+ def loadUserList(ul, _type):
65
+ """ Concatenate users
66
+ """
67
+ if os.path.exists(os.path.abspath(ul)):
68
+ userlist = open(os.path.abspath(ul), "r").read().splitlines()
69
+ else:
70
+ userlist = ul.split(",")
71
+ if _type == "search":
72
+ un = ""
73
+ for user in userlist:
74
+ un += "%20OR%20from%3A" + user
75
+ return un[15:]
76
+ return userlist
77
+
78
+
79
+ def initialize(args):
80
+ """ Set default values for config from args
81
+ """
82
+ c = config.Config()
83
+ c.Username = args.username
84
+ c.User_id = args.userid
85
+ c.Search = args.search
86
+ c.Geo = args.geo
87
+ c.Location = args.location
88
+ c.Near = args.near
89
+ c.Lang = args.lang
90
+ c.Output = args.output
91
+ c.Elasticsearch = args.elasticsearch
92
+ c.Year = args.year
93
+ c.Since = args.since
94
+ c.Until = args.until
95
+ c.Email = args.email
96
+ c.Phone = args.phone
97
+ c.Verified = args.verified
98
+ c.Store_csv = args.csv
99
+ c.Tabs = args.tabs
100
+ c.Store_json = args.json
101
+ c.Show_hashtags = args.hashtags
102
+ c.Show_cashtags = args.cashtags
103
+ c.Limit = args.limit
104
+ c.Count = args.count
105
+ c.Stats = args.stats
106
+ c.Database = args.database
107
+ c.To = args.to
108
+ c.All = args.all
109
+ c.Essid = args.essid
110
+ c.Format = args.format
111
+ c.User_full = args.user_full
112
+ # c.Profile_full = args.profile_full
113
+ c.Pandas_type = args.pandas_type
114
+ c.Index_tweets = args.index_tweets
115
+ c.Index_follow = args.index_follow
116
+ c.Index_users = args.index_users
117
+ c.Debug = args.debug
118
+ c.Resume = args.resume
119
+ c.Images = args.images
120
+ c.Videos = args.videos
121
+ c.Media = args.media
122
+ c.Replies = args.replies
123
+ c.Pandas_clean = args.pandas_clean
124
+ c.Proxy_host = args.proxy_host
125
+ c.Proxy_port = args.proxy_port
126
+ c.Proxy_type = args.proxy_type
127
+ c.Tor_control_port = args.tor_control_port
128
+ c.Tor_control_password = args.tor_control_password
129
+ c.Retweets = args.retweets
130
+ c.Custom_query = args.custom_query
131
+ c.Popular_tweets = args.popular_tweets
132
+ c.Skip_certs = args.skip_certs
133
+ c.Hide_output = args.hide_output
134
+ c.Native_retweets = args.native_retweets
135
+ c.Min_likes = args.min_likes
136
+ c.Min_retweets = args.min_retweets
137
+ c.Min_replies = args.min_replies
138
+ c.Links = args.links
139
+ c.Source = args.source
140
+ c.Members_list = args.members_list
141
+ c.Filter_retweets = args.filter_retweets
142
+ c.Translate = args.translate
143
+ c.TranslateDest = args.translate_dest
144
+ c.Backoff_exponent = args.backoff_exponent
145
+ c.Min_wait_time = args.min_wait_time
146
+ return c
147
+
148
+
149
+ def options():
150
+ """ Parse arguments
151
+ """
152
+ ap = argparse.ArgumentParser(prog="twint",
153
+ usage="python3 %(prog)s [options]",
154
+ description="TWINT - An Advanced Twitter Scraping Tool.")
155
+ ap.add_argument("-u", "--username", help="User's Tweets you want to scrape.")
156
+ ap.add_argument("-s", "--search", help="Search for Tweets containing this word or phrase.")
157
+ ap.add_argument("-g", "--geo", help="Search for geocoded Tweets.")
158
+ ap.add_argument("--near", help="Near a specified city.")
159
+ ap.add_argument("--location", help="Show user's location (Experimental).", action="store_true")
160
+ ap.add_argument("-l", "--lang", help="Search for Tweets in a specific language.")
161
+ ap.add_argument("-o", "--output", help="Save output to a file.")
162
+ ap.add_argument("-es", "--elasticsearch", help="Index to Elasticsearch.")
163
+ ap.add_argument("--year", help="Filter Tweets before specified year.")
164
+ ap.add_argument("--since", help="Filter Tweets sent since date (Example: \"2017-12-27 20:30:15\" or 2017-12-27).",
165
+ metavar="DATE")
166
+ ap.add_argument("--until", help="Filter Tweets sent until date (Example: \"2017-12-27 20:30:15\" or 2017-12-27).",
167
+ metavar="DATE")
168
+ ap.add_argument("--email", help="Filter Tweets that might have email addresses", action="store_true")
169
+ ap.add_argument("--phone", help="Filter Tweets that might have phone numbers", action="store_true")
170
+ ap.add_argument("--verified", help="Display Tweets only from verified users (Use with -s).",
171
+ action="store_true")
172
+ ap.add_argument("--csv", help="Write as .csv file.", action="store_true")
173
+ ap.add_argument("--tabs", help="Separate CSV fields with tab characters, not commas.", action="store_true")
174
+ ap.add_argument("--json", help="Write as .json file", action="store_true")
175
+ ap.add_argument("--hashtags", help="Output hashtags in seperate column.", action="store_true")
176
+ ap.add_argument("--cashtags", help="Output cashtags in seperate column.", action="store_true")
177
+ ap.add_argument("--userid", help="Twitter user id.")
178
+ ap.add_argument("--limit", help="Number of Tweets to pull (Increments of 20).")
179
+ ap.add_argument("--count", help="Display number of Tweets scraped at the end of session.",
180
+ action="store_true")
181
+ ap.add_argument("--stats", help="Show number of replies, retweets, and likes.",
182
+ action="store_true")
183
+ ap.add_argument("-db", "--database", help="Store Tweets in a sqlite3 database.")
184
+ ap.add_argument("--to", help="Search Tweets to a user.", metavar="USERNAME")
185
+ ap.add_argument("--all", help="Search all Tweets associated with a user.", metavar="USERNAME")
186
+ ap.add_argument("--followers", help="Scrape a person's followers.", action="store_true")
187
+ ap.add_argument("--following", help="Scrape a person's follows", action="store_true")
188
+ ap.add_argument("--favorites", help="Scrape Tweets a user has liked.", action="store_true")
189
+ ap.add_argument("--proxy-type", help="Socks5, HTTP, etc.")
190
+ ap.add_argument("--proxy-host", help="Proxy hostname or IP.")
191
+ ap.add_argument("--proxy-port", help="The port of the proxy server.")
192
+ ap.add_argument("--tor-control-port", help="If proxy-host is set to tor, this is the control port", default=9051)
193
+ ap.add_argument("--tor-control-password",
194
+ help="If proxy-host is set to tor, this is the password for the control port",
195
+ default="my_password")
196
+ ap.add_argument("--essid",
197
+ help="Elasticsearch Session ID, use this to differentiate scraping sessions.",
198
+ nargs="?", default="")
199
+ ap.add_argument("--userlist", help="Userlist from list or file.")
200
+ ap.add_argument("--retweets",
201
+ help="Include user's Retweets (Warning: limited).",
202
+ action="store_true")
203
+ ap.add_argument("--format", help="Custom output format (See wiki for details).")
204
+ ap.add_argument("--user-full",
205
+ help="Collect all user information (Use with followers or following only).",
206
+ action="store_true")
207
+ # I am removing this this feature for the time being, because it is no longer required, default method will do this
208
+ # ap.add_argument("--profile-full",
209
+ # help="Slow, but effective method of collecting a user's Tweets and RT.",
210
+ # action="store_true")
211
+ ap.add_argument(
212
+ "-tl",
213
+ "--timeline",
214
+ help="Collects every tweet from a User's Timeline. (Tweets, RTs & Replies)",
215
+ action="store_true",
216
+ )
217
+ ap.add_argument("--translate",
218
+ help="Get tweets translated by Google Translate.",
219
+ action="store_true")
220
+ ap.add_argument("--translate-dest", help="Translate tweet to language (ISO2).",
221
+ default="en")
222
+ ap.add_argument("--store-pandas", help="Save Tweets in a DataFrame (Pandas) file.")
223
+ ap.add_argument("--pandas-type",
224
+ help="Specify HDF5 or Pickle (HDF5 as default)", nargs="?", default="HDF5")
225
+ ap.add_argument("-it", "--index-tweets",
226
+ help="Custom Elasticsearch Index name for Tweets.", nargs="?", default="twinttweets")
227
+ ap.add_argument("-if", "--index-follow",
228
+ help="Custom Elasticsearch Index name for Follows.",
229
+ nargs="?", default="twintgraph")
230
+ ap.add_argument("-iu", "--index-users", help="Custom Elasticsearch Index name for Users.",
231
+ nargs="?", default="twintuser")
232
+ ap.add_argument("--debug",
233
+ help="Store information in debug logs", action="store_true")
234
+ ap.add_argument("--resume", help="Resume from Tweet ID.", metavar="TWEET_ID")
235
+ ap.add_argument("--videos", help="Display only Tweets with videos.", action="store_true")
236
+ ap.add_argument("--images", help="Display only Tweets with images.", action="store_true")
237
+ ap.add_argument("--media",
238
+ help="Display Tweets with only images or videos.", action="store_true")
239
+ ap.add_argument("--replies", help="Display replies to a subject.", action="store_true")
240
+ ap.add_argument("-pc", "--pandas-clean",
241
+ help="Automatically clean Pandas dataframe at every scrape.")
242
+ ap.add_argument("-cq", "--custom-query", help="Custom search query.")
243
+ ap.add_argument("-pt", "--popular-tweets", help="Scrape popular tweets instead of recent ones.",
244
+ action="store_true")
245
+ ap.add_argument("-sc", "--skip-certs", help="Skip certs verification, useful for SSC.", action="store_false")
246
+ ap.add_argument("-ho", "--hide-output", help="Hide output, no tweets will be displayed.", action="store_true")
247
+ ap.add_argument("-nr", "--native-retweets", help="Filter the results for retweets only.", action="store_true")
248
+ ap.add_argument("--min-likes", help="Filter the tweets by minimum number of likes.")
249
+ ap.add_argument("--min-retweets", help="Filter the tweets by minimum number of retweets.")
250
+ ap.add_argument("--min-replies", help="Filter the tweets by minimum number of replies.")
251
+ ap.add_argument("--links", help="Include or exclude tweets containing one o more links. If not specified" +
252
+ " you will get both tweets that might contain links or not.")
253
+ ap.add_argument("--source", help="Filter the tweets for specific source client.")
254
+ ap.add_argument("--members-list", help="Filter the tweets sent by users in a given list.")
255
+ ap.add_argument("-fr", "--filter-retweets", help="Exclude retweets from the results.", action="store_true")
256
+ ap.add_argument("--backoff-exponent", help="Specify a exponent for the polynomial backoff in case of errors.",
257
+ type=float, default=3.0)
258
+ ap.add_argument("--min-wait-time", type=float, default=15,
259
+ help="specifiy a minimum wait time in case of scraping limit error. This value will be adjusted by twint if the value provided does not satisfy the limits constraints")
260
+ args = ap.parse_args()
261
+
262
+ return args
263
+
264
+
265
+ def main():
266
+ """ Main
267
+ """
268
+ args = options()
269
+ check(args)
270
+
271
+ if args.pandas_clean:
272
+ storage.panda.clean()
273
+
274
+ c = initialize(args)
275
+
276
+ if args.userlist:
277
+ c.Query = loadUserList(args.userlist, "search")
278
+
279
+ if args.pandas_clean:
280
+ storage.panda.clean()
281
+
282
+ if args.favorites:
283
+ if args.userlist:
284
+ _userlist = loadUserList(args.userlist, "favorites")
285
+ for _user in _userlist:
286
+ args.username = _user
287
+ c = initialize(args)
288
+ run.Favorites(c)
289
+ else:
290
+ run.Favorites(c)
291
+ elif args.following:
292
+ if args.userlist:
293
+ _userlist = loadUserList(args.userlist, "following")
294
+ for _user in _userlist:
295
+ args.username = _user
296
+ c = initialize(args)
297
+ run.Following(c)
298
+ else:
299
+ run.Following(c)
300
+ elif args.followers:
301
+ if args.userlist:
302
+ _userlist = loadUserList(args.userlist, "followers")
303
+ for _user in _userlist:
304
+ args.username = _user
305
+ c = initialize(args)
306
+ run.Followers(c)
307
+ else:
308
+ run.Followers(c)
309
+ elif args.retweets: # or args.profile_full:
310
+ if args.userlist:
311
+ _userlist = loadUserList(args.userlist, "profile")
312
+ for _user in _userlist:
313
+ args.username = _user
314
+ c = initialize(args)
315
+ run.Profile(c)
316
+ else:
317
+ run.Profile(c)
318
+ elif args.user_full:
319
+ if args.userlist:
320
+ _userlist = loadUserList(args.userlist, "userlist")
321
+ for _user in _userlist:
322
+ args.username = _user
323
+ c = initialize(args)
324
+ run.Lookup(c)
325
+ else:
326
+ run.Lookup(c)
327
+ elif args.timeline:
328
+ run.Profile(c)
329
+ else:
330
+ run.Search(c)
331
+
332
+
333
+ def run_as_command():
334
+ if(sys.version_info.major < 3 or (sys.version_info.major == 3 and sys.version_info.minor < 6)):
335
+ print("[-] TWINT requires Python version 3.6+.")
336
+ sys.exit(0)
337
+
338
+ main()
339
+
340
+
341
+ if __name__ == '__main__':
342
+ main()
twitter-scraper/twint-master/twint/config.py ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dataclasses import dataclass
2
+ from typing import Optional
3
+
4
+ @dataclass
5
+ class Config:
6
+ Username: Optional[str] = None
7
+ User_id: Optional[str] = None
8
+ Search: Optional[str] = None
9
+ Lookup: bool = False
10
+ Geo: str = ""
11
+ Location: bool = False
12
+ Near: str = None
13
+ Lang: Optional[str] = None
14
+ Output: Optional[str] = None
15
+ Elasticsearch: object = None
16
+ Year: Optional[int] = None
17
+ Since: Optional[str] = None
18
+ Until: Optional[str] = None
19
+ Email: Optional[str] = None
20
+ Phone: Optional[str] = None
21
+ Verified: bool = False
22
+ Store_csv: bool = False
23
+ Store_json: bool = False
24
+ Custom = {"tweet": None, "user": None, "username": None}
25
+ Show_hashtags: bool = False
26
+ Show_cashtags: bool = False
27
+ Limit: Optional[int] = None
28
+ Count: Optional[int] = None
29
+ Stats: bool = False
30
+ Database: object = None
31
+ To: str = None
32
+ All = None
33
+ Debug: bool = False
34
+ Format = None
35
+ Essid: str = ""
36
+ Profile: bool = False
37
+ Followers: bool = False
38
+ Following: bool = False
39
+ Favorites: bool = False
40
+ TwitterSearch: bool = False
41
+ User_full: bool = False
42
+ # Profile_full: bool = False
43
+ Store_object: bool = False
44
+ Store_object_tweets_list: list = None
45
+ Store_object_users_list: list = None
46
+ Store_object_follow_list: list = None
47
+ Pandas_type: type = None
48
+ Pandas: bool = False
49
+ Index_tweets: str = "twinttweets"
50
+ Index_follow: str = "twintgraph"
51
+ Index_users: str = "twintuser"
52
+ Retries_count: int = 10
53
+ Resume: object = None
54
+ Images: bool = False
55
+ Videos: bool = False
56
+ Media: bool = False
57
+ Replies: bool = False
58
+ Pandas_clean: bool = True
59
+ Lowercase: bool = True
60
+ Pandas_au: bool = True
61
+ Proxy_host: str = ""
62
+ Proxy_port: int = 0
63
+ Proxy_type: object = None
64
+ Tor_control_port: int = 9051
65
+ Tor_control_password: str = None
66
+ Retweets: bool = False
67
+ Query: str = None
68
+ Hide_output: bool = False
69
+ Custom_query: str = ""
70
+ Popular_tweets: bool = False
71
+ Skip_certs: bool = False
72
+ Native_retweets: bool = False
73
+ Min_likes: int = 0
74
+ Min_retweets: int = 0
75
+ Min_replies: int = 0
76
+ Links: Optional[str] = None
77
+ Source: Optional[str] = None
78
+ Members_list: Optional[str] = None
79
+ Filter_retweets: bool = False
80
+ Translate: bool = False
81
+ TranslateSrc: str = "en"
82
+ TranslateDest: str = "en"
83
+ Backoff_exponent: float = 3.0
84
+ Min_wait_time: int = 0
85
+ Bearer_token: str = None
86
+ Guest_token: str = None
87
+ deleted: list = None
twitter-scraper/twint-master/twint/datelock.py ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import datetime
2
+
3
+ import logging as logme
4
+
5
+ from .tweet import utc_to_local
6
+
7
+
8
+ class Datelock:
9
+ until = None
10
+ since = None
11
+ _since_def_user = None
12
+
13
+
14
+ def convertToDateTime(string):
15
+ dateTimeList = string.split()
16
+ ListLength = len(dateTimeList)
17
+ if ListLength == 2:
18
+ return string
19
+ if ListLength == 1:
20
+ return string + " 00:00:00"
21
+ else:
22
+ return ""
23
+
24
+
25
+ def Set(Until, Since):
26
+ logme.debug(__name__+':Set')
27
+ d = Datelock()
28
+
29
+ if Until:
30
+ d.until = datetime.datetime.strptime(convertToDateTime(Until), "%Y-%m-%d %H:%M:%S")
31
+ d.until = utc_to_local(d.until)
32
+ else:
33
+ d.until = datetime.datetime.today()
34
+
35
+ if Since:
36
+ d.since = datetime.datetime.strptime(convertToDateTime(Since), "%Y-%m-%d %H:%M:%S")
37
+ d.since = utc_to_local(d.since)
38
+ d._since_def_user = True
39
+ else:
40
+ d.since = datetime.datetime.strptime("2006-03-21 00:00:00", "%Y-%m-%d %H:%M:%S")
41
+ d.since = utc_to_local(d.since)
42
+ d._since_def_user = False
43
+
44
+ return d
twitter-scraper/twint-master/twint/feed.py ADDED
@@ -0,0 +1,145 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import time
2
+ from datetime import datetime
3
+
4
+ from bs4 import BeautifulSoup
5
+ from re import findall
6
+ from json import loads
7
+
8
+ import logging as logme
9
+
10
+ from .tweet import utc_to_local, Tweet_formats
11
+
12
+
13
+ class NoMoreTweetsException(Exception):
14
+ def __init__(self, msg):
15
+ super().__init__(msg)
16
+
17
+
18
+ def Follow(response):
19
+ logme.debug(__name__ + ':Follow')
20
+ soup = BeautifulSoup(response, "html.parser")
21
+ follow = soup.find_all("td", "info fifty screenname")
22
+ cursor = soup.find_all("div", "w-button-more")
23
+ try:
24
+ cursor = findall(r'cursor=(.*?)">', str(cursor))[0]
25
+ except IndexError:
26
+ logme.critical(__name__ + ':Follow:IndexError')
27
+
28
+ return follow, cursor
29
+
30
+
31
+ # TODO: this won't be used by --profile-full anymore. if it isn't used anywhere else, perhaps remove this in future
32
+ def Mobile(response):
33
+ logme.debug(__name__ + ':Mobile')
34
+ soup = BeautifulSoup(response, "html.parser")
35
+ tweets = soup.find_all("span", "metadata")
36
+ max_id = soup.find_all("div", "w-button-more")
37
+ try:
38
+ max_id = findall(r'max_id=(.*?)">', str(max_id))[0]
39
+ except Exception as e:
40
+ logme.critical(__name__ + ':Mobile:' + str(e))
41
+
42
+ return tweets, max_id
43
+
44
+
45
+ def MobileFav(response):
46
+ soup = BeautifulSoup(response, "html.parser")
47
+ tweets = soup.find_all("table", "tweet")
48
+ max_id = soup.find_all("div", "w-button-more")
49
+ try:
50
+ max_id = findall(r'max_id=(.*?)">', str(max_id))[0]
51
+ except Exception as e:
52
+ print(str(e) + " [x] feed.MobileFav")
53
+
54
+ return tweets, max_id
55
+
56
+
57
+ def _get_cursor(response):
58
+ if isinstance(response, dict): # case 1
59
+ try:
60
+ next_cursor = response['timeline']['instructions'][0]['addEntries']['entries'][-1]['content'][
61
+ 'operation']['cursor']['value']
62
+ except KeyError:
63
+ # this is needed because after the first request location of cursor is changed
64
+ next_cursor = response['timeline']['instructions'][-1]['replaceEntry']['entry']['content']['operation'][
65
+ 'cursor']['value']
66
+ else: # case 2
67
+ next_cursor = response[-1]['content']['value']
68
+ return next_cursor
69
+
70
+
71
+ def Json(response):
72
+ logme.debug(__name__ + ':Json')
73
+ json_response = loads(response)
74
+ html = json_response["items_html"]
75
+ soup = BeautifulSoup(html, "html.parser")
76
+ feed = soup.find_all("div", "tweet")
77
+ return feed, json_response["min_position"]
78
+
79
+
80
+ def parse_tweets(config, response):
81
+ logme.debug(__name__ + ':parse_tweets')
82
+ response = loads(response)
83
+ feed = []
84
+ if 'globalObjects' in response:
85
+ if len(response['globalObjects']['tweets']) == 0:
86
+ msg = 'No more data!'
87
+ raise NoMoreTweetsException(msg)
88
+ for timeline_entry in response['timeline']['instructions'][0]['addEntries']['entries']:
89
+ # this will handle the cases when the timeline entry is a tweet
90
+ if (config.TwitterSearch or config.Profile) and (timeline_entry['entryId'].startswith('sq-I-t-') or
91
+ timeline_entry['entryId'].startswith('tweet-')):
92
+ if 'tweet' in timeline_entry['content']['item']['content']:
93
+ _id = timeline_entry['content']['item']['content']['tweet']['id']
94
+ # skip the ads
95
+ if 'promotedMetadata' in timeline_entry['content']['item']['content']['tweet']:
96
+ continue
97
+ elif 'tombstone' in timeline_entry['content']['item']['content'] and 'tweet' in \
98
+ timeline_entry['content']['item']['content']['tombstone']:
99
+ _id = timeline_entry['content']['item']['content']['tombstone']['tweet']['id']
100
+ else:
101
+ _id = None
102
+ if _id is None:
103
+ raise ValueError('Unable to find ID of tweet in timeline.')
104
+ try:
105
+ temp_obj = response['globalObjects']['tweets'][_id]
106
+ except KeyError:
107
+ logme.info('encountered a deleted tweet with id {}'.format(_id))
108
+
109
+ config.deleted.append(_id)
110
+ continue
111
+ temp_obj['user_data'] = response['globalObjects']['users'][temp_obj['user_id_str']]
112
+ if 'retweeted_status_id_str' in temp_obj:
113
+ rt_id = temp_obj['retweeted_status_id_str']
114
+ _dt = response['globalObjects']['tweets'][rt_id]['created_at']
115
+ _dt = datetime.strptime(_dt, '%a %b %d %H:%M:%S %z %Y')
116
+ _dt = utc_to_local(_dt)
117
+ _dt = str(_dt.strftime(Tweet_formats['datetime']))
118
+ temp_obj['retweet_data'] = {
119
+ 'user_rt_id': response['globalObjects']['tweets'][rt_id]['user_id_str'],
120
+ 'user_rt': response['globalObjects']['tweets'][rt_id]['full_text'],
121
+ 'retweet_id': rt_id,
122
+ 'retweet_date': _dt,
123
+ }
124
+ feed.append(temp_obj)
125
+ next_cursor = _get_cursor(response) # case 1
126
+ else:
127
+ response = response['data']['user']['result']['timeline']
128
+ entries = response['timeline']['instructions']
129
+ for e in entries:
130
+ if e.get('entries'):
131
+ entries = e['entries']
132
+ break
133
+ if len(entries) == 2:
134
+ msg = 'No more data!'
135
+ raise NoMoreTweetsException(msg)
136
+ for timeline_entry in entries:
137
+ if timeline_entry['content'].get('itemContent'):
138
+ try:
139
+ temp_obj = timeline_entry['content']['itemContent']['tweet_results']['result']['legacy']
140
+ temp_obj['user_data'] = timeline_entry['content']['itemContent']['tweet_results']['result']['core']['user_results']['result']['legacy']
141
+ feed.append(temp_obj)
142
+ except KeyError: # doubtful
143
+ next
144
+ next_cursor = _get_cursor(entries) # case 2
145
+ return feed, next_cursor
twitter-scraper/twint-master/twint/format.py ADDED
@@ -0,0 +1,91 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging as logme
2
+
3
+ def Tweet(config, t):
4
+ if config.Format:
5
+ logme.debug(__name__+':Tweet:Format')
6
+ output = config.Format.replace("{id}", t.id_str)
7
+ output = output.replace("{conversation_id}", t.conversation_id)
8
+ output = output.replace("{date}", t.datestamp)
9
+ output = output.replace("{time}", t.timestamp)
10
+ output = output.replace("{user_id}", t.user_id_str)
11
+ output = output.replace("{username}", t.username)
12
+ output = output.replace("{name}", t.name)
13
+ output = output.replace("{place}", t.place)
14
+ output = output.replace("{timezone}", t.timezone)
15
+ output = output.replace("{urls}", ",".join(t.urls))
16
+ output = output.replace("{photos}", ",".join(t.photos))
17
+ output = output.replace("{video}", str(t.video))
18
+ output = output.replace("{thumbnail}", t.thumbnail)
19
+ output = output.replace("{tweet}", t.tweet)
20
+ output = output.replace("{language}", t.lang)
21
+ output = output.replace("{hashtags}", ",".join(t.hashtags))
22
+ output = output.replace("{cashtags}", ",".join(t.cashtags))
23
+ output = output.replace("{replies}", t.replies_count)
24
+ output = output.replace("{retweets}", t.retweets_count)
25
+ output = output.replace("{likes}", t.likes_count)
26
+ output = output.replace("{link}", t.link)
27
+ output = output.replace("{is_retweet}", str(t.retweet))
28
+ output = output.replace("{user_rt_id}", str(t.user_rt_id))
29
+ output = output.replace("{quote_url}", t.quote_url)
30
+ output = output.replace("{near}", t.near)
31
+ output = output.replace("{geo}", t.geo)
32
+ output = output.replace("{mentions}", ",".join(t.mentions))
33
+ output = output.replace("{translate}", t.translate)
34
+ output = output.replace("{trans_src}", t.trans_src)
35
+ output = output.replace("{trans_dest}", t.trans_dest)
36
+ else:
37
+ logme.debug(__name__+':Tweet:notFormat')
38
+ output = f"{t.id_str} {t.datestamp} {t.timestamp} {t.timezone} "
39
+
40
+ # TODO: someone who is familiar with this code, needs to take a look at what this is <also see tweet.py>
41
+ # if t.retweet:
42
+ # output += "RT "
43
+
44
+ output += f"<{t.username}> {t.tweet}"
45
+
46
+ if config.Show_hashtags:
47
+ hashtags = ",".join(t.hashtags)
48
+ output += f" {hashtags}"
49
+ if config.Show_cashtags:
50
+ cashtags = ",".join(t.cashtags)
51
+ output += f" {cashtags}"
52
+ if config.Stats:
53
+ output += f" | {t.replies_count} replies {t.retweets_count} retweets {t.likes_count} likes"
54
+ if config.Translate:
55
+ output += f" {t.translate} {t.trans_src} {t.trans_dest}"
56
+ return output
57
+
58
+ def User(_format, u):
59
+ if _format:
60
+ logme.debug(__name__+':User:Format')
61
+ output = _format.replace("{id}", str(u.id))
62
+ output = output.replace("{name}", u.name)
63
+ output = output.replace("{username}", u.username)
64
+ output = output.replace("{bio}", u.bio)
65
+ output = output.replace("{location}", u.location)
66
+ output = output.replace("{url}", u.url)
67
+ output = output.replace("{join_date}", u.join_date)
68
+ output = output.replace("{join_time}", u.join_time)
69
+ output = output.replace("{tweets}", str(u.tweets))
70
+ output = output.replace("{following}", str(u.following))
71
+ output = output.replace("{followers}", str(u.followers))
72
+ output = output.replace("{likes}", str(u.likes))
73
+ output = output.replace("{media}", str(u.media_count))
74
+ output = output.replace("{private}", str(u.is_private))
75
+ output = output.replace("{verified}", str(u.is_verified))
76
+ output = output.replace("{avatar}", u.avatar)
77
+ if u.background_image:
78
+ output = output.replace("{background_image}", u.background_image)
79
+ else:
80
+ output = output.replace("{background_image}", "")
81
+ else:
82
+ logme.debug(__name__+':User:notFormat')
83
+ output = f"{u.id} | {u.name} | @{u.username} | Private: "
84
+ output += f"{u.is_private} | Verified: {u.is_verified} |"
85
+ output += f" Bio: {u.bio} | Location: {u.location} | Url: "
86
+ output += f"{u.url} | Joined: {u.join_date} {u.join_time} "
87
+ output += f"| Tweets: {u.tweets} | Following: {u.following}"
88
+ output += f" | Followers: {u.followers} | Likes: {u.likes} "
89
+ output += f"| Media: {u.media_count} | Avatar: {u.avatar}"
90
+
91
+ return output
twitter-scraper/twint-master/twint/get.py ADDED
@@ -0,0 +1,298 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from async_timeout import timeout
2
+ from datetime import datetime
3
+ from bs4 import BeautifulSoup
4
+ import sys
5
+ import socket
6
+ import aiohttp
7
+ from fake_useragent import UserAgent
8
+ import asyncio
9
+ import concurrent.futures
10
+ import random
11
+ from json import loads, dumps
12
+ from aiohttp_socks import ProxyConnector, ProxyType
13
+ from urllib.parse import quote
14
+ import time
15
+
16
+ from . import url
17
+ from .output import Tweets, Users
18
+ from .token import TokenExpiryException
19
+
20
+ import logging as logme
21
+
22
+ httpproxy = None
23
+
24
+ user_agent_list = [
25
+ # 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)'
26
+ # ' Chrome/60.0.3112.113 Safari/537.36',
27
+ # 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)'
28
+ # ' Chrome/60.0.3112.90 Safari/537.36',
29
+ # 'Mozilla/5.0 (Windows NT 5.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)'
30
+ # ' Chrome/60.0.3112.90 Safari/537.36',
31
+ # 'Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)'
32
+ # ' Chrome/60.0.3112.90 Safari/537.36',
33
+ # 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko)'
34
+ # ' Chrome/44.0.2403.157 Safari/537.36',
35
+ # 'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)'
36
+ # ' Chrome/60.0.3112.113 Safari/537.36',
37
+ # 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)'
38
+ # ' Chrome/57.0.2987.133 Safari/537.36',
39
+ # 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)'
40
+ # ' Chrome/57.0.2987.133 Safari/537.36',
41
+ # 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)'
42
+ # ' Chrome/55.0.2883.87 Safari/537.36',
43
+ # 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)'
44
+ # ' Chrome/55.0.2883.87 Safari/537.36',
45
+
46
+ 'Mozilla/4.0 (compatible; MSIE 9.0; Windows NT 6.1)',
47
+ 'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko',
48
+ 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0)',
49
+ 'Mozilla/5.0 (Windows NT 6.1; Trident/7.0; rv:11.0) like Gecko',
50
+ 'Mozilla/5.0 (Windows NT 6.2; WOW64; Trident/7.0; rv:11.0) like Gecko',
51
+ 'Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko',
52
+ 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.0; Trident/5.0)',
53
+ 'Mozilla/5.0 (Windows NT 6.3; WOW64; Trident/7.0; rv:11.0) like Gecko',
54
+ 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)',
55
+ 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; Trident/7.0; rv:11.0) like Gecko',
56
+ 'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0)',
57
+ 'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/6.0)',
58
+ 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; .NET CLR 2.0.50727; .NET CLR 3.0.4506.2152; .NET '
59
+ 'CLR 3.5.30729)',
60
+ ]
61
+
62
+
63
+ # function to convert python `dict` to json and then encode it to be passed in the url as a parameter
64
+ # some urls require this format
65
+ def dict_to_url(dct):
66
+ return quote(dumps(dct))
67
+
68
+
69
+ def get_connector(config):
70
+ logme.debug(__name__ + ':get_connector')
71
+ _connector = None
72
+ if config.Proxy_host:
73
+ if config.Proxy_host.lower() == "tor":
74
+ _connector = ProxyConnector(
75
+ host='127.0.0.1',
76
+ port=9050,
77
+ rdns=True)
78
+ elif config.Proxy_port and config.Proxy_type:
79
+ if config.Proxy_type.lower() == "socks5":
80
+ _type = ProxyType.SOCKS5
81
+ elif config.Proxy_type.lower() == "socks4":
82
+ _type = ProxyType.SOCKS4
83
+ elif config.Proxy_type.lower() == "http":
84
+ global httpproxy
85
+ httpproxy = "http://" + config.Proxy_host + ":" + str(config.Proxy_port)
86
+ return _connector
87
+ else:
88
+ logme.critical("get_connector:proxy-type-error")
89
+ print("Error: Proxy types allowed are: http, socks5 and socks4. No https.")
90
+ sys.exit(1)
91
+ _connector = ProxyConnector(
92
+ proxy_type=_type,
93
+ host=config.Proxy_host,
94
+ port=config.Proxy_port,
95
+ rdns=True)
96
+ else:
97
+ logme.critical(__name__ + ':get_connector:proxy-port-type-error')
98
+ print("Error: Please specify --proxy-host, --proxy-port, and --proxy-type")
99
+ sys.exit(1)
100
+ else:
101
+ if config.Proxy_port or config.Proxy_type:
102
+ logme.critical(__name__ + ':get_connector:proxy-host-arg-error')
103
+ print("Error: Please specify --proxy-host, --proxy-port, and --proxy-type")
104
+ sys.exit(1)
105
+
106
+ return _connector
107
+
108
+
109
+ async def RequestUrl(config, init):
110
+ logme.debug(__name__ + ':RequestUrl')
111
+ _connector = get_connector(config)
112
+ _serialQuery = ""
113
+ params = []
114
+ _url = ""
115
+ _headers = [("authorization", config.Bearer_token), ("x-guest-token", config.Guest_token)]
116
+
117
+ # TODO : do this later
118
+ if config.Profile:
119
+ logme.debug(__name__ + ':RequestUrl:Profile')
120
+ _url, params, _serialQuery = url.SearchProfile(config, init)
121
+ elif config.TwitterSearch:
122
+ logme.debug(__name__ + ':RequestUrl:TwitterSearch')
123
+ _url, params, _serialQuery = await url.Search(config, init)
124
+ else:
125
+ if config.Following:
126
+ logme.debug(__name__ + ':RequestUrl:Following')
127
+ _url = await url.Following(config.Username, init)
128
+ elif config.Followers:
129
+ logme.debug(__name__ + ':RequestUrl:Followers')
130
+ _url = await url.Followers(config.Username, init)
131
+ else:
132
+ logme.debug(__name__ + ':RequestUrl:Favorites')
133
+ _url = await url.Favorites(config.Username, init)
134
+ _serialQuery = _url
135
+
136
+ response = await Request(_url, params=params, connector=_connector, headers=_headers)
137
+
138
+ if config.Debug:
139
+ print(_serialQuery, file=open("twint-request_urls.log", "a", encoding="utf-8"))
140
+
141
+ return response
142
+
143
+
144
+ def ForceNewTorIdentity(config):
145
+ logme.debug(__name__ + ':ForceNewTorIdentity')
146
+ try:
147
+ tor_c = socket.create_connection(('127.0.0.1', config.Tor_control_port))
148
+ tor_c.send('AUTHENTICATE "{}"\r\nSIGNAL NEWNYM\r\n'.format(config.Tor_control_password).encode())
149
+ response = tor_c.recv(1024)
150
+ if response != b'250 OK\r\n250 OK\r\n':
151
+ sys.stderr.write('Unexpected response from Tor control port: {}\n'.format(response))
152
+ logme.critical(__name__ + ':ForceNewTorIdentity:unexpectedResponse')
153
+ except Exception as e:
154
+ logme.debug(__name__ + ':ForceNewTorIdentity:errorConnectingTor')
155
+ sys.stderr.write('Error connecting to Tor control port: {}\n'.format(repr(e)))
156
+ sys.stderr.write('If you want to rotate Tor ports automatically - enable Tor control port\n')
157
+
158
+
159
+ async def Request(_url, connector=None, params=None, headers=None):
160
+ logme.debug(__name__ + ':Request:Connector')
161
+ async with aiohttp.ClientSession(connector=connector, headers=headers) as session:
162
+ return await Response(session, _url, params)
163
+
164
+
165
+ async def Response(session, _url, params=None):
166
+ logme.debug(__name__ + ':Response')
167
+ retries = 5
168
+ wait = 10 # No basis, maybe work with 0
169
+ for attempt in range(retries + 1):
170
+ try:
171
+ with timeout(120):
172
+ async with session.get(_url, ssl=True, params=params, proxy=httpproxy) as response:
173
+ resp = await response.text()
174
+ if response.status == 429: # 429 implies Too many requests i.e. Rate Limit Exceeded
175
+ raise TokenExpiryException(loads(resp)['errors'][0]['message'])
176
+ return resp
177
+ except aiohttp.client_exceptions.ClientConnectorError as exc:
178
+ if attempt < retries:
179
+ retrying = ', retrying'
180
+ level = logme.WARNING
181
+ else:
182
+ retrying = ''
183
+ level = logme.ERROR
184
+ logme.log(level, f'Error retrieving {_url}: {exc!r}{retrying}')
185
+ if attempt < retries:
186
+ time.sleep(wait)
187
+ else:
188
+ logme.fatal(f'{retries + 1} requests to {_url} failed, giving up.')
189
+ raise TokenExpiryException(f'{exc!r}')
190
+
191
+
192
+ async def RandomUserAgent(wa=None):
193
+ logme.debug(__name__ + ':RandomUserAgent')
194
+ try:
195
+ if wa:
196
+ return "Mozilla/5.0 (Windows NT 6.4; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2225.0 Safari/537.36"
197
+ return UserAgent(verify_ssl=False, use_cache_server=False).random
198
+ except:
199
+ return random.choice(user_agent_list)
200
+
201
+
202
+ async def Username(_id, bearer_token, guest_token):
203
+ logme.debug(__name__ + ':Username')
204
+ _dct = {'userId': _id, 'withHighlightedLabel': False}
205
+ _url = "https://api.twitter.com/graphql/B9FuNQVmyx32rdbIPEZKag/UserByRestId?variables={}".format(dict_to_url(_dct))
206
+ _headers = {
207
+ 'authorization': bearer_token,
208
+ 'x-guest-token': guest_token,
209
+ }
210
+ r = await Request(_url, headers=_headers)
211
+ j_r = loads(r)
212
+ username = j_r['data']['user']['legacy']['screen_name']
213
+ return username
214
+
215
+
216
+ async def Tweet(url, config, conn):
217
+ logme.debug(__name__ + ':Tweet')
218
+ try:
219
+ response = await Request(url)
220
+ soup = BeautifulSoup(response, "html.parser")
221
+ tweets = soup.find_all("div", "tweet")
222
+ await Tweets(tweets, config, conn, url)
223
+ except Exception as e:
224
+ logme.critical(__name__ + ':Tweet:' + str(e))
225
+
226
+
227
+ async def User(username, config, conn, user_id=False):
228
+ logme.debug(__name__ + ':User')
229
+ _dct = {'screen_name': username, 'withHighlightedLabel': False}
230
+ _url = 'https://api.twitter.com/graphql/jMaTS-_Ea8vh9rpKggJbCQ/UserByScreenName?variables={}'\
231
+ .format(dict_to_url(_dct))
232
+ _headers = {
233
+ 'authorization': config.Bearer_token,
234
+ 'x-guest-token': config.Guest_token,
235
+ }
236
+ try:
237
+ response = await Request(_url, headers=_headers)
238
+ j_r = loads(response)
239
+ if user_id:
240
+ try:
241
+ _id = j_r['data']['user']['rest_id']
242
+ return _id
243
+ except KeyError as e:
244
+ logme.critical(__name__ + ':User:' + str(e))
245
+ return
246
+ await Users(j_r, config, conn)
247
+ except Exception as e:
248
+ logme.critical(__name__ + ':User:' + str(e))
249
+ raise
250
+
251
+
252
+ def Limit(Limit, count):
253
+ logme.debug(__name__ + ':Limit')
254
+ if Limit is not None and count >= int(Limit):
255
+ return True
256
+
257
+
258
+ async def Multi(feed, config, conn):
259
+ logme.debug(__name__ + ':Multi')
260
+ count = 0
261
+ try:
262
+ with concurrent.futures.ThreadPoolExecutor(max_workers=20) as executor:
263
+ loop = asyncio.get_event_loop()
264
+ futures = []
265
+ for tweet in feed:
266
+ count += 1
267
+ if config.Favorites or config.Profile_full:
268
+ logme.debug(__name__ + ':Multi:Favorites-profileFull')
269
+ link = tweet.find("a")["href"]
270
+ url = f"https://twitter.com{link}&lang=en"
271
+ elif config.User_full:
272
+ logme.debug(__name__ + ':Multi:userFull')
273
+ username = tweet.find("a")["name"]
274
+ url = f"http://twitter.com/{username}?lang=en"
275
+ else:
276
+ logme.debug(__name__ + ':Multi:else-url')
277
+ link = tweet.find("a", "tweet-timestamp js-permalink js-nav js-tooltip")["href"]
278
+ url = f"https://twitter.com{link}?lang=en"
279
+
280
+ if config.User_full:
281
+ logme.debug(__name__ + ':Multi:user-full-Run')
282
+ futures.append(loop.run_in_executor(executor, await User(url,
283
+ config, conn)))
284
+ else:
285
+ logme.debug(__name__ + ':Multi:notUser-full-Run')
286
+ futures.append(loop.run_in_executor(executor, await Tweet(url,
287
+ config, conn)))
288
+ logme.debug(__name__ + ':Multi:asyncioGather')
289
+ await asyncio.gather(*futures)
290
+ except Exception as e:
291
+ # TODO: fix error not error
292
+ # print(str(e) + " [x] get.Multi")
293
+ # will return "'NoneType' object is not callable"
294
+ # but still works
295
+ # logme.critical(__name__+':Multi:' + str(e))
296
+ pass
297
+
298
+ return count
twitter-scraper/twint-master/twint/output.py ADDED
@@ -0,0 +1,241 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from datetime import datetime
2
+
3
+ from . import format, get
4
+ from .tweet import Tweet
5
+ from .user import User
6
+ from .storage import db, elasticsearch, write, panda
7
+
8
+ import logging as logme
9
+
10
+ follows_list = []
11
+ tweets_list = []
12
+ users_list = []
13
+
14
+ author_list = {''}
15
+ author_list.pop()
16
+
17
+ # used by Pandas
18
+ _follows_object = {}
19
+
20
+
21
+ def _formatDateTime(datetimestamp):
22
+ try:
23
+ return int(datetime.strptime(datetimestamp, "%Y-%m-%d %H:%M:%S").timestamp())
24
+ except ValueError:
25
+ return int(datetime.strptime(datetimestamp, "%Y-%m-%d").timestamp())
26
+
27
+
28
+ def _clean_follow_list():
29
+ logme.debug(__name__ + ':clean_follow_list')
30
+ global _follows_object
31
+ _follows_object = {}
32
+
33
+
34
+ def clean_lists():
35
+ logme.debug(__name__ + ':clean_lists')
36
+ global follows_list
37
+ global tweets_list
38
+ global users_list
39
+ follows_list = []
40
+ tweets_list = []
41
+ users_list = []
42
+
43
+
44
+ def datecheck(datetimestamp, config):
45
+ logme.debug(__name__ + ':datecheck')
46
+ if config.Since:
47
+ logme.debug(__name__ + ':datecheck:SinceTrue')
48
+
49
+ d = _formatDateTime(datetimestamp)
50
+ s = _formatDateTime(config.Since)
51
+
52
+ if d < s:
53
+ return False
54
+ if config.Until:
55
+ logme.debug(__name__ + ':datecheck:UntilTrue')
56
+
57
+ d = _formatDateTime(datetimestamp)
58
+ s = _formatDateTime(config.Until)
59
+
60
+ if d > s:
61
+ return False
62
+ logme.debug(__name__ + ':datecheck:dateRangeFalse')
63
+ return True
64
+
65
+
66
+ # TODO In this method we need to delete the quoted tweets, because twitter also sends the quoted tweets in the
67
+ # `tweets` list along with the other tweets
68
+ def is_tweet(tw):
69
+ try:
70
+ tw["data-item-id"]
71
+ logme.debug(__name__ + ':is_tweet:True')
72
+ return True
73
+ except:
74
+ logme.critical(__name__ + ':is_tweet:False')
75
+ return False
76
+
77
+
78
+ def _output(obj, output, config, **extra):
79
+ logme.debug(__name__ + ':_output')
80
+ if config.Lowercase:
81
+ if isinstance(obj, str):
82
+ logme.debug(__name__ + ':_output:Lowercase:username')
83
+ obj = obj.lower()
84
+ elif obj.__class__.__name__ == "user":
85
+ logme.debug(__name__ + ':_output:Lowercase:user')
86
+ pass
87
+ elif obj.__class__.__name__ == "tweet":
88
+ logme.debug(__name__ + ':_output:Lowercase:tweet')
89
+ obj.username = obj.username.lower()
90
+ author_list.update({obj.username})
91
+ for dct in obj.mentions:
92
+ for key, val in dct.items():
93
+ dct[key] = val.lower()
94
+ for i in range(len(obj.hashtags)):
95
+ obj.hashtags[i] = obj.hashtags[i].lower()
96
+ for i in range(len(obj.cashtags)):
97
+ obj.cashtags[i] = obj.cashtags[i].lower()
98
+ else:
99
+ logme.info('_output:Lowercase:hiddenTweetFound')
100
+ print("[x] Hidden tweet found, account suspended due to violation of TOS")
101
+ return
102
+ if config.Output != None:
103
+ if config.Store_csv:
104
+ try:
105
+ write.Csv(obj, config)
106
+ logme.debug(__name__ + ':_output:CSV')
107
+ except Exception as e:
108
+ logme.critical(__name__ + ':_output:CSV:Error:' + str(e))
109
+ print(str(e) + " [x] output._output")
110
+ elif config.Store_json:
111
+ write.Json(obj, config)
112
+ logme.debug(__name__ + ':_output:JSON')
113
+ else:
114
+ write.Text(output, config.Output)
115
+ logme.debug(__name__ + ':_output:Text')
116
+
117
+ if config.Elasticsearch:
118
+ logme.debug(__name__ + ':_output:Elasticsearch')
119
+ print("", end=".", flush=True)
120
+ else:
121
+ if not config.Hide_output:
122
+ try:
123
+ print(output.replace('\n', ' '))
124
+ except UnicodeEncodeError:
125
+ logme.critical(__name__ + ':_output:UnicodeEncodeError')
126
+ print("unicode error [x] output._output")
127
+
128
+
129
+ async def checkData(tweet, config, conn):
130
+ logme.debug(__name__ + ':checkData')
131
+ tweet = Tweet(tweet, config)
132
+ if not tweet.datestamp:
133
+ logme.critical(__name__ + ':checkData:hiddenTweetFound')
134
+ print("[x] Hidden tweet found, account suspended due to violation of TOS")
135
+ return
136
+ if datecheck(tweet.datestamp + " " + tweet.timestamp, config):
137
+ output = format.Tweet(config, tweet)
138
+ if config.Database:
139
+ logme.debug(__name__ + ':checkData:Database')
140
+ db.tweets(conn, tweet, config)
141
+ if config.Pandas:
142
+ logme.debug(__name__ + ':checkData:Pandas')
143
+ panda.update(tweet, config)
144
+ if config.Store_object:
145
+ logme.debug(__name__ + ':checkData:Store_object')
146
+ if hasattr(config.Store_object_tweets_list, 'append'):
147
+ config.Store_object_tweets_list.append(tweet)
148
+ else:
149
+ tweets_list.append(tweet)
150
+ if config.Elasticsearch:
151
+ logme.debug(__name__ + ':checkData:Elasticsearch')
152
+ elasticsearch.Tweet(tweet, config)
153
+ _output(tweet, output, config)
154
+ # else:
155
+ # logme.critical(__name__+':checkData:copyrightedTweet')
156
+
157
+
158
+ async def Tweets(tweets, config, conn):
159
+ logme.debug(__name__ + ':Tweets')
160
+ if config.Favorites or config.Location:
161
+ logme.debug(__name__ + ':Tweets:fav+full+loc')
162
+ for tw in tweets:
163
+ await checkData(tw, config, conn)
164
+ elif config.TwitterSearch or config.Profile:
165
+ logme.debug(__name__ + ':Tweets:TwitterSearch')
166
+ await checkData(tweets, config, conn)
167
+ else:
168
+ logme.debug(__name__ + ':Tweets:else')
169
+ if int(tweets["data-user-id"]) == config.User_id or config.Retweets:
170
+ await checkData(tweets, config, conn)
171
+
172
+
173
+ async def Users(u, config, conn):
174
+ logme.debug(__name__ + ':User')
175
+ global users_list
176
+
177
+ user = User(u)
178
+ output = format.User(config.Format, user)
179
+
180
+ if config.Database:
181
+ logme.debug(__name__ + ':User:Database')
182
+ db.user(conn, config, user)
183
+
184
+ if config.Elasticsearch:
185
+ logme.debug(__name__ + ':User:Elasticsearch')
186
+ _save_date = user.join_date
187
+ _save_time = user.join_time
188
+ user.join_date = str(datetime.strptime(user.join_date, "%d %b %Y")).split()[0]
189
+ user.join_time = str(datetime.strptime(user.join_time, "%I:%M %p")).split()[1]
190
+ elasticsearch.UserProfile(user, config)
191
+ user.join_date = _save_date
192
+ user.join_time = _save_time
193
+
194
+ if config.Store_object:
195
+ logme.debug(__name__ + ':User:Store_object')
196
+
197
+ if hasattr(config.Store_object_follow_list, 'append'):
198
+ config.Store_object_follow_list.append(user)
199
+ elif hasattr(config.Store_object_users_list, 'append'):
200
+ config.Store_object_users_list.append(user)
201
+ else:
202
+ users_list.append(user) # twint.user.user
203
+
204
+ if config.Pandas:
205
+ logme.debug(__name__ + ':User:Pandas+user')
206
+ panda.update(user, config)
207
+
208
+ _output(user, output, config)
209
+
210
+
211
+ async def Username(username, config, conn):
212
+ logme.debug(__name__ + ':Username')
213
+ global _follows_object
214
+ global follows_list
215
+ follow_var = config.Following * "following" + config.Followers * "followers"
216
+
217
+ if config.Database:
218
+ logme.debug(__name__ + ':Username:Database')
219
+ db.follow(conn, config.Username, config.Followers, username)
220
+
221
+ if config.Elasticsearch:
222
+ logme.debug(__name__ + ':Username:Elasticsearch')
223
+ elasticsearch.Follow(username, config)
224
+
225
+ if config.Store_object:
226
+ if hasattr(config.Store_object_follow_list, 'append'):
227
+ config.Store_object_follow_list.append(username)
228
+ else:
229
+ follows_list.append(username) # twint.user.user
230
+
231
+ if config.Pandas:
232
+ logme.debug(__name__ + ':Username:object+pandas')
233
+ try:
234
+ _ = _follows_object[config.Username][follow_var]
235
+ except KeyError:
236
+ _follows_object.update({config.Username: {follow_var: []}})
237
+ _follows_object[config.Username][follow_var].append(username)
238
+ if config.Pandas_au:
239
+ logme.debug(__name__ + ':Username:object+pandas+au')
240
+ panda.update(_follows_object[config.Username], config)
241
+ _output(username, username, config)
twitter-scraper/twint-master/twint/run.py ADDED
@@ -0,0 +1,412 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys, os, datetime
2
+ from asyncio import get_event_loop, TimeoutError, ensure_future, new_event_loop, set_event_loop
3
+
4
+ from . import datelock, feed, get, output, verbose, storage
5
+ from .token import TokenExpiryException
6
+ from . import token
7
+ from .storage import db
8
+ from .feed import NoMoreTweetsException
9
+
10
+ import logging as logme
11
+
12
+ import time
13
+
14
+ bearer = 'Bearer AAAAAAAAAAAAAAAAAAAAANRILgAAAAAAnNwIzUejRCOuH5E6I8xnZz4puTs' \
15
+ '%3D1Zv7ttfk8LF81IUq16cHjhLTvJu4FA33AGWWjCpTnA'
16
+
17
+
18
+ class Twint:
19
+ def __init__(self, config):
20
+ logme.debug(__name__ + ':Twint:__init__')
21
+ if config.Resume is not None and (config.TwitterSearch or config.Followers or config.Following):
22
+ logme.debug(__name__ + ':Twint:__init__:Resume')
23
+ self.init = self.get_resume(config.Resume)
24
+ else:
25
+ self.init = -1
26
+
27
+ config.deleted = []
28
+ self.feed: list = [-1]
29
+ self.count = 0
30
+ self.user_agent = ""
31
+ self.config = config
32
+ self.config.Bearer_token = bearer
33
+ # TODO might have to make some adjustments for it to work with multi-treading
34
+ # USAGE : to get a new guest token simply do `self.token.refresh()`
35
+ self.token = token.Token(config)
36
+ self.token.refresh()
37
+ self.conn = db.Conn(config.Database)
38
+ self.d = datelock.Set(self.config.Until, self.config.Since)
39
+ verbose.Elastic(config.Elasticsearch)
40
+
41
+ if self.config.Store_object:
42
+ logme.debug(__name__ + ':Twint:__init__:clean_follow_list')
43
+ output._clean_follow_list()
44
+
45
+ if self.config.Pandas_clean:
46
+ logme.debug(__name__ + ':Twint:__init__:pandas_clean')
47
+ storage.panda.clean()
48
+
49
+ def get_resume(self, resumeFile):
50
+ if not os.path.exists(resumeFile):
51
+ return '-1'
52
+ with open(resumeFile, 'r') as rFile:
53
+ _init = rFile.readlines()[-1].strip('\n')
54
+ return _init
55
+
56
+ async def Feed(self):
57
+ logme.debug(__name__ + ':Twint:Feed')
58
+ consecutive_errors_count = 0
59
+ while True:
60
+ # this will receive a JSON string, parse it into a `dict` and do the required stuff
61
+ try:
62
+ response = await get.RequestUrl(self.config, self.init)
63
+ except TokenExpiryException as e:
64
+ logme.debug(__name__ + 'Twint:Feed:' + str(e))
65
+ self.token.refresh()
66
+ response = await get.RequestUrl(self.config, self.init)
67
+
68
+ if self.config.Debug:
69
+ print(response, file=open("twint-last-request.log", "w", encoding="utf-8"))
70
+
71
+ self.feed = []
72
+ try:
73
+ if self.config.Favorites:
74
+ self.feed, self.init = feed.MobileFav(response)
75
+ favorite_err_cnt = 0
76
+ if len(self.feed) == 0 and len(self.init) == 0:
77
+ while (len(self.feed) == 0 or len(self.init) == 0) and favorite_err_cnt < 5:
78
+ self.user_agent = await get.RandomUserAgent(wa=False)
79
+ response = await get.RequestUrl(self.config, self.init,
80
+ headers=[("User-Agent", self.user_agent)])
81
+ self.feed, self.init = feed.MobileFav(response)
82
+ favorite_err_cnt += 1
83
+ time.sleep(1)
84
+ if favorite_err_cnt == 5:
85
+ print("Favorite page could not be fetched")
86
+ if not self.count % 40:
87
+ time.sleep(5)
88
+ elif self.config.Followers or self.config.Following:
89
+ self.feed, self.init = feed.Follow(response)
90
+ if not self.count % 40:
91
+ time.sleep(5)
92
+ elif self.config.Profile or self.config.TwitterSearch:
93
+ try:
94
+ self.feed, self.init = feed.parse_tweets(self.config, response)
95
+ except NoMoreTweetsException as e:
96
+ logme.debug(__name__ + ':Twint:Feed:' + str(e))
97
+ print('[!] ' + str(e) + ' Scraping will stop now.')
98
+ print('found {} deleted tweets in this search.'.format(len(self.config.deleted)))
99
+ break
100
+ break
101
+ except TimeoutError as e:
102
+ if self.config.Proxy_host.lower() == "tor":
103
+ print("[?] Timed out, changing Tor identity...")
104
+ if self.config.Tor_control_password is None:
105
+ logme.critical(__name__ + ':Twint:Feed:tor-password')
106
+ sys.stderr.write("Error: config.Tor_control_password must be set for proxy auto-rotation!\r\n")
107
+ sys.stderr.write(
108
+ "Info: What is it? See https://stem.torproject.org/faq.html#can-i-interact-with-tors"
109
+ "-controller-interface-directly\r\n")
110
+ break
111
+ else:
112
+ get.ForceNewTorIdentity(self.config)
113
+ continue
114
+ else:
115
+ logme.critical(__name__ + ':Twint:Feed:' + str(e))
116
+ print(str(e))
117
+ break
118
+ except Exception as e:
119
+ if self.config.Profile or self.config.Favorites:
120
+ print("[!] Twitter does not return more data, scrape stops here.")
121
+ break
122
+
123
+ logme.critical(__name__ + ':Twint:Feed:noData' + str(e))
124
+ # Sometimes Twitter says there is no data. But it's a lie.
125
+ # raise
126
+ consecutive_errors_count += 1
127
+ if consecutive_errors_count < self.config.Retries_count:
128
+ # skip to the next iteration if wait time does not satisfy limit constraints
129
+ delay = round(consecutive_errors_count ** self.config.Backoff_exponent, 1)
130
+
131
+ # if the delay is less than users set min wait time then replace delay
132
+ if self.config.Min_wait_time > delay:
133
+ delay = self.config.Min_wait_time
134
+
135
+ sys.stderr.write('sleeping for {} secs\n'.format(delay))
136
+ time.sleep(delay)
137
+ self.user_agent = await get.RandomUserAgent(wa=True)
138
+ continue
139
+ logme.critical(__name__ + ':Twint:Feed:Tweets_known_error:' + str(e))
140
+ sys.stderr.write(str(e) + " [x] run.Feed")
141
+ sys.stderr.write(
142
+ "[!] if you get this error but you know for sure that more tweets exist, please open an issue and "
143
+ "we will investigate it!")
144
+ break
145
+ if self.config.Resume:
146
+ print(self.init, file=open(self.config.Resume, "a", encoding="utf-8"))
147
+
148
+ async def follow(self):
149
+ await self.Feed()
150
+ if self.config.User_full:
151
+ logme.debug(__name__ + ':Twint:follow:userFull')
152
+ self.count += await get.Multi(self.feed, self.config, self.conn)
153
+ else:
154
+ logme.debug(__name__ + ':Twint:follow:notUserFull')
155
+ for user in self.feed:
156
+ self.count += 1
157
+ username = user.find("a")["name"]
158
+ await output.Username(username, self.config, self.conn)
159
+
160
+ async def favorite(self):
161
+ logme.debug(__name__ + ':Twint:favorite')
162
+ await self.Feed()
163
+ favorited_tweets_list = []
164
+ for tweet in self.feed:
165
+ tweet_dict = {}
166
+ self.count += 1
167
+ try:
168
+ tweet_dict['data-item-id'] = tweet.find("div", {"class": "tweet-text"})['data-id']
169
+ t_url = tweet.find("span", {"class": "metadata"}).find("a")["href"]
170
+ tweet_dict['data-conversation-id'] = t_url.split('?')[0].split('/')[-1]
171
+ tweet_dict['username'] = tweet.find("div", {"class": "username"}).text.replace('\n', '').replace(' ',
172
+ '')
173
+ tweet_dict['tweet'] = tweet.find("div", {"class": "tweet-text"}).find("div", {"class": "dir-ltr"}).text
174
+ date_str = tweet.find("td", {"class": "timestamp"}).find("a").text
175
+ # test_dates = ["1m", "2h", "Jun 21, 2019", "Mar 12", "28 Jun 19"]
176
+ # date_str = test_dates[3]
177
+ if len(date_str) <= 3 and (date_str[-1] == "m" or date_str[-1] == "h"): # 25m 1h
178
+ dateu = str(datetime.date.today())
179
+ tweet_dict['date'] = dateu
180
+ elif ',' in date_str: # Aug 21, 2019
181
+ sp = date_str.replace(',', '').split(' ')
182
+ date_str_formatted = sp[1] + ' ' + sp[0] + ' ' + sp[2]
183
+ dateu = datetime.datetime.strptime(date_str_formatted, "%d %b %Y").strftime("%Y-%m-%d")
184
+ tweet_dict['date'] = dateu
185
+ elif len(date_str.split(' ')) == 3: # 28 Jun 19
186
+ sp = date_str.split(' ')
187
+ if len(sp[2]) == 2:
188
+ sp[2] = '20' + sp[2]
189
+ date_str_formatted = sp[0] + ' ' + sp[1] + ' ' + sp[2]
190
+ dateu = datetime.datetime.strptime(date_str_formatted, "%d %b %Y").strftime("%Y-%m-%d")
191
+ tweet_dict['date'] = dateu
192
+ else: # Aug 21
193
+ sp = date_str.split(' ')
194
+ date_str_formatted = sp[1] + ' ' + sp[0] + ' ' + str(datetime.date.today().year)
195
+ dateu = datetime.datetime.strptime(date_str_formatted, "%d %b %Y").strftime("%Y-%m-%d")
196
+ tweet_dict['date'] = dateu
197
+
198
+ favorited_tweets_list.append(tweet_dict)
199
+
200
+ except Exception as e:
201
+ logme.critical(__name__ + ':Twint:favorite:favorite_field_lack')
202
+ print("shit: ", date_str, " ", str(e))
203
+
204
+ try:
205
+ self.config.favorited_tweets_list += favorited_tweets_list
206
+ except AttributeError:
207
+ self.config.favorited_tweets_list = favorited_tweets_list
208
+
209
+ async def profile(self):
210
+ await self.Feed()
211
+ logme.debug(__name__ + ':Twint:profile')
212
+ for tweet in self.feed:
213
+ self.count += 1
214
+ await output.Tweets(tweet, self.config, self.conn)
215
+
216
+ async def tweets(self):
217
+ await self.Feed()
218
+ # TODO : need to take care of this later
219
+ if self.config.Location:
220
+ logme.debug(__name__ + ':Twint:tweets:location')
221
+ self.count += await get.Multi(self.feed, self.config, self.conn)
222
+ else:
223
+ logme.debug(__name__ + ':Twint:tweets:notLocation')
224
+ for tweet in self.feed:
225
+ self.count += 1
226
+ await output.Tweets(tweet, self.config, self.conn)
227
+
228
+ async def main(self, callback=None):
229
+
230
+ task = ensure_future(self.run()) # Might be changed to create_task in 3.7+.
231
+
232
+ if callback:
233
+ task.add_done_callback(callback)
234
+
235
+ await task
236
+
237
+ async def run(self):
238
+ if self.config.TwitterSearch:
239
+ self.user_agent = await get.RandomUserAgent(wa=True)
240
+ else:
241
+ self.user_agent = await get.RandomUserAgent()
242
+
243
+ if self.config.User_id is not None and self.config.Username is None:
244
+ logme.debug(__name__ + ':Twint:main:user_id')
245
+ self.config.Username = await get.Username(self.config.User_id, self.config.Bearer_token,
246
+ self.config.Guest_token)
247
+
248
+ if self.config.Username is not None and self.config.User_id is None:
249
+ logme.debug(__name__ + ':Twint:main:username')
250
+
251
+ self.config.User_id = await get.User(self.config.Username, self.config, self.conn, True)
252
+ if self.config.User_id is None:
253
+ raise ValueError("Cannot find twitter account with name = " + self.config.Username)
254
+
255
+ # TODO : will need to modify it to work with the new endpoints
256
+ if self.config.TwitterSearch and self.config.Since and self.config.Until:
257
+ logme.debug(__name__ + ':Twint:main:search+since+until')
258
+ while self.d.since < self.d.until:
259
+ self.config.Since = datetime.datetime.strftime(self.d.since, "%Y-%m-%d %H:%M:%S")
260
+ self.config.Until = datetime.datetime.strftime(self.d.until, "%Y-%m-%d %H:%M:%S")
261
+ if len(self.feed) > 0:
262
+ await self.tweets()
263
+ else:
264
+ logme.debug(__name__ + ':Twint:main:gettingNewTweets')
265
+ break
266
+
267
+ if get.Limit(self.config.Limit, self.count):
268
+ break
269
+ elif self.config.Lookup:
270
+ await self.Lookup()
271
+ else:
272
+ logme.debug(__name__ + ':Twint:main:not-search+since+until')
273
+ while True:
274
+ if len(self.feed) > 0:
275
+ if self.config.Followers or self.config.Following:
276
+ logme.debug(__name__ + ':Twint:main:follow')
277
+ await self.follow()
278
+ elif self.config.Favorites:
279
+ logme.debug(__name__ + ':Twint:main:favorites')
280
+ await self.favorite()
281
+ elif self.config.Profile:
282
+ logme.debug(__name__ + ':Twint:main:profile')
283
+ await self.profile()
284
+ elif self.config.TwitterSearch:
285
+ logme.debug(__name__ + ':Twint:main:twitter-search')
286
+ await self.tweets()
287
+ else:
288
+ logme.debug(__name__ + ':Twint:main:no-more-tweets')
289
+ break
290
+
291
+ # logging.info("[<] " + str(datetime.now()) + ':: run+Twint+main+CallingGetLimit2')
292
+ if get.Limit(self.config.Limit, self.count):
293
+ logme.debug(__name__ + ':Twint:main:reachedLimit')
294
+ break
295
+
296
+ if self.config.Count:
297
+ verbose.Count(self.count, self.config)
298
+
299
+ async def Lookup(self):
300
+ logme.debug(__name__ + ':Twint:Lookup')
301
+
302
+ try:
303
+ if self.config.User_id is not None and self.config.Username is None:
304
+ logme.debug(__name__ + ':Twint:Lookup:user_id')
305
+ self.config.Username = await get.Username(self.config.User_id, self.config.Bearer_token,
306
+ self.config.Guest_token)
307
+ await get.User(self.config.Username, self.config, db.Conn(self.config.Database))
308
+
309
+ except Exception as e:
310
+ logme.exception(__name__ + ':Twint:Lookup:Unexpected exception occurred.')
311
+ raise
312
+
313
+
314
+ def run(config, callback=None):
315
+ logme.debug(__name__ + ':run')
316
+ try:
317
+ get_event_loop()
318
+ except RuntimeError as e:
319
+ if "no current event loop" in str(e):
320
+ set_event_loop(new_event_loop())
321
+ else:
322
+ logme.exception(__name__ + ':run:Unexpected exception while handling an expected RuntimeError.')
323
+ raise
324
+ except Exception as e:
325
+ logme.exception(
326
+ __name__ + ':run:Unexpected exception occurred while attempting to get or create a new event loop.')
327
+ raise
328
+
329
+ get_event_loop().run_until_complete(Twint(config).main(callback))
330
+
331
+
332
+ def Favorites(config):
333
+ logme.debug(__name__ + ':Favorites')
334
+ config.Favorites = True
335
+ config.Following = False
336
+ config.Followers = False
337
+ config.Profile = False
338
+ config.TwitterSearch = False
339
+ run(config)
340
+ if config.Pandas_au:
341
+ storage.panda._autoget("tweet")
342
+
343
+
344
+ def Followers(config):
345
+ logme.debug(__name__ + ':Followers')
346
+ config.Followers = True
347
+ config.Following = False
348
+ config.Profile = False
349
+ config.Favorites = False
350
+ config.TwitterSearch = False
351
+ run(config)
352
+ if config.Pandas_au:
353
+ storage.panda._autoget("followers")
354
+ if config.User_full:
355
+ storage.panda._autoget("user")
356
+ if config.Pandas_clean and not config.Store_object:
357
+ # storage.panda.clean()
358
+ output._clean_follow_list()
359
+
360
+
361
+ def Following(config):
362
+ logme.debug(__name__ + ':Following')
363
+ config.Following = True
364
+ config.Followers = False
365
+ config.Profile = False
366
+ config.Favorites = False
367
+ config.TwitterSearch = False
368
+ run(config)
369
+ if config.Pandas_au:
370
+ storage.panda._autoget("following")
371
+ if config.User_full:
372
+ storage.panda._autoget("user")
373
+ if config.Pandas_clean and not config.Store_object:
374
+ # storage.panda.clean()
375
+ output._clean_follow_list()
376
+
377
+
378
+ def Lookup(config):
379
+ logme.debug(__name__ + ':Lookup')
380
+ config.Profile = False
381
+ config.Lookup = True
382
+ config.Favorites = False
383
+ config.FOllowing = False
384
+ config.Followers = False
385
+ config.TwitterSearch = False
386
+ run(config)
387
+ if config.Pandas_au:
388
+ storage.panda._autoget("user")
389
+
390
+
391
+ def Profile(config):
392
+ logme.debug(__name__ + ':Profile')
393
+ config.Profile = True
394
+ config.Favorites = False
395
+ config.Following = False
396
+ config.Followers = False
397
+ config.TwitterSearch = False
398
+ run(config)
399
+ if config.Pandas_au:
400
+ storage.panda._autoget("tweet")
401
+
402
+
403
+ def Search(config, callback=None):
404
+ logme.debug(__name__ + ':Search')
405
+ config.TwitterSearch = True
406
+ config.Favorites = False
407
+ config.Following = False
408
+ config.Followers = False
409
+ config.Profile = False
410
+ run(config, callback)
411
+ if config.Pandas_au:
412
+ storage.panda._autoget("tweet")
twitter-scraper/twint-master/twint/storage/__init__.py ADDED
File without changes
twitter-scraper/twint-master/twint/storage/db.py ADDED
@@ -0,0 +1,297 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sqlite3
2
+ import sys
3
+ import time
4
+ import hashlib
5
+
6
+ from datetime import datetime
7
+
8
+ def Conn(database):
9
+ if database:
10
+ print("[+] Inserting into Database: " + str(database))
11
+ conn = init(database)
12
+ if isinstance(conn, str): # error
13
+ print(conn)
14
+ sys.exit(1)
15
+ else:
16
+ conn = ""
17
+
18
+ return conn
19
+
20
+ def init(db):
21
+ try:
22
+ conn = sqlite3.connect(db)
23
+ cursor = conn.cursor()
24
+
25
+ table_users = """
26
+ CREATE TABLE IF NOT EXISTS
27
+ users(
28
+ id integer not null,
29
+ id_str text not null,
30
+ name text,
31
+ username text not null,
32
+ bio text,
33
+ location text,
34
+ url text,
35
+ join_date text not null,
36
+ join_time text not null,
37
+ tweets integer,
38
+ following integer,
39
+ followers integer,
40
+ likes integer,
41
+ media integer,
42
+ private integer not null,
43
+ verified integer not null,
44
+ profile_image_url text not null,
45
+ background_image text,
46
+ hex_dig text not null,
47
+ time_update integer not null,
48
+ CONSTRAINT users_pk PRIMARY KEY (id, hex_dig)
49
+ );
50
+ """
51
+ cursor.execute(table_users)
52
+
53
+ table_tweets = """
54
+ CREATE TABLE IF NOT EXISTS
55
+ tweets (
56
+ id integer not null,
57
+ id_str text not null,
58
+ tweet text default '',
59
+ language text default '',
60
+ conversation_id text not null,
61
+ created_at integer not null,
62
+ date text not null,
63
+ time text not null,
64
+ timezone text not null,
65
+ place text default '',
66
+ replies_count integer,
67
+ likes_count integer,
68
+ retweets_count integer,
69
+ user_id integer not null,
70
+ user_id_str text not null,
71
+ screen_name text not null,
72
+ name text default '',
73
+ link text,
74
+ mentions text,
75
+ hashtags text,
76
+ cashtags text,
77
+ urls text,
78
+ photos text,
79
+ thumbnail text,
80
+ quote_url text,
81
+ video integer,
82
+ geo text,
83
+ near text,
84
+ source text,
85
+ time_update integer not null,
86
+ `translate` text default '',
87
+ trans_src text default '',
88
+ trans_dest text default '',
89
+ PRIMARY KEY (id)
90
+ );
91
+ """
92
+ cursor.execute(table_tweets)
93
+
94
+ table_retweets = """
95
+ CREATE TABLE IF NOT EXISTS
96
+ retweets(
97
+ user_id integer not null,
98
+ username text not null,
99
+ tweet_id integer not null,
100
+ retweet_id integer not null,
101
+ retweet_date integer,
102
+ CONSTRAINT retweets_pk PRIMARY KEY(user_id, tweet_id),
103
+ CONSTRAINT user_id_fk FOREIGN KEY(user_id) REFERENCES users(id),
104
+ CONSTRAINT tweet_id_fk FOREIGN KEY(tweet_id) REFERENCES tweets(id)
105
+ );
106
+ """
107
+ cursor.execute(table_retweets)
108
+
109
+ table_reply_to = """
110
+ CREATE TABLE IF NOT EXISTS
111
+ replies(
112
+ tweet_id integer not null,
113
+ user_id integer not null,
114
+ username text not null,
115
+ CONSTRAINT replies_pk PRIMARY KEY (user_id, tweet_id),
116
+ CONSTRAINT tweet_id_fk FOREIGN KEY (tweet_id) REFERENCES tweets(id)
117
+ );
118
+ """
119
+ cursor.execute(table_reply_to)
120
+
121
+ table_favorites = """
122
+ CREATE TABLE IF NOT EXISTS
123
+ favorites(
124
+ user_id integer not null,
125
+ tweet_id integer not null,
126
+ CONSTRAINT favorites_pk PRIMARY KEY (user_id, tweet_id),
127
+ CONSTRAINT user_id_fk FOREIGN KEY (user_id) REFERENCES users(id),
128
+ CONSTRAINT tweet_id_fk FOREIGN KEY (tweet_id) REFERENCES tweets(id)
129
+ );
130
+ """
131
+ cursor.execute(table_favorites)
132
+
133
+ table_followers = """
134
+ CREATE TABLE IF NOT EXISTS
135
+ followers (
136
+ id integer not null,
137
+ follower_id integer not null,
138
+ CONSTRAINT followers_pk PRIMARY KEY (id, follower_id),
139
+ CONSTRAINT id_fk FOREIGN KEY(id) REFERENCES users(id),
140
+ CONSTRAINT follower_id_fk FOREIGN KEY(follower_id) REFERENCES users(id)
141
+ );
142
+ """
143
+ cursor.execute(table_followers)
144
+
145
+ table_following = """
146
+ CREATE TABLE IF NOT EXISTS
147
+ following (
148
+ id integer not null,
149
+ following_id integer not null,
150
+ CONSTRAINT following_pk PRIMARY KEY (id, following_id),
151
+ CONSTRAINT id_fk FOREIGN KEY(id) REFERENCES users(id),
152
+ CONSTRAINT following_id_fk FOREIGN KEY(following_id) REFERENCES users(id)
153
+ );
154
+ """
155
+ cursor.execute(table_following)
156
+
157
+ table_followers_names = """
158
+ CREATE TABLE IF NOT EXISTS
159
+ followers_names (
160
+ user text not null,
161
+ time_update integer not null,
162
+ follower text not null,
163
+ PRIMARY KEY (user, follower)
164
+ );
165
+ """
166
+ cursor.execute(table_followers_names)
167
+
168
+ table_following_names = """
169
+ CREATE TABLE IF NOT EXISTS
170
+ following_names (
171
+ user text not null,
172
+ time_update integer not null,
173
+ follows text not null,
174
+ PRIMARY KEY (user, follows)
175
+ );
176
+ """
177
+ cursor.execute(table_following_names)
178
+
179
+ return conn
180
+ except Exception as e:
181
+ return str(e)
182
+
183
+ def fTable(Followers):
184
+ if Followers:
185
+ table = "followers_names"
186
+ else:
187
+ table = "following_names"
188
+
189
+ return table
190
+
191
+ def uTable(Followers):
192
+ if Followers:
193
+ table = "followers"
194
+ else:
195
+ table = "following"
196
+
197
+ return table
198
+
199
+ def follow(conn, Username, Followers, User):
200
+ try:
201
+ time_ms = round(time.time()*1000)
202
+ cursor = conn.cursor()
203
+ entry = (User, time_ms, Username,)
204
+ table = fTable(Followers)
205
+ query = f"INSERT INTO {table} VALUES(?,?,?)"
206
+ cursor.execute(query, entry)
207
+ conn.commit()
208
+ except sqlite3.IntegrityError:
209
+ pass
210
+
211
+ def get_hash_id(conn, id):
212
+ cursor = conn.cursor()
213
+ cursor.execute('SELECT hex_dig FROM users WHERE id = ? LIMIT 1', (id,))
214
+ resultset = cursor.fetchall()
215
+ return resultset[0][0] if resultset else -1
216
+
217
+ def user(conn, config, User):
218
+ try:
219
+ time_ms = round(time.time()*1000)
220
+ cursor = conn.cursor()
221
+ user = [int(User.id), User.id, User.name, User.username, User.bio, User.location, User.url,User.join_date, User.join_time, User.tweets, User.following, User.followers, User.likes, User.media_count, User.is_private, User.is_verified, User.avatar, User.background_image]
222
+
223
+ hex_dig = hashlib.sha256(','.join(str(v) for v in user).encode()).hexdigest()
224
+ entry = tuple(user) + (hex_dig,time_ms,)
225
+ old_hash = get_hash_id(conn, User.id)
226
+
227
+ if old_hash == -1 or old_hash != hex_dig:
228
+ query = f"INSERT INTO users VALUES(?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)"
229
+ cursor.execute(query, entry)
230
+ else:
231
+ pass
232
+
233
+ if config.Followers or config.Following:
234
+ table = uTable(config.Followers)
235
+ query = f"INSERT INTO {table} VALUES(?,?)"
236
+ cursor.execute(query, (config.User_id, int(User.id)))
237
+
238
+ conn.commit()
239
+ except sqlite3.IntegrityError:
240
+ pass
241
+
242
+ def tweets(conn, Tweet, config):
243
+ try:
244
+ time_ms = round(time.time()*1000)
245
+ cursor = conn.cursor()
246
+ entry = (Tweet.id,
247
+ Tweet.id_str,
248
+ Tweet.tweet,
249
+ Tweet.lang,
250
+ Tweet.conversation_id,
251
+ Tweet.datetime,
252
+ Tweet.datestamp,
253
+ Tweet.timestamp,
254
+ Tweet.timezone,
255
+ Tweet.place,
256
+ Tweet.replies_count,
257
+ Tweet.likes_count,
258
+ Tweet.retweets_count,
259
+ Tweet.user_id,
260
+ Tweet.user_id_str,
261
+ Tweet.username,
262
+ Tweet.name,
263
+ Tweet.link,
264
+ ",".join(Tweet.mentions),
265
+ ",".join(Tweet.hashtags),
266
+ ",".join(Tweet.cashtags),
267
+ ",".join(Tweet.urls),
268
+ ",".join(Tweet.photos),
269
+ Tweet.thumbnail,
270
+ Tweet.quote_url,
271
+ Tweet.video,
272
+ Tweet.geo,
273
+ Tweet.near,
274
+ Tweet.source,
275
+ time_ms,
276
+ Tweet.translate,
277
+ Tweet.trans_src,
278
+ Tweet.trans_dest)
279
+ cursor.execute('INSERT INTO tweets VALUES(?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)', entry)
280
+
281
+ if config.Favorites:
282
+ query = 'INSERT INTO favorites VALUES(?,?)'
283
+ cursor.execute(query, (config.User_id, Tweet.id))
284
+
285
+ if Tweet.retweet:
286
+ query = 'INSERT INTO retweets VALUES(?,?,?,?,?)'
287
+ _d = datetime.timestamp(datetime.strptime(Tweet.retweet_date, "%Y-%m-%d %H:%M:%S"))
288
+ cursor.execute(query, (int(Tweet.user_rt_id), Tweet.user_rt, Tweet.id, int(Tweet.retweet_id), _d))
289
+
290
+ if Tweet.reply_to:
291
+ for reply in Tweet.reply_to:
292
+ query = 'INSERT INTO replies VALUES(?,?,?)'
293
+ cursor.execute(query, (Tweet.id, int(reply['user_id']), reply['username']))
294
+
295
+ conn.commit()
296
+ except sqlite3.IntegrityError:
297
+ pass
twitter-scraper/twint-master/twint/storage/elasticsearch.py ADDED
@@ -0,0 +1,364 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ## TODO - Fix Weekday situation
2
+ from elasticsearch import Elasticsearch, helpers
3
+ from geopy.geocoders import Nominatim
4
+ from datetime import datetime
5
+ import contextlib
6
+ import sys
7
+
8
+ _index_tweet_status = False
9
+ _index_follow_status = False
10
+ _index_user_status = False
11
+ _is_near_def = False
12
+ _is_location_def = False
13
+ _near = {}
14
+ _location = {}
15
+
16
+ geolocator = Nominatim(user_agent="twint-1.2")
17
+
18
+ class RecycleObject(object):
19
+ def write(self, junk): pass
20
+ def flush(self): pass
21
+
22
+ def getLocation(place, **options):
23
+ location = geolocator.geocode(place,timeout=1000)
24
+ if location:
25
+ if options.get("near"):
26
+ global _near
27
+ _near = {"lat": location.latitude, "lon": location.longitude}
28
+ return True
29
+ elif options.get("location"):
30
+ global _location
31
+ _location = {"lat": location.latitude, "lon": location.longitude}
32
+ return True
33
+ return {"lat": location.latitude, "lon": location.longitude}
34
+ else:
35
+ return {}
36
+
37
+ def handleIndexResponse(response):
38
+ try:
39
+ if response["status"] == 400:
40
+ return True
41
+ except KeyError:
42
+ pass
43
+ if response["acknowledged"]:
44
+ print("[+] Index \"" + response["index"] + "\" created!")
45
+ else:
46
+ print("[x] error index creation :: storage.elasticsearch.handleIndexCreation")
47
+ if response["shards_acknowledged"]:
48
+ print("[+] Shards acknowledged, everything is ready to be used!")
49
+ return True
50
+ else:
51
+ print("[x] error with shards :: storage.elasticsearch.HandleIndexCreation")
52
+ return False
53
+
54
+ def createIndex(config, instance, **scope):
55
+ if scope.get("scope") == "tweet":
56
+ tweets_body = {
57
+ "mappings": {
58
+ "properties": {
59
+ "id": {"type": "long"},
60
+ "conversation_id": {"type": "long"},
61
+ "created_at": {"type": "text"},
62
+ "date": {"type": "date", "format": "yyyy-MM-dd HH:mm:ss"},
63
+ "timezone": {"type": "keyword"},
64
+ "place": {"type": "keyword"},
65
+ "location": {"type": "keyword"},
66
+ "tweet": {"type": "text"},
67
+ "lang": {"type": "keyword"},
68
+ "hashtags": {"type": "keyword", "normalizer": "hashtag_normalizer"},
69
+ "cashtags": {"type": "keyword", "normalizer": "hashtag_normalizer"},
70
+ "user_id_str": {"type": "keyword"},
71
+ "username": {"type": "keyword", "normalizer": "hashtag_normalizer"},
72
+ "name": {"type": "text"},
73
+ "profile_image_url": {"type": "text"},
74
+ "day": {"type": "integer"},
75
+ "hour": {"type": "integer"},
76
+ "link": {"type": "text"},
77
+ "retweet": {"type": "text"},
78
+ "essid": {"type": "keyword"},
79
+ "nlikes": {"type": "integer"},
80
+ "nreplies": {"type": "integer"},
81
+ "nretweets": {"type": "integer"},
82
+ "quote_url": {"type": "text"},
83
+ "video": {"type":"integer"},
84
+ "thumbnail": {"type":"text"},
85
+ "search": {"type": "text"},
86
+ "near": {"type": "text"},
87
+ "geo_near": {"type": "geo_point"},
88
+ "geo_tweet": {"type": "geo_point"},
89
+ "photos": {"type": "text"},
90
+ "user_rt_id": {"type": "keyword"},
91
+ "mentions": {"type": "keyword", "normalizer": "hashtag_normalizer"},
92
+ "source": {"type": "keyword"},
93
+ "user_rt": {"type": "keyword"},
94
+ "retweet_id": {"type": "keyword"},
95
+ "reply_to": {
96
+ "type": "nested",
97
+ "properties": {
98
+ "user_id": {"type": "keyword"},
99
+ "username": {"type": "keyword"}
100
+ }
101
+ },
102
+ "retweet_date": {"type": "date", "format": "yyyy-MM-dd HH:mm:ss", "ignore_malformed": True},
103
+ "urls": {"type": "keyword"},
104
+ "translate": {"type": "text"},
105
+ "trans_src": {"type": "keyword"},
106
+ "trans_dest": {"type": "keyword"},
107
+ }
108
+ },
109
+ "settings": {
110
+ "number_of_shards": 1,
111
+ "analysis": {
112
+ "normalizer": {
113
+ "hashtag_normalizer": {
114
+ "type": "custom",
115
+ "char_filter": [],
116
+ "filter": ["lowercase", "asciifolding"]
117
+ }
118
+ }
119
+ }
120
+ }
121
+ }
122
+ with nostdout():
123
+ resp = instance.indices.create(index=config.Index_tweets, body=tweets_body, ignore=400)
124
+ return handleIndexResponse(resp)
125
+ elif scope.get("scope") == "follow":
126
+ follow_body = {
127
+ "mappings": {
128
+ "properties": {
129
+ "user": {"type": "keyword"},
130
+ "follow": {"type": "keyword"},
131
+ "essid": {"type": "keyword"}
132
+ }
133
+ },
134
+ "settings": {
135
+ "number_of_shards": 1
136
+ }
137
+ }
138
+ with nostdout():
139
+ resp = instance.indices.create(index=config.Index_follow, body=follow_body, ignore=400)
140
+ return handleIndexResponse(resp)
141
+ elif scope.get("scope") == "user":
142
+ user_body = {
143
+ "mappings": {
144
+ "properties": {
145
+ "id": {"type": "keyword"},
146
+ "name": {"type": "keyword"},
147
+ "username": {"type": "keyword"},
148
+ "bio": {"type": "text"},
149
+ "location": {"type": "keyword"},
150
+ "url": {"type": "text"},
151
+ "join_datetime": {"type": "date", "format": "yyyy-MM-dd HH:mm:ss"},
152
+ "tweets": {"type": "integer"},
153
+ "following": {"type": "integer"},
154
+ "followers": {"type": "integer"},
155
+ "likes": {"type": "integer"},
156
+ "media": {"type": "integer"},
157
+ "private": {"type": "integer"},
158
+ "verified": {"type": "integer"},
159
+ "avatar": {"type": "text"},
160
+ "background_image": {"type": "text"},
161
+ "session": {"type": "keyword"},
162
+ "geo_user": {"type": "geo_point"}
163
+ }
164
+ },
165
+ "settings": {
166
+ "number_of_shards": 1
167
+ }
168
+ }
169
+ with nostdout():
170
+ resp = instance.indices.create(index=config.Index_users, body=user_body, ignore=400)
171
+ return handleIndexResponse(resp)
172
+ else:
173
+ print("[x] error index pre-creation :: storage.elasticsearch.createIndex")
174
+ return False
175
+
176
+ @contextlib.contextmanager
177
+ def nostdout():
178
+ savestdout = sys.stdout
179
+ sys.stdout = RecycleObject()
180
+ yield
181
+ sys.stdout = savestdout
182
+
183
+ def weekday(day):
184
+ weekdays = {
185
+ "Monday": 1,
186
+ "Tuesday": 2,
187
+ "Wednesday": 3,
188
+ "Thursday": 4,
189
+ "Friday": 5,
190
+ "Saturday": 6,
191
+ "Sunday": 7,
192
+ }
193
+
194
+ return weekdays[day]
195
+
196
+ def Tweet(Tweet, config):
197
+ global _index_tweet_status
198
+ global _is_near_def
199
+ date_obj = datetime.strptime(Tweet.datetime, "%Y-%m-%d %H:%M:%S %Z")
200
+
201
+ actions = []
202
+
203
+ try:
204
+ retweet = Tweet.retweet
205
+ except AttributeError:
206
+ retweet = None
207
+
208
+ dt = f"{Tweet.datestamp} {Tweet.timestamp}"
209
+
210
+ j_data = {
211
+ "_index": config.Index_tweets,
212
+ "_id": str(Tweet.id) + "_raw_" + config.Essid,
213
+ "_source": {
214
+ "id": str(Tweet.id),
215
+ "conversation_id": Tweet.conversation_id,
216
+ "created_at": Tweet.datetime,
217
+ "date": dt,
218
+ "timezone": Tweet.timezone,
219
+ "place": Tweet.place,
220
+ "tweet": Tweet.tweet,
221
+ "language": Tweet.lang,
222
+ "hashtags": Tweet.hashtags,
223
+ "cashtags": Tweet.cashtags,
224
+ "user_id_str": Tweet.user_id_str,
225
+ "username": Tweet.username,
226
+ "name": Tweet.name,
227
+ "day": date_obj.weekday(),
228
+ "hour": date_obj.hour,
229
+ "link": Tweet.link,
230
+ "retweet": retweet,
231
+ "essid": config.Essid,
232
+ "nlikes": int(Tweet.likes_count),
233
+ "nreplies": int(Tweet.replies_count),
234
+ "nretweets": int(Tweet.retweets_count),
235
+ "quote_url": Tweet.quote_url,
236
+ "video": Tweet.video,
237
+ "search": str(config.Search),
238
+ "near": config.Near
239
+ }
240
+ }
241
+ if retweet is not None:
242
+ j_data["_source"].update({"user_rt_id": Tweet.user_rt_id})
243
+ j_data["_source"].update({"user_rt": Tweet.user_rt})
244
+ j_data["_source"].update({"retweet_id": Tweet.retweet_id})
245
+ j_data["_source"].update({"retweet_date": Tweet.retweet_date})
246
+ if Tweet.reply_to:
247
+ j_data["_source"].update({"reply_to": Tweet.reply_to})
248
+ if Tweet.photos:
249
+ _photos = []
250
+ for photo in Tweet.photos:
251
+ _photos.append(photo)
252
+ j_data["_source"].update({"photos": _photos})
253
+ if Tweet.thumbnail:
254
+ j_data["_source"].update({"thumbnail": Tweet.thumbnail})
255
+ if Tweet.mentions:
256
+ _mentions = []
257
+ for mention in Tweet.mentions:
258
+ _mentions.append(mention)
259
+ j_data["_source"].update({"mentions": _mentions})
260
+ if Tweet.urls:
261
+ _urls = []
262
+ for url in Tweet.urls:
263
+ _urls.append(url)
264
+ j_data["_source"].update({"urls": _urls})
265
+ if config.Near or config.Geo:
266
+ if not _is_near_def:
267
+ __geo = ""
268
+ __near = ""
269
+ if config.Geo:
270
+ __geo = config.Geo
271
+ if config.Near:
272
+ __near = config.Near
273
+ _is_near_def = getLocation(__near + __geo, near=True)
274
+ if _near:
275
+ j_data["_source"].update({"geo_near": _near})
276
+ if Tweet.place:
277
+ _t_place = getLocation(Tweet.place)
278
+ if _t_place:
279
+ j_data["_source"].update({"geo_tweet": getLocation(Tweet.place)})
280
+ if Tweet.source:
281
+ j_data["_source"].update({"source": Tweet.Source})
282
+ if config.Translate:
283
+ j_data["_source"].update({"translate": Tweet.translate})
284
+ j_data["_source"].update({"trans_src": Tweet.trans_src})
285
+ j_data["_source"].update({"trans_dest": Tweet.trans_dest})
286
+
287
+ actions.append(j_data)
288
+
289
+ es = Elasticsearch(config.Elasticsearch, verify_certs=config.Skip_certs)
290
+ if not _index_tweet_status:
291
+ _index_tweet_status = createIndex(config, es, scope="tweet")
292
+ with nostdout():
293
+ helpers.bulk(es, actions, chunk_size=2000, request_timeout=200)
294
+ actions = []
295
+
296
+ def Follow(user, config):
297
+ global _index_follow_status
298
+ actions = []
299
+
300
+ if config.Following:
301
+ _user = config.Username
302
+ _follow = user
303
+ else:
304
+ _user = user
305
+ _follow = config.Username
306
+ j_data = {
307
+ "_index": config.Index_follow,
308
+ "_id": _user + "_" + _follow + "_" + config.Essid,
309
+ "_source": {
310
+ "user": _user,
311
+ "follow": _follow,
312
+ "essid": config.Essid
313
+ }
314
+ }
315
+ actions.append(j_data)
316
+
317
+ es = Elasticsearch(config.Elasticsearch, verify_certs=config.Skip_certs)
318
+ if not _index_follow_status:
319
+ _index_follow_status = createIndex(config, es, scope="follow")
320
+ with nostdout():
321
+ helpers.bulk(es, actions, chunk_size=2000, request_timeout=200)
322
+ actions = []
323
+
324
+ def UserProfile(user, config):
325
+ global _index_user_status
326
+ global _is_location_def
327
+ actions = []
328
+
329
+ j_data = {
330
+ "_index": config.Index_users,
331
+ "_id": user.id + "_" + user.join_date + "_" + user.join_time + "_" + config.Essid,
332
+ "_source": {
333
+ "id": user.id,
334
+ "name": user.name,
335
+ "username": user.username,
336
+ "bio": user.bio,
337
+ "location": user.location,
338
+ "url": user.url,
339
+ "join_datetime": user.join_date + " " + user.join_time,
340
+ "tweets": user.tweets,
341
+ "following": user.following,
342
+ "followers": user.followers,
343
+ "likes": user.likes,
344
+ "media": user.media_count,
345
+ "private": user.is_private,
346
+ "verified": user.is_verified,
347
+ "avatar": user.avatar,
348
+ "background_image": user.background_image,
349
+ "session": config.Essid
350
+ }
351
+ }
352
+ if config.Location:
353
+ if not _is_location_def:
354
+ _is_location_def = getLocation(user.location, location=True)
355
+ if _location:
356
+ j_data["_source"].update({"geo_user": _location})
357
+ actions.append(j_data)
358
+
359
+ es = Elasticsearch(config.Elasticsearch, verify_certs=config.Skip_certs)
360
+ if not _index_user_status:
361
+ _index_user_status = createIndex(config, es, scope="user")
362
+ with nostdout():
363
+ helpers.bulk(es, actions, chunk_size=2000, request_timeout=200)
364
+ actions = []
twitter-scraper/twint-master/twint/storage/panda.py ADDED
@@ -0,0 +1,196 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import datetime, pandas as pd, warnings
2
+ from time import strftime, localtime
3
+ from twint.tweet import Tweet_formats
4
+
5
+ Tweets_df = None
6
+ Follow_df = None
7
+ User_df = None
8
+
9
+ _object_blocks = {
10
+ "tweet": [],
11
+ "user": [],
12
+ "following": [],
13
+ "followers": []
14
+ }
15
+
16
+ weekdays = {
17
+ "Monday": 1,
18
+ "Tuesday": 2,
19
+ "Wednesday": 3,
20
+ "Thursday": 4,
21
+ "Friday": 5,
22
+ "Saturday": 6,
23
+ "Sunday": 7,
24
+ }
25
+
26
+ _type = ""
27
+
28
+ def _concat(df, _type):
29
+ if df is None:
30
+ df = pd.DataFrame(_object_blocks[_type])
31
+ else:
32
+ _df = pd.DataFrame(_object_blocks[_type])
33
+ df = pd.concat([df, _df], sort=True)
34
+ return df
35
+
36
+ def _autoget(_type):
37
+ global Tweets_df
38
+ global Follow_df
39
+ global User_df
40
+
41
+ if _type == "tweet":
42
+ Tweets_df = _concat(Tweets_df, _type)
43
+ elif _type == "followers" or _type == "following":
44
+ Follow_df = _concat(Follow_df, _type)
45
+ elif _type == "user":
46
+ User_df = _concat(User_df, _type)
47
+ else:
48
+ error("[x] Wrong type of object passed")
49
+
50
+
51
+ def update(object, config):
52
+ global _type
53
+
54
+ #try:
55
+ # _type = ((object.__class__.__name__ == "tweet")*"tweet" +
56
+ # (object.__class__.__name__ == "user")*"user")
57
+ #except AttributeError:
58
+ # _type = config.Following*"following" + config.Followers*"followers"
59
+ if object.__class__.__name__ == "tweet":
60
+ _type = "tweet"
61
+ elif object.__class__.__name__ == "user":
62
+ _type = "user"
63
+ elif object.__class__.__name__ == "dict":
64
+ _type = config.Following*"following" + config.Followers*"followers"
65
+
66
+ if _type == "tweet":
67
+ Tweet = object
68
+ datetime_ms = datetime.datetime.strptime(Tweet.datetime, Tweet_formats['datetime']).timestamp() * 1000
69
+ day = weekdays[strftime("%A", localtime(datetime_ms/1000))]
70
+ dt = f"{object.datestamp} {object.timestamp}"
71
+ _data = {
72
+ "id": str(Tweet.id),
73
+ "conversation_id": Tweet.conversation_id,
74
+ "created_at": datetime_ms,
75
+ "date": dt,
76
+ "timezone": Tweet.timezone,
77
+ "place": Tweet.place,
78
+ "tweet": Tweet.tweet,
79
+ "language": Tweet.lang,
80
+ "hashtags": Tweet.hashtags,
81
+ "cashtags": Tweet.cashtags,
82
+ "user_id": Tweet.user_id,
83
+ "user_id_str": Tweet.user_id_str,
84
+ "username": Tweet.username,
85
+ "name": Tweet.name,
86
+ "day": day,
87
+ "hour": strftime("%H", localtime(datetime_ms/1000)),
88
+ "link": Tweet.link,
89
+ "urls": Tweet.urls,
90
+ "photos": Tweet.photos,
91
+ "video": Tweet.video,
92
+ "thumbnail": Tweet.thumbnail,
93
+ "retweet": Tweet.retweet,
94
+ "nlikes": int(Tweet.likes_count),
95
+ "nreplies": int(Tweet.replies_count),
96
+ "nretweets": int(Tweet.retweets_count),
97
+ "quote_url": Tweet.quote_url,
98
+ "search": str(config.Search),
99
+ "near": Tweet.near,
100
+ "geo": Tweet.geo,
101
+ "source": Tweet.source,
102
+ "user_rt_id": Tweet.user_rt_id,
103
+ "user_rt": Tweet.user_rt,
104
+ "retweet_id": Tweet.retweet_id,
105
+ "reply_to": Tweet.reply_to,
106
+ "retweet_date": Tweet.retweet_date,
107
+ "translate": Tweet.translate,
108
+ "trans_src": Tweet.trans_src,
109
+ "trans_dest": Tweet.trans_dest
110
+ }
111
+ _object_blocks[_type].append(_data)
112
+ elif _type == "user":
113
+ user = object
114
+ try:
115
+ background_image = user.background_image
116
+ except:
117
+ background_image = ""
118
+ _data = {
119
+ "id": user.id,
120
+ "name": user.name,
121
+ "username": user.username,
122
+ "bio": user.bio,
123
+ "url": user.url,
124
+ "join_datetime": user.join_date + " " + user.join_time,
125
+ "join_date": user.join_date,
126
+ "join_time": user.join_time,
127
+ "tweets": user.tweets,
128
+ "location": user.location,
129
+ "following": user.following,
130
+ "followers": user.followers,
131
+ "likes": user.likes,
132
+ "media": user.media_count,
133
+ "private": user.is_private,
134
+ "verified": user.is_verified,
135
+ "avatar": user.avatar,
136
+ "background_image": background_image,
137
+ }
138
+ _object_blocks[_type].append(_data)
139
+ elif _type == "followers" or _type == "following":
140
+ _data = {
141
+ config.Following*"following" + config.Followers*"followers" :
142
+ {config.Username: object[_type]}
143
+ }
144
+ _object_blocks[_type] = _data
145
+ else:
146
+ print("Wrong type of object passed!")
147
+
148
+
149
+ def clean():
150
+ global Tweets_df
151
+ global Follow_df
152
+ global User_df
153
+ _object_blocks["tweet"].clear()
154
+ _object_blocks["following"].clear()
155
+ _object_blocks["followers"].clear()
156
+ _object_blocks["user"].clear()
157
+ Tweets_df = None
158
+ Follow_df = None
159
+ User_df = None
160
+
161
+ def save(_filename, _dataframe, **options):
162
+ if options.get("dataname"):
163
+ _dataname = options.get("dataname")
164
+ else:
165
+ _dataname = "twint"
166
+
167
+ if not options.get("type"):
168
+ with warnings.catch_warnings():
169
+ warnings.simplefilter("ignore")
170
+ _store = pd.HDFStore(_filename + ".h5")
171
+ _store[_dataname] = _dataframe
172
+ _store.close()
173
+ elif options.get("type") == "Pickle":
174
+ with warnings.catch_warnings():
175
+ warnings.simplefilter("ignore")
176
+ _dataframe.to_pickle(_filename + ".pkl")
177
+ else:
178
+ print("""Please specify: filename, DataFrame, DataFrame name and type
179
+ (HDF5, default, or Pickle)""")
180
+
181
+ def read(_filename, **options):
182
+ if not options.get("dataname"):
183
+ _dataname = "twint"
184
+ else:
185
+ _dataname = options.get("dataname")
186
+
187
+ if not options.get("type"):
188
+ _store = pd.HDFStore(_filename + ".h5")
189
+ _df = _store[_dataname]
190
+ return _df
191
+ elif options.get("type") == "Pickle":
192
+ _df = pd.read_pickle(_filename + ".pkl")
193
+ return _df
194
+ else:
195
+ print("""Please specify: DataFrame, DataFrame name (twint as default),
196
+ filename and type (HDF5, default, or Pickle""")
twitter-scraper/twint-master/twint/storage/write.py ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from . import write_meta as meta
2
+ import csv
3
+ import json
4
+ import os
5
+
6
+ def outputExt(objType, fType):
7
+ if objType == "str":
8
+ objType = "username"
9
+ outExt = f"/{objType}s.{fType}"
10
+
11
+ return outExt
12
+
13
+ def addExt(base, objType, fType):
14
+ if len(base.split('.')) == 1:
15
+ createDirIfMissing(base)
16
+ base += outputExt(objType, fType)
17
+
18
+ return base
19
+
20
+ def Text(entry, f):
21
+ print(entry.replace('\n', ' '), file=open(f, "a", encoding="utf-8"))
22
+
23
+ def Type(config):
24
+ if config.User_full:
25
+ _type = "user"
26
+ elif config.Followers or config.Following:
27
+ _type = "username"
28
+ else:
29
+ _type = "tweet"
30
+
31
+ return _type
32
+
33
+ def struct(obj, custom, _type):
34
+ if custom:
35
+ fieldnames = custom
36
+ row = {}
37
+ for f in fieldnames:
38
+ row[f] = meta.Data(obj, _type)[f]
39
+ else:
40
+ fieldnames = meta.Fieldnames(_type)
41
+ row = meta.Data(obj, _type)
42
+
43
+ return fieldnames, row
44
+
45
+ def createDirIfMissing(dirname):
46
+ if not os.path.exists(dirname):
47
+ os.makedirs(dirname)
48
+
49
+ def Csv(obj, config):
50
+ _obj_type = obj.__class__.__name__
51
+ if _obj_type == "str":
52
+ _obj_type = "username"
53
+ fieldnames, row = struct(obj, config.Custom[_obj_type], _obj_type)
54
+
55
+ base = addExt(config.Output, _obj_type, "csv")
56
+ dialect = 'excel-tab' if 'Tabs' in config.__dict__ else 'excel'
57
+
58
+ if not (os.path.exists(base)):
59
+ with open(base, "w", newline='', encoding="utf-8") as csv_file:
60
+ writer = csv.DictWriter(csv_file, fieldnames=fieldnames, dialect=dialect)
61
+ writer.writeheader()
62
+
63
+ with open(base, "a", newline='', encoding="utf-8") as csv_file:
64
+ writer = csv.DictWriter(csv_file, fieldnames=fieldnames, dialect=dialect)
65
+ writer.writerow(row)
66
+
67
+ def Json(obj, config):
68
+ _obj_type = obj.__class__.__name__
69
+ if _obj_type == "str":
70
+ _obj_type = "username"
71
+ null, data = struct(obj, config.Custom[_obj_type], _obj_type)
72
+
73
+ base = addExt(config.Output, _obj_type, "json")
74
+
75
+ with open(base, "a", newline='', encoding="utf-8") as json_file:
76
+ json.dump(data, json_file, ensure_ascii=False)
77
+ json_file.write("\n")
twitter-scraper/twint-master/twint/storage/write_meta.py ADDED
@@ -0,0 +1,151 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ def tweetData(t):
2
+ data = {
3
+ "id": int(t.id),
4
+ "conversation_id": t.conversation_id,
5
+ "created_at": t.datetime,
6
+ "date": t.datestamp,
7
+ "time": t.timestamp,
8
+ "timezone": t.timezone,
9
+ "user_id": t.user_id,
10
+ "username": t.username,
11
+ "name": t.name,
12
+ "place": t.place,
13
+ "tweet": t.tweet,
14
+ "language": t.lang,
15
+ "mentions": t.mentions,
16
+ "urls": t.urls,
17
+ "photos": t.photos,
18
+ "replies_count": int(t.replies_count),
19
+ "retweets_count": int(t.retweets_count),
20
+ "likes_count": int(t.likes_count),
21
+ "hashtags": t.hashtags,
22
+ "cashtags": t.cashtags,
23
+ "link": t.link,
24
+ "retweet": t.retweet,
25
+ "quote_url": t.quote_url,
26
+ "video": t.video,
27
+ "thumbnail": t.thumbnail,
28
+ "near": t.near,
29
+ "geo": t.geo,
30
+ "source": t.source,
31
+ "user_rt_id": t.user_rt_id,
32
+ "user_rt": t.user_rt,
33
+ "retweet_id": t.retweet_id,
34
+ "reply_to": t.reply_to,
35
+ "retweet_date": t.retweet_date,
36
+ "translate": t.translate,
37
+ "trans_src": t.trans_src,
38
+ "trans_dest": t.trans_dest,
39
+ }
40
+ return data
41
+
42
+ def tweetFieldnames():
43
+ fieldnames = [
44
+ "id",
45
+ "conversation_id",
46
+ "created_at",
47
+ "date",
48
+ "time",
49
+ "timezone",
50
+ "user_id",
51
+ "username",
52
+ "name",
53
+ "place",
54
+ "tweet",
55
+ "language",
56
+ "mentions",
57
+ "urls",
58
+ "photos",
59
+ "replies_count",
60
+ "retweets_count",
61
+ "likes_count",
62
+ "hashtags",
63
+ "cashtags",
64
+ "link",
65
+ "retweet",
66
+ "quote_url",
67
+ "video",
68
+ "thumbnail",
69
+ "near",
70
+ "geo",
71
+ "source",
72
+ "user_rt_id",
73
+ "user_rt",
74
+ "retweet_id",
75
+ "reply_to",
76
+ "retweet_date",
77
+ "translate",
78
+ "trans_src",
79
+ "trans_dest"
80
+ ]
81
+ return fieldnames
82
+
83
+ def userData(u):
84
+ data = {
85
+ "id": int(u.id),
86
+ "name": u.name,
87
+ "username": u.username,
88
+ "bio": u.bio,
89
+ "location": u.location,
90
+ "url": u.url,
91
+ "join_date": u.join_date,
92
+ "join_time": u.join_time,
93
+ "tweets": int(u.tweets),
94
+ "following": int(u.following),
95
+ "followers": int(u.followers),
96
+ "likes": int(u.likes),
97
+ "media": int(u.media_count),
98
+ "private": u.is_private,
99
+ "verified": u.is_verified,
100
+ "profile_image_url": u.avatar,
101
+ "background_image": u.background_image
102
+ }
103
+ return data
104
+
105
+ def userFieldnames():
106
+ fieldnames = [
107
+ "id",
108
+ "name",
109
+ "username",
110
+ "bio",
111
+ "location",
112
+ "url",
113
+ "join_date",
114
+ "join_time",
115
+ "tweets",
116
+ "following",
117
+ "followers",
118
+ "likes",
119
+ "media",
120
+ "private",
121
+ "verified",
122
+ "profile_image_url",
123
+ "background_image"
124
+ ]
125
+ return fieldnames
126
+
127
+ def usernameData(u):
128
+ return {"username": u}
129
+
130
+ def usernameFieldnames():
131
+ return ["username"]
132
+
133
+ def Data(obj, _type):
134
+ if _type == "user":
135
+ ret = userData(obj)
136
+ elif _type == "username":
137
+ ret = usernameData(obj)
138
+ else:
139
+ ret = tweetData(obj)
140
+
141
+ return ret
142
+
143
+ def Fieldnames(_type):
144
+ if _type == "user":
145
+ ret = userFieldnames()
146
+ elif _type == "username":
147
+ ret = usernameFieldnames()
148
+ else:
149
+ ret = tweetFieldnames()
150
+
151
+ return ret
twitter-scraper/twint-master/twint/token.py ADDED
@@ -0,0 +1,94 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import time
3
+
4
+ import requests
5
+ import logging as logme
6
+
7
+
8
+ class TokenExpiryException(Exception):
9
+ def __init__(self, msg):
10
+ super().__init__(msg)
11
+
12
+
13
+ class RefreshTokenException(Exception):
14
+ def __init__(self, msg):
15
+ super().__init__(msg)
16
+
17
+
18
+ class Token:
19
+ def __init__(self, config):
20
+ self._session = requests.Session()
21
+ self._session.headers.update({'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:78.0) Gecko/20100101 Firefox/78.0'})
22
+ self.config = config
23
+ self._retries = 5
24
+ self._timeout = 10
25
+ self.url = 'https://twitter.com'
26
+
27
+ def _request(self):
28
+ for attempt in range(self._retries + 1):
29
+ # The request is newly prepared on each retry because of potential cookie updates.
30
+ req = self._session.prepare_request(requests.Request('GET', self.url))
31
+ logme.debug(f'Retrieving {req.url}')
32
+ try:
33
+ r = self._session.send(req, allow_redirects=True, timeout=self._timeout)
34
+ except requests.exceptions.RequestException as exc:
35
+ if attempt < self._retries:
36
+ retrying = ', retrying'
37
+ level = logme.WARNING
38
+ else:
39
+ retrying = ''
40
+ level = logme.ERROR
41
+ logme.log(level, f'Error retrieving {req.url}: {exc!r}{retrying}')
42
+ else:
43
+ success, msg = (True, None)
44
+ msg = f': {msg}' if msg else ''
45
+
46
+ if success:
47
+ logme.debug(f'{req.url} retrieved successfully{msg}')
48
+ return r
49
+ if attempt < self._retries:
50
+ # TODO : might wanna tweak this back-off timer
51
+ sleep_time = 2.0 * 2 ** attempt
52
+ logme.info(f'Waiting {sleep_time:.0f} seconds')
53
+ time.sleep(sleep_time)
54
+ else:
55
+ msg = f'{self._retries + 1} requests to {self.url} failed, giving up.'
56
+ logme.fatal(msg)
57
+ self.config.Guest_token = None
58
+ raise RefreshTokenException(msg)
59
+
60
+ def refresh(self):
61
+ logme.debug('Retrieving guest token')
62
+ res = self._request()
63
+ match = re.search(r'\("gt=(\d+);', res.text)
64
+ if match:
65
+ logme.debug('Found guest token in HTML')
66
+ self.config.Guest_token = str(match.group(1))
67
+ else:
68
+ headers = {
69
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:78.0) Gecko/20100101 Firefox/78.0',
70
+ 'authority': 'api.twitter.com',
71
+ 'content-length': '0',
72
+ 'authorization': self.config.Bearer_token,
73
+ 'x-twitter-client-language': 'en',
74
+ 'x-csrf-token': res.cookies.get("ct0"),
75
+ 'x-twitter-active-user': 'yes',
76
+ 'content-type': 'application/x-www-form-urlencoded',
77
+ 'accept': '*/*',
78
+ 'sec-gpc': '1',
79
+ 'origin': 'https://twitter.com',
80
+ 'sec-fetch-site': 'same-site',
81
+ 'sec-fetch-mode': 'cors',
82
+ 'sec-fetch-dest': 'empty',
83
+ 'referer': 'https://twitter.com/',
84
+ 'accept-language': 'en-US',
85
+ }
86
+ self._session.headers.update(headers)
87
+ req = self._session.prepare_request(requests.Request('POST', 'https://api.twitter.com/1.1/guest/activate.json'))
88
+ res = self._session.send(req, allow_redirects=True, timeout=self._timeout)
89
+ if 'guest_token' in res.json():
90
+ logme.debug('Found guest token in JSON')
91
+ self.config.Guest_token = res.json()['guest_token']
92
+ else:
93
+ self.config.Guest_token = None
94
+ raise RefreshTokenException('Could not find the Guest token in HTML')
twitter-scraper/twint-master/twint/tweet.py ADDED
@@ -0,0 +1,166 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from time import strftime, localtime
2
+ from datetime import datetime, timezone
3
+
4
+ import logging as logme
5
+ from googletransx import Translator
6
+ # ref.
7
+ # - https://github.com/x0rzkov/py-googletrans#basic-usage
8
+ translator = Translator()
9
+
10
+
11
+ class tweet:
12
+ """Define Tweet class
13
+ """
14
+ type = "tweet"
15
+
16
+ def __init__(self):
17
+ pass
18
+
19
+
20
+ def utc_to_local(utc_dt):
21
+ return utc_dt.replace(tzinfo=timezone.utc).astimezone(tz=None)
22
+
23
+
24
+ Tweet_formats = {
25
+ 'datetime': '%Y-%m-%d %H:%M:%S %Z',
26
+ 'datestamp': '%Y-%m-%d',
27
+ 'timestamp': '%H:%M:%S'
28
+ }
29
+
30
+
31
+ def _get_mentions(tw):
32
+ """Extract mentions from tweet
33
+ """
34
+ logme.debug(__name__ + ':get_mentions')
35
+ try:
36
+ mentions = [
37
+ {
38
+ 'screen_name': _mention['screen_name'],
39
+ 'name': _mention['name'],
40
+ 'id': _mention['id_str'],
41
+ } for _mention in tw['entities']['user_mentions']
42
+ if tw['display_text_range'][0] < _mention['indices'][0]
43
+ ]
44
+ except KeyError:
45
+ mentions = []
46
+ return mentions
47
+
48
+
49
+ def _get_reply_to(tw):
50
+ try:
51
+ reply_to = [
52
+ {
53
+ 'screen_name': _mention['screen_name'],
54
+ 'name': _mention['name'],
55
+ 'id': _mention['id_str'],
56
+ } for _mention in tw['entities']['user_mentions']
57
+ if tw['display_text_range'][0] > _mention['indices'][1]
58
+ ]
59
+ except KeyError:
60
+ reply_to = []
61
+ return reply_to
62
+
63
+
64
+ def getText(tw):
65
+ """Replace some text
66
+ """
67
+ logme.debug(__name__ + ':getText')
68
+ text = tw['full_text']
69
+ text = text.replace("http", " http")
70
+ text = text.replace("pic.twitter", " pic.twitter")
71
+ text = text.replace("\n", " ")
72
+
73
+ return text
74
+
75
+
76
+ def Tweet(tw, config):
77
+ """Create Tweet object
78
+ """
79
+ logme.debug(__name__ + ':Tweet')
80
+ t = tweet()
81
+ t.id = int(tw['id_str'])
82
+ t.id_str = tw["id_str"]
83
+ t.conversation_id = tw["conversation_id_str"]
84
+
85
+ # parsing date to user-friendly format
86
+ _dt = tw['created_at']
87
+ _dt = datetime.strptime(_dt, '%a %b %d %H:%M:%S %z %Y')
88
+ _dt = utc_to_local(_dt)
89
+ t.datetime = str(_dt.strftime(Tweet_formats['datetime']))
90
+ # date is of the format year,
91
+ t.datestamp = _dt.strftime(Tweet_formats['datestamp'])
92
+ t.timestamp = _dt.strftime(Tweet_formats['timestamp'])
93
+ t.user_id = int(tw["user_id_str"])
94
+ t.user_id_str = tw["user_id_str"]
95
+ t.username = tw["user_data"]['screen_name']
96
+ t.name = tw["user_data"]['name']
97
+ t.place = tw['geo'] if 'geo' in tw and tw['geo'] else ""
98
+ t.timezone = strftime("%z", localtime())
99
+ t.mentions = _get_mentions(tw)
100
+ t.reply_to = _get_reply_to(tw)
101
+ try:
102
+ t.urls = [_url['expanded_url'] for _url in tw['entities']['urls']]
103
+ except KeyError:
104
+ t.urls = []
105
+ try:
106
+ t.photos = [_img['media_url_https'] for _img in tw['entities']['media'] if _img['type'] == 'photo' and
107
+ _img['expanded_url'].find('/photo/') != -1]
108
+ except KeyError:
109
+ t.photos = []
110
+ try:
111
+ t.video = 1 if len(tw['extended_entities']['media']) else 0
112
+ except KeyError:
113
+ t.video = 0
114
+ try:
115
+ t.thumbnail = tw['extended_entities']['media'][0]['media_url_https']
116
+ except KeyError:
117
+ t.thumbnail = ''
118
+ t.tweet = getText(tw)
119
+ t.lang = tw['lang']
120
+ try:
121
+ t.hashtags = [hashtag['text'] for hashtag in tw['entities']['hashtags']]
122
+ except KeyError:
123
+ t.hashtags = []
124
+ try:
125
+ t.cashtags = [cashtag['text'] for cashtag in tw['entities']['symbols']]
126
+ except KeyError:
127
+ t.cashtags = []
128
+ t.replies_count = tw['reply_count']
129
+ t.retweets_count = tw['retweet_count']
130
+ t.likes_count = tw['favorite_count']
131
+ t.link = f"https://twitter.com/{t.username}/status/{t.id}"
132
+ try:
133
+ if 'user_rt_id' in tw['retweet_data']:
134
+ t.retweet = True
135
+ t.retweet_id = tw['retweet_data']['retweet_id']
136
+ t.retweet_date = tw['retweet_data']['retweet_date']
137
+ t.user_rt = tw['retweet_data']['user_rt']
138
+ t.user_rt_id = tw['retweet_data']['user_rt_id']
139
+ except KeyError:
140
+ t.retweet = False
141
+ t.retweet_id = ''
142
+ t.retweet_date = ''
143
+ t.user_rt = ''
144
+ t.user_rt_id = ''
145
+ try:
146
+ t.quote_url = tw['quoted_status_permalink']['expanded'] if tw['is_quote_status'] else ''
147
+ except KeyError:
148
+ # means that the quoted tweet have been deleted
149
+ t.quote_url = 0
150
+ t.near = config.Near if config.Near else ""
151
+ t.geo = config.Geo if config.Geo else ""
152
+ t.source = config.Source if config.Source else ""
153
+ t.translate = ''
154
+ t.trans_src = ''
155
+ t.trans_dest = ''
156
+ if config.Translate:
157
+ try:
158
+ ts = translator.translate(text=t.tweet, dest=config.TranslateDest)
159
+ t.translate = ts.text
160
+ t.trans_src = ts.src
161
+ t.trans_dest = ts.dest
162
+ # ref. https://github.com/SuniTheFish/ChainTranslator/blob/master/ChainTranslator/__main__.py#L31
163
+ except ValueError as e:
164
+ logme.debug(__name__ + ':Tweet:translator.translate:' + str(e))
165
+ raise Exception("Invalid destination language: {} / Tweet: {}".format(config.TranslateDest, t.tweet))
166
+ return t
twitter-scraper/twint-master/twint/url.py ADDED
@@ -0,0 +1,195 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import datetime
2
+ import json
3
+ from sys import platform
4
+ import logging as logme
5
+ from urllib.parse import urlencode
6
+ from urllib.parse import quote
7
+
8
+ mobile = "https://mobile.twitter.com"
9
+ base = "https://api.twitter.com/2/search/adaptive.json"
10
+
11
+
12
+ def _sanitizeQuery(_url, params):
13
+ _serialQuery = ""
14
+ _serialQuery = urlencode(params, quote_via=quote)
15
+ _serialQuery = _url + "?" + _serialQuery
16
+ return _serialQuery
17
+
18
+
19
+ def _formatDate(date):
20
+ if "win" in platform:
21
+ return f'\"{date.split()[0]}\"'
22
+ try:
23
+ return int(datetime.datetime.strptime(date, "%Y-%m-%d %H:%M:%S").timestamp())
24
+ except ValueError:
25
+ return int(datetime.datetime.strptime(date, "%Y-%m-%d").timestamp())
26
+
27
+
28
+ async def Favorites(username, init):
29
+ logme.debug(__name__ + ':Favorites')
30
+ url = f"{mobile}/{username}/favorites?lang=en"
31
+
32
+ if init != '-1':
33
+ url += f"&max_id={init}"
34
+
35
+ return url
36
+
37
+
38
+ async def Followers(username, init):
39
+ logme.debug(__name__ + ':Followers')
40
+ url = f"{mobile}/{username}/followers?lang=en"
41
+
42
+ if init != '-1':
43
+ url += f"&cursor={init}"
44
+
45
+ return url
46
+
47
+
48
+ async def Following(username, init):
49
+ logme.debug(__name__ + ':Following')
50
+ url = f"{mobile}/{username}/following?lang=en"
51
+
52
+ if init != '-1':
53
+ url += f"&cursor={init}"
54
+
55
+ return url
56
+
57
+
58
+ async def MobileProfile(username, init):
59
+ logme.debug(__name__ + ':MobileProfile')
60
+ url = f"{mobile}/{username}?lang=en"
61
+
62
+ if init != '-1':
63
+ url += f"&max_id={init}"
64
+
65
+ return url
66
+
67
+
68
+ async def Search(config, init):
69
+ logme.debug(__name__ + ':Search')
70
+ url = base
71
+ tweet_count = 100 if not config.Limit else config.Limit
72
+ q = ""
73
+ params = [
74
+ # ('include_blocking', '1'),
75
+ # ('include_blocked_by', '1'),
76
+ # ('include_followed_by', '1'),
77
+ # ('include_want_retweets', '1'),
78
+ # ('include_mute_edge', '1'),
79
+ # ('include_can_dm', '1'),
80
+ ('include_can_media_tag', '1'),
81
+ # ('skip_status', '1'),
82
+ # ('include_cards', '1'),
83
+ ('include_ext_alt_text', 'true'),
84
+ ('include_quote_count', 'true'),
85
+ ('include_reply_count', '1'),
86
+ ('tweet_mode', 'extended'),
87
+ ('include_entities', 'true'),
88
+ ('include_user_entities', 'true'),
89
+ ('include_ext_media_availability', 'true'),
90
+ ('send_error_codes', 'true'),
91
+ ('simple_quoted_tweet', 'true'),
92
+ ('count', tweet_count),
93
+ ('query_source', 'typed_query'),
94
+ # ('pc', '1'),
95
+ ('cursor', str(init)),
96
+ ('spelling_corrections', '1'),
97
+ ('ext', 'mediaStats%2ChighlightedLabel'),
98
+ ('tweet_search_mode', 'live'), # this can be handled better, maybe take an argument and set it then
99
+ ]
100
+ if not config.Popular_tweets:
101
+ params.append(('f', 'tweets'))
102
+ if config.Lang:
103
+ params.append(("l", config.Lang))
104
+ params.append(("lang", "en"))
105
+ if config.Query:
106
+ q += f" from:{config.Query}"
107
+ if config.Username:
108
+ q += f" from:{config.Username}"
109
+ if config.Geo:
110
+ config.Geo = config.Geo.replace(" ", "")
111
+ q += f" geocode:{config.Geo}"
112
+ if config.Search:
113
+
114
+ q += f" {config.Search}"
115
+ if config.Year:
116
+ q += f" until:{config.Year}-1-1"
117
+ if config.Since:
118
+ q += f" since:{_formatDate(config.Since)}"
119
+ if config.Until:
120
+ q += f" until:{_formatDate(config.Until)}"
121
+ if config.Email:
122
+ q += ' "mail" OR "email" OR'
123
+ q += ' "gmail" OR "e-mail"'
124
+ if config.Phone:
125
+ q += ' "phone" OR "call me" OR "text me"'
126
+ if config.Verified:
127
+ q += " filter:verified"
128
+ if config.To:
129
+ q += f" to:{config.To}"
130
+ if config.All:
131
+ q += f" to:{config.All} OR from:{config.All} OR @{config.All}"
132
+ if config.Near:
133
+ q += f' near:"{config.Near}"'
134
+ if config.Images:
135
+ q += " filter:images"
136
+ if config.Videos:
137
+ q += " filter:videos"
138
+ if config.Media:
139
+ q += " filter:media"
140
+ if config.Replies:
141
+ q += " filter:replies"
142
+ # although this filter can still be used, but I found it broken in my preliminary testing, needs more testing
143
+ if config.Native_retweets:
144
+ q += " filter:nativeretweets"
145
+ if config.Min_likes:
146
+ q += f" min_faves:{config.Min_likes}"
147
+ if config.Min_retweets:
148
+ q += f" min_retweets:{config.Min_retweets}"
149
+ if config.Min_replies:
150
+ q += f" min_replies:{config.Min_replies}"
151
+ if config.Links == "include":
152
+ q += " filter:links"
153
+ elif config.Links == "exclude":
154
+ q += " exclude:links"
155
+ if config.Source:
156
+ q += f" source:\"{config.Source}\""
157
+ if config.Members_list:
158
+ q += f" list:{config.Members_list}"
159
+ if config.Filter_retweets:
160
+ q += f" exclude:nativeretweets exclude:retweets"
161
+ if config.Custom_query:
162
+ q = config.Custom_query
163
+
164
+ q = q.strip()
165
+ params.append(("q", q))
166
+ _serialQuery = _sanitizeQuery(url, params)
167
+ return url, params, _serialQuery
168
+
169
+
170
+ def SearchProfile(config, init=None):
171
+ logme.debug(__name__ + ':SearchProfile')
172
+ _url = 'https://twitter.com/i/api/graphql/CwLU7qTfeu0doqhSr6tW4A/UserTweetsAndReplies'
173
+ tweet_count = 100
174
+ variables = {
175
+ "userId": config.User_id,
176
+ "count": tweet_count,
177
+ "includePromotedContent": True,
178
+ "withCommunity": True,
179
+ "withSuperFollowsUserFields": True,
180
+ "withBirdwatchPivots": False,
181
+ "withDownvotePerspective": False,
182
+ "withReactionsMetadata": False,
183
+ "withReactionsPerspective": False,
184
+ "withSuperFollowsTweetFields": True,
185
+ "withVoice": True,
186
+ "withV2Timeline": False,
187
+ "__fs_interactive_text": False,
188
+ "__fs_dont_mention_me_view_api_enabled": False,
189
+ }
190
+ if type(init) == str:
191
+ variables['cursor'] = init
192
+ params = [('variables', json.dumps(variables, separators=(',',':')))]
193
+
194
+ _serialQuery = _sanitizeQuery(_url, params)
195
+ return _serialQuery, [], _serialQuery
twitter-scraper/twint-master/twint/user.py ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import datetime
2
+ import logging as logme
3
+
4
+
5
+ class user:
6
+ type = "user"
7
+
8
+ def __init__(self):
9
+ pass
10
+
11
+
12
+ User_formats = {
13
+ 'join_date': '%Y-%m-%d',
14
+ 'join_time': '%H:%M:%S %Z'
15
+ }
16
+
17
+
18
+ # ur object must be a json from the endpoint https://api.twitter.com/graphql
19
+ def User(ur):
20
+ logme.debug(__name__ + ':User')
21
+ if 'data' not in ur and 'user' not in ur['data']:
22
+ msg = 'malformed json! cannot be parsed to get user data'
23
+ logme.fatal(msg)
24
+ raise KeyError(msg)
25
+ _usr = user()
26
+ _usr.id = ur['data']['user']['rest_id']
27
+ _usr.name = ur['data']['user']['legacy']['name']
28
+ _usr.username = ur['data']['user']['legacy']['screen_name']
29
+ _usr.bio = ur['data']['user']['legacy']['description']
30
+ _usr.location = ur['data']['user']['legacy']['location']
31
+ _usr.url = ur['data']['user']['legacy']['url']
32
+ # parsing date to user-friendly format
33
+ _dt = ur['data']['user']['legacy']['created_at']
34
+ _dt = datetime.datetime.strptime(_dt, '%a %b %d %H:%M:%S %z %Y')
35
+ # date is of the format year,
36
+ _usr.join_date = _dt.strftime(User_formats['join_date'])
37
+ _usr.join_time = _dt.strftime(User_formats['join_time'])
38
+
39
+ # :type `int`
40
+ _usr.tweets = int(ur['data']['user']['legacy']['statuses_count'])
41
+ _usr.following = int(ur['data']['user']['legacy']['friends_count'])
42
+ _usr.followers = int(ur['data']['user']['legacy']['followers_count'])
43
+ _usr.likes = int(ur['data']['user']['legacy']['favourites_count'])
44
+ _usr.media_count = int(ur['data']['user']['legacy']['media_count'])
45
+
46
+ _usr.is_private = ur['data']['user']['legacy']['protected']
47
+ _usr.is_verified = ur['data']['user']['legacy']['verified']
48
+ _usr.avatar = ur['data']['user']['legacy']['profile_image_url_https']
49
+ _usr.background_image = ur['data']['user']['legacy']['profile_banner_url']
50
+ # TODO : future implementation
51
+ # legacy_extended_profile is also available in some cases which can be used to get DOB of user
52
+ return _usr
twitter-scraper/twint-master/twint/verbose.py ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ def Count(count, config):
2
+ msg = "[+] Finished: Successfully collected "
3
+ if config.Followers:
4
+ msg += f"all {count} users who follow @{config.Username}"
5
+ elif config.Following:
6
+ msg += f"all {count} users who @{config.Username} follows"
7
+ elif config.Favorites:
8
+ msg += f"{count} Tweets that @{config.Username} liked"
9
+ else:
10
+ msg += f"{count} Tweets_and_replies"
11
+ if config.Username:
12
+ msg += f" from @{config.Username}"
13
+ msg += "."
14
+ print(msg)
15
+
16
+ def Elastic(elasticsearch):
17
+ if elasticsearch:
18
+ print("[+] Indexing to Elasticsearch @ " + str(elasticsearch))
twitter-scraper/twint-master/twitter_scraper.ipynb ADDED
@@ -0,0 +1,265 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "id": "a5361789",
6
+ "metadata": {},
7
+ "source": [
8
+ "## Have to install these packages \n"
9
+ ]
10
+ },
11
+ {
12
+ "cell_type": "code",
13
+ "execution_count": null,
14
+ "id": "c9021300",
15
+ "metadata": {
16
+ "scrolled": true
17
+ },
18
+ "outputs": [],
19
+ "source": [
20
+ "%%capture \n",
21
+ "!pip3 install Twint \n"
22
+ ]
23
+ },
24
+ {
25
+ "cell_type": "markdown",
26
+ "id": "5c857dbf",
27
+ "metadata": {},
28
+ "source": [
29
+ "## Nessessary Imports"
30
+ ]
31
+ },
32
+ {
33
+ "cell_type": "code",
34
+ "execution_count": null,
35
+ "id": "1413ab2b",
36
+ "metadata": {},
37
+ "outputs": [],
38
+ "source": [
39
+ "# import asyncio\n",
40
+ "# import os\n",
41
+ "# loop = asyncio.get_event_loop()\n",
42
+ "# loop.is_running()\n",
43
+ "# import twint\n",
44
+ "# import nest_asyncio\n",
45
+ "# nest_asyncio.apply()"
46
+ ]
47
+ },
48
+ {
49
+ "cell_type": "code",
50
+ "execution_count": null,
51
+ "id": "d38514f3",
52
+ "metadata": {},
53
+ "outputs": [],
54
+ "source": [
55
+ "import scrape\n"
56
+ ]
57
+ },
58
+ {
59
+ "cell_type": "code",
60
+ "execution_count": null,
61
+ "id": "a7912a91",
62
+ "metadata": {},
63
+ "outputs": [],
64
+ "source": [
65
+ "from_date=\"2022-6-10 10:30:22\"\n",
66
+ "to_date= \"2022-6-30\"\n",
67
+ "num_tweets = 20\n",
68
+ "_data=scrape.scraper.get_tweets(\"jimmieakesson\",u_or_s=\"u\",from_date=221232,to_date=2313)\n"
69
+ ]
70
+ },
71
+ {
72
+ "cell_type": "code",
73
+ "execution_count": null,
74
+ "id": "48d50b46",
75
+ "metadata": {},
76
+ "outputs": [],
77
+ "source": [
78
+ "tweets= _data.keys()\n",
79
+ "for i in tweets:\n",
80
+ " _data[i][\"tweet\"]\n",
81
+ " print(_data[i][\"tweet\"], \"\\n\", \"__________________________________________________________\")"
82
+ ]
83
+ },
84
+ {
85
+ "cell_type": "code",
86
+ "execution_count": null,
87
+ "id": "72cabcb5",
88
+ "metadata": {},
89
+ "outputs": [],
90
+ "source": [
91
+ "from_date=\"2022-6-10 10:30:22\"\n",
92
+ "to_date= \"2022-6-30\"\n",
93
+ "num_tweets = 20\n",
94
+ "_data=scrape.scraper.string_search_user_tweets(\"jimmieakesson\",\"invandring\")\n"
95
+ ]
96
+ },
97
+ {
98
+ "cell_type": "code",
99
+ "execution_count": null,
100
+ "id": "549e4fb3",
101
+ "metadata": {},
102
+ "outputs": [],
103
+ "source": [
104
+ "tweets= _data[\"tweet\"]\n",
105
+ "for i in tweets:\n",
106
+ " print(i, \"\\n\", \"__________________________________________________________\")"
107
+ ]
108
+ },
109
+ {
110
+ "cell_type": "code",
111
+ "execution_count": 3,
112
+ "id": "733dd44a",
113
+ "metadata": {},
114
+ "outputs": [
115
+ {
116
+ "name": "stdout",
117
+ "output_type": "stream",
118
+ "text": [
119
+ "Defaulting to user installation because normal site-packages is not writeable\n",
120
+ "Requirement already satisfied: snscrape in /home/oxygen/.local/lib/python3.10/site-packages (0.3.4)\n",
121
+ "Requirement already satisfied: beautifulsoup4 in /home/oxygen/.local/lib/python3.10/site-packages (from snscrape) (4.11.1)\n",
122
+ "Requirement already satisfied: requests[socks] in /usr/lib/python3/dist-packages (from snscrape) (2.25.1)\n",
123
+ "Requirement already satisfied: lxml in /usr/lib/python3/dist-packages (from snscrape) (4.8.0)\n",
124
+ "Requirement already satisfied: soupsieve>1.2 in /home/oxygen/.local/lib/python3.10/site-packages (from beautifulsoup4->snscrape) (2.3.2.post1)\n",
125
+ "Requirement already satisfied: PySocks!=1.5.7,>=1.5.6 in /home/oxygen/.local/lib/python3.10/site-packages (from requests[socks]->snscrape) (1.7.1)\n"
126
+ ]
127
+ }
128
+ ],
129
+ "source": [
130
+ "#%pip install -q snscrape==0.3.4\n",
131
+ "!pip3 install snscrape\n",
132
+ "#!pip3 install git+https://github.com/JustAnotherArchivist/snscrape.git"
133
+ ]
134
+ },
135
+ {
136
+ "cell_type": "code",
137
+ "execution_count": 14,
138
+ "id": "0d16422c",
139
+ "metadata": {},
140
+ "outputs": [
141
+ {
142
+ "name": "stdout",
143
+ "output_type": "stream",
144
+ "text": [
145
+ "Note: you may need to restart the kernel to use updated packages.\n"
146
+ ]
147
+ }
148
+ ],
149
+ "source": [
150
+ "%pip install -q snscrape==0.3.4\n",
151
+ "from datetime import date\n",
152
+ "import os\n",
153
+ "import pandas as pd\n",
154
+ "\n",
155
+ "\n",
156
+ "def get_tweets(search_term, from_date, to_date=date.today(), num_tweets=100,u_or_s='s'):\n",
157
+ " if u_or_s.lower() =='u':\n",
158
+ " extracted_tweets = \"snscrape --format '{content!r}'\"+ f\" --max-results {num_tweets} --since {from_date} twitter-user '{search_term} until:{to_date}' > extracted-tweets.txt\" \n",
159
+ " else:\n",
160
+ " extracted_tweets = \"snscrape --format '{content!r}'\"+ f\" --max-results {num_tweets} --since {from_date} twitter-search '{search_term} until:{to_date}' > extracted-tweets.txt\"\n",
161
+ " \n",
162
+ " os.system(extracted_tweets)\n",
163
+ " if os.stat(\"extracted-tweets.txt\").st_size == 0:\n",
164
+ " print('No Tweets found')\n",
165
+ " else:\n",
166
+ " df = pd.read_csv('extracted-tweets.txt', names=['content'])\n",
167
+ " data_list=[]\n",
168
+ " for row in df['content'].iteritems():\n",
169
+ " temp= str(row[0])+str(row[1])\n",
170
+ " temp= temp.replace(\"\\'\",\"\")\n",
171
+ " data_list.append(temp)\n",
172
+ " return data_list\n",
173
+ "\n"
174
+ ]
175
+ },
176
+ {
177
+ "cell_type": "code",
178
+ "execution_count": 12,
179
+ "id": "8e2adb35",
180
+ "metadata": {},
181
+ "outputs": [
182
+ {
183
+ "name": "stdout",
184
+ "output_type": "stream",
185
+ "text": [
186
+ "No Tweets found\n"
187
+ ]
188
+ },
189
+ {
190
+ "name": "stderr",
191
+ "output_type": "stream",
192
+ "text": [
193
+ "Traceback (most recent call last):\n",
194
+ " File \"/home/oxygen/.local/bin/snscrape\", line 8, in <module>\n",
195
+ " sys.exit(main())\n",
196
+ " File \"/home/oxygen/.local/lib/python3.10/site-packages/snscrape/cli.py\", line 224, in main\n",
197
+ " args = parse_args()\n",
198
+ " File \"/home/oxygen/.local/lib/python3.10/site-packages/snscrape/cli.py\", line 159, in parse_args\n",
199
+ " import snscrape.modules\n",
200
+ " File \"/home/oxygen/.local/lib/python3.10/site-packages/snscrape/modules/__init__.py\", line 15, in <module>\n",
201
+ " _import_modules()\n",
202
+ " File \"/home/oxygen/.local/lib/python3.10/site-packages/snscrape/modules/__init__.py\", line 12, in _import_modules\n",
203
+ " module = importlib.import_module(moduleName)\n",
204
+ " File \"/usr/lib/python3.10/importlib/__init__.py\", line 126, in import_module\n",
205
+ " return _bootstrap._gcd_import(name[level:], package, level)\n",
206
+ " File \"/home/oxygen/.local/lib/python3.10/site-packages/snscrape/modules/instagram.py\", line 12, in <module>\n",
207
+ " class InstagramPost(typing.NamedTuple, snscrape.base.Item):\n",
208
+ " File \"/usr/lib/python3.10/typing.py\", line 2329, in _namedtuple_mro_entries\n",
209
+ " raise TypeError(\"Multiple inheritance with NamedTuple is not supported\")\n",
210
+ "TypeError: Multiple inheritance with NamedTuple is not supported\n"
211
+ ]
212
+ },
213
+ {
214
+ "ename": "UnboundLocalError",
215
+ "evalue": "local variable 'df' referenced before assignment",
216
+ "output_type": "error",
217
+ "traceback": [
218
+ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
219
+ "\u001b[0;31mUnboundLocalError\u001b[0m Traceback (most recent call last)",
220
+ "\u001b[0;32m/tmp/ipykernel_26511/1892081786.py\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0md\u001b[0m\u001b[0;34m=\u001b[0m \u001b[0mget_tweets\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"jimmieakesson\"\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0mfrom_date\u001b[0m\u001b[0;34m=\u001b[0m \u001b[0;34m\"2022-06-01\"\u001b[0m \u001b[0;34m,\u001b[0m\u001b[0mnum_tweets\u001b[0m \u001b[0;34m=\u001b[0m\u001b[0;36m5\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mu_or_s\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m\"u\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
221
+ "\u001b[0;32m/tmp/ipykernel_26511/275462205.py\u001b[0m in \u001b[0;36mget_tweets\u001b[0;34m(search_term, from_date, to_date, num_tweets, u_or_s)\u001b[0m\n\u001b[1;32m 17\u001b[0m \u001b[0mdf\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mpd\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mread_csv\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'extracted-tweets.txt'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mnames\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'content'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 18\u001b[0m \u001b[0mdata_list\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 19\u001b[0;31m \u001b[0;32mfor\u001b[0m \u001b[0mrow\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mdf\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'content'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0miteritems\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 20\u001b[0m \u001b[0mtemp\u001b[0m\u001b[0;34m=\u001b[0m \u001b[0mstr\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mrow\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m+\u001b[0m\u001b[0mstr\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mrow\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 21\u001b[0m \u001b[0mtemp\u001b[0m\u001b[0;34m=\u001b[0m \u001b[0mtemp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mreplace\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"\\'\"\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\"\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
222
+ "\u001b[0;31mUnboundLocalError\u001b[0m: local variable 'df' referenced before assignment"
223
+ ]
224
+ }
225
+ ],
226
+ "source": [
227
+ "d= get_tweets(\"jimmieakesson\",from_date= \"2022-06-01\" ,num_tweets =5, u_or_s=\"u\")"
228
+ ]
229
+ },
230
+ {
231
+ "cell_type": "code",
232
+ "execution_count": null,
233
+ "id": "a2c837f4",
234
+ "metadata": {},
235
+ "outputs": [],
236
+ "source": []
237
+ }
238
+ ],
239
+ "metadata": {
240
+ "kernelspec": {
241
+ "display_name": "Python 3.10.4 64-bit",
242
+ "language": "python",
243
+ "name": "python3"
244
+ },
245
+ "language_info": {
246
+ "codemirror_mode": {
247
+ "name": "ipython",
248
+ "version": 3
249
+ },
250
+ "file_extension": ".py",
251
+ "mimetype": "text/x-python",
252
+ "name": "python",
253
+ "nbconvert_exporter": "python",
254
+ "pygments_lexer": "ipython3",
255
+ "version": "3.10.4"
256
+ },
257
+ "vscode": {
258
+ "interpreter": {
259
+ "hash": "916dbcbb3f70747c44a77c7bcd40155683ae19c65e1c03b4aa3499c5328201f1"
260
+ }
261
+ }
262
+ },
263
+ "nbformat": 4,
264
+ "nbformat_minor": 5
265
+ }