Spaces:
Runtime error
Runtime error
Mosa
commited on
Commit
•
35c6ca6
1
Parent(s):
b24e23b
fdsfkl
Browse files- twitter-scraper/scrape.py +103 -0
- twitter-scraper/twint-master/.github/FUNDING.yml +0 -3
- twitter-scraper/twint-master/.github/ISSUE_TEMPLATE.md +0 -20
- twitter-scraper/twint-master/.github/ISSUE_TEMPLATE/ISSUE_TEMPLATE.md +0 -17
- twitter-scraper/twint-master/.gitignore +0 -115
- twitter-scraper/twint-master/.travis.yml +0 -23
- twitter-scraper/twint-master/Dockerfile +0 -10
- twitter-scraper/twint-master/LICENSE +0 -21
- twitter-scraper/twint-master/MANIFEST.in +0 -1
- twitter-scraper/twint-master/README.md +0 -272
- twitter-scraper/twint-master/Untitled.ipynb +0 -282
- twitter-scraper/twint-master/automate.py +0 -65
- twitter-scraper/twint-master/elasticsearch/README.md +0 -5
- twitter-scraper/twint-master/scrape.py +0 -102
- twitter-scraper/twint-master/scrape__init__.py +0 -14
- twitter-scraper/twint-master/setup.py +0 -65
- twitter-scraper/twint-master/test.py +0 -92
- twitter-scraper/twint-master/twint/__init__.py +0 -32
- twitter-scraper/twint-master/twint/__version__.py +0 -3
- twitter-scraper/twint-master/twint/cli.py +0 -342
- twitter-scraper/twint-master/twint/config.py +0 -87
- twitter-scraper/twint-master/twint/datelock.py +0 -44
- twitter-scraper/twint-master/twint/feed.py +0 -145
- twitter-scraper/twint-master/twint/format.py +0 -91
- twitter-scraper/twint-master/twint/get.py +0 -298
- twitter-scraper/twint-master/twint/output.py +0 -241
- twitter-scraper/twint-master/twint/run.py +0 -412
- twitter-scraper/twint-master/twint/storage/__init__.py +0 -0
- twitter-scraper/twint-master/twint/storage/db.py +0 -297
- twitter-scraper/twint-master/twint/storage/elasticsearch.py +0 -364
- twitter-scraper/twint-master/twint/storage/panda.py +0 -196
- twitter-scraper/twint-master/twint/storage/write.py +0 -77
- twitter-scraper/twint-master/twint/storage/write_meta.py +0 -151
- twitter-scraper/twint-master/twint/token.py +0 -94
- twitter-scraper/twint-master/twint/tweet.py +0 -166
- twitter-scraper/twint-master/twint/url.py +0 -195
- twitter-scraper/twint-master/twint/user.py +0 -52
- twitter-scraper/twint-master/twint/verbose.py +0 -18
- twitter-scraper/{twint-master/twitter_scraper.ipynb → twitter_scraper.ipynb} +0 -0
- twitter_scraper/twint_master/elasticsearch/dashboard.json +18 -0
- twitter_scraper/twint_master/elasticsearch/index-follow.json +15 -0
- twitter_scraper/twint_master/elasticsearch/index-tweets.json +48 -0
- twitter_scraper/twint_master/elasticsearch/index-user.json +33 -0
- twitter_scraper/twint_master/elasticsearch/visualizations.json +100 -0
- twitter_scraper/twint_master/extracted-tweets.txt +5 -0
- twitter_scraper/twint_master/requirements.txt +13 -0
twitter-scraper/scrape.py
ADDED
@@ -0,0 +1,103 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
from tkinter import EXCEPTION
|
3 |
+
import twint
|
4 |
+
from datetime import date
|
5 |
+
import pandas as pd
|
6 |
+
import sys
|
7 |
+
import io
|
8 |
+
import time
|
9 |
+
class scraper:
|
10 |
+
def get_tweets(search_str, from_date="2006-07-01", to_date=str(date.today()), num_tweets=10, u_or_s='s',
|
11 |
+
acceptable_range=10):
|
12 |
+
if (type(from_date) or type("str")) is not type("str"):
|
13 |
+
print("[!] Please make sure the date is a string in this format \"yyyy-mm-dd\" ")
|
14 |
+
raise EXCEPTION("Incorrect date type Exception!")
|
15 |
+
|
16 |
+
time_out = time.time() + 2 * 60
|
17 |
+
_dict = {}
|
18 |
+
c = twint.Config()
|
19 |
+
if u_or_s.lower() == "u":
|
20 |
+
c.Search = "from:@" + search_str # topic
|
21 |
+
else:
|
22 |
+
c.Search = search_str # topic
|
23 |
+
c.Pandas = True
|
24 |
+
num_tweets_and_replies = num_tweets
|
25 |
+
c.Count = True
|
26 |
+
#for j in range(1, 5):
|
27 |
+
c.Limit = num_tweets_and_replies
|
28 |
+
c.Since = from_date
|
29 |
+
c.Until = to_date
|
30 |
+
c.Hide_output = True
|
31 |
+
old_stdout = sys.stdout
|
32 |
+
new_stdout = io.StringIO()
|
33 |
+
sys.stdout = new_stdout
|
34 |
+
twint.run.Search(c)
|
35 |
+
output = new_stdout.getvalue()
|
36 |
+
sys.stdout = old_stdout
|
37 |
+
print(output[0:-2])
|
38 |
+
tweet_info =twint.output.panda.Tweets_df
|
39 |
+
indx_replies=[]
|
40 |
+
tweet=tweet_info['tweet']
|
41 |
+
for i in range(len(tweet)):
|
42 |
+
if tweet[i].startswith("@"):
|
43 |
+
indx_replies.append(i)
|
44 |
+
tweet_info.drop(indx_replies,axis=0, inplace =True)
|
45 |
+
print(len(tweet_info['tweet']), " of them are Tweets")
|
46 |
+
#df.drop([5,6], axis=0, inplace=True)
|
47 |
+
return tweet_info
|
48 |
+
|
49 |
+
|
50 |
+
|
51 |
+
|
52 |
+
# try:
|
53 |
+
# _keys = tweet_info[["id","tweet","date","user_id","urls" ,'nlikes', 'nreplies', 'nretweets']]
|
54 |
+
# # tweet infor is a dataframe with fallowing columns
|
55 |
+
# # Index(['id', 'conversation_id', 'created_at', 'date', 'timezone', 'place',
|
56 |
+
# # 'tweet', 'language', 'hashtags', 'cashtags', 'user_id', 'user_id_str',
|
57 |
+
# # 'username', 'name', 'day', 'hour', 'link', 'urls', 'photos', 'video',
|
58 |
+
# # 'thumbnail', 'retweet', 'nlikes', 'nreplies', 'nretweets', 'quote_url',
|
59 |
+
# # 'search', 'near', 'geo', 'source', 'user_rt_id', 'user_rt',
|
60 |
+
# # 'retweet_id', 'reply_to', 'retweet_date', 'translate', 'trans_src',
|
61 |
+
# # 'trans_dest'],
|
62 |
+
# # dtype='object')
|
63 |
+
|
64 |
+
# for i in range(len( _keys)):
|
65 |
+
# if _keys[i] in _dict.keys() or tweet_info["tweet"][i].startswith("@"):
|
66 |
+
# pass
|
67 |
+
# else:
|
68 |
+
# _dict[int(_keys[i])] = {"tweet": tweet_info["tweet"][i],
|
69 |
+
# "date": tweet_info["date"][i],
|
70 |
+
# "nlikes": tweet_info["nlikes"][i],
|
71 |
+
# "nreplies": tweet_info["nreplies"][i],
|
72 |
+
# "nretweets": tweet_info["nretweets"][i], "topic": ""}
|
73 |
+
# if len(list(_dict.keys())) == num_tweets:
|
74 |
+
# break
|
75 |
+
# except:
|
76 |
+
# pass
|
77 |
+
# print(len(list(_dict.keys())), " of them are Tweets")
|
78 |
+
# if (num_tweets - len(list(_dict.keys()))) < acceptable_range:
|
79 |
+
# return _dict
|
80 |
+
# if len(list(_dict.keys())) < num_tweets:
|
81 |
+
# num_tweets_and_replies = num_tweets_and_replies + 100 * 3 ** j
|
82 |
+
# else:
|
83 |
+
# break
|
84 |
+
# if time_out < time.time():
|
85 |
+
# break
|
86 |
+
# if output.startswith("[!] No more data!"):
|
87 |
+
# break
|
88 |
+
#return _dict
|
89 |
+
|
90 |
+
def string_search_user_tweets(user_name, search_str, from_date="2006-07-01", to_date=str(date.today()),
|
91 |
+
num_tweets=10):
|
92 |
+
c = twint.Config()
|
93 |
+
c.Username = user_name
|
94 |
+
c.Search = search_str # topic
|
95 |
+
c.Pandas = True
|
96 |
+
num_tweets_and_replies = num_tweets
|
97 |
+
c.Count = True
|
98 |
+
c.Limit = num_tweets_and_replies
|
99 |
+
c.Since = from_date
|
100 |
+
c.Until = to_date
|
101 |
+
c.Hide_output = True
|
102 |
+
twint.run.Search(c)
|
103 |
+
return twint.output.panda.Tweets_df
|
twitter-scraper/twint-master/.github/FUNDING.yml
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
# These are supported funding model platforms
|
2 |
-
patreon: twintproject
|
3 |
-
custom: paypal.me/noneprivacy
|
|
|
|
|
|
|
|
twitter-scraper/twint-master/.github/ISSUE_TEMPLATE.md
DELETED
@@ -1,20 +0,0 @@
|
|
1 |
-
# Issue Template
|
2 |
-
Please use this template!
|
3 |
-
|
4 |
-
## Initial Check
|
5 |
-
> If the issue is a request please specify that it is a request in the title (Example: [REQUEST] more features). If this is a question regarding 'twint' please specify that it's a question in the title (Example: [QUESTION] What is x?). Please **only** submit issues related to 'twint'. Thanks.
|
6 |
-
|
7 |
-
>Make sure you've checked the following:
|
8 |
-
|
9 |
-
- [] Python version is 3.6 or later;
|
10 |
-
- [] Updated Twint with `pip3 install --user --upgrade -e git+https://github.com/minamotorin/twint.git@origin/master#egg=twint`;
|
11 |
-
- [] I have searched the issues and there are no duplicates of this issue/question/request (please link to related issues of twintproject/twint for reference).
|
12 |
-
|
13 |
-
## Command Ran
|
14 |
-
>Please provide the _exact_ command ran including the username/search/code so I may reproduce the issue.
|
15 |
-
|
16 |
-
## Description of Issue
|
17 |
-
>Please use **as much detail as possible.**
|
18 |
-
|
19 |
-
## Environment Details
|
20 |
-
>Using Windows, Linux? What OS version? Running this in Anaconda? Jupyter Notebook? Terminal?
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
twitter-scraper/twint-master/.github/ISSUE_TEMPLATE/ISSUE_TEMPLATE.md
DELETED
@@ -1,17 +0,0 @@
|
|
1 |
-
### Initial Check
|
2 |
-
> If the issue is a request please specify that it is a request in the title (Example: [REQUEST] more features). If this is a question regarding 'twint' please specify that it's a question in the title (Example: [QUESTION] What is x?). Please **only** submit issues related to 'twint'. Thanks.
|
3 |
-
|
4 |
-
>Make sure you've checked the following:
|
5 |
-
|
6 |
-
- [] Python version is 3.6;
|
7 |
-
- [] Using the latest version of Twint;
|
8 |
-
- [] Updated Twint with `pip3 install --upgrade -e git+https://github.com/twintproject/twint.git@origin/master#egg=twint`;
|
9 |
-
|
10 |
-
### Command Ran
|
11 |
-
>Please provide the _exact_ command ran including the username/search/code so I may reproduce the issue.
|
12 |
-
|
13 |
-
### Description of Issue
|
14 |
-
>Please use **as much detail as possible.**
|
15 |
-
|
16 |
-
### Environment Details
|
17 |
-
>Using Windows, Linux? What OS version? Running this in Anaconda? Jupyter Notebook? Terminal?
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
twitter-scraper/twint-master/.gitignore
DELETED
@@ -1,115 +0,0 @@
|
|
1 |
-
# Byte-compiled / optimized / DLL files
|
2 |
-
__pycache__/
|
3 |
-
*.py[cod]
|
4 |
-
*$py.class
|
5 |
-
tweets.db
|
6 |
-
# C extensions
|
7 |
-
*.so
|
8 |
-
|
9 |
-
config.ini
|
10 |
-
twint/storage/mysql.py
|
11 |
-
|
12 |
-
# Node Dependency directories
|
13 |
-
node_modules/
|
14 |
-
jspm_packages/
|
15 |
-
tests/
|
16 |
-
# Distribution / packaging
|
17 |
-
.Python
|
18 |
-
env/
|
19 |
-
build/
|
20 |
-
develop-eggs/
|
21 |
-
dist/
|
22 |
-
downloads/
|
23 |
-
eggs/
|
24 |
-
.eggs/
|
25 |
-
lib/
|
26 |
-
lib64/
|
27 |
-
parts/
|
28 |
-
sdist/
|
29 |
-
var/
|
30 |
-
wheels/
|
31 |
-
*.egg-info/
|
32 |
-
.installed.cfg
|
33 |
-
*.egg
|
34 |
-
|
35 |
-
# PyInstaller
|
36 |
-
# Usually these files are written by a python script from a template
|
37 |
-
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
38 |
-
*.manifest
|
39 |
-
*.spec
|
40 |
-
|
41 |
-
# Installer logs
|
42 |
-
pip-log.txt
|
43 |
-
pip-delete-this-directory.txt
|
44 |
-
|
45 |
-
# Unit test / coverage reports
|
46 |
-
htmlcov/
|
47 |
-
.tox/
|
48 |
-
.coverage
|
49 |
-
.coverage.*
|
50 |
-
.cache
|
51 |
-
nosetests.xml
|
52 |
-
coverage.xml
|
53 |
-
*.cover
|
54 |
-
.hypothesis/
|
55 |
-
|
56 |
-
# Translations
|
57 |
-
*.mo
|
58 |
-
*.pot
|
59 |
-
|
60 |
-
# Django stuff:
|
61 |
-
*.log
|
62 |
-
local_settings.py
|
63 |
-
|
64 |
-
# Flask stuff:
|
65 |
-
instance/
|
66 |
-
.webassets-cache
|
67 |
-
|
68 |
-
# Scrapy stuff:
|
69 |
-
.scrapy
|
70 |
-
|
71 |
-
# Sphinx documentation
|
72 |
-
docs/_build/
|
73 |
-
|
74 |
-
# PyBuilder
|
75 |
-
target/
|
76 |
-
|
77 |
-
# Jupyter Notebook
|
78 |
-
.ipynb_checkpoints
|
79 |
-
|
80 |
-
# pyenv
|
81 |
-
.python-version
|
82 |
-
|
83 |
-
# celery beat schedule file
|
84 |
-
celerybeat-schedule
|
85 |
-
|
86 |
-
# SageMath parsed files
|
87 |
-
*.sage.py
|
88 |
-
|
89 |
-
# dotenv
|
90 |
-
.env
|
91 |
-
|
92 |
-
# virtualenv
|
93 |
-
.venv
|
94 |
-
venv/
|
95 |
-
ENV/
|
96 |
-
|
97 |
-
# Spyder project settings
|
98 |
-
.spyderproject
|
99 |
-
.spyproject
|
100 |
-
|
101 |
-
# Rope project settings
|
102 |
-
.ropeproject
|
103 |
-
|
104 |
-
# mkdocs documentation
|
105 |
-
/site
|
106 |
-
|
107 |
-
# mypy
|
108 |
-
.mypy_cache/
|
109 |
-
|
110 |
-
# output
|
111 |
-
*.csv
|
112 |
-
*.json
|
113 |
-
*.txt
|
114 |
-
|
115 |
-
test_twint.py
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
twitter-scraper/twint-master/.travis.yml
DELETED
@@ -1,23 +0,0 @@
|
|
1 |
-
dist: bionic
|
2 |
-
language: python
|
3 |
-
python:
|
4 |
-
- "3.6"
|
5 |
-
- "3.7"
|
6 |
-
- "3.8"
|
7 |
-
- "nightly"
|
8 |
-
matrix:
|
9 |
-
allow_failures:
|
10 |
-
- python: "nightly"
|
11 |
-
- python: "3.8"
|
12 |
-
install:
|
13 |
-
- pip install -r requirements.txt
|
14 |
-
script:
|
15 |
-
- python test.py
|
16 |
-
deploy:
|
17 |
-
provider: pypi
|
18 |
-
user: "codyzacharias"
|
19 |
-
password:
|
20 |
-
secure: sWWvx50F7KJBtf8z2njc+Q31WIAHiQs4zKEiGD4/7xrshw55H5z+WnqZ9VIP83qm9yKefoRKp7WnaJeXZ3ulZSLn64ue45lqFozWMyGvelRPOKvZi9XPMqBA7+qllR/GseTHSGC3G5EGxac6UEI3irYe3mZXxfjpxNOXVti8rJ2xX8TiJM0AVKRrdDiAstOhMMkXkB7fYXMQALwEp8UoW/UbjbeqsKueXydjStaESNP/QzRFZ3/tuNu+3HMz/olniLUhUWcF/xDbJVpXuaRMUalgqe+BTbDdtUVt/s/GKtpg5GAzJyhQphiCM/huihedUIKSoI+6A8PTzuxrLhB5BMi9pcllED02v7w1enpu5L2l5cRDgQJSOpkxkA5Eese8nxKOOq0KzwDQa3JByrRor8R4yz+p5s4u2r0Rs2A9fkjQYwd/uWBSEIRF4K9WZoniiikahwXq070DMRgV7HbovKSjo5NK5F8j+psrtqPF+OHN2aVfWxbGnezrOOkmzuTHhWZVj3pPSpQU1WFWHo9fPo4I6YstR4q6XjNNjrpY3ojSlv0ThMbUem7zhHTRkRsSA2SpPfqw5E3Jf7vaiQb4M5zkBVqxuq4tXb14GJ26tGD8tel8u8b+ccpkAE9xf+QavP8UHz4PbBhqgFX5TbV/H++cdsICyoZnT35yiaDOELM=
|
21 |
-
on:
|
22 |
-
tags: true
|
23 |
-
python: "3.7"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
twitter-scraper/twint-master/Dockerfile
DELETED
@@ -1,10 +0,0 @@
|
|
1 |
-
FROM python:3.6-buster
|
2 |
-
LABEL maintainer="codyzacharias@pm.me"
|
3 |
-
|
4 |
-
WORKDIR /root
|
5 |
-
|
6 |
-
RUN git clone --depth=1 https://github.com/twintproject/twint.git && \
|
7 |
-
cd /root/twint && \
|
8 |
-
pip3 install . -r requirements.txt
|
9 |
-
|
10 |
-
CMD /bin/bash
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
twitter-scraper/twint-master/LICENSE
DELETED
@@ -1,21 +0,0 @@
|
|
1 |
-
MIT License
|
2 |
-
|
3 |
-
Copyright (c) 2018 Cody Zacharias
|
4 |
-
|
5 |
-
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6 |
-
of this software and associated documentation files (the "Software"), to deal
|
7 |
-
in the Software without restriction, including without limitation the rights
|
8 |
-
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9 |
-
copies of the Software, and to permit persons to whom the Software is
|
10 |
-
furnished to do so, subject to the following conditions:
|
11 |
-
|
12 |
-
The above copyright notice and this permission notice shall be included in all
|
13 |
-
copies or substantial portions of the Software.
|
14 |
-
|
15 |
-
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16 |
-
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17 |
-
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18 |
-
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19 |
-
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20 |
-
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
21 |
-
SOFTWARE.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
twitter-scraper/twint-master/MANIFEST.in
DELETED
@@ -1 +0,0 @@
|
|
1 |
-
include README.md LICENSE
|
|
|
|
twitter-scraper/twint-master/README.md
DELETED
@@ -1,272 +0,0 @@
|
|
1 |
-
20220207.0
|
2 |
-
|
3 |
-
# About this fork
|
4 |
-
|
5 |
-
[This repository](https://github.com/minamotorin/twint) is the fork of [https://github.com/twintproject/twint](https://github.com/twintproject/twint) and for myself.
|
6 |
-
|
7 |
-
Modified by [minamotorin](https://github.com/minamotorin).
|
8 |
-
|
9 |
-
## Updates from twintproject/twint
|
10 |
-
|
11 |
-
### twint.token.RefreshTokenException: Could not find the Guest token in HTML
|
12 |
-
|
13 |
-
This problem doesn't happen recently.
|
14 |
-
|
15 |
-
#### Related
|
16 |
-
|
17 |
-
- [twintproject/twint#1320](https://github.com/twintproject/twint/issues/1320)
|
18 |
-
- [twintproject/twint#1322](https://github.com/twintproject/twint/pull/1322)
|
19 |
-
- [twintproject/twint#1328](https://github.com/twintproject/twint/pull/1328)
|
20 |
-
- [twintproject/twint#1061](https://github.com/twintproject/twint/issues/1061)
|
21 |
-
- [twintproject/twint#1114](https://github.com/twintproject/twint/issues/1114)
|
22 |
-
|
23 |
-
### json.decoder.JSONDecodeError: Expecting value: line 1 column 1 (char 0)
|
24 |
-
|
25 |
-
The fix is **not complete**.
|
26 |
-
`twint.run.Profile` will work but `twint.run.db` will not.
|
27 |
-
This means [`test.py`](./test.py) causes an error.
|
28 |
-
|
29 |
-
I think this is because the fields of the result table are not exactly the same as the traditional ones.
|
30 |
-
|
31 |
-
#### Related
|
32 |
-
|
33 |
-
- [twintproject/twint#1335](https://github.com/twintproject/twint/issues/1335)
|
34 |
-
|
35 |
-
### [-] TWINT requires Python version 3.6+.
|
36 |
-
|
37 |
-
#### Related
|
38 |
-
|
39 |
-
- [twintproject/twint#1344](https://github.com/twintproject/twint/issues/1344)
|
40 |
-
- [twintproject/twint#1345](https://github.com/twintproject/twint/pull/1345)
|
41 |
-
- [twintproject/twint#1344](https://github.com/twintproject/twint/issues/1346)
|
42 |
-
- [twintproject/twint#1309](https://github.com/twintproject/twint/pull/1309)
|
43 |
-
- [twintproject/twint#1313](https://github.com/twintproject/twint/issues/1313)
|
44 |
-
|
45 |
-
## References
|
46 |
-
|
47 |
-
- [snscrape](https://github.com/JustAnotherArchivist/snscrape)
|
48 |
-
- [gallery-dl](https://github.com/mikf/gallery-dl)
|
49 |
-
|
50 |
-
## License
|
51 |
-
|
52 |
-
This repository is also under the [MIT License](https://opensource.org/licenses/mit-license.php).
|
53 |
-
|
54 |
-
---
|
55 |
-
|
56 |
-
# TWINT - Twitter Intelligence Tool
|
57 |
-
![2](https://i.imgur.com/iaH3s7z.png)
|
58 |
-
![3](https://i.imgur.com/hVeCrqL.png)
|
59 |
-
|
60 |
-
[![PyPI](https://img.shields.io/pypi/v/twint.svg)](https://pypi.org/project/twint/) [![Build Status](https://travis-ci.org/twintproject/twint.svg?branch=master)](https://travis-ci.org/twintproject/twint) [![Python 3.6|3.7|3.8](https://img.shields.io/badge/Python-3.6%2F3.7%2F3.8-blue.svg)](https://www.python.org/download/releases/3.0/) [![GitHub license](https://img.shields.io/github/license/haccer/tweep.svg)](https://github.com/haccer/tweep/blob/master/LICENSE) [![Downloads](https://pepy.tech/badge/twint)](https://pepy.tech/project/twint) [![Downloads](https://pepy.tech/badge/twint/week)](https://pepy.tech/project/twint/week) [![Patreon](https://img.shields.io/endpoint.svg?url=https:%2F%2Fshieldsio-patreon.herokuapp.com%2Ftwintproject)](https://www.patreon.com/twintproject) ![](https://img.shields.io/twitter/follow/noneprivacy.svg?label=Follow&style=social)
|
61 |
-
|
62 |
-
>No authentication. No API. No limits.
|
63 |
-
|
64 |
-
Twint is an advanced Twitter scraping tool written in Python that allows for scraping Tweets from Twitter profiles **without** using Twitter's API.
|
65 |
-
|
66 |
-
Twint utilizes Twitter's search operators to let you scrape Tweets from specific users, scrape Tweets relating to certain topics, hashtags & trends, or sort out *sensitive* information from Tweets like e-mail and phone numbers. I find this very useful, and you can get really creative with it too.
|
67 |
-
|
68 |
-
Twint also makes special queries to Twitter allowing you to also scrape a Twitter user's followers, Tweets a user has liked, and who they follow **without** any authentication, API, Selenium, or browser emulation.
|
69 |
-
|
70 |
-
## tl;dr Benefits
|
71 |
-
Some of the benefits of using Twint vs Twitter API:
|
72 |
-
- Can fetch almost __all__ Tweets (Twitter API limits to last 3200 Tweets only);
|
73 |
-
- Fast initial setup;
|
74 |
-
- Can be used anonymously and without Twitter sign up;
|
75 |
-
- **No rate limitations**.
|
76 |
-
|
77 |
-
## Limits imposed by Twitter
|
78 |
-
Twitter limits scrolls while browsing the user timeline. This means that with `.Profile` or with `.Favorites` you will be able to get ~3200 tweets.
|
79 |
-
|
80 |
-
## Requirements
|
81 |
-
- Python 3.6;
|
82 |
-
- aiohttp;
|
83 |
-
- aiodns;
|
84 |
-
- beautifulsoup4;
|
85 |
-
- cchardet;
|
86 |
-
- dataclasses
|
87 |
-
- elasticsearch;
|
88 |
-
- pysocks;
|
89 |
-
- pandas (>=0.23.0);
|
90 |
-
- aiohttp_socks;
|
91 |
-
- schedule;
|
92 |
-
- geopy;
|
93 |
-
- fake-useragent;
|
94 |
-
- py-googletransx.
|
95 |
-
|
96 |
-
## Installing
|
97 |
-
|
98 |
-
**Git:**
|
99 |
-
```bash
|
100 |
-
git clone --depth=1 https://github.com/twintproject/twint.git
|
101 |
-
cd twint
|
102 |
-
pip3 install . -r requirements.txt
|
103 |
-
```
|
104 |
-
|
105 |
-
**Pip:**
|
106 |
-
```bash
|
107 |
-
pip3 install twint
|
108 |
-
```
|
109 |
-
|
110 |
-
or
|
111 |
-
|
112 |
-
```bash
|
113 |
-
pip3 install --user --upgrade git+https://github.com/twintproject/twint.git@origin/master#egg=twint
|
114 |
-
```
|
115 |
-
|
116 |
-
**Pipenv**:
|
117 |
-
```bash
|
118 |
-
pipenv install git+https://github.com/twintproject/twint.git#egg=twint
|
119 |
-
```
|
120 |
-
|
121 |
-
### March 2, 2021 Update
|
122 |
-
|
123 |
-
**Added**: Dockerfile
|
124 |
-
|
125 |
-
Noticed a lot of people are having issues installing (including me). Please use the Dockerfile temporarily while I look into them.
|
126 |
-
|
127 |
-
## CLI Basic Examples and Combos
|
128 |
-
A few simple examples to help you understand the basics:
|
129 |
-
|
130 |
-
- `twint -u username` - Scrape all the Tweets of a *user* (doesn't include **retweets** but includes **replies**).
|
131 |
-
- `twint -u username -s pineapple` - Scrape all Tweets from the *user*'s timeline containing _pineapple_.
|
132 |
-
- `twint -s pineapple` - Collect every Tweet containing *pineapple* from everyone's Tweets.
|
133 |
-
- `twint -u username --year 2014` - Collect Tweets that were tweeted **before** 2014.
|
134 |
-
- `twint -u username --since "2015-12-20 20:30:15"` - Collect Tweets that were tweeted since 2015-12-20 20:30:15.
|
135 |
-
- `twint -u username --since 2015-12-20` - Collect Tweets that were tweeted since 2015-12-20 00:00:00.
|
136 |
-
- `twint -u username -o file.txt` - Scrape Tweets and save to file.txt.
|
137 |
-
- `twint -u username -o file.csv --csv` - Scrape Tweets and save as a csv file.
|
138 |
-
- `twint -u username --email --phone` - Show Tweets that might have phone numbers or email addresses.
|
139 |
-
- `twint -s "Donald Trump" --verified` - Display Tweets by verified users that Tweeted about Donald Trump.
|
140 |
-
- `twint -g="48.880048,2.385939,1km" -o file.csv --csv` - Scrape Tweets from a radius of 1km around a place in Paris and export them to a csv file.
|
141 |
-
- `twint -u username -es localhost:9200` - Output Tweets to Elasticsearch
|
142 |
-
- `twint -u username -o file.json --json` - Scrape Tweets and save as a json file.
|
143 |
-
- `twint -u username --database tweets.db` - Save Tweets to a SQLite database.
|
144 |
-
- `twint -u username --followers` - Scrape a Twitter user's followers.
|
145 |
-
- `twint -u username --following` - Scrape who a Twitter user follows.
|
146 |
-
- `twint -u username --favorites` - Collect all the Tweets a user has favorited (gathers ~3200 tweet).
|
147 |
-
- `twint -u username --following --user-full` - Collect full user information a person follows
|
148 |
-
- `twint -u username --timeline` - Use an effective method to gather Tweets from a user's profile (Gathers ~3200 Tweets, including **retweets** & **replies**).
|
149 |
-
- `twint -u username --retweets` - Use a quick method to gather the last 900 Tweets (that includes retweets) from a user's profile.
|
150 |
-
- `twint -u username --resume resume_file.txt` - Resume a search starting from the last saved scroll-id.
|
151 |
-
|
152 |
-
More detail about the commands and options are located in the [wiki](https://github.com/twintproject/twint/wiki/Commands)
|
153 |
-
|
154 |
-
## Module Example
|
155 |
-
|
156 |
-
Twint can now be used as a module and supports custom formatting. **More details are located in the [wiki](https://github.com/twintproject/twint/wiki/Module)**
|
157 |
-
|
158 |
-
```python
|
159 |
-
import twint
|
160 |
-
|
161 |
-
# Configure
|
162 |
-
c = twint.Config()
|
163 |
-
c.Username = "realDonaldTrump"
|
164 |
-
c.Search = "great"
|
165 |
-
|
166 |
-
# Run
|
167 |
-
twint.run.Search(c)
|
168 |
-
```
|
169 |
-
> Output
|
170 |
-
|
171 |
-
`955511208597184512 2018-01-22 18:43:19 GMT <now> pineapples are the best fruit`
|
172 |
-
|
173 |
-
```python
|
174 |
-
import twint
|
175 |
-
|
176 |
-
c = twint.Config()
|
177 |
-
|
178 |
-
c.Username = "noneprivacy"
|
179 |
-
c.Custom["tweet"] = ["id"]
|
180 |
-
c.Custom["user"] = ["bio"]
|
181 |
-
c.Limit = 10
|
182 |
-
c.Store_csv = True
|
183 |
-
c.Output = "none"
|
184 |
-
|
185 |
-
twint.run.Search(c)
|
186 |
-
```
|
187 |
-
|
188 |
-
## Storing Options
|
189 |
-
- Write to file;
|
190 |
-
- CSV;
|
191 |
-
- JSON;
|
192 |
-
- SQLite;
|
193 |
-
- Elasticsearch.
|
194 |
-
|
195 |
-
## Elasticsearch Setup
|
196 |
-
|
197 |
-
Details on setting up Elasticsearch with Twint is located in the [wiki](https://github.com/twintproject/twint/wiki/Elasticsearch).
|
198 |
-
|
199 |
-
## Graph Visualization
|
200 |
-
![graph](https://i.imgur.com/EEJqB8n.png)
|
201 |
-
|
202 |
-
[Graph](https://github.com/twintproject/twint/wiki/Graph) details are also located in the [wiki](https://github.com/twintproject/twint/wiki/Graph).
|
203 |
-
|
204 |
-
We are developing a Twint Desktop App.
|
205 |
-
|
206 |
-
![4](https://i.imgur.com/DzcfIgL.png)
|
207 |
-
|
208 |
-
## FAQ
|
209 |
-
> I tried scraping tweets from a user, I know that they exist but I'm not getting them
|
210 |
-
|
211 |
-
Twitter can shadow-ban accounts, which means that their tweets will not be available via search. To solve this, pass `--profile-full` if you are using Twint via CLI or, if are using Twint as module, add `config.Profile_full = True`. Please note that this process will be quite slow.
|
212 |
-
## More Examples
|
213 |
-
|
214 |
-
#### Followers/Following
|
215 |
-
|
216 |
-
> To get only follower usernames/following usernames
|
217 |
-
|
218 |
-
`twint -u username --followers`
|
219 |
-
|
220 |
-
`twint -u username --following`
|
221 |
-
|
222 |
-
> To get user info of followers/following users
|
223 |
-
|
224 |
-
`twint -u username --followers --user-full`
|
225 |
-
|
226 |
-
`twint -u username --following --user-full`
|
227 |
-
|
228 |
-
#### userlist
|
229 |
-
|
230 |
-
> To get only user info of user
|
231 |
-
|
232 |
-
`twint -u username --user-full`
|
233 |
-
|
234 |
-
> To get user info of users from a userlist
|
235 |
-
|
236 |
-
`twint --userlist inputlist --user-full`
|
237 |
-
|
238 |
-
|
239 |
-
#### tweet translation (experimental)
|
240 |
-
|
241 |
-
> To get 100 english tweets and translate them to italian
|
242 |
-
|
243 |
-
`twint -u noneprivacy --csv --output none.csv --lang en --translate --translate-dest it --limit 100`
|
244 |
-
|
245 |
-
or
|
246 |
-
|
247 |
-
```python
|
248 |
-
import twint
|
249 |
-
|
250 |
-
c = twint.Config()
|
251 |
-
c.Username = "noneprivacy"
|
252 |
-
c.Limit = 100
|
253 |
-
c.Store_csv = True
|
254 |
-
c.Output = "none.csv"
|
255 |
-
c.Lang = "en"
|
256 |
-
c.Translate = True
|
257 |
-
c.TranslateDest = "it"
|
258 |
-
twint.run.Search(c)
|
259 |
-
```
|
260 |
-
|
261 |
-
Notes:
|
262 |
-
- [Google translate has some quotas](https://cloud.google.com/translate/quotas)
|
263 |
-
|
264 |
-
## Featured Blog Posts:
|
265 |
-
- [How to use Twint as an OSINT tool](https://pielco11.ovh/posts/twint-osint/)
|
266 |
-
- [Basic tutorial made by Null Byte](https://null-byte.wonderhowto.com/how-to/mine-twitter-for-targeted-information-with-twint-0193853/)
|
267 |
-
- [Analyzing Tweets with NLP in minutes with Spark, Optimus and Twint](https://towardsdatascience.com/analyzing-tweets-with-nlp-in-minutes-with-spark-optimus-and-twint-a0c96084995f)
|
268 |
-
- [Loading tweets into Kafka and Neo4j](https://markhneedham.com/blog/2019/05/29/loading-tweets-twint-kafka-neo4j/)
|
269 |
-
|
270 |
-
## Contact
|
271 |
-
|
272 |
-
If you have any question, want to join in discussions, or need extra help, you are welcome to join our Twint focused channel at [OSINT team](https://osint.team)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
twitter-scraper/twint-master/Untitled.ipynb
DELETED
@@ -1,282 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"cells": [
|
3 |
-
{
|
4 |
-
"cell_type": "code",
|
5 |
-
"execution_count": 67,
|
6 |
-
"metadata": {},
|
7 |
-
"outputs": [],
|
8 |
-
"source": [
|
9 |
-
"text= \"\\n\\n0. Brottslighet, 1. Miljö, 2. Skola, 3. Sjukvård, 4. Militär, 5. Invandring, 6. Integration \""
|
10 |
-
]
|
11 |
-
},
|
12 |
-
{
|
13 |
-
"cell_type": "code",
|
14 |
-
"execution_count": 17,
|
15 |
-
"metadata": {},
|
16 |
-
"outputs": [
|
17 |
-
{
|
18 |
-
"name": "stdout",
|
19 |
-
"output_type": "stream",
|
20 |
-
"text": [
|
21 |
-
"WARNING: pip is being invoked by an old script wrapper. This will fail in a future version of pip.\n",
|
22 |
-
"Please see https://github.com/pypa/pip/issues/5599 for advice on fixing the underlying issue.\n",
|
23 |
-
"To avoid this problem you can invoke Python with '-m pip' instead of running pip directly.\n",
|
24 |
-
"Requirement already satisfied: regex in /home/oxygen/snap/jupyter/common/lib/python3.7/site-packages (2022.6.2)\n"
|
25 |
-
]
|
26 |
-
}
|
27 |
-
],
|
28 |
-
"source": [
|
29 |
-
"!pip install regex\n"
|
30 |
-
]
|
31 |
-
},
|
32 |
-
{
|
33 |
-
"cell_type": "code",
|
34 |
-
"execution_count": 15,
|
35 |
-
"metadata": {},
|
36 |
-
"outputs": [
|
37 |
-
{
|
38 |
-
"data": {
|
39 |
-
"text/plain": [
|
40 |
-
"['0']"
|
41 |
-
]
|
42 |
-
},
|
43 |
-
"execution_count": 15,
|
44 |
-
"metadata": {},
|
45 |
-
"output_type": "execute_result"
|
46 |
-
}
|
47 |
-
],
|
48 |
-
"source": [
|
49 |
-
"re.findall(\"[0-9]+\", tl[0])"
|
50 |
-
]
|
51 |
-
},
|
52 |
-
{
|
53 |
-
"cell_type": "code",
|
54 |
-
"execution_count": 48,
|
55 |
-
"metadata": {},
|
56 |
-
"outputs": [
|
57 |
-
{
|
58 |
-
"data": {
|
59 |
-
"text/plain": [
|
60 |
-
"'0. Äldrefrågor'"
|
61 |
-
]
|
62 |
-
},
|
63 |
-
"execution_count": 48,
|
64 |
-
"metadata": {},
|
65 |
-
"output_type": "execute_result"
|
66 |
-
}
|
67 |
-
],
|
68 |
-
"source": [
|
69 |
-
"tl[0]"
|
70 |
-
]
|
71 |
-
},
|
72 |
-
{
|
73 |
-
"cell_type": "code",
|
74 |
-
"execution_count": 49,
|
75 |
-
"metadata": {},
|
76 |
-
"outputs": [
|
77 |
-
{
|
78 |
-
"data": {
|
79 |
-
"text/plain": [
|
80 |
-
"['0', ' Äldrefrågor']"
|
81 |
-
]
|
82 |
-
},
|
83 |
-
"execution_count": 49,
|
84 |
-
"metadata": {},
|
85 |
-
"output_type": "execute_result"
|
86 |
-
}
|
87 |
-
],
|
88 |
-
"source": [
|
89 |
-
"f=tl[0].split('.')\n",
|
90 |
-
"\n",
|
91 |
-
"f#int(f[0])"
|
92 |
-
]
|
93 |
-
},
|
94 |
-
{
|
95 |
-
"cell_type": "code",
|
96 |
-
"execution_count": 29,
|
97 |
-
"metadata": {},
|
98 |
-
"outputs": [
|
99 |
-
{
|
100 |
-
"ename": "NameError",
|
101 |
-
"evalue": "name 'str_topics_to_dict' is not defined",
|
102 |
-
"output_type": "error",
|
103 |
-
"traceback": [
|
104 |
-
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
|
105 |
-
"\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)",
|
106 |
-
"\u001b[0;32m<ipython-input-29-b05d9860dbcf>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mstr_topics_to_dict\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtext\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
|
107 |
-
"\u001b[0;31mNameError\u001b[0m: name 'str_topics_to_dict' is not defined"
|
108 |
-
]
|
109 |
-
}
|
110 |
-
],
|
111 |
-
"source": []
|
112 |
-
},
|
113 |
-
{
|
114 |
-
"cell_type": "code",
|
115 |
-
"execution_count": 65,
|
116 |
-
"metadata": {},
|
117 |
-
"outputs": [],
|
118 |
-
"source": [
|
119 |
-
"\n",
|
120 |
-
"def str_topics_to_dict(topics):\n",
|
121 |
-
" topic_list=topics.split(\",\")\n",
|
122 |
-
" ind_topic_dict={}\n",
|
123 |
-
" for i inrange(len(topic_list)): \n",
|
124 |
-
" index_topic_list=\n",
|
125 |
-
" ind=index_topic_list[0]\n",
|
126 |
-
" just_topic=index_topic_list[1][1:]\n",
|
127 |
-
" ind_topic_dict[int(ind)]=just_topic\n",
|
128 |
-
" return ind_topic_dict"
|
129 |
-
]
|
130 |
-
},
|
131 |
-
{
|
132 |
-
"cell_type": "code",
|
133 |
-
"execution_count": 68,
|
134 |
-
"metadata": {},
|
135 |
-
"outputs": [
|
136 |
-
{
|
137 |
-
"data": {
|
138 |
-
"text/plain": [
|
139 |
-
"{0: 'Brottslighet',\n",
|
140 |
-
" 1: 'Miljö',\n",
|
141 |
-
" 2: 'Skola',\n",
|
142 |
-
" 3: 'Sjukvård',\n",
|
143 |
-
" 4: 'Militär',\n",
|
144 |
-
" 5: 'Invandring',\n",
|
145 |
-
" 6: 'Integration '}"
|
146 |
-
]
|
147 |
-
},
|
148 |
-
"execution_count": 68,
|
149 |
-
"metadata": {},
|
150 |
-
"output_type": "execute_result"
|
151 |
-
}
|
152 |
-
],
|
153 |
-
"source": [
|
154 |
-
"str_topics_to_dict(text)"
|
155 |
-
]
|
156 |
-
},
|
157 |
-
{
|
158 |
-
"cell_type": "code",
|
159 |
-
"execution_count": 109,
|
160 |
-
"metadata": {},
|
161 |
-
"outputs": [
|
162 |
-
{
|
163 |
-
"data": {
|
164 |
-
"text/plain": [
|
165 |
-
"' Brottslighet, Miljö, Skola, Sjukvård, Militär stöd, Invandring, Integration '"
|
166 |
-
]
|
167 |
-
},
|
168 |
-
"execution_count": 109,
|
169 |
-
"metadata": {},
|
170 |
-
"output_type": "execute_result"
|
171 |
-
}
|
172 |
-
],
|
173 |
-
"source": [
|
174 |
-
"\n",
|
175 |
-
"text=\"\\n\\n0. Brottslighet, 1. Miljö, 2. Skola, 3. Sjukvård, 4. Militär stöd, 5. Invandring, 6. Integration \"\n",
|
176 |
-
"text=re.sub(r\"(\\n+)\",\" \",text)\n",
|
177 |
-
"text=re.sub(\"(\\.)|\\d+\",\"\",text )\n",
|
178 |
-
"text"
|
179 |
-
]
|
180 |
-
},
|
181 |
-
{
|
182 |
-
"cell_type": "code",
|
183 |
-
"execution_count": 100,
|
184 |
-
"metadata": {},
|
185 |
-
"outputs": [
|
186 |
-
{
|
187 |
-
"data": {
|
188 |
-
"text/plain": [
|
189 |
-
"[' Brottslighet',\n",
|
190 |
-
" ' Miljö',\n",
|
191 |
-
" ' Skola',\n",
|
192 |
-
" ' Sjukvård',\n",
|
193 |
-
" ' Militär stöd',\n",
|
194 |
-
" ' Invandring',\n",
|
195 |
-
" ' Integration ']"
|
196 |
-
]
|
197 |
-
},
|
198 |
-
"execution_count": 100,
|
199 |
-
"metadata": {},
|
200 |
-
"output_type": "execute_result"
|
201 |
-
}
|
202 |
-
],
|
203 |
-
"source": [
|
204 |
-
"text.split(\",\")"
|
205 |
-
]
|
206 |
-
},
|
207 |
-
{
|
208 |
-
"cell_type": "code",
|
209 |
-
"execution_count": 116,
|
210 |
-
"metadata": {},
|
211 |
-
"outputs": [],
|
212 |
-
"source": [
|
213 |
-
"import regex as re \n",
|
214 |
-
"def str_topics_to_dict(topics):\n",
|
215 |
-
" text=re.sub(r\"(\\n+)\",\" \",topics)\n",
|
216 |
-
" text=re.sub(\"(\\.)|\\d+\",\"\",topics )\n",
|
217 |
-
" topics=re.sub(r\"(\\n+)|(\\.)|\\d+\",\"\",topics)\n",
|
218 |
-
" topic_list=topics.split(\",\")\n",
|
219 |
-
" ind_topic_dict={}\n",
|
220 |
-
" for i in range(len(topic_list)): \n",
|
221 |
-
" ind=i\n",
|
222 |
-
" just_topic=topic_list[i]\n",
|
223 |
-
" ind_topic_dict[ind]=just_topic\n",
|
224 |
-
" return ind_topic_dict"
|
225 |
-
]
|
226 |
-
},
|
227 |
-
{
|
228 |
-
"cell_type": "code",
|
229 |
-
"execution_count": 117,
|
230 |
-
"metadata": {},
|
231 |
-
"outputs": [
|
232 |
-
{
|
233 |
-
"data": {
|
234 |
-
"text/plain": [
|
235 |
-
"{0: ' Brottslighet',\n",
|
236 |
-
" 1: ' Miljö',\n",
|
237 |
-
" 2: ' Skola',\n",
|
238 |
-
" 3: ' Sjukvård',\n",
|
239 |
-
" 4: ' Militär stöd',\n",
|
240 |
-
" 5: ' Invandring',\n",
|
241 |
-
" 6: ' Integration '}"
|
242 |
-
]
|
243 |
-
},
|
244 |
-
"execution_count": 117,
|
245 |
-
"metadata": {},
|
246 |
-
"output_type": "execute_result"
|
247 |
-
}
|
248 |
-
],
|
249 |
-
"source": [
|
250 |
-
"str_topics_to_dict(text)"
|
251 |
-
]
|
252 |
-
},
|
253 |
-
{
|
254 |
-
"cell_type": "code",
|
255 |
-
"execution_count": null,
|
256 |
-
"metadata": {},
|
257 |
-
"outputs": [],
|
258 |
-
"source": []
|
259 |
-
}
|
260 |
-
],
|
261 |
-
"metadata": {
|
262 |
-
"kernelspec": {
|
263 |
-
"display_name": "Python 3",
|
264 |
-
"language": "python",
|
265 |
-
"name": "python3"
|
266 |
-
},
|
267 |
-
"language_info": {
|
268 |
-
"codemirror_mode": {
|
269 |
-
"name": "ipython",
|
270 |
-
"version": 3
|
271 |
-
},
|
272 |
-
"file_extension": ".py",
|
273 |
-
"mimetype": "text/x-python",
|
274 |
-
"name": "python",
|
275 |
-
"nbconvert_exporter": "python",
|
276 |
-
"pygments_lexer": "ipython3",
|
277 |
-
"version": "3.7.3"
|
278 |
-
}
|
279 |
-
},
|
280 |
-
"nbformat": 4,
|
281 |
-
"nbformat_minor": 2
|
282 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
twitter-scraper/twint-master/automate.py
DELETED
@@ -1,65 +0,0 @@
|
|
1 |
-
import twint
|
2 |
-
import schedule
|
3 |
-
import time
|
4 |
-
|
5 |
-
# you can change the name of each "job" after "def" if you'd like.
|
6 |
-
def jobone():
|
7 |
-
print ("Fetching Tweets")
|
8 |
-
c = twint.Config()
|
9 |
-
# choose username (optional)
|
10 |
-
c.Username = "insert username here"
|
11 |
-
# choose search term (optional)
|
12 |
-
c.Search = "insert search term here"
|
13 |
-
# choose beginning time (narrow results)
|
14 |
-
c.Since = "2018-01-01"
|
15 |
-
# set limit on total tweets
|
16 |
-
c.Limit = 1000
|
17 |
-
# no idea, but makes the csv format properly
|
18 |
-
c.Store_csv = True
|
19 |
-
# format of the csv
|
20 |
-
c.Custom = ["date", "time", "username", "tweet", "link", "likes", "retweets", "replies", "mentions", "hashtags"]
|
21 |
-
# change the name of the csv file
|
22 |
-
c.Output = "filename.csv"
|
23 |
-
twint.run.Search(c)
|
24 |
-
|
25 |
-
def jobtwo():
|
26 |
-
print ("Fetching Tweets")
|
27 |
-
c = twint.Config()
|
28 |
-
# choose username (optional)
|
29 |
-
c.Username = "insert username here"
|
30 |
-
# choose search term (optional)
|
31 |
-
c.Search = "insert search term here"
|
32 |
-
# choose beginning time (narrow results)
|
33 |
-
c.Since = "2018-01-01"
|
34 |
-
# set limit on total tweets
|
35 |
-
c.Limit = 1000
|
36 |
-
# no idea, but makes the csv format properly
|
37 |
-
c.Store_csv = True
|
38 |
-
# format of the csv
|
39 |
-
c.Custom = ["date", "time", "username", "tweet", "link", "likes", "retweets", "replies", "mentions", "hashtags"]
|
40 |
-
# change the name of the csv file
|
41 |
-
c.Output = "filename2.csv"
|
42 |
-
twint.run.Search(c)
|
43 |
-
|
44 |
-
# run once when you start the program
|
45 |
-
|
46 |
-
jobone()
|
47 |
-
jobtwo()
|
48 |
-
|
49 |
-
# run every minute(s), hour, day at, day of the week, day of the week and time. Use "#" to block out which ones you don't want to use. Remove it to active. Also, replace "jobone" and "jobtwo" with your new function names (if applicable)
|
50 |
-
|
51 |
-
# schedule.every(1).minutes.do(jobone)
|
52 |
-
schedule.every().hour.do(jobone)
|
53 |
-
# schedule.every().day.at("10:30").do(jobone)
|
54 |
-
# schedule.every().monday.do(jobone)
|
55 |
-
# schedule.every().wednesday.at("13:15").do(jobone)
|
56 |
-
|
57 |
-
# schedule.every(1).minutes.do(jobtwo)
|
58 |
-
schedule.every().hour.do(jobtwo)
|
59 |
-
# schedule.every().day.at("10:30").do(jobtwo)
|
60 |
-
# schedule.every().monday.do(jobtwo)
|
61 |
-
# schedule.every().wednesday.at("13:15").do(jobtwo)
|
62 |
-
|
63 |
-
while True:
|
64 |
-
schedule.run_pending()
|
65 |
-
time.sleep(1)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
twitter-scraper/twint-master/elasticsearch/README.md
DELETED
@@ -1,5 +0,0 @@
|
|
1 |
-
# Elasticsearch How-To
|
2 |
-
|
3 |
-
![dashboard](https://i.imgur.com/BEbtdo5.png)
|
4 |
-
|
5 |
-
Please read the Wiki [here](https://github.com/twintproject/twint/wiki/Elasticsearch)
|
|
|
|
|
|
|
|
|
|
|
|
twitter-scraper/twint-master/scrape.py
DELETED
@@ -1,102 +0,0 @@
|
|
1 |
-
import sys
|
2 |
-
import io
|
3 |
-
import time
|
4 |
-
import asyncio
|
5 |
-
import os
|
6 |
-
from tkinter import EXCEPTION
|
7 |
-
from numpy import not_equal
|
8 |
-
|
9 |
-
loop = asyncio.get_event_loop()
|
10 |
-
loop.is_running()
|
11 |
-
import twint
|
12 |
-
import nest_asyncio
|
13 |
-
|
14 |
-
nest_asyncio.apply()
|
15 |
-
from datetime import date
|
16 |
-
|
17 |
-
|
18 |
-
class scraper:
|
19 |
-
def get_tweets(search_str, from_date="2006-07-01", to_date=str(date.today()), num_tweets=10, u_or_s='s',
|
20 |
-
acceptable_range=10):
|
21 |
-
|
22 |
-
if (type(from_date) or type("str")) is not type("str"):
|
23 |
-
print("[!] Please make sure the date is a string in this format \"yyyy-mm-dd\" ")
|
24 |
-
raise EXCEPTION("Incorrect date type Exception!")
|
25 |
-
|
26 |
-
time_out = time.time() + 2 * 60
|
27 |
-
_dict = {}
|
28 |
-
c = twint.Config()
|
29 |
-
if u_or_s.lower() == "u":
|
30 |
-
c.Search = "from:@" + search_str # topic
|
31 |
-
else:
|
32 |
-
c.Search = search_str # topic
|
33 |
-
c.Pandas = True
|
34 |
-
num_tweets_and_replies = num_tweets
|
35 |
-
c.Count = True
|
36 |
-
for j in range(1, 5):
|
37 |
-
c.Limit = num_tweets_and_replies
|
38 |
-
c.Since = from_date
|
39 |
-
c.Until = to_date
|
40 |
-
c.Hide_output = True
|
41 |
-
old_stdout = sys.stdout
|
42 |
-
new_stdout = io.StringIO()
|
43 |
-
sys.stdout = new_stdout
|
44 |
-
twint.run.Search(c)
|
45 |
-
output = new_stdout.getvalue()
|
46 |
-
sys.stdout = old_stdout
|
47 |
-
print(output[0:-2])
|
48 |
-
tweet_info = twint.output.panda.Tweets_df
|
49 |
-
|
50 |
-
t_count = 0
|
51 |
-
try:
|
52 |
-
_keys = tweet_info["id"]
|
53 |
-
# tweet infor is a dataframe with fallowing columns
|
54 |
-
'''Index(['id', 'conversation_id', 'created_at', 'date', 'timezone', 'place',
|
55 |
-
'tweet', 'language', 'hashtags', 'cashtags', 'user_id', 'user_id_str',
|
56 |
-
'username', 'name', 'day', 'hour', 'link', 'urls', 'photos', 'video',
|
57 |
-
'thumbnail', 'retweet', 'nlikes', 'nreplies', 'nretweets', 'quote_url',
|
58 |
-
'search', 'near', 'geo', 'source', 'user_rt_id', 'user_rt',
|
59 |
-
'retweet_id', 'reply_to', 'retweet_date', 'translate', 'trans_src',
|
60 |
-
'trans_dest'],
|
61 |
-
dtype='object')'''
|
62 |
-
|
63 |
-
for i in range(len(_keys)):
|
64 |
-
if _keys[i] in _dict.keys() or tweet_info["tweet"][i].startswith("@"):
|
65 |
-
pass
|
66 |
-
else:
|
67 |
-
_dict[int(_keys[i])] = {"tweet": tweet_info["tweet"][i],
|
68 |
-
"date": tweet_info["date"][i],
|
69 |
-
"nlikes": tweet_info["nlikes"][i],
|
70 |
-
"nreplies": tweet_info["nreplies"][i],
|
71 |
-
"nretweets": tweet_info["nretweets"][i], "topic": ""}
|
72 |
-
if len(list(_dict.keys())) == num_tweets:
|
73 |
-
break
|
74 |
-
except:
|
75 |
-
pass
|
76 |
-
print(len(list(_dict.keys())), " of them are Tweets")
|
77 |
-
if (num_tweets - len(list(_dict.keys()))) < acceptable_range:
|
78 |
-
return _dict
|
79 |
-
if len(list(_dict.keys())) < num_tweets:
|
80 |
-
num_tweets_and_replies = num_tweets_and_replies + 100 * 3 ** j
|
81 |
-
else:
|
82 |
-
break
|
83 |
-
if time_out < time.time():
|
84 |
-
break
|
85 |
-
if output.startswith("[!] No more data!"):
|
86 |
-
break
|
87 |
-
return _dict
|
88 |
-
|
89 |
-
def string_search_user_tweets(user_name, search_str, from_date="2006-07-01", to_date=str(date.today()),
|
90 |
-
num_tweets=10):
|
91 |
-
c = twint.Config()
|
92 |
-
c.Username = user_name
|
93 |
-
c.Search = search_str # topic
|
94 |
-
c.Pandas = True
|
95 |
-
num_tweets_and_replies = num_tweets
|
96 |
-
c.Count = True
|
97 |
-
c.Limit = num_tweets_and_replies
|
98 |
-
c.Since = from_date
|
99 |
-
c.Until = to_date
|
100 |
-
c.Hide_output = True
|
101 |
-
twint.run.Search(c)
|
102 |
-
return twint.output.panda.Tweets_df
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
twitter-scraper/twint-master/scrape__init__.py
DELETED
@@ -1,14 +0,0 @@
|
|
1 |
-
def scraper_libs():
|
2 |
-
import sys
|
3 |
-
import io
|
4 |
-
import time
|
5 |
-
import asyncio
|
6 |
-
import os
|
7 |
-
from tkinter import EXCEPTION
|
8 |
-
from numpy import not_equal
|
9 |
-
loop = asyncio.get_event_loop()
|
10 |
-
loop.is_running()
|
11 |
-
import twint
|
12 |
-
import nest_asyncio
|
13 |
-
nest_asyncio.apply()
|
14 |
-
from datetime import date
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
twitter-scraper/twint-master/setup.py
DELETED
@@ -1,65 +0,0 @@
|
|
1 |
-
#!/usr/bin/python3
|
2 |
-
from setuptools import setup
|
3 |
-
import io
|
4 |
-
import os
|
5 |
-
|
6 |
-
# Package meta-data
|
7 |
-
NAME = 'twint'
|
8 |
-
DESCRIPTION = 'An advanced Twitter scraping & OSINT tool.'
|
9 |
-
URL = 'https://github.com/twintproject/twint'
|
10 |
-
EMAIL = 'codyzacharias@pm.me'
|
11 |
-
AUTHOR = 'Cody Zacharias'
|
12 |
-
REQUIRES_PYTHON = '>=3.6.0'
|
13 |
-
VERSION = None
|
14 |
-
|
15 |
-
# Packages required
|
16 |
-
REQUIRED = [
|
17 |
-
'aiohttp', 'aiodns', 'beautifulsoup4', 'cchardet', 'dataclasses',
|
18 |
-
'elasticsearch', 'pysocks', 'pandas', 'aiohttp_socks',
|
19 |
-
'schedule', 'geopy', 'fake-useragent', 'googletransx'
|
20 |
-
]
|
21 |
-
|
22 |
-
here = os.path.abspath(os.path.dirname(__file__))
|
23 |
-
|
24 |
-
with io.open(os.path.join(here, 'README.md'), encoding='utf-8') as f:
|
25 |
-
long_description = '\n' + f.read()
|
26 |
-
|
27 |
-
# Load the package's __version__.py
|
28 |
-
about = {}
|
29 |
-
if not VERSION:
|
30 |
-
with open(os.path.join(here, NAME, '__version__.py')) as f:
|
31 |
-
exec(f.read(), about)
|
32 |
-
else:
|
33 |
-
about['__version__'] = VERSION
|
34 |
-
|
35 |
-
setup(
|
36 |
-
name=NAME,
|
37 |
-
version=about['__version__'],
|
38 |
-
description=DESCRIPTION,
|
39 |
-
long_description=long_description,
|
40 |
-
long_description_content_type="text/markdown",
|
41 |
-
author=AUTHOR,
|
42 |
-
author_email=EMAIL,
|
43 |
-
python_requires=REQUIRES_PYTHON,
|
44 |
-
url=URL,
|
45 |
-
packages=['twint', 'twint.storage'],
|
46 |
-
entry_points={
|
47 |
-
'console_scripts': [
|
48 |
-
'twint = twint.cli:run_as_command',
|
49 |
-
],
|
50 |
-
},
|
51 |
-
install_requires=REQUIRED,
|
52 |
-
dependency_links=[
|
53 |
-
'git+https://github.com/x0rzkov/py-googletrans#egg=googletrans'
|
54 |
-
],
|
55 |
-
license='MIT',
|
56 |
-
classifiers=[
|
57 |
-
'License :: OSI Approved :: MIT License',
|
58 |
-
'Programming Language :: Python',
|
59 |
-
'Programming Language :: Python :: 3',
|
60 |
-
'Programming Language :: Python :: 3.6',
|
61 |
-
'Programming Language :: Python :: 3.7',
|
62 |
-
'Programming Language :: Python :: 3.8',
|
63 |
-
'Programming Language :: Python :: Implementation :: CPython',
|
64 |
-
],
|
65 |
-
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
twitter-scraper/twint-master/test.py
DELETED
@@ -1,92 +0,0 @@
|
|
1 |
-
import twint
|
2 |
-
import os
|
3 |
-
|
4 |
-
'''
|
5 |
-
Test.py - Testing TWINT to make sure everything works.
|
6 |
-
'''
|
7 |
-
|
8 |
-
|
9 |
-
def test_reg(c, run):
|
10 |
-
print("[+] Beginning vanilla test in {}".format(str(run)))
|
11 |
-
run(c)
|
12 |
-
|
13 |
-
|
14 |
-
def test_db(c, run):
|
15 |
-
print("[+] Beginning DB test in {}".format(str(run)))
|
16 |
-
c.Database = "test_twint.db"
|
17 |
-
run(c)
|
18 |
-
|
19 |
-
|
20 |
-
def custom(c, run, _type):
|
21 |
-
print("[+] Beginning custom {} test in {}".format(_type, str(run)))
|
22 |
-
c.Custom['tweet'] = ["id", "username"]
|
23 |
-
c.Custom['user'] = ["id", "username"]
|
24 |
-
run(c)
|
25 |
-
|
26 |
-
|
27 |
-
def test_json(c, run):
|
28 |
-
c.Store_json = True
|
29 |
-
c.Output = "test_twint.json"
|
30 |
-
custom(c, run, "JSON")
|
31 |
-
print("[+] Beginning JSON test in {}".format(str(run)))
|
32 |
-
run(c)
|
33 |
-
|
34 |
-
|
35 |
-
def test_csv(c, run):
|
36 |
-
c.Store_csv = True
|
37 |
-
c.Output = "test_twint.csv"
|
38 |
-
custom(c, run, "CSV")
|
39 |
-
print("[+] Beginning CSV test in {}".format(str(run)))
|
40 |
-
run(c)
|
41 |
-
|
42 |
-
|
43 |
-
def main():
|
44 |
-
c = twint.Config()
|
45 |
-
c.Username = "verified"
|
46 |
-
c.Limit = 20
|
47 |
-
c.Store_object = True
|
48 |
-
|
49 |
-
# Separate objects are necessary.
|
50 |
-
|
51 |
-
f = twint.Config()
|
52 |
-
f.Username = "verified"
|
53 |
-
f.Limit = 20
|
54 |
-
f.Store_object = True
|
55 |
-
f.User_full = True
|
56 |
-
|
57 |
-
runs = [
|
58 |
-
twint.run.Profile, # this doesn't
|
59 |
-
twint.run.Search, # this works
|
60 |
-
twint.run.Following,
|
61 |
-
twint.run.Followers,
|
62 |
-
twint.run.Favorites,
|
63 |
-
]
|
64 |
-
|
65 |
-
tests = [test_reg, test_json, test_csv, test_db]
|
66 |
-
|
67 |
-
# Something breaks if we don't split these up
|
68 |
-
|
69 |
-
for run in runs[:3]:
|
70 |
-
if run == twint.run.Search:
|
71 |
-
c.Since = "2012-1-1 20:30:22"
|
72 |
-
c.Until = "2017-1-1"
|
73 |
-
else:
|
74 |
-
c.Since = ""
|
75 |
-
c.Until = ""
|
76 |
-
|
77 |
-
for test in tests:
|
78 |
-
test(c, run)
|
79 |
-
|
80 |
-
for run in runs[3:]:
|
81 |
-
for test in tests:
|
82 |
-
test(f, run)
|
83 |
-
|
84 |
-
files = ["test_twint.db", "test_twint.json", "test_twint.csv"]
|
85 |
-
for _file in files:
|
86 |
-
os.remove(_file)
|
87 |
-
|
88 |
-
print("[+] Testing complete!")
|
89 |
-
|
90 |
-
|
91 |
-
if __name__ == '__main__':
|
92 |
-
main()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
twitter-scraper/twint-master/twint/__init__.py
DELETED
@@ -1,32 +0,0 @@
|
|
1 |
-
'''
|
2 |
-
TWINT - Twitter Intelligence Tool (formerly known as Tweep).
|
3 |
-
|
4 |
-
See wiki on Github for in-depth details.
|
5 |
-
https://github.com/twintproject/twint/wiki
|
6 |
-
|
7 |
-
Licensed under MIT License
|
8 |
-
Copyright (c) 2018 Cody Zacharias
|
9 |
-
'''
|
10 |
-
import logging, os
|
11 |
-
|
12 |
-
from .config import Config
|
13 |
-
from .__version__ import __version__
|
14 |
-
from . import run
|
15 |
-
|
16 |
-
_levels = {
|
17 |
-
'info': logging.INFO,
|
18 |
-
'debug': logging.DEBUG
|
19 |
-
}
|
20 |
-
|
21 |
-
_level = os.getenv('TWINT_DEBUG', 'info')
|
22 |
-
_logLevel = _levels[_level]
|
23 |
-
|
24 |
-
if _level == "debug":
|
25 |
-
logger = logging.getLogger()
|
26 |
-
_output_fn = 'twint.log'
|
27 |
-
logger.setLevel(_logLevel)
|
28 |
-
formatter = logging.Formatter('%(levelname)s:%(asctime)s:%(name)s:%(message)s')
|
29 |
-
fileHandler = logging.FileHandler(_output_fn)
|
30 |
-
fileHandler.setLevel(_logLevel)
|
31 |
-
fileHandler.setFormatter(formatter)
|
32 |
-
logger.addHandler(fileHandler)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
twitter-scraper/twint-master/twint/__version__.py
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
VERSION = (2, 1, 21)
|
2 |
-
|
3 |
-
__version__ = '.'.join(map(str, VERSION))
|
|
|
|
|
|
|
|
twitter-scraper/twint-master/twint/cli.py
DELETED
@@ -1,342 +0,0 @@
|
|
1 |
-
#!/usr/bin/env python3
|
2 |
-
'''
|
3 |
-
Twint.py - Twitter Intelligence Tool (formerly known as Tweep).
|
4 |
-
|
5 |
-
See wiki on Github for in-depth details.
|
6 |
-
https://github.com/twintproject/twint/wiki
|
7 |
-
|
8 |
-
Licensed under MIT License
|
9 |
-
Copyright (c) 2018 The Twint Project
|
10 |
-
'''
|
11 |
-
import sys
|
12 |
-
import os
|
13 |
-
import argparse
|
14 |
-
|
15 |
-
from . import run
|
16 |
-
from . import config
|
17 |
-
from . import storage
|
18 |
-
|
19 |
-
|
20 |
-
def error(_error, message):
|
21 |
-
""" Print errors to stdout
|
22 |
-
"""
|
23 |
-
print("[-] {}: {}".format(_error, message))
|
24 |
-
sys.exit(0)
|
25 |
-
|
26 |
-
|
27 |
-
def check(args):
|
28 |
-
""" Error checking
|
29 |
-
"""
|
30 |
-
if args.username is not None or args.userlist or args.members_list:
|
31 |
-
if args.verified:
|
32 |
-
error("Contradicting Args",
|
33 |
-
"Please use --verified in combination with -s.")
|
34 |
-
if args.userid:
|
35 |
-
error("Contradicting Args",
|
36 |
-
"--userid and -u cannot be used together.")
|
37 |
-
if args.all:
|
38 |
-
error("Contradicting Args",
|
39 |
-
"--all and -u cannot be used together.")
|
40 |
-
elif args.search and args.timeline:
|
41 |
-
error("Contradicting Args",
|
42 |
-
"--s and --tl cannot be used together.")
|
43 |
-
elif args.timeline and not args.username:
|
44 |
-
error("Error", "-tl cannot be used without -u.")
|
45 |
-
elif args.search is None:
|
46 |
-
if args.custom_query is not None:
|
47 |
-
pass
|
48 |
-
elif (args.geo or args.near) is None and not (args.all or args.userid):
|
49 |
-
error("Error", "Please use at least -u, -s, -g or --near.")
|
50 |
-
elif args.all and args.userid:
|
51 |
-
error("Contradicting Args",
|
52 |
-
"--all and --userid cannot be used together")
|
53 |
-
if args.output is None:
|
54 |
-
if args.csv:
|
55 |
-
error("Error", "Please specify an output file (Example: -o file.csv).")
|
56 |
-
elif args.json:
|
57 |
-
error("Error", "Please specify an output file (Example: -o file.json).")
|
58 |
-
if args.backoff_exponent <= 0:
|
59 |
-
error("Error", "Please specifiy a positive value for backoff_exponent")
|
60 |
-
if args.min_wait_time < 0:
|
61 |
-
error("Error", "Please specifiy a non negative value for min_wait_time")
|
62 |
-
|
63 |
-
|
64 |
-
def loadUserList(ul, _type):
|
65 |
-
""" Concatenate users
|
66 |
-
"""
|
67 |
-
if os.path.exists(os.path.abspath(ul)):
|
68 |
-
userlist = open(os.path.abspath(ul), "r").read().splitlines()
|
69 |
-
else:
|
70 |
-
userlist = ul.split(",")
|
71 |
-
if _type == "search":
|
72 |
-
un = ""
|
73 |
-
for user in userlist:
|
74 |
-
un += "%20OR%20from%3A" + user
|
75 |
-
return un[15:]
|
76 |
-
return userlist
|
77 |
-
|
78 |
-
|
79 |
-
def initialize(args):
|
80 |
-
""" Set default values for config from args
|
81 |
-
"""
|
82 |
-
c = config.Config()
|
83 |
-
c.Username = args.username
|
84 |
-
c.User_id = args.userid
|
85 |
-
c.Search = args.search
|
86 |
-
c.Geo = args.geo
|
87 |
-
c.Location = args.location
|
88 |
-
c.Near = args.near
|
89 |
-
c.Lang = args.lang
|
90 |
-
c.Output = args.output
|
91 |
-
c.Elasticsearch = args.elasticsearch
|
92 |
-
c.Year = args.year
|
93 |
-
c.Since = args.since
|
94 |
-
c.Until = args.until
|
95 |
-
c.Email = args.email
|
96 |
-
c.Phone = args.phone
|
97 |
-
c.Verified = args.verified
|
98 |
-
c.Store_csv = args.csv
|
99 |
-
c.Tabs = args.tabs
|
100 |
-
c.Store_json = args.json
|
101 |
-
c.Show_hashtags = args.hashtags
|
102 |
-
c.Show_cashtags = args.cashtags
|
103 |
-
c.Limit = args.limit
|
104 |
-
c.Count = args.count
|
105 |
-
c.Stats = args.stats
|
106 |
-
c.Database = args.database
|
107 |
-
c.To = args.to
|
108 |
-
c.All = args.all
|
109 |
-
c.Essid = args.essid
|
110 |
-
c.Format = args.format
|
111 |
-
c.User_full = args.user_full
|
112 |
-
# c.Profile_full = args.profile_full
|
113 |
-
c.Pandas_type = args.pandas_type
|
114 |
-
c.Index_tweets = args.index_tweets
|
115 |
-
c.Index_follow = args.index_follow
|
116 |
-
c.Index_users = args.index_users
|
117 |
-
c.Debug = args.debug
|
118 |
-
c.Resume = args.resume
|
119 |
-
c.Images = args.images
|
120 |
-
c.Videos = args.videos
|
121 |
-
c.Media = args.media
|
122 |
-
c.Replies = args.replies
|
123 |
-
c.Pandas_clean = args.pandas_clean
|
124 |
-
c.Proxy_host = args.proxy_host
|
125 |
-
c.Proxy_port = args.proxy_port
|
126 |
-
c.Proxy_type = args.proxy_type
|
127 |
-
c.Tor_control_port = args.tor_control_port
|
128 |
-
c.Tor_control_password = args.tor_control_password
|
129 |
-
c.Retweets = args.retweets
|
130 |
-
c.Custom_query = args.custom_query
|
131 |
-
c.Popular_tweets = args.popular_tweets
|
132 |
-
c.Skip_certs = args.skip_certs
|
133 |
-
c.Hide_output = args.hide_output
|
134 |
-
c.Native_retweets = args.native_retweets
|
135 |
-
c.Min_likes = args.min_likes
|
136 |
-
c.Min_retweets = args.min_retweets
|
137 |
-
c.Min_replies = args.min_replies
|
138 |
-
c.Links = args.links
|
139 |
-
c.Source = args.source
|
140 |
-
c.Members_list = args.members_list
|
141 |
-
c.Filter_retweets = args.filter_retweets
|
142 |
-
c.Translate = args.translate
|
143 |
-
c.TranslateDest = args.translate_dest
|
144 |
-
c.Backoff_exponent = args.backoff_exponent
|
145 |
-
c.Min_wait_time = args.min_wait_time
|
146 |
-
return c
|
147 |
-
|
148 |
-
|
149 |
-
def options():
|
150 |
-
""" Parse arguments
|
151 |
-
"""
|
152 |
-
ap = argparse.ArgumentParser(prog="twint",
|
153 |
-
usage="python3 %(prog)s [options]",
|
154 |
-
description="TWINT - An Advanced Twitter Scraping Tool.")
|
155 |
-
ap.add_argument("-u", "--username", help="User's Tweets you want to scrape.")
|
156 |
-
ap.add_argument("-s", "--search", help="Search for Tweets containing this word or phrase.")
|
157 |
-
ap.add_argument("-g", "--geo", help="Search for geocoded Tweets.")
|
158 |
-
ap.add_argument("--near", help="Near a specified city.")
|
159 |
-
ap.add_argument("--location", help="Show user's location (Experimental).", action="store_true")
|
160 |
-
ap.add_argument("-l", "--lang", help="Search for Tweets in a specific language.")
|
161 |
-
ap.add_argument("-o", "--output", help="Save output to a file.")
|
162 |
-
ap.add_argument("-es", "--elasticsearch", help="Index to Elasticsearch.")
|
163 |
-
ap.add_argument("--year", help="Filter Tweets before specified year.")
|
164 |
-
ap.add_argument("--since", help="Filter Tweets sent since date (Example: \"2017-12-27 20:30:15\" or 2017-12-27).",
|
165 |
-
metavar="DATE")
|
166 |
-
ap.add_argument("--until", help="Filter Tweets sent until date (Example: \"2017-12-27 20:30:15\" or 2017-12-27).",
|
167 |
-
metavar="DATE")
|
168 |
-
ap.add_argument("--email", help="Filter Tweets that might have email addresses", action="store_true")
|
169 |
-
ap.add_argument("--phone", help="Filter Tweets that might have phone numbers", action="store_true")
|
170 |
-
ap.add_argument("--verified", help="Display Tweets only from verified users (Use with -s).",
|
171 |
-
action="store_true")
|
172 |
-
ap.add_argument("--csv", help="Write as .csv file.", action="store_true")
|
173 |
-
ap.add_argument("--tabs", help="Separate CSV fields with tab characters, not commas.", action="store_true")
|
174 |
-
ap.add_argument("--json", help="Write as .json file", action="store_true")
|
175 |
-
ap.add_argument("--hashtags", help="Output hashtags in seperate column.", action="store_true")
|
176 |
-
ap.add_argument("--cashtags", help="Output cashtags in seperate column.", action="store_true")
|
177 |
-
ap.add_argument("--userid", help="Twitter user id.")
|
178 |
-
ap.add_argument("--limit", help="Number of Tweets to pull (Increments of 20).")
|
179 |
-
ap.add_argument("--count", help="Display number of Tweets scraped at the end of session.",
|
180 |
-
action="store_true")
|
181 |
-
ap.add_argument("--stats", help="Show number of replies, retweets, and likes.",
|
182 |
-
action="store_true")
|
183 |
-
ap.add_argument("-db", "--database", help="Store Tweets in a sqlite3 database.")
|
184 |
-
ap.add_argument("--to", help="Search Tweets to a user.", metavar="USERNAME")
|
185 |
-
ap.add_argument("--all", help="Search all Tweets associated with a user.", metavar="USERNAME")
|
186 |
-
ap.add_argument("--followers", help="Scrape a person's followers.", action="store_true")
|
187 |
-
ap.add_argument("--following", help="Scrape a person's follows", action="store_true")
|
188 |
-
ap.add_argument("--favorites", help="Scrape Tweets a user has liked.", action="store_true")
|
189 |
-
ap.add_argument("--proxy-type", help="Socks5, HTTP, etc.")
|
190 |
-
ap.add_argument("--proxy-host", help="Proxy hostname or IP.")
|
191 |
-
ap.add_argument("--proxy-port", help="The port of the proxy server.")
|
192 |
-
ap.add_argument("--tor-control-port", help="If proxy-host is set to tor, this is the control port", default=9051)
|
193 |
-
ap.add_argument("--tor-control-password",
|
194 |
-
help="If proxy-host is set to tor, this is the password for the control port",
|
195 |
-
default="my_password")
|
196 |
-
ap.add_argument("--essid",
|
197 |
-
help="Elasticsearch Session ID, use this to differentiate scraping sessions.",
|
198 |
-
nargs="?", default="")
|
199 |
-
ap.add_argument("--userlist", help="Userlist from list or file.")
|
200 |
-
ap.add_argument("--retweets",
|
201 |
-
help="Include user's Retweets (Warning: limited).",
|
202 |
-
action="store_true")
|
203 |
-
ap.add_argument("--format", help="Custom output format (See wiki for details).")
|
204 |
-
ap.add_argument("--user-full",
|
205 |
-
help="Collect all user information (Use with followers or following only).",
|
206 |
-
action="store_true")
|
207 |
-
# I am removing this this feature for the time being, because it is no longer required, default method will do this
|
208 |
-
# ap.add_argument("--profile-full",
|
209 |
-
# help="Slow, but effective method of collecting a user's Tweets and RT.",
|
210 |
-
# action="store_true")
|
211 |
-
ap.add_argument(
|
212 |
-
"-tl",
|
213 |
-
"--timeline",
|
214 |
-
help="Collects every tweet from a User's Timeline. (Tweets, RTs & Replies)",
|
215 |
-
action="store_true",
|
216 |
-
)
|
217 |
-
ap.add_argument("--translate",
|
218 |
-
help="Get tweets translated by Google Translate.",
|
219 |
-
action="store_true")
|
220 |
-
ap.add_argument("--translate-dest", help="Translate tweet to language (ISO2).",
|
221 |
-
default="en")
|
222 |
-
ap.add_argument("--store-pandas", help="Save Tweets in a DataFrame (Pandas) file.")
|
223 |
-
ap.add_argument("--pandas-type",
|
224 |
-
help="Specify HDF5 or Pickle (HDF5 as default)", nargs="?", default="HDF5")
|
225 |
-
ap.add_argument("-it", "--index-tweets",
|
226 |
-
help="Custom Elasticsearch Index name for Tweets.", nargs="?", default="twinttweets")
|
227 |
-
ap.add_argument("-if", "--index-follow",
|
228 |
-
help="Custom Elasticsearch Index name for Follows.",
|
229 |
-
nargs="?", default="twintgraph")
|
230 |
-
ap.add_argument("-iu", "--index-users", help="Custom Elasticsearch Index name for Users.",
|
231 |
-
nargs="?", default="twintuser")
|
232 |
-
ap.add_argument("--debug",
|
233 |
-
help="Store information in debug logs", action="store_true")
|
234 |
-
ap.add_argument("--resume", help="Resume from Tweet ID.", metavar="TWEET_ID")
|
235 |
-
ap.add_argument("--videos", help="Display only Tweets with videos.", action="store_true")
|
236 |
-
ap.add_argument("--images", help="Display only Tweets with images.", action="store_true")
|
237 |
-
ap.add_argument("--media",
|
238 |
-
help="Display Tweets with only images or videos.", action="store_true")
|
239 |
-
ap.add_argument("--replies", help="Display replies to a subject.", action="store_true")
|
240 |
-
ap.add_argument("-pc", "--pandas-clean",
|
241 |
-
help="Automatically clean Pandas dataframe at every scrape.")
|
242 |
-
ap.add_argument("-cq", "--custom-query", help="Custom search query.")
|
243 |
-
ap.add_argument("-pt", "--popular-tweets", help="Scrape popular tweets instead of recent ones.",
|
244 |
-
action="store_true")
|
245 |
-
ap.add_argument("-sc", "--skip-certs", help="Skip certs verification, useful for SSC.", action="store_false")
|
246 |
-
ap.add_argument("-ho", "--hide-output", help="Hide output, no tweets will be displayed.", action="store_true")
|
247 |
-
ap.add_argument("-nr", "--native-retweets", help="Filter the results for retweets only.", action="store_true")
|
248 |
-
ap.add_argument("--min-likes", help="Filter the tweets by minimum number of likes.")
|
249 |
-
ap.add_argument("--min-retweets", help="Filter the tweets by minimum number of retweets.")
|
250 |
-
ap.add_argument("--min-replies", help="Filter the tweets by minimum number of replies.")
|
251 |
-
ap.add_argument("--links", help="Include or exclude tweets containing one o more links. If not specified" +
|
252 |
-
" you will get both tweets that might contain links or not.")
|
253 |
-
ap.add_argument("--source", help="Filter the tweets for specific source client.")
|
254 |
-
ap.add_argument("--members-list", help="Filter the tweets sent by users in a given list.")
|
255 |
-
ap.add_argument("-fr", "--filter-retweets", help="Exclude retweets from the results.", action="store_true")
|
256 |
-
ap.add_argument("--backoff-exponent", help="Specify a exponent for the polynomial backoff in case of errors.",
|
257 |
-
type=float, default=3.0)
|
258 |
-
ap.add_argument("--min-wait-time", type=float, default=15,
|
259 |
-
help="specifiy a minimum wait time in case of scraping limit error. This value will be adjusted by twint if the value provided does not satisfy the limits constraints")
|
260 |
-
args = ap.parse_args()
|
261 |
-
|
262 |
-
return args
|
263 |
-
|
264 |
-
|
265 |
-
def main():
|
266 |
-
""" Main
|
267 |
-
"""
|
268 |
-
args = options()
|
269 |
-
check(args)
|
270 |
-
|
271 |
-
if args.pandas_clean:
|
272 |
-
storage.panda.clean()
|
273 |
-
|
274 |
-
c = initialize(args)
|
275 |
-
|
276 |
-
if args.userlist:
|
277 |
-
c.Query = loadUserList(args.userlist, "search")
|
278 |
-
|
279 |
-
if args.pandas_clean:
|
280 |
-
storage.panda.clean()
|
281 |
-
|
282 |
-
if args.favorites:
|
283 |
-
if args.userlist:
|
284 |
-
_userlist = loadUserList(args.userlist, "favorites")
|
285 |
-
for _user in _userlist:
|
286 |
-
args.username = _user
|
287 |
-
c = initialize(args)
|
288 |
-
run.Favorites(c)
|
289 |
-
else:
|
290 |
-
run.Favorites(c)
|
291 |
-
elif args.following:
|
292 |
-
if args.userlist:
|
293 |
-
_userlist = loadUserList(args.userlist, "following")
|
294 |
-
for _user in _userlist:
|
295 |
-
args.username = _user
|
296 |
-
c = initialize(args)
|
297 |
-
run.Following(c)
|
298 |
-
else:
|
299 |
-
run.Following(c)
|
300 |
-
elif args.followers:
|
301 |
-
if args.userlist:
|
302 |
-
_userlist = loadUserList(args.userlist, "followers")
|
303 |
-
for _user in _userlist:
|
304 |
-
args.username = _user
|
305 |
-
c = initialize(args)
|
306 |
-
run.Followers(c)
|
307 |
-
else:
|
308 |
-
run.Followers(c)
|
309 |
-
elif args.retweets: # or args.profile_full:
|
310 |
-
if args.userlist:
|
311 |
-
_userlist = loadUserList(args.userlist, "profile")
|
312 |
-
for _user in _userlist:
|
313 |
-
args.username = _user
|
314 |
-
c = initialize(args)
|
315 |
-
run.Profile(c)
|
316 |
-
else:
|
317 |
-
run.Profile(c)
|
318 |
-
elif args.user_full:
|
319 |
-
if args.userlist:
|
320 |
-
_userlist = loadUserList(args.userlist, "userlist")
|
321 |
-
for _user in _userlist:
|
322 |
-
args.username = _user
|
323 |
-
c = initialize(args)
|
324 |
-
run.Lookup(c)
|
325 |
-
else:
|
326 |
-
run.Lookup(c)
|
327 |
-
elif args.timeline:
|
328 |
-
run.Profile(c)
|
329 |
-
else:
|
330 |
-
run.Search(c)
|
331 |
-
|
332 |
-
|
333 |
-
def run_as_command():
|
334 |
-
if(sys.version_info.major < 3 or (sys.version_info.major == 3 and sys.version_info.minor < 6)):
|
335 |
-
print("[-] TWINT requires Python version 3.6+.")
|
336 |
-
sys.exit(0)
|
337 |
-
|
338 |
-
main()
|
339 |
-
|
340 |
-
|
341 |
-
if __name__ == '__main__':
|
342 |
-
main()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
twitter-scraper/twint-master/twint/config.py
DELETED
@@ -1,87 +0,0 @@
|
|
1 |
-
from dataclasses import dataclass
|
2 |
-
from typing import Optional
|
3 |
-
|
4 |
-
@dataclass
|
5 |
-
class Config:
|
6 |
-
Username: Optional[str] = None
|
7 |
-
User_id: Optional[str] = None
|
8 |
-
Search: Optional[str] = None
|
9 |
-
Lookup: bool = False
|
10 |
-
Geo: str = ""
|
11 |
-
Location: bool = False
|
12 |
-
Near: str = None
|
13 |
-
Lang: Optional[str] = None
|
14 |
-
Output: Optional[str] = None
|
15 |
-
Elasticsearch: object = None
|
16 |
-
Year: Optional[int] = None
|
17 |
-
Since: Optional[str] = None
|
18 |
-
Until: Optional[str] = None
|
19 |
-
Email: Optional[str] = None
|
20 |
-
Phone: Optional[str] = None
|
21 |
-
Verified: bool = False
|
22 |
-
Store_csv: bool = False
|
23 |
-
Store_json: bool = False
|
24 |
-
Custom = {"tweet": None, "user": None, "username": None}
|
25 |
-
Show_hashtags: bool = False
|
26 |
-
Show_cashtags: bool = False
|
27 |
-
Limit: Optional[int] = None
|
28 |
-
Count: Optional[int] = None
|
29 |
-
Stats: bool = False
|
30 |
-
Database: object = None
|
31 |
-
To: str = None
|
32 |
-
All = None
|
33 |
-
Debug: bool = False
|
34 |
-
Format = None
|
35 |
-
Essid: str = ""
|
36 |
-
Profile: bool = False
|
37 |
-
Followers: bool = False
|
38 |
-
Following: bool = False
|
39 |
-
Favorites: bool = False
|
40 |
-
TwitterSearch: bool = False
|
41 |
-
User_full: bool = False
|
42 |
-
# Profile_full: bool = False
|
43 |
-
Store_object: bool = False
|
44 |
-
Store_object_tweets_list: list = None
|
45 |
-
Store_object_users_list: list = None
|
46 |
-
Store_object_follow_list: list = None
|
47 |
-
Pandas_type: type = None
|
48 |
-
Pandas: bool = False
|
49 |
-
Index_tweets: str = "twinttweets"
|
50 |
-
Index_follow: str = "twintgraph"
|
51 |
-
Index_users: str = "twintuser"
|
52 |
-
Retries_count: int = 10
|
53 |
-
Resume: object = None
|
54 |
-
Images: bool = False
|
55 |
-
Videos: bool = False
|
56 |
-
Media: bool = False
|
57 |
-
Replies: bool = False
|
58 |
-
Pandas_clean: bool = True
|
59 |
-
Lowercase: bool = True
|
60 |
-
Pandas_au: bool = True
|
61 |
-
Proxy_host: str = ""
|
62 |
-
Proxy_port: int = 0
|
63 |
-
Proxy_type: object = None
|
64 |
-
Tor_control_port: int = 9051
|
65 |
-
Tor_control_password: str = None
|
66 |
-
Retweets: bool = False
|
67 |
-
Query: str = None
|
68 |
-
Hide_output: bool = False
|
69 |
-
Custom_query: str = ""
|
70 |
-
Popular_tweets: bool = False
|
71 |
-
Skip_certs: bool = False
|
72 |
-
Native_retweets: bool = False
|
73 |
-
Min_likes: int = 0
|
74 |
-
Min_retweets: int = 0
|
75 |
-
Min_replies: int = 0
|
76 |
-
Links: Optional[str] = None
|
77 |
-
Source: Optional[str] = None
|
78 |
-
Members_list: Optional[str] = None
|
79 |
-
Filter_retweets: bool = False
|
80 |
-
Translate: bool = False
|
81 |
-
TranslateSrc: str = "en"
|
82 |
-
TranslateDest: str = "en"
|
83 |
-
Backoff_exponent: float = 3.0
|
84 |
-
Min_wait_time: int = 0
|
85 |
-
Bearer_token: str = None
|
86 |
-
Guest_token: str = None
|
87 |
-
deleted: list = None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
twitter-scraper/twint-master/twint/datelock.py
DELETED
@@ -1,44 +0,0 @@
|
|
1 |
-
import datetime
|
2 |
-
|
3 |
-
import logging as logme
|
4 |
-
|
5 |
-
from .tweet import utc_to_local
|
6 |
-
|
7 |
-
|
8 |
-
class Datelock:
|
9 |
-
until = None
|
10 |
-
since = None
|
11 |
-
_since_def_user = None
|
12 |
-
|
13 |
-
|
14 |
-
def convertToDateTime(string):
|
15 |
-
dateTimeList = string.split()
|
16 |
-
ListLength = len(dateTimeList)
|
17 |
-
if ListLength == 2:
|
18 |
-
return string
|
19 |
-
if ListLength == 1:
|
20 |
-
return string + " 00:00:00"
|
21 |
-
else:
|
22 |
-
return ""
|
23 |
-
|
24 |
-
|
25 |
-
def Set(Until, Since):
|
26 |
-
logme.debug(__name__+':Set')
|
27 |
-
d = Datelock()
|
28 |
-
|
29 |
-
if Until:
|
30 |
-
d.until = datetime.datetime.strptime(convertToDateTime(Until), "%Y-%m-%d %H:%M:%S")
|
31 |
-
d.until = utc_to_local(d.until)
|
32 |
-
else:
|
33 |
-
d.until = datetime.datetime.today()
|
34 |
-
|
35 |
-
if Since:
|
36 |
-
d.since = datetime.datetime.strptime(convertToDateTime(Since), "%Y-%m-%d %H:%M:%S")
|
37 |
-
d.since = utc_to_local(d.since)
|
38 |
-
d._since_def_user = True
|
39 |
-
else:
|
40 |
-
d.since = datetime.datetime.strptime("2006-03-21 00:00:00", "%Y-%m-%d %H:%M:%S")
|
41 |
-
d.since = utc_to_local(d.since)
|
42 |
-
d._since_def_user = False
|
43 |
-
|
44 |
-
return d
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
twitter-scraper/twint-master/twint/feed.py
DELETED
@@ -1,145 +0,0 @@
|
|
1 |
-
import time
|
2 |
-
from datetime import datetime
|
3 |
-
|
4 |
-
from bs4 import BeautifulSoup
|
5 |
-
from re import findall
|
6 |
-
from json import loads
|
7 |
-
|
8 |
-
import logging as logme
|
9 |
-
|
10 |
-
from .tweet import utc_to_local, Tweet_formats
|
11 |
-
|
12 |
-
|
13 |
-
class NoMoreTweetsException(Exception):
|
14 |
-
def __init__(self, msg):
|
15 |
-
super().__init__(msg)
|
16 |
-
|
17 |
-
|
18 |
-
def Follow(response):
|
19 |
-
logme.debug(__name__ + ':Follow')
|
20 |
-
soup = BeautifulSoup(response, "html.parser")
|
21 |
-
follow = soup.find_all("td", "info fifty screenname")
|
22 |
-
cursor = soup.find_all("div", "w-button-more")
|
23 |
-
try:
|
24 |
-
cursor = findall(r'cursor=(.*?)">', str(cursor))[0]
|
25 |
-
except IndexError:
|
26 |
-
logme.critical(__name__ + ':Follow:IndexError')
|
27 |
-
|
28 |
-
return follow, cursor
|
29 |
-
|
30 |
-
|
31 |
-
# TODO: this won't be used by --profile-full anymore. if it isn't used anywhere else, perhaps remove this in future
|
32 |
-
def Mobile(response):
|
33 |
-
logme.debug(__name__ + ':Mobile')
|
34 |
-
soup = BeautifulSoup(response, "html.parser")
|
35 |
-
tweets = soup.find_all("span", "metadata")
|
36 |
-
max_id = soup.find_all("div", "w-button-more")
|
37 |
-
try:
|
38 |
-
max_id = findall(r'max_id=(.*?)">', str(max_id))[0]
|
39 |
-
except Exception as e:
|
40 |
-
logme.critical(__name__ + ':Mobile:' + str(e))
|
41 |
-
|
42 |
-
return tweets, max_id
|
43 |
-
|
44 |
-
|
45 |
-
def MobileFav(response):
|
46 |
-
soup = BeautifulSoup(response, "html.parser")
|
47 |
-
tweets = soup.find_all("table", "tweet")
|
48 |
-
max_id = soup.find_all("div", "w-button-more")
|
49 |
-
try:
|
50 |
-
max_id = findall(r'max_id=(.*?)">', str(max_id))[0]
|
51 |
-
except Exception as e:
|
52 |
-
print(str(e) + " [x] feed.MobileFav")
|
53 |
-
|
54 |
-
return tweets, max_id
|
55 |
-
|
56 |
-
|
57 |
-
def _get_cursor(response):
|
58 |
-
if isinstance(response, dict): # case 1
|
59 |
-
try:
|
60 |
-
next_cursor = response['timeline']['instructions'][0]['addEntries']['entries'][-1]['content'][
|
61 |
-
'operation']['cursor']['value']
|
62 |
-
except KeyError:
|
63 |
-
# this is needed because after the first request location of cursor is changed
|
64 |
-
next_cursor = response['timeline']['instructions'][-1]['replaceEntry']['entry']['content']['operation'][
|
65 |
-
'cursor']['value']
|
66 |
-
else: # case 2
|
67 |
-
next_cursor = response[-1]['content']['value']
|
68 |
-
return next_cursor
|
69 |
-
|
70 |
-
|
71 |
-
def Json(response):
|
72 |
-
logme.debug(__name__ + ':Json')
|
73 |
-
json_response = loads(response)
|
74 |
-
html = json_response["items_html"]
|
75 |
-
soup = BeautifulSoup(html, "html.parser")
|
76 |
-
feed = soup.find_all("div", "tweet")
|
77 |
-
return feed, json_response["min_position"]
|
78 |
-
|
79 |
-
|
80 |
-
def parse_tweets(config, response):
|
81 |
-
logme.debug(__name__ + ':parse_tweets')
|
82 |
-
response = loads(response)
|
83 |
-
feed = []
|
84 |
-
if 'globalObjects' in response:
|
85 |
-
if len(response['globalObjects']['tweets']) == 0:
|
86 |
-
msg = 'No more data!'
|
87 |
-
raise NoMoreTweetsException(msg)
|
88 |
-
for timeline_entry in response['timeline']['instructions'][0]['addEntries']['entries']:
|
89 |
-
# this will handle the cases when the timeline entry is a tweet
|
90 |
-
if (config.TwitterSearch or config.Profile) and (timeline_entry['entryId'].startswith('sq-I-t-') or
|
91 |
-
timeline_entry['entryId'].startswith('tweet-')):
|
92 |
-
if 'tweet' in timeline_entry['content']['item']['content']:
|
93 |
-
_id = timeline_entry['content']['item']['content']['tweet']['id']
|
94 |
-
# skip the ads
|
95 |
-
if 'promotedMetadata' in timeline_entry['content']['item']['content']['tweet']:
|
96 |
-
continue
|
97 |
-
elif 'tombstone' in timeline_entry['content']['item']['content'] and 'tweet' in \
|
98 |
-
timeline_entry['content']['item']['content']['tombstone']:
|
99 |
-
_id = timeline_entry['content']['item']['content']['tombstone']['tweet']['id']
|
100 |
-
else:
|
101 |
-
_id = None
|
102 |
-
if _id is None:
|
103 |
-
raise ValueError('Unable to find ID of tweet in timeline.')
|
104 |
-
try:
|
105 |
-
temp_obj = response['globalObjects']['tweets'][_id]
|
106 |
-
except KeyError:
|
107 |
-
logme.info('encountered a deleted tweet with id {}'.format(_id))
|
108 |
-
|
109 |
-
config.deleted.append(_id)
|
110 |
-
continue
|
111 |
-
temp_obj['user_data'] = response['globalObjects']['users'][temp_obj['user_id_str']]
|
112 |
-
if 'retweeted_status_id_str' in temp_obj:
|
113 |
-
rt_id = temp_obj['retweeted_status_id_str']
|
114 |
-
_dt = response['globalObjects']['tweets'][rt_id]['created_at']
|
115 |
-
_dt = datetime.strptime(_dt, '%a %b %d %H:%M:%S %z %Y')
|
116 |
-
_dt = utc_to_local(_dt)
|
117 |
-
_dt = str(_dt.strftime(Tweet_formats['datetime']))
|
118 |
-
temp_obj['retweet_data'] = {
|
119 |
-
'user_rt_id': response['globalObjects']['tweets'][rt_id]['user_id_str'],
|
120 |
-
'user_rt': response['globalObjects']['tweets'][rt_id]['full_text'],
|
121 |
-
'retweet_id': rt_id,
|
122 |
-
'retweet_date': _dt,
|
123 |
-
}
|
124 |
-
feed.append(temp_obj)
|
125 |
-
next_cursor = _get_cursor(response) # case 1
|
126 |
-
else:
|
127 |
-
response = response['data']['user']['result']['timeline']
|
128 |
-
entries = response['timeline']['instructions']
|
129 |
-
for e in entries:
|
130 |
-
if e.get('entries'):
|
131 |
-
entries = e['entries']
|
132 |
-
break
|
133 |
-
if len(entries) == 2:
|
134 |
-
msg = 'No more data!'
|
135 |
-
raise NoMoreTweetsException(msg)
|
136 |
-
for timeline_entry in entries:
|
137 |
-
if timeline_entry['content'].get('itemContent'):
|
138 |
-
try:
|
139 |
-
temp_obj = timeline_entry['content']['itemContent']['tweet_results']['result']['legacy']
|
140 |
-
temp_obj['user_data'] = timeline_entry['content']['itemContent']['tweet_results']['result']['core']['user_results']['result']['legacy']
|
141 |
-
feed.append(temp_obj)
|
142 |
-
except KeyError: # doubtful
|
143 |
-
next
|
144 |
-
next_cursor = _get_cursor(entries) # case 2
|
145 |
-
return feed, next_cursor
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
twitter-scraper/twint-master/twint/format.py
DELETED
@@ -1,91 +0,0 @@
|
|
1 |
-
import logging as logme
|
2 |
-
|
3 |
-
def Tweet(config, t):
|
4 |
-
if config.Format:
|
5 |
-
logme.debug(__name__+':Tweet:Format')
|
6 |
-
output = config.Format.replace("{id}", t.id_str)
|
7 |
-
output = output.replace("{conversation_id}", t.conversation_id)
|
8 |
-
output = output.replace("{date}", t.datestamp)
|
9 |
-
output = output.replace("{time}", t.timestamp)
|
10 |
-
output = output.replace("{user_id}", t.user_id_str)
|
11 |
-
output = output.replace("{username}", t.username)
|
12 |
-
output = output.replace("{name}", t.name)
|
13 |
-
output = output.replace("{place}", t.place)
|
14 |
-
output = output.replace("{timezone}", t.timezone)
|
15 |
-
output = output.replace("{urls}", ",".join(t.urls))
|
16 |
-
output = output.replace("{photos}", ",".join(t.photos))
|
17 |
-
output = output.replace("{video}", str(t.video))
|
18 |
-
output = output.replace("{thumbnail}", t.thumbnail)
|
19 |
-
output = output.replace("{tweet}", t.tweet)
|
20 |
-
output = output.replace("{language}", t.lang)
|
21 |
-
output = output.replace("{hashtags}", ",".join(t.hashtags))
|
22 |
-
output = output.replace("{cashtags}", ",".join(t.cashtags))
|
23 |
-
output = output.replace("{replies}", t.replies_count)
|
24 |
-
output = output.replace("{retweets}", t.retweets_count)
|
25 |
-
output = output.replace("{likes}", t.likes_count)
|
26 |
-
output = output.replace("{link}", t.link)
|
27 |
-
output = output.replace("{is_retweet}", str(t.retweet))
|
28 |
-
output = output.replace("{user_rt_id}", str(t.user_rt_id))
|
29 |
-
output = output.replace("{quote_url}", t.quote_url)
|
30 |
-
output = output.replace("{near}", t.near)
|
31 |
-
output = output.replace("{geo}", t.geo)
|
32 |
-
output = output.replace("{mentions}", ",".join(t.mentions))
|
33 |
-
output = output.replace("{translate}", t.translate)
|
34 |
-
output = output.replace("{trans_src}", t.trans_src)
|
35 |
-
output = output.replace("{trans_dest}", t.trans_dest)
|
36 |
-
else:
|
37 |
-
logme.debug(__name__+':Tweet:notFormat')
|
38 |
-
output = f"{t.id_str} {t.datestamp} {t.timestamp} {t.timezone} "
|
39 |
-
|
40 |
-
# TODO: someone who is familiar with this code, needs to take a look at what this is <also see tweet.py>
|
41 |
-
# if t.retweet:
|
42 |
-
# output += "RT "
|
43 |
-
|
44 |
-
output += f"<{t.username}> {t.tweet}"
|
45 |
-
|
46 |
-
if config.Show_hashtags:
|
47 |
-
hashtags = ",".join(t.hashtags)
|
48 |
-
output += f" {hashtags}"
|
49 |
-
if config.Show_cashtags:
|
50 |
-
cashtags = ",".join(t.cashtags)
|
51 |
-
output += f" {cashtags}"
|
52 |
-
if config.Stats:
|
53 |
-
output += f" | {t.replies_count} replies {t.retweets_count} retweets {t.likes_count} likes"
|
54 |
-
if config.Translate:
|
55 |
-
output += f" {t.translate} {t.trans_src} {t.trans_dest}"
|
56 |
-
return output
|
57 |
-
|
58 |
-
def User(_format, u):
|
59 |
-
if _format:
|
60 |
-
logme.debug(__name__+':User:Format')
|
61 |
-
output = _format.replace("{id}", str(u.id))
|
62 |
-
output = output.replace("{name}", u.name)
|
63 |
-
output = output.replace("{username}", u.username)
|
64 |
-
output = output.replace("{bio}", u.bio)
|
65 |
-
output = output.replace("{location}", u.location)
|
66 |
-
output = output.replace("{url}", u.url)
|
67 |
-
output = output.replace("{join_date}", u.join_date)
|
68 |
-
output = output.replace("{join_time}", u.join_time)
|
69 |
-
output = output.replace("{tweets}", str(u.tweets))
|
70 |
-
output = output.replace("{following}", str(u.following))
|
71 |
-
output = output.replace("{followers}", str(u.followers))
|
72 |
-
output = output.replace("{likes}", str(u.likes))
|
73 |
-
output = output.replace("{media}", str(u.media_count))
|
74 |
-
output = output.replace("{private}", str(u.is_private))
|
75 |
-
output = output.replace("{verified}", str(u.is_verified))
|
76 |
-
output = output.replace("{avatar}", u.avatar)
|
77 |
-
if u.background_image:
|
78 |
-
output = output.replace("{background_image}", u.background_image)
|
79 |
-
else:
|
80 |
-
output = output.replace("{background_image}", "")
|
81 |
-
else:
|
82 |
-
logme.debug(__name__+':User:notFormat')
|
83 |
-
output = f"{u.id} | {u.name} | @{u.username} | Private: "
|
84 |
-
output += f"{u.is_private} | Verified: {u.is_verified} |"
|
85 |
-
output += f" Bio: {u.bio} | Location: {u.location} | Url: "
|
86 |
-
output += f"{u.url} | Joined: {u.join_date} {u.join_time} "
|
87 |
-
output += f"| Tweets: {u.tweets} | Following: {u.following}"
|
88 |
-
output += f" | Followers: {u.followers} | Likes: {u.likes} "
|
89 |
-
output += f"| Media: {u.media_count} | Avatar: {u.avatar}"
|
90 |
-
|
91 |
-
return output
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
twitter-scraper/twint-master/twint/get.py
DELETED
@@ -1,298 +0,0 @@
|
|
1 |
-
from async_timeout import timeout
|
2 |
-
from datetime import datetime
|
3 |
-
from bs4 import BeautifulSoup
|
4 |
-
import sys
|
5 |
-
import socket
|
6 |
-
import aiohttp
|
7 |
-
from fake_useragent import UserAgent
|
8 |
-
import asyncio
|
9 |
-
import concurrent.futures
|
10 |
-
import random
|
11 |
-
from json import loads, dumps
|
12 |
-
from aiohttp_socks import ProxyConnector, ProxyType
|
13 |
-
from urllib.parse import quote
|
14 |
-
import time
|
15 |
-
|
16 |
-
from . import url
|
17 |
-
from .output import Tweets, Users
|
18 |
-
from .token import TokenExpiryException
|
19 |
-
|
20 |
-
import logging as logme
|
21 |
-
|
22 |
-
httpproxy = None
|
23 |
-
|
24 |
-
user_agent_list = [
|
25 |
-
# 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)'
|
26 |
-
# ' Chrome/60.0.3112.113 Safari/537.36',
|
27 |
-
# 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)'
|
28 |
-
# ' Chrome/60.0.3112.90 Safari/537.36',
|
29 |
-
# 'Mozilla/5.0 (Windows NT 5.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)'
|
30 |
-
# ' Chrome/60.0.3112.90 Safari/537.36',
|
31 |
-
# 'Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)'
|
32 |
-
# ' Chrome/60.0.3112.90 Safari/537.36',
|
33 |
-
# 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko)'
|
34 |
-
# ' Chrome/44.0.2403.157 Safari/537.36',
|
35 |
-
# 'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)'
|
36 |
-
# ' Chrome/60.0.3112.113 Safari/537.36',
|
37 |
-
# 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)'
|
38 |
-
# ' Chrome/57.0.2987.133 Safari/537.36',
|
39 |
-
# 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)'
|
40 |
-
# ' Chrome/57.0.2987.133 Safari/537.36',
|
41 |
-
# 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)'
|
42 |
-
# ' Chrome/55.0.2883.87 Safari/537.36',
|
43 |
-
# 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)'
|
44 |
-
# ' Chrome/55.0.2883.87 Safari/537.36',
|
45 |
-
|
46 |
-
'Mozilla/4.0 (compatible; MSIE 9.0; Windows NT 6.1)',
|
47 |
-
'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko',
|
48 |
-
'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0)',
|
49 |
-
'Mozilla/5.0 (Windows NT 6.1; Trident/7.0; rv:11.0) like Gecko',
|
50 |
-
'Mozilla/5.0 (Windows NT 6.2; WOW64; Trident/7.0; rv:11.0) like Gecko',
|
51 |
-
'Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko',
|
52 |
-
'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.0; Trident/5.0)',
|
53 |
-
'Mozilla/5.0 (Windows NT 6.3; WOW64; Trident/7.0; rv:11.0) like Gecko',
|
54 |
-
'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)',
|
55 |
-
'Mozilla/5.0 (Windows NT 6.1; Win64; x64; Trident/7.0; rv:11.0) like Gecko',
|
56 |
-
'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0)',
|
57 |
-
'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/6.0)',
|
58 |
-
'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; .NET CLR 2.0.50727; .NET CLR 3.0.4506.2152; .NET '
|
59 |
-
'CLR 3.5.30729)',
|
60 |
-
]
|
61 |
-
|
62 |
-
|
63 |
-
# function to convert python `dict` to json and then encode it to be passed in the url as a parameter
|
64 |
-
# some urls require this format
|
65 |
-
def dict_to_url(dct):
|
66 |
-
return quote(dumps(dct))
|
67 |
-
|
68 |
-
|
69 |
-
def get_connector(config):
|
70 |
-
logme.debug(__name__ + ':get_connector')
|
71 |
-
_connector = None
|
72 |
-
if config.Proxy_host:
|
73 |
-
if config.Proxy_host.lower() == "tor":
|
74 |
-
_connector = ProxyConnector(
|
75 |
-
host='127.0.0.1',
|
76 |
-
port=9050,
|
77 |
-
rdns=True)
|
78 |
-
elif config.Proxy_port and config.Proxy_type:
|
79 |
-
if config.Proxy_type.lower() == "socks5":
|
80 |
-
_type = ProxyType.SOCKS5
|
81 |
-
elif config.Proxy_type.lower() == "socks4":
|
82 |
-
_type = ProxyType.SOCKS4
|
83 |
-
elif config.Proxy_type.lower() == "http":
|
84 |
-
global httpproxy
|
85 |
-
httpproxy = "http://" + config.Proxy_host + ":" + str(config.Proxy_port)
|
86 |
-
return _connector
|
87 |
-
else:
|
88 |
-
logme.critical("get_connector:proxy-type-error")
|
89 |
-
print("Error: Proxy types allowed are: http, socks5 and socks4. No https.")
|
90 |
-
sys.exit(1)
|
91 |
-
_connector = ProxyConnector(
|
92 |
-
proxy_type=_type,
|
93 |
-
host=config.Proxy_host,
|
94 |
-
port=config.Proxy_port,
|
95 |
-
rdns=True)
|
96 |
-
else:
|
97 |
-
logme.critical(__name__ + ':get_connector:proxy-port-type-error')
|
98 |
-
print("Error: Please specify --proxy-host, --proxy-port, and --proxy-type")
|
99 |
-
sys.exit(1)
|
100 |
-
else:
|
101 |
-
if config.Proxy_port or config.Proxy_type:
|
102 |
-
logme.critical(__name__ + ':get_connector:proxy-host-arg-error')
|
103 |
-
print("Error: Please specify --proxy-host, --proxy-port, and --proxy-type")
|
104 |
-
sys.exit(1)
|
105 |
-
|
106 |
-
return _connector
|
107 |
-
|
108 |
-
|
109 |
-
async def RequestUrl(config, init):
|
110 |
-
logme.debug(__name__ + ':RequestUrl')
|
111 |
-
_connector = get_connector(config)
|
112 |
-
_serialQuery = ""
|
113 |
-
params = []
|
114 |
-
_url = ""
|
115 |
-
_headers = [("authorization", config.Bearer_token), ("x-guest-token", config.Guest_token)]
|
116 |
-
|
117 |
-
# TODO : do this later
|
118 |
-
if config.Profile:
|
119 |
-
logme.debug(__name__ + ':RequestUrl:Profile')
|
120 |
-
_url, params, _serialQuery = url.SearchProfile(config, init)
|
121 |
-
elif config.TwitterSearch:
|
122 |
-
logme.debug(__name__ + ':RequestUrl:TwitterSearch')
|
123 |
-
_url, params, _serialQuery = await url.Search(config, init)
|
124 |
-
else:
|
125 |
-
if config.Following:
|
126 |
-
logme.debug(__name__ + ':RequestUrl:Following')
|
127 |
-
_url = await url.Following(config.Username, init)
|
128 |
-
elif config.Followers:
|
129 |
-
logme.debug(__name__ + ':RequestUrl:Followers')
|
130 |
-
_url = await url.Followers(config.Username, init)
|
131 |
-
else:
|
132 |
-
logme.debug(__name__ + ':RequestUrl:Favorites')
|
133 |
-
_url = await url.Favorites(config.Username, init)
|
134 |
-
_serialQuery = _url
|
135 |
-
|
136 |
-
response = await Request(_url, params=params, connector=_connector, headers=_headers)
|
137 |
-
|
138 |
-
if config.Debug:
|
139 |
-
print(_serialQuery, file=open("twint-request_urls.log", "a", encoding="utf-8"))
|
140 |
-
|
141 |
-
return response
|
142 |
-
|
143 |
-
|
144 |
-
def ForceNewTorIdentity(config):
|
145 |
-
logme.debug(__name__ + ':ForceNewTorIdentity')
|
146 |
-
try:
|
147 |
-
tor_c = socket.create_connection(('127.0.0.1', config.Tor_control_port))
|
148 |
-
tor_c.send('AUTHENTICATE "{}"\r\nSIGNAL NEWNYM\r\n'.format(config.Tor_control_password).encode())
|
149 |
-
response = tor_c.recv(1024)
|
150 |
-
if response != b'250 OK\r\n250 OK\r\n':
|
151 |
-
sys.stderr.write('Unexpected response from Tor control port: {}\n'.format(response))
|
152 |
-
logme.critical(__name__ + ':ForceNewTorIdentity:unexpectedResponse')
|
153 |
-
except Exception as e:
|
154 |
-
logme.debug(__name__ + ':ForceNewTorIdentity:errorConnectingTor')
|
155 |
-
sys.stderr.write('Error connecting to Tor control port: {}\n'.format(repr(e)))
|
156 |
-
sys.stderr.write('If you want to rotate Tor ports automatically - enable Tor control port\n')
|
157 |
-
|
158 |
-
|
159 |
-
async def Request(_url, connector=None, params=None, headers=None):
|
160 |
-
logme.debug(__name__ + ':Request:Connector')
|
161 |
-
async with aiohttp.ClientSession(connector=connector, headers=headers) as session:
|
162 |
-
return await Response(session, _url, params)
|
163 |
-
|
164 |
-
|
165 |
-
async def Response(session, _url, params=None):
|
166 |
-
logme.debug(__name__ + ':Response')
|
167 |
-
retries = 5
|
168 |
-
wait = 10 # No basis, maybe work with 0
|
169 |
-
for attempt in range(retries + 1):
|
170 |
-
try:
|
171 |
-
with timeout(120):
|
172 |
-
async with session.get(_url, ssl=True, params=params, proxy=httpproxy) as response:
|
173 |
-
resp = await response.text()
|
174 |
-
if response.status == 429: # 429 implies Too many requests i.e. Rate Limit Exceeded
|
175 |
-
raise TokenExpiryException(loads(resp)['errors'][0]['message'])
|
176 |
-
return resp
|
177 |
-
except aiohttp.client_exceptions.ClientConnectorError as exc:
|
178 |
-
if attempt < retries:
|
179 |
-
retrying = ', retrying'
|
180 |
-
level = logme.WARNING
|
181 |
-
else:
|
182 |
-
retrying = ''
|
183 |
-
level = logme.ERROR
|
184 |
-
logme.log(level, f'Error retrieving {_url}: {exc!r}{retrying}')
|
185 |
-
if attempt < retries:
|
186 |
-
time.sleep(wait)
|
187 |
-
else:
|
188 |
-
logme.fatal(f'{retries + 1} requests to {_url} failed, giving up.')
|
189 |
-
raise TokenExpiryException(f'{exc!r}')
|
190 |
-
|
191 |
-
|
192 |
-
async def RandomUserAgent(wa=None):
|
193 |
-
logme.debug(__name__ + ':RandomUserAgent')
|
194 |
-
try:
|
195 |
-
if wa:
|
196 |
-
return "Mozilla/5.0 (Windows NT 6.4; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2225.0 Safari/537.36"
|
197 |
-
return UserAgent(verify_ssl=False, use_cache_server=False).random
|
198 |
-
except:
|
199 |
-
return random.choice(user_agent_list)
|
200 |
-
|
201 |
-
|
202 |
-
async def Username(_id, bearer_token, guest_token):
|
203 |
-
logme.debug(__name__ + ':Username')
|
204 |
-
_dct = {'userId': _id, 'withHighlightedLabel': False}
|
205 |
-
_url = "https://api.twitter.com/graphql/B9FuNQVmyx32rdbIPEZKag/UserByRestId?variables={}".format(dict_to_url(_dct))
|
206 |
-
_headers = {
|
207 |
-
'authorization': bearer_token,
|
208 |
-
'x-guest-token': guest_token,
|
209 |
-
}
|
210 |
-
r = await Request(_url, headers=_headers)
|
211 |
-
j_r = loads(r)
|
212 |
-
username = j_r['data']['user']['legacy']['screen_name']
|
213 |
-
return username
|
214 |
-
|
215 |
-
|
216 |
-
async def Tweet(url, config, conn):
|
217 |
-
logme.debug(__name__ + ':Tweet')
|
218 |
-
try:
|
219 |
-
response = await Request(url)
|
220 |
-
soup = BeautifulSoup(response, "html.parser")
|
221 |
-
tweets = soup.find_all("div", "tweet")
|
222 |
-
await Tweets(tweets, config, conn, url)
|
223 |
-
except Exception as e:
|
224 |
-
logme.critical(__name__ + ':Tweet:' + str(e))
|
225 |
-
|
226 |
-
|
227 |
-
async def User(username, config, conn, user_id=False):
|
228 |
-
logme.debug(__name__ + ':User')
|
229 |
-
_dct = {'screen_name': username, 'withHighlightedLabel': False}
|
230 |
-
_url = 'https://api.twitter.com/graphql/jMaTS-_Ea8vh9rpKggJbCQ/UserByScreenName?variables={}'\
|
231 |
-
.format(dict_to_url(_dct))
|
232 |
-
_headers = {
|
233 |
-
'authorization': config.Bearer_token,
|
234 |
-
'x-guest-token': config.Guest_token,
|
235 |
-
}
|
236 |
-
try:
|
237 |
-
response = await Request(_url, headers=_headers)
|
238 |
-
j_r = loads(response)
|
239 |
-
if user_id:
|
240 |
-
try:
|
241 |
-
_id = j_r['data']['user']['rest_id']
|
242 |
-
return _id
|
243 |
-
except KeyError as e:
|
244 |
-
logme.critical(__name__ + ':User:' + str(e))
|
245 |
-
return
|
246 |
-
await Users(j_r, config, conn)
|
247 |
-
except Exception as e:
|
248 |
-
logme.critical(__name__ + ':User:' + str(e))
|
249 |
-
raise
|
250 |
-
|
251 |
-
|
252 |
-
def Limit(Limit, count):
|
253 |
-
logme.debug(__name__ + ':Limit')
|
254 |
-
if Limit is not None and count >= int(Limit):
|
255 |
-
return True
|
256 |
-
|
257 |
-
|
258 |
-
async def Multi(feed, config, conn):
|
259 |
-
logme.debug(__name__ + ':Multi')
|
260 |
-
count = 0
|
261 |
-
try:
|
262 |
-
with concurrent.futures.ThreadPoolExecutor(max_workers=20) as executor:
|
263 |
-
loop = asyncio.get_event_loop()
|
264 |
-
futures = []
|
265 |
-
for tweet in feed:
|
266 |
-
count += 1
|
267 |
-
if config.Favorites or config.Profile_full:
|
268 |
-
logme.debug(__name__ + ':Multi:Favorites-profileFull')
|
269 |
-
link = tweet.find("a")["href"]
|
270 |
-
url = f"https://twitter.com{link}&lang=en"
|
271 |
-
elif config.User_full:
|
272 |
-
logme.debug(__name__ + ':Multi:userFull')
|
273 |
-
username = tweet.find("a")["name"]
|
274 |
-
url = f"http://twitter.com/{username}?lang=en"
|
275 |
-
else:
|
276 |
-
logme.debug(__name__ + ':Multi:else-url')
|
277 |
-
link = tweet.find("a", "tweet-timestamp js-permalink js-nav js-tooltip")["href"]
|
278 |
-
url = f"https://twitter.com{link}?lang=en"
|
279 |
-
|
280 |
-
if config.User_full:
|
281 |
-
logme.debug(__name__ + ':Multi:user-full-Run')
|
282 |
-
futures.append(loop.run_in_executor(executor, await User(url,
|
283 |
-
config, conn)))
|
284 |
-
else:
|
285 |
-
logme.debug(__name__ + ':Multi:notUser-full-Run')
|
286 |
-
futures.append(loop.run_in_executor(executor, await Tweet(url,
|
287 |
-
config, conn)))
|
288 |
-
logme.debug(__name__ + ':Multi:asyncioGather')
|
289 |
-
await asyncio.gather(*futures)
|
290 |
-
except Exception as e:
|
291 |
-
# TODO: fix error not error
|
292 |
-
# print(str(e) + " [x] get.Multi")
|
293 |
-
# will return "'NoneType' object is not callable"
|
294 |
-
# but still works
|
295 |
-
# logme.critical(__name__+':Multi:' + str(e))
|
296 |
-
pass
|
297 |
-
|
298 |
-
return count
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
twitter-scraper/twint-master/twint/output.py
DELETED
@@ -1,241 +0,0 @@
|
|
1 |
-
from datetime import datetime
|
2 |
-
|
3 |
-
from . import format, get
|
4 |
-
from .tweet import Tweet
|
5 |
-
from .user import User
|
6 |
-
from .storage import db, elasticsearch, write, panda
|
7 |
-
|
8 |
-
import logging as logme
|
9 |
-
|
10 |
-
follows_list = []
|
11 |
-
tweets_list = []
|
12 |
-
users_list = []
|
13 |
-
|
14 |
-
author_list = {''}
|
15 |
-
author_list.pop()
|
16 |
-
|
17 |
-
# used by Pandas
|
18 |
-
_follows_object = {}
|
19 |
-
|
20 |
-
|
21 |
-
def _formatDateTime(datetimestamp):
|
22 |
-
try:
|
23 |
-
return int(datetime.strptime(datetimestamp, "%Y-%m-%d %H:%M:%S").timestamp())
|
24 |
-
except ValueError:
|
25 |
-
return int(datetime.strptime(datetimestamp, "%Y-%m-%d").timestamp())
|
26 |
-
|
27 |
-
|
28 |
-
def _clean_follow_list():
|
29 |
-
logme.debug(__name__ + ':clean_follow_list')
|
30 |
-
global _follows_object
|
31 |
-
_follows_object = {}
|
32 |
-
|
33 |
-
|
34 |
-
def clean_lists():
|
35 |
-
logme.debug(__name__ + ':clean_lists')
|
36 |
-
global follows_list
|
37 |
-
global tweets_list
|
38 |
-
global users_list
|
39 |
-
follows_list = []
|
40 |
-
tweets_list = []
|
41 |
-
users_list = []
|
42 |
-
|
43 |
-
|
44 |
-
def datecheck(datetimestamp, config):
|
45 |
-
logme.debug(__name__ + ':datecheck')
|
46 |
-
if config.Since:
|
47 |
-
logme.debug(__name__ + ':datecheck:SinceTrue')
|
48 |
-
|
49 |
-
d = _formatDateTime(datetimestamp)
|
50 |
-
s = _formatDateTime(config.Since)
|
51 |
-
|
52 |
-
if d < s:
|
53 |
-
return False
|
54 |
-
if config.Until:
|
55 |
-
logme.debug(__name__ + ':datecheck:UntilTrue')
|
56 |
-
|
57 |
-
d = _formatDateTime(datetimestamp)
|
58 |
-
s = _formatDateTime(config.Until)
|
59 |
-
|
60 |
-
if d > s:
|
61 |
-
return False
|
62 |
-
logme.debug(__name__ + ':datecheck:dateRangeFalse')
|
63 |
-
return True
|
64 |
-
|
65 |
-
|
66 |
-
# TODO In this method we need to delete the quoted tweets, because twitter also sends the quoted tweets in the
|
67 |
-
# `tweets` list along with the other tweets
|
68 |
-
def is_tweet(tw):
|
69 |
-
try:
|
70 |
-
tw["data-item-id"]
|
71 |
-
logme.debug(__name__ + ':is_tweet:True')
|
72 |
-
return True
|
73 |
-
except:
|
74 |
-
logme.critical(__name__ + ':is_tweet:False')
|
75 |
-
return False
|
76 |
-
|
77 |
-
|
78 |
-
def _output(obj, output, config, **extra):
|
79 |
-
logme.debug(__name__ + ':_output')
|
80 |
-
if config.Lowercase:
|
81 |
-
if isinstance(obj, str):
|
82 |
-
logme.debug(__name__ + ':_output:Lowercase:username')
|
83 |
-
obj = obj.lower()
|
84 |
-
elif obj.__class__.__name__ == "user":
|
85 |
-
logme.debug(__name__ + ':_output:Lowercase:user')
|
86 |
-
pass
|
87 |
-
elif obj.__class__.__name__ == "tweet":
|
88 |
-
logme.debug(__name__ + ':_output:Lowercase:tweet')
|
89 |
-
obj.username = obj.username.lower()
|
90 |
-
author_list.update({obj.username})
|
91 |
-
for dct in obj.mentions:
|
92 |
-
for key, val in dct.items():
|
93 |
-
dct[key] = val.lower()
|
94 |
-
for i in range(len(obj.hashtags)):
|
95 |
-
obj.hashtags[i] = obj.hashtags[i].lower()
|
96 |
-
for i in range(len(obj.cashtags)):
|
97 |
-
obj.cashtags[i] = obj.cashtags[i].lower()
|
98 |
-
else:
|
99 |
-
logme.info('_output:Lowercase:hiddenTweetFound')
|
100 |
-
print("[x] Hidden tweet found, account suspended due to violation of TOS")
|
101 |
-
return
|
102 |
-
if config.Output != None:
|
103 |
-
if config.Store_csv:
|
104 |
-
try:
|
105 |
-
write.Csv(obj, config)
|
106 |
-
logme.debug(__name__ + ':_output:CSV')
|
107 |
-
except Exception as e:
|
108 |
-
logme.critical(__name__ + ':_output:CSV:Error:' + str(e))
|
109 |
-
print(str(e) + " [x] output._output")
|
110 |
-
elif config.Store_json:
|
111 |
-
write.Json(obj, config)
|
112 |
-
logme.debug(__name__ + ':_output:JSON')
|
113 |
-
else:
|
114 |
-
write.Text(output, config.Output)
|
115 |
-
logme.debug(__name__ + ':_output:Text')
|
116 |
-
|
117 |
-
if config.Elasticsearch:
|
118 |
-
logme.debug(__name__ + ':_output:Elasticsearch')
|
119 |
-
print("", end=".", flush=True)
|
120 |
-
else:
|
121 |
-
if not config.Hide_output:
|
122 |
-
try:
|
123 |
-
print(output.replace('\n', ' '))
|
124 |
-
except UnicodeEncodeError:
|
125 |
-
logme.critical(__name__ + ':_output:UnicodeEncodeError')
|
126 |
-
print("unicode error [x] output._output")
|
127 |
-
|
128 |
-
|
129 |
-
async def checkData(tweet, config, conn):
|
130 |
-
logme.debug(__name__ + ':checkData')
|
131 |
-
tweet = Tweet(tweet, config)
|
132 |
-
if not tweet.datestamp:
|
133 |
-
logme.critical(__name__ + ':checkData:hiddenTweetFound')
|
134 |
-
print("[x] Hidden tweet found, account suspended due to violation of TOS")
|
135 |
-
return
|
136 |
-
if datecheck(tweet.datestamp + " " + tweet.timestamp, config):
|
137 |
-
output = format.Tweet(config, tweet)
|
138 |
-
if config.Database:
|
139 |
-
logme.debug(__name__ + ':checkData:Database')
|
140 |
-
db.tweets(conn, tweet, config)
|
141 |
-
if config.Pandas:
|
142 |
-
logme.debug(__name__ + ':checkData:Pandas')
|
143 |
-
panda.update(tweet, config)
|
144 |
-
if config.Store_object:
|
145 |
-
logme.debug(__name__ + ':checkData:Store_object')
|
146 |
-
if hasattr(config.Store_object_tweets_list, 'append'):
|
147 |
-
config.Store_object_tweets_list.append(tweet)
|
148 |
-
else:
|
149 |
-
tweets_list.append(tweet)
|
150 |
-
if config.Elasticsearch:
|
151 |
-
logme.debug(__name__ + ':checkData:Elasticsearch')
|
152 |
-
elasticsearch.Tweet(tweet, config)
|
153 |
-
_output(tweet, output, config)
|
154 |
-
# else:
|
155 |
-
# logme.critical(__name__+':checkData:copyrightedTweet')
|
156 |
-
|
157 |
-
|
158 |
-
async def Tweets(tweets, config, conn):
|
159 |
-
logme.debug(__name__ + ':Tweets')
|
160 |
-
if config.Favorites or config.Location:
|
161 |
-
logme.debug(__name__ + ':Tweets:fav+full+loc')
|
162 |
-
for tw in tweets:
|
163 |
-
await checkData(tw, config, conn)
|
164 |
-
elif config.TwitterSearch or config.Profile:
|
165 |
-
logme.debug(__name__ + ':Tweets:TwitterSearch')
|
166 |
-
await checkData(tweets, config, conn)
|
167 |
-
else:
|
168 |
-
logme.debug(__name__ + ':Tweets:else')
|
169 |
-
if int(tweets["data-user-id"]) == config.User_id or config.Retweets:
|
170 |
-
await checkData(tweets, config, conn)
|
171 |
-
|
172 |
-
|
173 |
-
async def Users(u, config, conn):
|
174 |
-
logme.debug(__name__ + ':User')
|
175 |
-
global users_list
|
176 |
-
|
177 |
-
user = User(u)
|
178 |
-
output = format.User(config.Format, user)
|
179 |
-
|
180 |
-
if config.Database:
|
181 |
-
logme.debug(__name__ + ':User:Database')
|
182 |
-
db.user(conn, config, user)
|
183 |
-
|
184 |
-
if config.Elasticsearch:
|
185 |
-
logme.debug(__name__ + ':User:Elasticsearch')
|
186 |
-
_save_date = user.join_date
|
187 |
-
_save_time = user.join_time
|
188 |
-
user.join_date = str(datetime.strptime(user.join_date, "%d %b %Y")).split()[0]
|
189 |
-
user.join_time = str(datetime.strptime(user.join_time, "%I:%M %p")).split()[1]
|
190 |
-
elasticsearch.UserProfile(user, config)
|
191 |
-
user.join_date = _save_date
|
192 |
-
user.join_time = _save_time
|
193 |
-
|
194 |
-
if config.Store_object:
|
195 |
-
logme.debug(__name__ + ':User:Store_object')
|
196 |
-
|
197 |
-
if hasattr(config.Store_object_follow_list, 'append'):
|
198 |
-
config.Store_object_follow_list.append(user)
|
199 |
-
elif hasattr(config.Store_object_users_list, 'append'):
|
200 |
-
config.Store_object_users_list.append(user)
|
201 |
-
else:
|
202 |
-
users_list.append(user) # twint.user.user
|
203 |
-
|
204 |
-
if config.Pandas:
|
205 |
-
logme.debug(__name__ + ':User:Pandas+user')
|
206 |
-
panda.update(user, config)
|
207 |
-
|
208 |
-
_output(user, output, config)
|
209 |
-
|
210 |
-
|
211 |
-
async def Username(username, config, conn):
|
212 |
-
logme.debug(__name__ + ':Username')
|
213 |
-
global _follows_object
|
214 |
-
global follows_list
|
215 |
-
follow_var = config.Following * "following" + config.Followers * "followers"
|
216 |
-
|
217 |
-
if config.Database:
|
218 |
-
logme.debug(__name__ + ':Username:Database')
|
219 |
-
db.follow(conn, config.Username, config.Followers, username)
|
220 |
-
|
221 |
-
if config.Elasticsearch:
|
222 |
-
logme.debug(__name__ + ':Username:Elasticsearch')
|
223 |
-
elasticsearch.Follow(username, config)
|
224 |
-
|
225 |
-
if config.Store_object:
|
226 |
-
if hasattr(config.Store_object_follow_list, 'append'):
|
227 |
-
config.Store_object_follow_list.append(username)
|
228 |
-
else:
|
229 |
-
follows_list.append(username) # twint.user.user
|
230 |
-
|
231 |
-
if config.Pandas:
|
232 |
-
logme.debug(__name__ + ':Username:object+pandas')
|
233 |
-
try:
|
234 |
-
_ = _follows_object[config.Username][follow_var]
|
235 |
-
except KeyError:
|
236 |
-
_follows_object.update({config.Username: {follow_var: []}})
|
237 |
-
_follows_object[config.Username][follow_var].append(username)
|
238 |
-
if config.Pandas_au:
|
239 |
-
logme.debug(__name__ + ':Username:object+pandas+au')
|
240 |
-
panda.update(_follows_object[config.Username], config)
|
241 |
-
_output(username, username, config)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
twitter-scraper/twint-master/twint/run.py
DELETED
@@ -1,412 +0,0 @@
|
|
1 |
-
import sys, os, datetime
|
2 |
-
from asyncio import get_event_loop, TimeoutError, ensure_future, new_event_loop, set_event_loop
|
3 |
-
|
4 |
-
from . import datelock, feed, get, output, verbose, storage
|
5 |
-
from .token import TokenExpiryException
|
6 |
-
from . import token
|
7 |
-
from .storage import db
|
8 |
-
from .feed import NoMoreTweetsException
|
9 |
-
|
10 |
-
import logging as logme
|
11 |
-
|
12 |
-
import time
|
13 |
-
|
14 |
-
bearer = 'Bearer AAAAAAAAAAAAAAAAAAAAANRILgAAAAAAnNwIzUejRCOuH5E6I8xnZz4puTs' \
|
15 |
-
'%3D1Zv7ttfk8LF81IUq16cHjhLTvJu4FA33AGWWjCpTnA'
|
16 |
-
|
17 |
-
|
18 |
-
class Twint:
|
19 |
-
def __init__(self, config):
|
20 |
-
logme.debug(__name__ + ':Twint:__init__')
|
21 |
-
if config.Resume is not None and (config.TwitterSearch or config.Followers or config.Following):
|
22 |
-
logme.debug(__name__ + ':Twint:__init__:Resume')
|
23 |
-
self.init = self.get_resume(config.Resume)
|
24 |
-
else:
|
25 |
-
self.init = -1
|
26 |
-
|
27 |
-
config.deleted = []
|
28 |
-
self.feed: list = [-1]
|
29 |
-
self.count = 0
|
30 |
-
self.user_agent = ""
|
31 |
-
self.config = config
|
32 |
-
self.config.Bearer_token = bearer
|
33 |
-
# TODO might have to make some adjustments for it to work with multi-treading
|
34 |
-
# USAGE : to get a new guest token simply do `self.token.refresh()`
|
35 |
-
self.token = token.Token(config)
|
36 |
-
self.token.refresh()
|
37 |
-
self.conn = db.Conn(config.Database)
|
38 |
-
self.d = datelock.Set(self.config.Until, self.config.Since)
|
39 |
-
verbose.Elastic(config.Elasticsearch)
|
40 |
-
|
41 |
-
if self.config.Store_object:
|
42 |
-
logme.debug(__name__ + ':Twint:__init__:clean_follow_list')
|
43 |
-
output._clean_follow_list()
|
44 |
-
|
45 |
-
if self.config.Pandas_clean:
|
46 |
-
logme.debug(__name__ + ':Twint:__init__:pandas_clean')
|
47 |
-
storage.panda.clean()
|
48 |
-
|
49 |
-
def get_resume(self, resumeFile):
|
50 |
-
if not os.path.exists(resumeFile):
|
51 |
-
return '-1'
|
52 |
-
with open(resumeFile, 'r') as rFile:
|
53 |
-
_init = rFile.readlines()[-1].strip('\n')
|
54 |
-
return _init
|
55 |
-
|
56 |
-
async def Feed(self):
|
57 |
-
logme.debug(__name__ + ':Twint:Feed')
|
58 |
-
consecutive_errors_count = 0
|
59 |
-
while True:
|
60 |
-
# this will receive a JSON string, parse it into a `dict` and do the required stuff
|
61 |
-
try:
|
62 |
-
response = await get.RequestUrl(self.config, self.init)
|
63 |
-
except TokenExpiryException as e:
|
64 |
-
logme.debug(__name__ + 'Twint:Feed:' + str(e))
|
65 |
-
self.token.refresh()
|
66 |
-
response = await get.RequestUrl(self.config, self.init)
|
67 |
-
|
68 |
-
if self.config.Debug:
|
69 |
-
print(response, file=open("twint-last-request.log", "w", encoding="utf-8"))
|
70 |
-
|
71 |
-
self.feed = []
|
72 |
-
try:
|
73 |
-
if self.config.Favorites:
|
74 |
-
self.feed, self.init = feed.MobileFav(response)
|
75 |
-
favorite_err_cnt = 0
|
76 |
-
if len(self.feed) == 0 and len(self.init) == 0:
|
77 |
-
while (len(self.feed) == 0 or len(self.init) == 0) and favorite_err_cnt < 5:
|
78 |
-
self.user_agent = await get.RandomUserAgent(wa=False)
|
79 |
-
response = await get.RequestUrl(self.config, self.init,
|
80 |
-
headers=[("User-Agent", self.user_agent)])
|
81 |
-
self.feed, self.init = feed.MobileFav(response)
|
82 |
-
favorite_err_cnt += 1
|
83 |
-
time.sleep(1)
|
84 |
-
if favorite_err_cnt == 5:
|
85 |
-
print("Favorite page could not be fetched")
|
86 |
-
if not self.count % 40:
|
87 |
-
time.sleep(5)
|
88 |
-
elif self.config.Followers or self.config.Following:
|
89 |
-
self.feed, self.init = feed.Follow(response)
|
90 |
-
if not self.count % 40:
|
91 |
-
time.sleep(5)
|
92 |
-
elif self.config.Profile or self.config.TwitterSearch:
|
93 |
-
try:
|
94 |
-
self.feed, self.init = feed.parse_tweets(self.config, response)
|
95 |
-
except NoMoreTweetsException as e:
|
96 |
-
logme.debug(__name__ + ':Twint:Feed:' + str(e))
|
97 |
-
print('[!] ' + str(e) + ' Scraping will stop now.')
|
98 |
-
print('found {} deleted tweets in this search.'.format(len(self.config.deleted)))
|
99 |
-
break
|
100 |
-
break
|
101 |
-
except TimeoutError as e:
|
102 |
-
if self.config.Proxy_host.lower() == "tor":
|
103 |
-
print("[?] Timed out, changing Tor identity...")
|
104 |
-
if self.config.Tor_control_password is None:
|
105 |
-
logme.critical(__name__ + ':Twint:Feed:tor-password')
|
106 |
-
sys.stderr.write("Error: config.Tor_control_password must be set for proxy auto-rotation!\r\n")
|
107 |
-
sys.stderr.write(
|
108 |
-
"Info: What is it? See https://stem.torproject.org/faq.html#can-i-interact-with-tors"
|
109 |
-
"-controller-interface-directly\r\n")
|
110 |
-
break
|
111 |
-
else:
|
112 |
-
get.ForceNewTorIdentity(self.config)
|
113 |
-
continue
|
114 |
-
else:
|
115 |
-
logme.critical(__name__ + ':Twint:Feed:' + str(e))
|
116 |
-
print(str(e))
|
117 |
-
break
|
118 |
-
except Exception as e:
|
119 |
-
if self.config.Profile or self.config.Favorites:
|
120 |
-
print("[!] Twitter does not return more data, scrape stops here.")
|
121 |
-
break
|
122 |
-
|
123 |
-
logme.critical(__name__ + ':Twint:Feed:noData' + str(e))
|
124 |
-
# Sometimes Twitter says there is no data. But it's a lie.
|
125 |
-
# raise
|
126 |
-
consecutive_errors_count += 1
|
127 |
-
if consecutive_errors_count < self.config.Retries_count:
|
128 |
-
# skip to the next iteration if wait time does not satisfy limit constraints
|
129 |
-
delay = round(consecutive_errors_count ** self.config.Backoff_exponent, 1)
|
130 |
-
|
131 |
-
# if the delay is less than users set min wait time then replace delay
|
132 |
-
if self.config.Min_wait_time > delay:
|
133 |
-
delay = self.config.Min_wait_time
|
134 |
-
|
135 |
-
sys.stderr.write('sleeping for {} secs\n'.format(delay))
|
136 |
-
time.sleep(delay)
|
137 |
-
self.user_agent = await get.RandomUserAgent(wa=True)
|
138 |
-
continue
|
139 |
-
logme.critical(__name__ + ':Twint:Feed:Tweets_known_error:' + str(e))
|
140 |
-
sys.stderr.write(str(e) + " [x] run.Feed")
|
141 |
-
sys.stderr.write(
|
142 |
-
"[!] if you get this error but you know for sure that more tweets exist, please open an issue and "
|
143 |
-
"we will investigate it!")
|
144 |
-
break
|
145 |
-
if self.config.Resume:
|
146 |
-
print(self.init, file=open(self.config.Resume, "a", encoding="utf-8"))
|
147 |
-
|
148 |
-
async def follow(self):
|
149 |
-
await self.Feed()
|
150 |
-
if self.config.User_full:
|
151 |
-
logme.debug(__name__ + ':Twint:follow:userFull')
|
152 |
-
self.count += await get.Multi(self.feed, self.config, self.conn)
|
153 |
-
else:
|
154 |
-
logme.debug(__name__ + ':Twint:follow:notUserFull')
|
155 |
-
for user in self.feed:
|
156 |
-
self.count += 1
|
157 |
-
username = user.find("a")["name"]
|
158 |
-
await output.Username(username, self.config, self.conn)
|
159 |
-
|
160 |
-
async def favorite(self):
|
161 |
-
logme.debug(__name__ + ':Twint:favorite')
|
162 |
-
await self.Feed()
|
163 |
-
favorited_tweets_list = []
|
164 |
-
for tweet in self.feed:
|
165 |
-
tweet_dict = {}
|
166 |
-
self.count += 1
|
167 |
-
try:
|
168 |
-
tweet_dict['data-item-id'] = tweet.find("div", {"class": "tweet-text"})['data-id']
|
169 |
-
t_url = tweet.find("span", {"class": "metadata"}).find("a")["href"]
|
170 |
-
tweet_dict['data-conversation-id'] = t_url.split('?')[0].split('/')[-1]
|
171 |
-
tweet_dict['username'] = tweet.find("div", {"class": "username"}).text.replace('\n', '').replace(' ',
|
172 |
-
'')
|
173 |
-
tweet_dict['tweet'] = tweet.find("div", {"class": "tweet-text"}).find("div", {"class": "dir-ltr"}).text
|
174 |
-
date_str = tweet.find("td", {"class": "timestamp"}).find("a").text
|
175 |
-
# test_dates = ["1m", "2h", "Jun 21, 2019", "Mar 12", "28 Jun 19"]
|
176 |
-
# date_str = test_dates[3]
|
177 |
-
if len(date_str) <= 3 and (date_str[-1] == "m" or date_str[-1] == "h"): # 25m 1h
|
178 |
-
dateu = str(datetime.date.today())
|
179 |
-
tweet_dict['date'] = dateu
|
180 |
-
elif ',' in date_str: # Aug 21, 2019
|
181 |
-
sp = date_str.replace(',', '').split(' ')
|
182 |
-
date_str_formatted = sp[1] + ' ' + sp[0] + ' ' + sp[2]
|
183 |
-
dateu = datetime.datetime.strptime(date_str_formatted, "%d %b %Y").strftime("%Y-%m-%d")
|
184 |
-
tweet_dict['date'] = dateu
|
185 |
-
elif len(date_str.split(' ')) == 3: # 28 Jun 19
|
186 |
-
sp = date_str.split(' ')
|
187 |
-
if len(sp[2]) == 2:
|
188 |
-
sp[2] = '20' + sp[2]
|
189 |
-
date_str_formatted = sp[0] + ' ' + sp[1] + ' ' + sp[2]
|
190 |
-
dateu = datetime.datetime.strptime(date_str_formatted, "%d %b %Y").strftime("%Y-%m-%d")
|
191 |
-
tweet_dict['date'] = dateu
|
192 |
-
else: # Aug 21
|
193 |
-
sp = date_str.split(' ')
|
194 |
-
date_str_formatted = sp[1] + ' ' + sp[0] + ' ' + str(datetime.date.today().year)
|
195 |
-
dateu = datetime.datetime.strptime(date_str_formatted, "%d %b %Y").strftime("%Y-%m-%d")
|
196 |
-
tweet_dict['date'] = dateu
|
197 |
-
|
198 |
-
favorited_tweets_list.append(tweet_dict)
|
199 |
-
|
200 |
-
except Exception as e:
|
201 |
-
logme.critical(__name__ + ':Twint:favorite:favorite_field_lack')
|
202 |
-
print("shit: ", date_str, " ", str(e))
|
203 |
-
|
204 |
-
try:
|
205 |
-
self.config.favorited_tweets_list += favorited_tweets_list
|
206 |
-
except AttributeError:
|
207 |
-
self.config.favorited_tweets_list = favorited_tweets_list
|
208 |
-
|
209 |
-
async def profile(self):
|
210 |
-
await self.Feed()
|
211 |
-
logme.debug(__name__ + ':Twint:profile')
|
212 |
-
for tweet in self.feed:
|
213 |
-
self.count += 1
|
214 |
-
await output.Tweets(tweet, self.config, self.conn)
|
215 |
-
|
216 |
-
async def tweets(self):
|
217 |
-
await self.Feed()
|
218 |
-
# TODO : need to take care of this later
|
219 |
-
if self.config.Location:
|
220 |
-
logme.debug(__name__ + ':Twint:tweets:location')
|
221 |
-
self.count += await get.Multi(self.feed, self.config, self.conn)
|
222 |
-
else:
|
223 |
-
logme.debug(__name__ + ':Twint:tweets:notLocation')
|
224 |
-
for tweet in self.feed:
|
225 |
-
self.count += 1
|
226 |
-
await output.Tweets(tweet, self.config, self.conn)
|
227 |
-
|
228 |
-
async def main(self, callback=None):
|
229 |
-
|
230 |
-
task = ensure_future(self.run()) # Might be changed to create_task in 3.7+.
|
231 |
-
|
232 |
-
if callback:
|
233 |
-
task.add_done_callback(callback)
|
234 |
-
|
235 |
-
await task
|
236 |
-
|
237 |
-
async def run(self):
|
238 |
-
if self.config.TwitterSearch:
|
239 |
-
self.user_agent = await get.RandomUserAgent(wa=True)
|
240 |
-
else:
|
241 |
-
self.user_agent = await get.RandomUserAgent()
|
242 |
-
|
243 |
-
if self.config.User_id is not None and self.config.Username is None:
|
244 |
-
logme.debug(__name__ + ':Twint:main:user_id')
|
245 |
-
self.config.Username = await get.Username(self.config.User_id, self.config.Bearer_token,
|
246 |
-
self.config.Guest_token)
|
247 |
-
|
248 |
-
if self.config.Username is not None and self.config.User_id is None:
|
249 |
-
logme.debug(__name__ + ':Twint:main:username')
|
250 |
-
|
251 |
-
self.config.User_id = await get.User(self.config.Username, self.config, self.conn, True)
|
252 |
-
if self.config.User_id is None:
|
253 |
-
raise ValueError("Cannot find twitter account with name = " + self.config.Username)
|
254 |
-
|
255 |
-
# TODO : will need to modify it to work with the new endpoints
|
256 |
-
if self.config.TwitterSearch and self.config.Since and self.config.Until:
|
257 |
-
logme.debug(__name__ + ':Twint:main:search+since+until')
|
258 |
-
while self.d.since < self.d.until:
|
259 |
-
self.config.Since = datetime.datetime.strftime(self.d.since, "%Y-%m-%d %H:%M:%S")
|
260 |
-
self.config.Until = datetime.datetime.strftime(self.d.until, "%Y-%m-%d %H:%M:%S")
|
261 |
-
if len(self.feed) > 0:
|
262 |
-
await self.tweets()
|
263 |
-
else:
|
264 |
-
logme.debug(__name__ + ':Twint:main:gettingNewTweets')
|
265 |
-
break
|
266 |
-
|
267 |
-
if get.Limit(self.config.Limit, self.count):
|
268 |
-
break
|
269 |
-
elif self.config.Lookup:
|
270 |
-
await self.Lookup()
|
271 |
-
else:
|
272 |
-
logme.debug(__name__ + ':Twint:main:not-search+since+until')
|
273 |
-
while True:
|
274 |
-
if len(self.feed) > 0:
|
275 |
-
if self.config.Followers or self.config.Following:
|
276 |
-
logme.debug(__name__ + ':Twint:main:follow')
|
277 |
-
await self.follow()
|
278 |
-
elif self.config.Favorites:
|
279 |
-
logme.debug(__name__ + ':Twint:main:favorites')
|
280 |
-
await self.favorite()
|
281 |
-
elif self.config.Profile:
|
282 |
-
logme.debug(__name__ + ':Twint:main:profile')
|
283 |
-
await self.profile()
|
284 |
-
elif self.config.TwitterSearch:
|
285 |
-
logme.debug(__name__ + ':Twint:main:twitter-search')
|
286 |
-
await self.tweets()
|
287 |
-
else:
|
288 |
-
logme.debug(__name__ + ':Twint:main:no-more-tweets')
|
289 |
-
break
|
290 |
-
|
291 |
-
# logging.info("[<] " + str(datetime.now()) + ':: run+Twint+main+CallingGetLimit2')
|
292 |
-
if get.Limit(self.config.Limit, self.count):
|
293 |
-
logme.debug(__name__ + ':Twint:main:reachedLimit')
|
294 |
-
break
|
295 |
-
|
296 |
-
if self.config.Count:
|
297 |
-
verbose.Count(self.count, self.config)
|
298 |
-
|
299 |
-
async def Lookup(self):
|
300 |
-
logme.debug(__name__ + ':Twint:Lookup')
|
301 |
-
|
302 |
-
try:
|
303 |
-
if self.config.User_id is not None and self.config.Username is None:
|
304 |
-
logme.debug(__name__ + ':Twint:Lookup:user_id')
|
305 |
-
self.config.Username = await get.Username(self.config.User_id, self.config.Bearer_token,
|
306 |
-
self.config.Guest_token)
|
307 |
-
await get.User(self.config.Username, self.config, db.Conn(self.config.Database))
|
308 |
-
|
309 |
-
except Exception as e:
|
310 |
-
logme.exception(__name__ + ':Twint:Lookup:Unexpected exception occurred.')
|
311 |
-
raise
|
312 |
-
|
313 |
-
|
314 |
-
def run(config, callback=None):
|
315 |
-
logme.debug(__name__ + ':run')
|
316 |
-
try:
|
317 |
-
get_event_loop()
|
318 |
-
except RuntimeError as e:
|
319 |
-
if "no current event loop" in str(e):
|
320 |
-
set_event_loop(new_event_loop())
|
321 |
-
else:
|
322 |
-
logme.exception(__name__ + ':run:Unexpected exception while handling an expected RuntimeError.')
|
323 |
-
raise
|
324 |
-
except Exception as e:
|
325 |
-
logme.exception(
|
326 |
-
__name__ + ':run:Unexpected exception occurred while attempting to get or create a new event loop.')
|
327 |
-
raise
|
328 |
-
|
329 |
-
get_event_loop().run_until_complete(Twint(config).main(callback))
|
330 |
-
|
331 |
-
|
332 |
-
def Favorites(config):
|
333 |
-
logme.debug(__name__ + ':Favorites')
|
334 |
-
config.Favorites = True
|
335 |
-
config.Following = False
|
336 |
-
config.Followers = False
|
337 |
-
config.Profile = False
|
338 |
-
config.TwitterSearch = False
|
339 |
-
run(config)
|
340 |
-
if config.Pandas_au:
|
341 |
-
storage.panda._autoget("tweet")
|
342 |
-
|
343 |
-
|
344 |
-
def Followers(config):
|
345 |
-
logme.debug(__name__ + ':Followers')
|
346 |
-
config.Followers = True
|
347 |
-
config.Following = False
|
348 |
-
config.Profile = False
|
349 |
-
config.Favorites = False
|
350 |
-
config.TwitterSearch = False
|
351 |
-
run(config)
|
352 |
-
if config.Pandas_au:
|
353 |
-
storage.panda._autoget("followers")
|
354 |
-
if config.User_full:
|
355 |
-
storage.panda._autoget("user")
|
356 |
-
if config.Pandas_clean and not config.Store_object:
|
357 |
-
# storage.panda.clean()
|
358 |
-
output._clean_follow_list()
|
359 |
-
|
360 |
-
|
361 |
-
def Following(config):
|
362 |
-
logme.debug(__name__ + ':Following')
|
363 |
-
config.Following = True
|
364 |
-
config.Followers = False
|
365 |
-
config.Profile = False
|
366 |
-
config.Favorites = False
|
367 |
-
config.TwitterSearch = False
|
368 |
-
run(config)
|
369 |
-
if config.Pandas_au:
|
370 |
-
storage.panda._autoget("following")
|
371 |
-
if config.User_full:
|
372 |
-
storage.panda._autoget("user")
|
373 |
-
if config.Pandas_clean and not config.Store_object:
|
374 |
-
# storage.panda.clean()
|
375 |
-
output._clean_follow_list()
|
376 |
-
|
377 |
-
|
378 |
-
def Lookup(config):
|
379 |
-
logme.debug(__name__ + ':Lookup')
|
380 |
-
config.Profile = False
|
381 |
-
config.Lookup = True
|
382 |
-
config.Favorites = False
|
383 |
-
config.FOllowing = False
|
384 |
-
config.Followers = False
|
385 |
-
config.TwitterSearch = False
|
386 |
-
run(config)
|
387 |
-
if config.Pandas_au:
|
388 |
-
storage.panda._autoget("user")
|
389 |
-
|
390 |
-
|
391 |
-
def Profile(config):
|
392 |
-
logme.debug(__name__ + ':Profile')
|
393 |
-
config.Profile = True
|
394 |
-
config.Favorites = False
|
395 |
-
config.Following = False
|
396 |
-
config.Followers = False
|
397 |
-
config.TwitterSearch = False
|
398 |
-
run(config)
|
399 |
-
if config.Pandas_au:
|
400 |
-
storage.panda._autoget("tweet")
|
401 |
-
|
402 |
-
|
403 |
-
def Search(config, callback=None):
|
404 |
-
logme.debug(__name__ + ':Search')
|
405 |
-
config.TwitterSearch = True
|
406 |
-
config.Favorites = False
|
407 |
-
config.Following = False
|
408 |
-
config.Followers = False
|
409 |
-
config.Profile = False
|
410 |
-
run(config, callback)
|
411 |
-
if config.Pandas_au:
|
412 |
-
storage.panda._autoget("tweet")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
twitter-scraper/twint-master/twint/storage/__init__.py
DELETED
File without changes
|
twitter-scraper/twint-master/twint/storage/db.py
DELETED
@@ -1,297 +0,0 @@
|
|
1 |
-
import sqlite3
|
2 |
-
import sys
|
3 |
-
import time
|
4 |
-
import hashlib
|
5 |
-
|
6 |
-
from datetime import datetime
|
7 |
-
|
8 |
-
def Conn(database):
|
9 |
-
if database:
|
10 |
-
print("[+] Inserting into Database: " + str(database))
|
11 |
-
conn = init(database)
|
12 |
-
if isinstance(conn, str): # error
|
13 |
-
print(conn)
|
14 |
-
sys.exit(1)
|
15 |
-
else:
|
16 |
-
conn = ""
|
17 |
-
|
18 |
-
return conn
|
19 |
-
|
20 |
-
def init(db):
|
21 |
-
try:
|
22 |
-
conn = sqlite3.connect(db)
|
23 |
-
cursor = conn.cursor()
|
24 |
-
|
25 |
-
table_users = """
|
26 |
-
CREATE TABLE IF NOT EXISTS
|
27 |
-
users(
|
28 |
-
id integer not null,
|
29 |
-
id_str text not null,
|
30 |
-
name text,
|
31 |
-
username text not null,
|
32 |
-
bio text,
|
33 |
-
location text,
|
34 |
-
url text,
|
35 |
-
join_date text not null,
|
36 |
-
join_time text not null,
|
37 |
-
tweets integer,
|
38 |
-
following integer,
|
39 |
-
followers integer,
|
40 |
-
likes integer,
|
41 |
-
media integer,
|
42 |
-
private integer not null,
|
43 |
-
verified integer not null,
|
44 |
-
profile_image_url text not null,
|
45 |
-
background_image text,
|
46 |
-
hex_dig text not null,
|
47 |
-
time_update integer not null,
|
48 |
-
CONSTRAINT users_pk PRIMARY KEY (id, hex_dig)
|
49 |
-
);
|
50 |
-
"""
|
51 |
-
cursor.execute(table_users)
|
52 |
-
|
53 |
-
table_tweets = """
|
54 |
-
CREATE TABLE IF NOT EXISTS
|
55 |
-
tweets (
|
56 |
-
id integer not null,
|
57 |
-
id_str text not null,
|
58 |
-
tweet text default '',
|
59 |
-
language text default '',
|
60 |
-
conversation_id text not null,
|
61 |
-
created_at integer not null,
|
62 |
-
date text not null,
|
63 |
-
time text not null,
|
64 |
-
timezone text not null,
|
65 |
-
place text default '',
|
66 |
-
replies_count integer,
|
67 |
-
likes_count integer,
|
68 |
-
retweets_count integer,
|
69 |
-
user_id integer not null,
|
70 |
-
user_id_str text not null,
|
71 |
-
screen_name text not null,
|
72 |
-
name text default '',
|
73 |
-
link text,
|
74 |
-
mentions text,
|
75 |
-
hashtags text,
|
76 |
-
cashtags text,
|
77 |
-
urls text,
|
78 |
-
photos text,
|
79 |
-
thumbnail text,
|
80 |
-
quote_url text,
|
81 |
-
video integer,
|
82 |
-
geo text,
|
83 |
-
near text,
|
84 |
-
source text,
|
85 |
-
time_update integer not null,
|
86 |
-
`translate` text default '',
|
87 |
-
trans_src text default '',
|
88 |
-
trans_dest text default '',
|
89 |
-
PRIMARY KEY (id)
|
90 |
-
);
|
91 |
-
"""
|
92 |
-
cursor.execute(table_tweets)
|
93 |
-
|
94 |
-
table_retweets = """
|
95 |
-
CREATE TABLE IF NOT EXISTS
|
96 |
-
retweets(
|
97 |
-
user_id integer not null,
|
98 |
-
username text not null,
|
99 |
-
tweet_id integer not null,
|
100 |
-
retweet_id integer not null,
|
101 |
-
retweet_date integer,
|
102 |
-
CONSTRAINT retweets_pk PRIMARY KEY(user_id, tweet_id),
|
103 |
-
CONSTRAINT user_id_fk FOREIGN KEY(user_id) REFERENCES users(id),
|
104 |
-
CONSTRAINT tweet_id_fk FOREIGN KEY(tweet_id) REFERENCES tweets(id)
|
105 |
-
);
|
106 |
-
"""
|
107 |
-
cursor.execute(table_retweets)
|
108 |
-
|
109 |
-
table_reply_to = """
|
110 |
-
CREATE TABLE IF NOT EXISTS
|
111 |
-
replies(
|
112 |
-
tweet_id integer not null,
|
113 |
-
user_id integer not null,
|
114 |
-
username text not null,
|
115 |
-
CONSTRAINT replies_pk PRIMARY KEY (user_id, tweet_id),
|
116 |
-
CONSTRAINT tweet_id_fk FOREIGN KEY (tweet_id) REFERENCES tweets(id)
|
117 |
-
);
|
118 |
-
"""
|
119 |
-
cursor.execute(table_reply_to)
|
120 |
-
|
121 |
-
table_favorites = """
|
122 |
-
CREATE TABLE IF NOT EXISTS
|
123 |
-
favorites(
|
124 |
-
user_id integer not null,
|
125 |
-
tweet_id integer not null,
|
126 |
-
CONSTRAINT favorites_pk PRIMARY KEY (user_id, tweet_id),
|
127 |
-
CONSTRAINT user_id_fk FOREIGN KEY (user_id) REFERENCES users(id),
|
128 |
-
CONSTRAINT tweet_id_fk FOREIGN KEY (tweet_id) REFERENCES tweets(id)
|
129 |
-
);
|
130 |
-
"""
|
131 |
-
cursor.execute(table_favorites)
|
132 |
-
|
133 |
-
table_followers = """
|
134 |
-
CREATE TABLE IF NOT EXISTS
|
135 |
-
followers (
|
136 |
-
id integer not null,
|
137 |
-
follower_id integer not null,
|
138 |
-
CONSTRAINT followers_pk PRIMARY KEY (id, follower_id),
|
139 |
-
CONSTRAINT id_fk FOREIGN KEY(id) REFERENCES users(id),
|
140 |
-
CONSTRAINT follower_id_fk FOREIGN KEY(follower_id) REFERENCES users(id)
|
141 |
-
);
|
142 |
-
"""
|
143 |
-
cursor.execute(table_followers)
|
144 |
-
|
145 |
-
table_following = """
|
146 |
-
CREATE TABLE IF NOT EXISTS
|
147 |
-
following (
|
148 |
-
id integer not null,
|
149 |
-
following_id integer not null,
|
150 |
-
CONSTRAINT following_pk PRIMARY KEY (id, following_id),
|
151 |
-
CONSTRAINT id_fk FOREIGN KEY(id) REFERENCES users(id),
|
152 |
-
CONSTRAINT following_id_fk FOREIGN KEY(following_id) REFERENCES users(id)
|
153 |
-
);
|
154 |
-
"""
|
155 |
-
cursor.execute(table_following)
|
156 |
-
|
157 |
-
table_followers_names = """
|
158 |
-
CREATE TABLE IF NOT EXISTS
|
159 |
-
followers_names (
|
160 |
-
user text not null,
|
161 |
-
time_update integer not null,
|
162 |
-
follower text not null,
|
163 |
-
PRIMARY KEY (user, follower)
|
164 |
-
);
|
165 |
-
"""
|
166 |
-
cursor.execute(table_followers_names)
|
167 |
-
|
168 |
-
table_following_names = """
|
169 |
-
CREATE TABLE IF NOT EXISTS
|
170 |
-
following_names (
|
171 |
-
user text not null,
|
172 |
-
time_update integer not null,
|
173 |
-
follows text not null,
|
174 |
-
PRIMARY KEY (user, follows)
|
175 |
-
);
|
176 |
-
"""
|
177 |
-
cursor.execute(table_following_names)
|
178 |
-
|
179 |
-
return conn
|
180 |
-
except Exception as e:
|
181 |
-
return str(e)
|
182 |
-
|
183 |
-
def fTable(Followers):
|
184 |
-
if Followers:
|
185 |
-
table = "followers_names"
|
186 |
-
else:
|
187 |
-
table = "following_names"
|
188 |
-
|
189 |
-
return table
|
190 |
-
|
191 |
-
def uTable(Followers):
|
192 |
-
if Followers:
|
193 |
-
table = "followers"
|
194 |
-
else:
|
195 |
-
table = "following"
|
196 |
-
|
197 |
-
return table
|
198 |
-
|
199 |
-
def follow(conn, Username, Followers, User):
|
200 |
-
try:
|
201 |
-
time_ms = round(time.time()*1000)
|
202 |
-
cursor = conn.cursor()
|
203 |
-
entry = (User, time_ms, Username,)
|
204 |
-
table = fTable(Followers)
|
205 |
-
query = f"INSERT INTO {table} VALUES(?,?,?)"
|
206 |
-
cursor.execute(query, entry)
|
207 |
-
conn.commit()
|
208 |
-
except sqlite3.IntegrityError:
|
209 |
-
pass
|
210 |
-
|
211 |
-
def get_hash_id(conn, id):
|
212 |
-
cursor = conn.cursor()
|
213 |
-
cursor.execute('SELECT hex_dig FROM users WHERE id = ? LIMIT 1', (id,))
|
214 |
-
resultset = cursor.fetchall()
|
215 |
-
return resultset[0][0] if resultset else -1
|
216 |
-
|
217 |
-
def user(conn, config, User):
|
218 |
-
try:
|
219 |
-
time_ms = round(time.time()*1000)
|
220 |
-
cursor = conn.cursor()
|
221 |
-
user = [int(User.id), User.id, User.name, User.username, User.bio, User.location, User.url,User.join_date, User.join_time, User.tweets, User.following, User.followers, User.likes, User.media_count, User.is_private, User.is_verified, User.avatar, User.background_image]
|
222 |
-
|
223 |
-
hex_dig = hashlib.sha256(','.join(str(v) for v in user).encode()).hexdigest()
|
224 |
-
entry = tuple(user) + (hex_dig,time_ms,)
|
225 |
-
old_hash = get_hash_id(conn, User.id)
|
226 |
-
|
227 |
-
if old_hash == -1 or old_hash != hex_dig:
|
228 |
-
query = f"INSERT INTO users VALUES(?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)"
|
229 |
-
cursor.execute(query, entry)
|
230 |
-
else:
|
231 |
-
pass
|
232 |
-
|
233 |
-
if config.Followers or config.Following:
|
234 |
-
table = uTable(config.Followers)
|
235 |
-
query = f"INSERT INTO {table} VALUES(?,?)"
|
236 |
-
cursor.execute(query, (config.User_id, int(User.id)))
|
237 |
-
|
238 |
-
conn.commit()
|
239 |
-
except sqlite3.IntegrityError:
|
240 |
-
pass
|
241 |
-
|
242 |
-
def tweets(conn, Tweet, config):
|
243 |
-
try:
|
244 |
-
time_ms = round(time.time()*1000)
|
245 |
-
cursor = conn.cursor()
|
246 |
-
entry = (Tweet.id,
|
247 |
-
Tweet.id_str,
|
248 |
-
Tweet.tweet,
|
249 |
-
Tweet.lang,
|
250 |
-
Tweet.conversation_id,
|
251 |
-
Tweet.datetime,
|
252 |
-
Tweet.datestamp,
|
253 |
-
Tweet.timestamp,
|
254 |
-
Tweet.timezone,
|
255 |
-
Tweet.place,
|
256 |
-
Tweet.replies_count,
|
257 |
-
Tweet.likes_count,
|
258 |
-
Tweet.retweets_count,
|
259 |
-
Tweet.user_id,
|
260 |
-
Tweet.user_id_str,
|
261 |
-
Tweet.username,
|
262 |
-
Tweet.name,
|
263 |
-
Tweet.link,
|
264 |
-
",".join(Tweet.mentions),
|
265 |
-
",".join(Tweet.hashtags),
|
266 |
-
",".join(Tweet.cashtags),
|
267 |
-
",".join(Tweet.urls),
|
268 |
-
",".join(Tweet.photos),
|
269 |
-
Tweet.thumbnail,
|
270 |
-
Tweet.quote_url,
|
271 |
-
Tweet.video,
|
272 |
-
Tweet.geo,
|
273 |
-
Tweet.near,
|
274 |
-
Tweet.source,
|
275 |
-
time_ms,
|
276 |
-
Tweet.translate,
|
277 |
-
Tweet.trans_src,
|
278 |
-
Tweet.trans_dest)
|
279 |
-
cursor.execute('INSERT INTO tweets VALUES(?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)', entry)
|
280 |
-
|
281 |
-
if config.Favorites:
|
282 |
-
query = 'INSERT INTO favorites VALUES(?,?)'
|
283 |
-
cursor.execute(query, (config.User_id, Tweet.id))
|
284 |
-
|
285 |
-
if Tweet.retweet:
|
286 |
-
query = 'INSERT INTO retweets VALUES(?,?,?,?,?)'
|
287 |
-
_d = datetime.timestamp(datetime.strptime(Tweet.retweet_date, "%Y-%m-%d %H:%M:%S"))
|
288 |
-
cursor.execute(query, (int(Tweet.user_rt_id), Tweet.user_rt, Tweet.id, int(Tweet.retweet_id), _d))
|
289 |
-
|
290 |
-
if Tweet.reply_to:
|
291 |
-
for reply in Tweet.reply_to:
|
292 |
-
query = 'INSERT INTO replies VALUES(?,?,?)'
|
293 |
-
cursor.execute(query, (Tweet.id, int(reply['user_id']), reply['username']))
|
294 |
-
|
295 |
-
conn.commit()
|
296 |
-
except sqlite3.IntegrityError:
|
297 |
-
pass
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
twitter-scraper/twint-master/twint/storage/elasticsearch.py
DELETED
@@ -1,364 +0,0 @@
|
|
1 |
-
## TODO - Fix Weekday situation
|
2 |
-
from elasticsearch import Elasticsearch, helpers
|
3 |
-
from geopy.geocoders import Nominatim
|
4 |
-
from datetime import datetime
|
5 |
-
import contextlib
|
6 |
-
import sys
|
7 |
-
|
8 |
-
_index_tweet_status = False
|
9 |
-
_index_follow_status = False
|
10 |
-
_index_user_status = False
|
11 |
-
_is_near_def = False
|
12 |
-
_is_location_def = False
|
13 |
-
_near = {}
|
14 |
-
_location = {}
|
15 |
-
|
16 |
-
geolocator = Nominatim(user_agent="twint-1.2")
|
17 |
-
|
18 |
-
class RecycleObject(object):
|
19 |
-
def write(self, junk): pass
|
20 |
-
def flush(self): pass
|
21 |
-
|
22 |
-
def getLocation(place, **options):
|
23 |
-
location = geolocator.geocode(place,timeout=1000)
|
24 |
-
if location:
|
25 |
-
if options.get("near"):
|
26 |
-
global _near
|
27 |
-
_near = {"lat": location.latitude, "lon": location.longitude}
|
28 |
-
return True
|
29 |
-
elif options.get("location"):
|
30 |
-
global _location
|
31 |
-
_location = {"lat": location.latitude, "lon": location.longitude}
|
32 |
-
return True
|
33 |
-
return {"lat": location.latitude, "lon": location.longitude}
|
34 |
-
else:
|
35 |
-
return {}
|
36 |
-
|
37 |
-
def handleIndexResponse(response):
|
38 |
-
try:
|
39 |
-
if response["status"] == 400:
|
40 |
-
return True
|
41 |
-
except KeyError:
|
42 |
-
pass
|
43 |
-
if response["acknowledged"]:
|
44 |
-
print("[+] Index \"" + response["index"] + "\" created!")
|
45 |
-
else:
|
46 |
-
print("[x] error index creation :: storage.elasticsearch.handleIndexCreation")
|
47 |
-
if response["shards_acknowledged"]:
|
48 |
-
print("[+] Shards acknowledged, everything is ready to be used!")
|
49 |
-
return True
|
50 |
-
else:
|
51 |
-
print("[x] error with shards :: storage.elasticsearch.HandleIndexCreation")
|
52 |
-
return False
|
53 |
-
|
54 |
-
def createIndex(config, instance, **scope):
|
55 |
-
if scope.get("scope") == "tweet":
|
56 |
-
tweets_body = {
|
57 |
-
"mappings": {
|
58 |
-
"properties": {
|
59 |
-
"id": {"type": "long"},
|
60 |
-
"conversation_id": {"type": "long"},
|
61 |
-
"created_at": {"type": "text"},
|
62 |
-
"date": {"type": "date", "format": "yyyy-MM-dd HH:mm:ss"},
|
63 |
-
"timezone": {"type": "keyword"},
|
64 |
-
"place": {"type": "keyword"},
|
65 |
-
"location": {"type": "keyword"},
|
66 |
-
"tweet": {"type": "text"},
|
67 |
-
"lang": {"type": "keyword"},
|
68 |
-
"hashtags": {"type": "keyword", "normalizer": "hashtag_normalizer"},
|
69 |
-
"cashtags": {"type": "keyword", "normalizer": "hashtag_normalizer"},
|
70 |
-
"user_id_str": {"type": "keyword"},
|
71 |
-
"username": {"type": "keyword", "normalizer": "hashtag_normalizer"},
|
72 |
-
"name": {"type": "text"},
|
73 |
-
"profile_image_url": {"type": "text"},
|
74 |
-
"day": {"type": "integer"},
|
75 |
-
"hour": {"type": "integer"},
|
76 |
-
"link": {"type": "text"},
|
77 |
-
"retweet": {"type": "text"},
|
78 |
-
"essid": {"type": "keyword"},
|
79 |
-
"nlikes": {"type": "integer"},
|
80 |
-
"nreplies": {"type": "integer"},
|
81 |
-
"nretweets": {"type": "integer"},
|
82 |
-
"quote_url": {"type": "text"},
|
83 |
-
"video": {"type":"integer"},
|
84 |
-
"thumbnail": {"type":"text"},
|
85 |
-
"search": {"type": "text"},
|
86 |
-
"near": {"type": "text"},
|
87 |
-
"geo_near": {"type": "geo_point"},
|
88 |
-
"geo_tweet": {"type": "geo_point"},
|
89 |
-
"photos": {"type": "text"},
|
90 |
-
"user_rt_id": {"type": "keyword"},
|
91 |
-
"mentions": {"type": "keyword", "normalizer": "hashtag_normalizer"},
|
92 |
-
"source": {"type": "keyword"},
|
93 |
-
"user_rt": {"type": "keyword"},
|
94 |
-
"retweet_id": {"type": "keyword"},
|
95 |
-
"reply_to": {
|
96 |
-
"type": "nested",
|
97 |
-
"properties": {
|
98 |
-
"user_id": {"type": "keyword"},
|
99 |
-
"username": {"type": "keyword"}
|
100 |
-
}
|
101 |
-
},
|
102 |
-
"retweet_date": {"type": "date", "format": "yyyy-MM-dd HH:mm:ss", "ignore_malformed": True},
|
103 |
-
"urls": {"type": "keyword"},
|
104 |
-
"translate": {"type": "text"},
|
105 |
-
"trans_src": {"type": "keyword"},
|
106 |
-
"trans_dest": {"type": "keyword"},
|
107 |
-
}
|
108 |
-
},
|
109 |
-
"settings": {
|
110 |
-
"number_of_shards": 1,
|
111 |
-
"analysis": {
|
112 |
-
"normalizer": {
|
113 |
-
"hashtag_normalizer": {
|
114 |
-
"type": "custom",
|
115 |
-
"char_filter": [],
|
116 |
-
"filter": ["lowercase", "asciifolding"]
|
117 |
-
}
|
118 |
-
}
|
119 |
-
}
|
120 |
-
}
|
121 |
-
}
|
122 |
-
with nostdout():
|
123 |
-
resp = instance.indices.create(index=config.Index_tweets, body=tweets_body, ignore=400)
|
124 |
-
return handleIndexResponse(resp)
|
125 |
-
elif scope.get("scope") == "follow":
|
126 |
-
follow_body = {
|
127 |
-
"mappings": {
|
128 |
-
"properties": {
|
129 |
-
"user": {"type": "keyword"},
|
130 |
-
"follow": {"type": "keyword"},
|
131 |
-
"essid": {"type": "keyword"}
|
132 |
-
}
|
133 |
-
},
|
134 |
-
"settings": {
|
135 |
-
"number_of_shards": 1
|
136 |
-
}
|
137 |
-
}
|
138 |
-
with nostdout():
|
139 |
-
resp = instance.indices.create(index=config.Index_follow, body=follow_body, ignore=400)
|
140 |
-
return handleIndexResponse(resp)
|
141 |
-
elif scope.get("scope") == "user":
|
142 |
-
user_body = {
|
143 |
-
"mappings": {
|
144 |
-
"properties": {
|
145 |
-
"id": {"type": "keyword"},
|
146 |
-
"name": {"type": "keyword"},
|
147 |
-
"username": {"type": "keyword"},
|
148 |
-
"bio": {"type": "text"},
|
149 |
-
"location": {"type": "keyword"},
|
150 |
-
"url": {"type": "text"},
|
151 |
-
"join_datetime": {"type": "date", "format": "yyyy-MM-dd HH:mm:ss"},
|
152 |
-
"tweets": {"type": "integer"},
|
153 |
-
"following": {"type": "integer"},
|
154 |
-
"followers": {"type": "integer"},
|
155 |
-
"likes": {"type": "integer"},
|
156 |
-
"media": {"type": "integer"},
|
157 |
-
"private": {"type": "integer"},
|
158 |
-
"verified": {"type": "integer"},
|
159 |
-
"avatar": {"type": "text"},
|
160 |
-
"background_image": {"type": "text"},
|
161 |
-
"session": {"type": "keyword"},
|
162 |
-
"geo_user": {"type": "geo_point"}
|
163 |
-
}
|
164 |
-
},
|
165 |
-
"settings": {
|
166 |
-
"number_of_shards": 1
|
167 |
-
}
|
168 |
-
}
|
169 |
-
with nostdout():
|
170 |
-
resp = instance.indices.create(index=config.Index_users, body=user_body, ignore=400)
|
171 |
-
return handleIndexResponse(resp)
|
172 |
-
else:
|
173 |
-
print("[x] error index pre-creation :: storage.elasticsearch.createIndex")
|
174 |
-
return False
|
175 |
-
|
176 |
-
@contextlib.contextmanager
|
177 |
-
def nostdout():
|
178 |
-
savestdout = sys.stdout
|
179 |
-
sys.stdout = RecycleObject()
|
180 |
-
yield
|
181 |
-
sys.stdout = savestdout
|
182 |
-
|
183 |
-
def weekday(day):
|
184 |
-
weekdays = {
|
185 |
-
"Monday": 1,
|
186 |
-
"Tuesday": 2,
|
187 |
-
"Wednesday": 3,
|
188 |
-
"Thursday": 4,
|
189 |
-
"Friday": 5,
|
190 |
-
"Saturday": 6,
|
191 |
-
"Sunday": 7,
|
192 |
-
}
|
193 |
-
|
194 |
-
return weekdays[day]
|
195 |
-
|
196 |
-
def Tweet(Tweet, config):
|
197 |
-
global _index_tweet_status
|
198 |
-
global _is_near_def
|
199 |
-
date_obj = datetime.strptime(Tweet.datetime, "%Y-%m-%d %H:%M:%S %Z")
|
200 |
-
|
201 |
-
actions = []
|
202 |
-
|
203 |
-
try:
|
204 |
-
retweet = Tweet.retweet
|
205 |
-
except AttributeError:
|
206 |
-
retweet = None
|
207 |
-
|
208 |
-
dt = f"{Tweet.datestamp} {Tweet.timestamp}"
|
209 |
-
|
210 |
-
j_data = {
|
211 |
-
"_index": config.Index_tweets,
|
212 |
-
"_id": str(Tweet.id) + "_raw_" + config.Essid,
|
213 |
-
"_source": {
|
214 |
-
"id": str(Tweet.id),
|
215 |
-
"conversation_id": Tweet.conversation_id,
|
216 |
-
"created_at": Tweet.datetime,
|
217 |
-
"date": dt,
|
218 |
-
"timezone": Tweet.timezone,
|
219 |
-
"place": Tweet.place,
|
220 |
-
"tweet": Tweet.tweet,
|
221 |
-
"language": Tweet.lang,
|
222 |
-
"hashtags": Tweet.hashtags,
|
223 |
-
"cashtags": Tweet.cashtags,
|
224 |
-
"user_id_str": Tweet.user_id_str,
|
225 |
-
"username": Tweet.username,
|
226 |
-
"name": Tweet.name,
|
227 |
-
"day": date_obj.weekday(),
|
228 |
-
"hour": date_obj.hour,
|
229 |
-
"link": Tweet.link,
|
230 |
-
"retweet": retweet,
|
231 |
-
"essid": config.Essid,
|
232 |
-
"nlikes": int(Tweet.likes_count),
|
233 |
-
"nreplies": int(Tweet.replies_count),
|
234 |
-
"nretweets": int(Tweet.retweets_count),
|
235 |
-
"quote_url": Tweet.quote_url,
|
236 |
-
"video": Tweet.video,
|
237 |
-
"search": str(config.Search),
|
238 |
-
"near": config.Near
|
239 |
-
}
|
240 |
-
}
|
241 |
-
if retweet is not None:
|
242 |
-
j_data["_source"].update({"user_rt_id": Tweet.user_rt_id})
|
243 |
-
j_data["_source"].update({"user_rt": Tweet.user_rt})
|
244 |
-
j_data["_source"].update({"retweet_id": Tweet.retweet_id})
|
245 |
-
j_data["_source"].update({"retweet_date": Tweet.retweet_date})
|
246 |
-
if Tweet.reply_to:
|
247 |
-
j_data["_source"].update({"reply_to": Tweet.reply_to})
|
248 |
-
if Tweet.photos:
|
249 |
-
_photos = []
|
250 |
-
for photo in Tweet.photos:
|
251 |
-
_photos.append(photo)
|
252 |
-
j_data["_source"].update({"photos": _photos})
|
253 |
-
if Tweet.thumbnail:
|
254 |
-
j_data["_source"].update({"thumbnail": Tweet.thumbnail})
|
255 |
-
if Tweet.mentions:
|
256 |
-
_mentions = []
|
257 |
-
for mention in Tweet.mentions:
|
258 |
-
_mentions.append(mention)
|
259 |
-
j_data["_source"].update({"mentions": _mentions})
|
260 |
-
if Tweet.urls:
|
261 |
-
_urls = []
|
262 |
-
for url in Tweet.urls:
|
263 |
-
_urls.append(url)
|
264 |
-
j_data["_source"].update({"urls": _urls})
|
265 |
-
if config.Near or config.Geo:
|
266 |
-
if not _is_near_def:
|
267 |
-
__geo = ""
|
268 |
-
__near = ""
|
269 |
-
if config.Geo:
|
270 |
-
__geo = config.Geo
|
271 |
-
if config.Near:
|
272 |
-
__near = config.Near
|
273 |
-
_is_near_def = getLocation(__near + __geo, near=True)
|
274 |
-
if _near:
|
275 |
-
j_data["_source"].update({"geo_near": _near})
|
276 |
-
if Tweet.place:
|
277 |
-
_t_place = getLocation(Tweet.place)
|
278 |
-
if _t_place:
|
279 |
-
j_data["_source"].update({"geo_tweet": getLocation(Tweet.place)})
|
280 |
-
if Tweet.source:
|
281 |
-
j_data["_source"].update({"source": Tweet.Source})
|
282 |
-
if config.Translate:
|
283 |
-
j_data["_source"].update({"translate": Tweet.translate})
|
284 |
-
j_data["_source"].update({"trans_src": Tweet.trans_src})
|
285 |
-
j_data["_source"].update({"trans_dest": Tweet.trans_dest})
|
286 |
-
|
287 |
-
actions.append(j_data)
|
288 |
-
|
289 |
-
es = Elasticsearch(config.Elasticsearch, verify_certs=config.Skip_certs)
|
290 |
-
if not _index_tweet_status:
|
291 |
-
_index_tweet_status = createIndex(config, es, scope="tweet")
|
292 |
-
with nostdout():
|
293 |
-
helpers.bulk(es, actions, chunk_size=2000, request_timeout=200)
|
294 |
-
actions = []
|
295 |
-
|
296 |
-
def Follow(user, config):
|
297 |
-
global _index_follow_status
|
298 |
-
actions = []
|
299 |
-
|
300 |
-
if config.Following:
|
301 |
-
_user = config.Username
|
302 |
-
_follow = user
|
303 |
-
else:
|
304 |
-
_user = user
|
305 |
-
_follow = config.Username
|
306 |
-
j_data = {
|
307 |
-
"_index": config.Index_follow,
|
308 |
-
"_id": _user + "_" + _follow + "_" + config.Essid,
|
309 |
-
"_source": {
|
310 |
-
"user": _user,
|
311 |
-
"follow": _follow,
|
312 |
-
"essid": config.Essid
|
313 |
-
}
|
314 |
-
}
|
315 |
-
actions.append(j_data)
|
316 |
-
|
317 |
-
es = Elasticsearch(config.Elasticsearch, verify_certs=config.Skip_certs)
|
318 |
-
if not _index_follow_status:
|
319 |
-
_index_follow_status = createIndex(config, es, scope="follow")
|
320 |
-
with nostdout():
|
321 |
-
helpers.bulk(es, actions, chunk_size=2000, request_timeout=200)
|
322 |
-
actions = []
|
323 |
-
|
324 |
-
def UserProfile(user, config):
|
325 |
-
global _index_user_status
|
326 |
-
global _is_location_def
|
327 |
-
actions = []
|
328 |
-
|
329 |
-
j_data = {
|
330 |
-
"_index": config.Index_users,
|
331 |
-
"_id": user.id + "_" + user.join_date + "_" + user.join_time + "_" + config.Essid,
|
332 |
-
"_source": {
|
333 |
-
"id": user.id,
|
334 |
-
"name": user.name,
|
335 |
-
"username": user.username,
|
336 |
-
"bio": user.bio,
|
337 |
-
"location": user.location,
|
338 |
-
"url": user.url,
|
339 |
-
"join_datetime": user.join_date + " " + user.join_time,
|
340 |
-
"tweets": user.tweets,
|
341 |
-
"following": user.following,
|
342 |
-
"followers": user.followers,
|
343 |
-
"likes": user.likes,
|
344 |
-
"media": user.media_count,
|
345 |
-
"private": user.is_private,
|
346 |
-
"verified": user.is_verified,
|
347 |
-
"avatar": user.avatar,
|
348 |
-
"background_image": user.background_image,
|
349 |
-
"session": config.Essid
|
350 |
-
}
|
351 |
-
}
|
352 |
-
if config.Location:
|
353 |
-
if not _is_location_def:
|
354 |
-
_is_location_def = getLocation(user.location, location=True)
|
355 |
-
if _location:
|
356 |
-
j_data["_source"].update({"geo_user": _location})
|
357 |
-
actions.append(j_data)
|
358 |
-
|
359 |
-
es = Elasticsearch(config.Elasticsearch, verify_certs=config.Skip_certs)
|
360 |
-
if not _index_user_status:
|
361 |
-
_index_user_status = createIndex(config, es, scope="user")
|
362 |
-
with nostdout():
|
363 |
-
helpers.bulk(es, actions, chunk_size=2000, request_timeout=200)
|
364 |
-
actions = []
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
twitter-scraper/twint-master/twint/storage/panda.py
DELETED
@@ -1,196 +0,0 @@
|
|
1 |
-
import datetime, pandas as pd, warnings
|
2 |
-
from time import strftime, localtime
|
3 |
-
from twint.tweet import Tweet_formats
|
4 |
-
|
5 |
-
Tweets_df = None
|
6 |
-
Follow_df = None
|
7 |
-
User_df = None
|
8 |
-
|
9 |
-
_object_blocks = {
|
10 |
-
"tweet": [],
|
11 |
-
"user": [],
|
12 |
-
"following": [],
|
13 |
-
"followers": []
|
14 |
-
}
|
15 |
-
|
16 |
-
weekdays = {
|
17 |
-
"Monday": 1,
|
18 |
-
"Tuesday": 2,
|
19 |
-
"Wednesday": 3,
|
20 |
-
"Thursday": 4,
|
21 |
-
"Friday": 5,
|
22 |
-
"Saturday": 6,
|
23 |
-
"Sunday": 7,
|
24 |
-
}
|
25 |
-
|
26 |
-
_type = ""
|
27 |
-
|
28 |
-
def _concat(df, _type):
|
29 |
-
if df is None:
|
30 |
-
df = pd.DataFrame(_object_blocks[_type])
|
31 |
-
else:
|
32 |
-
_df = pd.DataFrame(_object_blocks[_type])
|
33 |
-
df = pd.concat([df, _df], sort=True)
|
34 |
-
return df
|
35 |
-
|
36 |
-
def _autoget(_type):
|
37 |
-
global Tweets_df
|
38 |
-
global Follow_df
|
39 |
-
global User_df
|
40 |
-
|
41 |
-
if _type == "tweet":
|
42 |
-
Tweets_df = _concat(Tweets_df, _type)
|
43 |
-
elif _type == "followers" or _type == "following":
|
44 |
-
Follow_df = _concat(Follow_df, _type)
|
45 |
-
elif _type == "user":
|
46 |
-
User_df = _concat(User_df, _type)
|
47 |
-
else:
|
48 |
-
error("[x] Wrong type of object passed")
|
49 |
-
|
50 |
-
|
51 |
-
def update(object, config):
|
52 |
-
global _type
|
53 |
-
|
54 |
-
#try:
|
55 |
-
# _type = ((object.__class__.__name__ == "tweet")*"tweet" +
|
56 |
-
# (object.__class__.__name__ == "user")*"user")
|
57 |
-
#except AttributeError:
|
58 |
-
# _type = config.Following*"following" + config.Followers*"followers"
|
59 |
-
if object.__class__.__name__ == "tweet":
|
60 |
-
_type = "tweet"
|
61 |
-
elif object.__class__.__name__ == "user":
|
62 |
-
_type = "user"
|
63 |
-
elif object.__class__.__name__ == "dict":
|
64 |
-
_type = config.Following*"following" + config.Followers*"followers"
|
65 |
-
|
66 |
-
if _type == "tweet":
|
67 |
-
Tweet = object
|
68 |
-
datetime_ms = datetime.datetime.strptime(Tweet.datetime, Tweet_formats['datetime']).timestamp() * 1000
|
69 |
-
day = weekdays[strftime("%A", localtime(datetime_ms/1000))]
|
70 |
-
dt = f"{object.datestamp} {object.timestamp}"
|
71 |
-
_data = {
|
72 |
-
"id": str(Tweet.id),
|
73 |
-
"conversation_id": Tweet.conversation_id,
|
74 |
-
"created_at": datetime_ms,
|
75 |
-
"date": dt,
|
76 |
-
"timezone": Tweet.timezone,
|
77 |
-
"place": Tweet.place,
|
78 |
-
"tweet": Tweet.tweet,
|
79 |
-
"language": Tweet.lang,
|
80 |
-
"hashtags": Tweet.hashtags,
|
81 |
-
"cashtags": Tweet.cashtags,
|
82 |
-
"user_id": Tweet.user_id,
|
83 |
-
"user_id_str": Tweet.user_id_str,
|
84 |
-
"username": Tweet.username,
|
85 |
-
"name": Tweet.name,
|
86 |
-
"day": day,
|
87 |
-
"hour": strftime("%H", localtime(datetime_ms/1000)),
|
88 |
-
"link": Tweet.link,
|
89 |
-
"urls": Tweet.urls,
|
90 |
-
"photos": Tweet.photos,
|
91 |
-
"video": Tweet.video,
|
92 |
-
"thumbnail": Tweet.thumbnail,
|
93 |
-
"retweet": Tweet.retweet,
|
94 |
-
"nlikes": int(Tweet.likes_count),
|
95 |
-
"nreplies": int(Tweet.replies_count),
|
96 |
-
"nretweets": int(Tweet.retweets_count),
|
97 |
-
"quote_url": Tweet.quote_url,
|
98 |
-
"search": str(config.Search),
|
99 |
-
"near": Tweet.near,
|
100 |
-
"geo": Tweet.geo,
|
101 |
-
"source": Tweet.source,
|
102 |
-
"user_rt_id": Tweet.user_rt_id,
|
103 |
-
"user_rt": Tweet.user_rt,
|
104 |
-
"retweet_id": Tweet.retweet_id,
|
105 |
-
"reply_to": Tweet.reply_to,
|
106 |
-
"retweet_date": Tweet.retweet_date,
|
107 |
-
"translate": Tweet.translate,
|
108 |
-
"trans_src": Tweet.trans_src,
|
109 |
-
"trans_dest": Tweet.trans_dest
|
110 |
-
}
|
111 |
-
_object_blocks[_type].append(_data)
|
112 |
-
elif _type == "user":
|
113 |
-
user = object
|
114 |
-
try:
|
115 |
-
background_image = user.background_image
|
116 |
-
except:
|
117 |
-
background_image = ""
|
118 |
-
_data = {
|
119 |
-
"id": user.id,
|
120 |
-
"name": user.name,
|
121 |
-
"username": user.username,
|
122 |
-
"bio": user.bio,
|
123 |
-
"url": user.url,
|
124 |
-
"join_datetime": user.join_date + " " + user.join_time,
|
125 |
-
"join_date": user.join_date,
|
126 |
-
"join_time": user.join_time,
|
127 |
-
"tweets": user.tweets,
|
128 |
-
"location": user.location,
|
129 |
-
"following": user.following,
|
130 |
-
"followers": user.followers,
|
131 |
-
"likes": user.likes,
|
132 |
-
"media": user.media_count,
|
133 |
-
"private": user.is_private,
|
134 |
-
"verified": user.is_verified,
|
135 |
-
"avatar": user.avatar,
|
136 |
-
"background_image": background_image,
|
137 |
-
}
|
138 |
-
_object_blocks[_type].append(_data)
|
139 |
-
elif _type == "followers" or _type == "following":
|
140 |
-
_data = {
|
141 |
-
config.Following*"following" + config.Followers*"followers" :
|
142 |
-
{config.Username: object[_type]}
|
143 |
-
}
|
144 |
-
_object_blocks[_type] = _data
|
145 |
-
else:
|
146 |
-
print("Wrong type of object passed!")
|
147 |
-
|
148 |
-
|
149 |
-
def clean():
|
150 |
-
global Tweets_df
|
151 |
-
global Follow_df
|
152 |
-
global User_df
|
153 |
-
_object_blocks["tweet"].clear()
|
154 |
-
_object_blocks["following"].clear()
|
155 |
-
_object_blocks["followers"].clear()
|
156 |
-
_object_blocks["user"].clear()
|
157 |
-
Tweets_df = None
|
158 |
-
Follow_df = None
|
159 |
-
User_df = None
|
160 |
-
|
161 |
-
def save(_filename, _dataframe, **options):
|
162 |
-
if options.get("dataname"):
|
163 |
-
_dataname = options.get("dataname")
|
164 |
-
else:
|
165 |
-
_dataname = "twint"
|
166 |
-
|
167 |
-
if not options.get("type"):
|
168 |
-
with warnings.catch_warnings():
|
169 |
-
warnings.simplefilter("ignore")
|
170 |
-
_store = pd.HDFStore(_filename + ".h5")
|
171 |
-
_store[_dataname] = _dataframe
|
172 |
-
_store.close()
|
173 |
-
elif options.get("type") == "Pickle":
|
174 |
-
with warnings.catch_warnings():
|
175 |
-
warnings.simplefilter("ignore")
|
176 |
-
_dataframe.to_pickle(_filename + ".pkl")
|
177 |
-
else:
|
178 |
-
print("""Please specify: filename, DataFrame, DataFrame name and type
|
179 |
-
(HDF5, default, or Pickle)""")
|
180 |
-
|
181 |
-
def read(_filename, **options):
|
182 |
-
if not options.get("dataname"):
|
183 |
-
_dataname = "twint"
|
184 |
-
else:
|
185 |
-
_dataname = options.get("dataname")
|
186 |
-
|
187 |
-
if not options.get("type"):
|
188 |
-
_store = pd.HDFStore(_filename + ".h5")
|
189 |
-
_df = _store[_dataname]
|
190 |
-
return _df
|
191 |
-
elif options.get("type") == "Pickle":
|
192 |
-
_df = pd.read_pickle(_filename + ".pkl")
|
193 |
-
return _df
|
194 |
-
else:
|
195 |
-
print("""Please specify: DataFrame, DataFrame name (twint as default),
|
196 |
-
filename and type (HDF5, default, or Pickle""")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
twitter-scraper/twint-master/twint/storage/write.py
DELETED
@@ -1,77 +0,0 @@
|
|
1 |
-
from . import write_meta as meta
|
2 |
-
import csv
|
3 |
-
import json
|
4 |
-
import os
|
5 |
-
|
6 |
-
def outputExt(objType, fType):
|
7 |
-
if objType == "str":
|
8 |
-
objType = "username"
|
9 |
-
outExt = f"/{objType}s.{fType}"
|
10 |
-
|
11 |
-
return outExt
|
12 |
-
|
13 |
-
def addExt(base, objType, fType):
|
14 |
-
if len(base.split('.')) == 1:
|
15 |
-
createDirIfMissing(base)
|
16 |
-
base += outputExt(objType, fType)
|
17 |
-
|
18 |
-
return base
|
19 |
-
|
20 |
-
def Text(entry, f):
|
21 |
-
print(entry.replace('\n', ' '), file=open(f, "a", encoding="utf-8"))
|
22 |
-
|
23 |
-
def Type(config):
|
24 |
-
if config.User_full:
|
25 |
-
_type = "user"
|
26 |
-
elif config.Followers or config.Following:
|
27 |
-
_type = "username"
|
28 |
-
else:
|
29 |
-
_type = "tweet"
|
30 |
-
|
31 |
-
return _type
|
32 |
-
|
33 |
-
def struct(obj, custom, _type):
|
34 |
-
if custom:
|
35 |
-
fieldnames = custom
|
36 |
-
row = {}
|
37 |
-
for f in fieldnames:
|
38 |
-
row[f] = meta.Data(obj, _type)[f]
|
39 |
-
else:
|
40 |
-
fieldnames = meta.Fieldnames(_type)
|
41 |
-
row = meta.Data(obj, _type)
|
42 |
-
|
43 |
-
return fieldnames, row
|
44 |
-
|
45 |
-
def createDirIfMissing(dirname):
|
46 |
-
if not os.path.exists(dirname):
|
47 |
-
os.makedirs(dirname)
|
48 |
-
|
49 |
-
def Csv(obj, config):
|
50 |
-
_obj_type = obj.__class__.__name__
|
51 |
-
if _obj_type == "str":
|
52 |
-
_obj_type = "username"
|
53 |
-
fieldnames, row = struct(obj, config.Custom[_obj_type], _obj_type)
|
54 |
-
|
55 |
-
base = addExt(config.Output, _obj_type, "csv")
|
56 |
-
dialect = 'excel-tab' if 'Tabs' in config.__dict__ else 'excel'
|
57 |
-
|
58 |
-
if not (os.path.exists(base)):
|
59 |
-
with open(base, "w", newline='', encoding="utf-8") as csv_file:
|
60 |
-
writer = csv.DictWriter(csv_file, fieldnames=fieldnames, dialect=dialect)
|
61 |
-
writer.writeheader()
|
62 |
-
|
63 |
-
with open(base, "a", newline='', encoding="utf-8") as csv_file:
|
64 |
-
writer = csv.DictWriter(csv_file, fieldnames=fieldnames, dialect=dialect)
|
65 |
-
writer.writerow(row)
|
66 |
-
|
67 |
-
def Json(obj, config):
|
68 |
-
_obj_type = obj.__class__.__name__
|
69 |
-
if _obj_type == "str":
|
70 |
-
_obj_type = "username"
|
71 |
-
null, data = struct(obj, config.Custom[_obj_type], _obj_type)
|
72 |
-
|
73 |
-
base = addExt(config.Output, _obj_type, "json")
|
74 |
-
|
75 |
-
with open(base, "a", newline='', encoding="utf-8") as json_file:
|
76 |
-
json.dump(data, json_file, ensure_ascii=False)
|
77 |
-
json_file.write("\n")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
twitter-scraper/twint-master/twint/storage/write_meta.py
DELETED
@@ -1,151 +0,0 @@
|
|
1 |
-
def tweetData(t):
|
2 |
-
data = {
|
3 |
-
"id": int(t.id),
|
4 |
-
"conversation_id": t.conversation_id,
|
5 |
-
"created_at": t.datetime,
|
6 |
-
"date": t.datestamp,
|
7 |
-
"time": t.timestamp,
|
8 |
-
"timezone": t.timezone,
|
9 |
-
"user_id": t.user_id,
|
10 |
-
"username": t.username,
|
11 |
-
"name": t.name,
|
12 |
-
"place": t.place,
|
13 |
-
"tweet": t.tweet,
|
14 |
-
"language": t.lang,
|
15 |
-
"mentions": t.mentions,
|
16 |
-
"urls": t.urls,
|
17 |
-
"photos": t.photos,
|
18 |
-
"replies_count": int(t.replies_count),
|
19 |
-
"retweets_count": int(t.retweets_count),
|
20 |
-
"likes_count": int(t.likes_count),
|
21 |
-
"hashtags": t.hashtags,
|
22 |
-
"cashtags": t.cashtags,
|
23 |
-
"link": t.link,
|
24 |
-
"retweet": t.retweet,
|
25 |
-
"quote_url": t.quote_url,
|
26 |
-
"video": t.video,
|
27 |
-
"thumbnail": t.thumbnail,
|
28 |
-
"near": t.near,
|
29 |
-
"geo": t.geo,
|
30 |
-
"source": t.source,
|
31 |
-
"user_rt_id": t.user_rt_id,
|
32 |
-
"user_rt": t.user_rt,
|
33 |
-
"retweet_id": t.retweet_id,
|
34 |
-
"reply_to": t.reply_to,
|
35 |
-
"retweet_date": t.retweet_date,
|
36 |
-
"translate": t.translate,
|
37 |
-
"trans_src": t.trans_src,
|
38 |
-
"trans_dest": t.trans_dest,
|
39 |
-
}
|
40 |
-
return data
|
41 |
-
|
42 |
-
def tweetFieldnames():
|
43 |
-
fieldnames = [
|
44 |
-
"id",
|
45 |
-
"conversation_id",
|
46 |
-
"created_at",
|
47 |
-
"date",
|
48 |
-
"time",
|
49 |
-
"timezone",
|
50 |
-
"user_id",
|
51 |
-
"username",
|
52 |
-
"name",
|
53 |
-
"place",
|
54 |
-
"tweet",
|
55 |
-
"language",
|
56 |
-
"mentions",
|
57 |
-
"urls",
|
58 |
-
"photos",
|
59 |
-
"replies_count",
|
60 |
-
"retweets_count",
|
61 |
-
"likes_count",
|
62 |
-
"hashtags",
|
63 |
-
"cashtags",
|
64 |
-
"link",
|
65 |
-
"retweet",
|
66 |
-
"quote_url",
|
67 |
-
"video",
|
68 |
-
"thumbnail",
|
69 |
-
"near",
|
70 |
-
"geo",
|
71 |
-
"source",
|
72 |
-
"user_rt_id",
|
73 |
-
"user_rt",
|
74 |
-
"retweet_id",
|
75 |
-
"reply_to",
|
76 |
-
"retweet_date",
|
77 |
-
"translate",
|
78 |
-
"trans_src",
|
79 |
-
"trans_dest"
|
80 |
-
]
|
81 |
-
return fieldnames
|
82 |
-
|
83 |
-
def userData(u):
|
84 |
-
data = {
|
85 |
-
"id": int(u.id),
|
86 |
-
"name": u.name,
|
87 |
-
"username": u.username,
|
88 |
-
"bio": u.bio,
|
89 |
-
"location": u.location,
|
90 |
-
"url": u.url,
|
91 |
-
"join_date": u.join_date,
|
92 |
-
"join_time": u.join_time,
|
93 |
-
"tweets": int(u.tweets),
|
94 |
-
"following": int(u.following),
|
95 |
-
"followers": int(u.followers),
|
96 |
-
"likes": int(u.likes),
|
97 |
-
"media": int(u.media_count),
|
98 |
-
"private": u.is_private,
|
99 |
-
"verified": u.is_verified,
|
100 |
-
"profile_image_url": u.avatar,
|
101 |
-
"background_image": u.background_image
|
102 |
-
}
|
103 |
-
return data
|
104 |
-
|
105 |
-
def userFieldnames():
|
106 |
-
fieldnames = [
|
107 |
-
"id",
|
108 |
-
"name",
|
109 |
-
"username",
|
110 |
-
"bio",
|
111 |
-
"location",
|
112 |
-
"url",
|
113 |
-
"join_date",
|
114 |
-
"join_time",
|
115 |
-
"tweets",
|
116 |
-
"following",
|
117 |
-
"followers",
|
118 |
-
"likes",
|
119 |
-
"media",
|
120 |
-
"private",
|
121 |
-
"verified",
|
122 |
-
"profile_image_url",
|
123 |
-
"background_image"
|
124 |
-
]
|
125 |
-
return fieldnames
|
126 |
-
|
127 |
-
def usernameData(u):
|
128 |
-
return {"username": u}
|
129 |
-
|
130 |
-
def usernameFieldnames():
|
131 |
-
return ["username"]
|
132 |
-
|
133 |
-
def Data(obj, _type):
|
134 |
-
if _type == "user":
|
135 |
-
ret = userData(obj)
|
136 |
-
elif _type == "username":
|
137 |
-
ret = usernameData(obj)
|
138 |
-
else:
|
139 |
-
ret = tweetData(obj)
|
140 |
-
|
141 |
-
return ret
|
142 |
-
|
143 |
-
def Fieldnames(_type):
|
144 |
-
if _type == "user":
|
145 |
-
ret = userFieldnames()
|
146 |
-
elif _type == "username":
|
147 |
-
ret = usernameFieldnames()
|
148 |
-
else:
|
149 |
-
ret = tweetFieldnames()
|
150 |
-
|
151 |
-
return ret
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
twitter-scraper/twint-master/twint/token.py
DELETED
@@ -1,94 +0,0 @@
|
|
1 |
-
import re
|
2 |
-
import time
|
3 |
-
|
4 |
-
import requests
|
5 |
-
import logging as logme
|
6 |
-
|
7 |
-
|
8 |
-
class TokenExpiryException(Exception):
|
9 |
-
def __init__(self, msg):
|
10 |
-
super().__init__(msg)
|
11 |
-
|
12 |
-
|
13 |
-
class RefreshTokenException(Exception):
|
14 |
-
def __init__(self, msg):
|
15 |
-
super().__init__(msg)
|
16 |
-
|
17 |
-
|
18 |
-
class Token:
|
19 |
-
def __init__(self, config):
|
20 |
-
self._session = requests.Session()
|
21 |
-
self._session.headers.update({'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:78.0) Gecko/20100101 Firefox/78.0'})
|
22 |
-
self.config = config
|
23 |
-
self._retries = 5
|
24 |
-
self._timeout = 10
|
25 |
-
self.url = 'https://twitter.com'
|
26 |
-
|
27 |
-
def _request(self):
|
28 |
-
for attempt in range(self._retries + 1):
|
29 |
-
# The request is newly prepared on each retry because of potential cookie updates.
|
30 |
-
req = self._session.prepare_request(requests.Request('GET', self.url))
|
31 |
-
logme.debug(f'Retrieving {req.url}')
|
32 |
-
try:
|
33 |
-
r = self._session.send(req, allow_redirects=True, timeout=self._timeout)
|
34 |
-
except requests.exceptions.RequestException as exc:
|
35 |
-
if attempt < self._retries:
|
36 |
-
retrying = ', retrying'
|
37 |
-
level = logme.WARNING
|
38 |
-
else:
|
39 |
-
retrying = ''
|
40 |
-
level = logme.ERROR
|
41 |
-
logme.log(level, f'Error retrieving {req.url}: {exc!r}{retrying}')
|
42 |
-
else:
|
43 |
-
success, msg = (True, None)
|
44 |
-
msg = f': {msg}' if msg else ''
|
45 |
-
|
46 |
-
if success:
|
47 |
-
logme.debug(f'{req.url} retrieved successfully{msg}')
|
48 |
-
return r
|
49 |
-
if attempt < self._retries:
|
50 |
-
# TODO : might wanna tweak this back-off timer
|
51 |
-
sleep_time = 2.0 * 2 ** attempt
|
52 |
-
logme.info(f'Waiting {sleep_time:.0f} seconds')
|
53 |
-
time.sleep(sleep_time)
|
54 |
-
else:
|
55 |
-
msg = f'{self._retries + 1} requests to {self.url} failed, giving up.'
|
56 |
-
logme.fatal(msg)
|
57 |
-
self.config.Guest_token = None
|
58 |
-
raise RefreshTokenException(msg)
|
59 |
-
|
60 |
-
def refresh(self):
|
61 |
-
logme.debug('Retrieving guest token')
|
62 |
-
res = self._request()
|
63 |
-
match = re.search(r'\("gt=(\d+);', res.text)
|
64 |
-
if match:
|
65 |
-
logme.debug('Found guest token in HTML')
|
66 |
-
self.config.Guest_token = str(match.group(1))
|
67 |
-
else:
|
68 |
-
headers = {
|
69 |
-
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:78.0) Gecko/20100101 Firefox/78.0',
|
70 |
-
'authority': 'api.twitter.com',
|
71 |
-
'content-length': '0',
|
72 |
-
'authorization': self.config.Bearer_token,
|
73 |
-
'x-twitter-client-language': 'en',
|
74 |
-
'x-csrf-token': res.cookies.get("ct0"),
|
75 |
-
'x-twitter-active-user': 'yes',
|
76 |
-
'content-type': 'application/x-www-form-urlencoded',
|
77 |
-
'accept': '*/*',
|
78 |
-
'sec-gpc': '1',
|
79 |
-
'origin': 'https://twitter.com',
|
80 |
-
'sec-fetch-site': 'same-site',
|
81 |
-
'sec-fetch-mode': 'cors',
|
82 |
-
'sec-fetch-dest': 'empty',
|
83 |
-
'referer': 'https://twitter.com/',
|
84 |
-
'accept-language': 'en-US',
|
85 |
-
}
|
86 |
-
self._session.headers.update(headers)
|
87 |
-
req = self._session.prepare_request(requests.Request('POST', 'https://api.twitter.com/1.1/guest/activate.json'))
|
88 |
-
res = self._session.send(req, allow_redirects=True, timeout=self._timeout)
|
89 |
-
if 'guest_token' in res.json():
|
90 |
-
logme.debug('Found guest token in JSON')
|
91 |
-
self.config.Guest_token = res.json()['guest_token']
|
92 |
-
else:
|
93 |
-
self.config.Guest_token = None
|
94 |
-
raise RefreshTokenException('Could not find the Guest token in HTML')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
twitter-scraper/twint-master/twint/tweet.py
DELETED
@@ -1,166 +0,0 @@
|
|
1 |
-
from time import strftime, localtime
|
2 |
-
from datetime import datetime, timezone
|
3 |
-
|
4 |
-
import logging as logme
|
5 |
-
from googletransx import Translator
|
6 |
-
# ref.
|
7 |
-
# - https://github.com/x0rzkov/py-googletrans#basic-usage
|
8 |
-
translator = Translator()
|
9 |
-
|
10 |
-
|
11 |
-
class tweet:
|
12 |
-
"""Define Tweet class
|
13 |
-
"""
|
14 |
-
type = "tweet"
|
15 |
-
|
16 |
-
def __init__(self):
|
17 |
-
pass
|
18 |
-
|
19 |
-
|
20 |
-
def utc_to_local(utc_dt):
|
21 |
-
return utc_dt.replace(tzinfo=timezone.utc).astimezone(tz=None)
|
22 |
-
|
23 |
-
|
24 |
-
Tweet_formats = {
|
25 |
-
'datetime': '%Y-%m-%d %H:%M:%S %Z',
|
26 |
-
'datestamp': '%Y-%m-%d',
|
27 |
-
'timestamp': '%H:%M:%S'
|
28 |
-
}
|
29 |
-
|
30 |
-
|
31 |
-
def _get_mentions(tw):
|
32 |
-
"""Extract mentions from tweet
|
33 |
-
"""
|
34 |
-
logme.debug(__name__ + ':get_mentions')
|
35 |
-
try:
|
36 |
-
mentions = [
|
37 |
-
{
|
38 |
-
'screen_name': _mention['screen_name'],
|
39 |
-
'name': _mention['name'],
|
40 |
-
'id': _mention['id_str'],
|
41 |
-
} for _mention in tw['entities']['user_mentions']
|
42 |
-
if tw['display_text_range'][0] < _mention['indices'][0]
|
43 |
-
]
|
44 |
-
except KeyError:
|
45 |
-
mentions = []
|
46 |
-
return mentions
|
47 |
-
|
48 |
-
|
49 |
-
def _get_reply_to(tw):
|
50 |
-
try:
|
51 |
-
reply_to = [
|
52 |
-
{
|
53 |
-
'screen_name': _mention['screen_name'],
|
54 |
-
'name': _mention['name'],
|
55 |
-
'id': _mention['id_str'],
|
56 |
-
} for _mention in tw['entities']['user_mentions']
|
57 |
-
if tw['display_text_range'][0] > _mention['indices'][1]
|
58 |
-
]
|
59 |
-
except KeyError:
|
60 |
-
reply_to = []
|
61 |
-
return reply_to
|
62 |
-
|
63 |
-
|
64 |
-
def getText(tw):
|
65 |
-
"""Replace some text
|
66 |
-
"""
|
67 |
-
logme.debug(__name__ + ':getText')
|
68 |
-
text = tw['full_text']
|
69 |
-
text = text.replace("http", " http")
|
70 |
-
text = text.replace("pic.twitter", " pic.twitter")
|
71 |
-
text = text.replace("\n", " ")
|
72 |
-
|
73 |
-
return text
|
74 |
-
|
75 |
-
|
76 |
-
def Tweet(tw, config):
|
77 |
-
"""Create Tweet object
|
78 |
-
"""
|
79 |
-
logme.debug(__name__ + ':Tweet')
|
80 |
-
t = tweet()
|
81 |
-
t.id = int(tw['id_str'])
|
82 |
-
t.id_str = tw["id_str"]
|
83 |
-
t.conversation_id = tw["conversation_id_str"]
|
84 |
-
|
85 |
-
# parsing date to user-friendly format
|
86 |
-
_dt = tw['created_at']
|
87 |
-
_dt = datetime.strptime(_dt, '%a %b %d %H:%M:%S %z %Y')
|
88 |
-
_dt = utc_to_local(_dt)
|
89 |
-
t.datetime = str(_dt.strftime(Tweet_formats['datetime']))
|
90 |
-
# date is of the format year,
|
91 |
-
t.datestamp = _dt.strftime(Tweet_formats['datestamp'])
|
92 |
-
t.timestamp = _dt.strftime(Tweet_formats['timestamp'])
|
93 |
-
t.user_id = int(tw["user_id_str"])
|
94 |
-
t.user_id_str = tw["user_id_str"]
|
95 |
-
t.username = tw["user_data"]['screen_name']
|
96 |
-
t.name = tw["user_data"]['name']
|
97 |
-
t.place = tw['geo'] if 'geo' in tw and tw['geo'] else ""
|
98 |
-
t.timezone = strftime("%z", localtime())
|
99 |
-
t.mentions = _get_mentions(tw)
|
100 |
-
t.reply_to = _get_reply_to(tw)
|
101 |
-
try:
|
102 |
-
t.urls = [_url['expanded_url'] for _url in tw['entities']['urls']]
|
103 |
-
except KeyError:
|
104 |
-
t.urls = []
|
105 |
-
try:
|
106 |
-
t.photos = [_img['media_url_https'] for _img in tw['entities']['media'] if _img['type'] == 'photo' and
|
107 |
-
_img['expanded_url'].find('/photo/') != -1]
|
108 |
-
except KeyError:
|
109 |
-
t.photos = []
|
110 |
-
try:
|
111 |
-
t.video = 1 if len(tw['extended_entities']['media']) else 0
|
112 |
-
except KeyError:
|
113 |
-
t.video = 0
|
114 |
-
try:
|
115 |
-
t.thumbnail = tw['extended_entities']['media'][0]['media_url_https']
|
116 |
-
except KeyError:
|
117 |
-
t.thumbnail = ''
|
118 |
-
t.tweet = getText(tw)
|
119 |
-
t.lang = tw['lang']
|
120 |
-
try:
|
121 |
-
t.hashtags = [hashtag['text'] for hashtag in tw['entities']['hashtags']]
|
122 |
-
except KeyError:
|
123 |
-
t.hashtags = []
|
124 |
-
try:
|
125 |
-
t.cashtags = [cashtag['text'] for cashtag in tw['entities']['symbols']]
|
126 |
-
except KeyError:
|
127 |
-
t.cashtags = []
|
128 |
-
t.replies_count = tw['reply_count']
|
129 |
-
t.retweets_count = tw['retweet_count']
|
130 |
-
t.likes_count = tw['favorite_count']
|
131 |
-
t.link = f"https://twitter.com/{t.username}/status/{t.id}"
|
132 |
-
try:
|
133 |
-
if 'user_rt_id' in tw['retweet_data']:
|
134 |
-
t.retweet = True
|
135 |
-
t.retweet_id = tw['retweet_data']['retweet_id']
|
136 |
-
t.retweet_date = tw['retweet_data']['retweet_date']
|
137 |
-
t.user_rt = tw['retweet_data']['user_rt']
|
138 |
-
t.user_rt_id = tw['retweet_data']['user_rt_id']
|
139 |
-
except KeyError:
|
140 |
-
t.retweet = False
|
141 |
-
t.retweet_id = ''
|
142 |
-
t.retweet_date = ''
|
143 |
-
t.user_rt = ''
|
144 |
-
t.user_rt_id = ''
|
145 |
-
try:
|
146 |
-
t.quote_url = tw['quoted_status_permalink']['expanded'] if tw['is_quote_status'] else ''
|
147 |
-
except KeyError:
|
148 |
-
# means that the quoted tweet have been deleted
|
149 |
-
t.quote_url = 0
|
150 |
-
t.near = config.Near if config.Near else ""
|
151 |
-
t.geo = config.Geo if config.Geo else ""
|
152 |
-
t.source = config.Source if config.Source else ""
|
153 |
-
t.translate = ''
|
154 |
-
t.trans_src = ''
|
155 |
-
t.trans_dest = ''
|
156 |
-
if config.Translate:
|
157 |
-
try:
|
158 |
-
ts = translator.translate(text=t.tweet, dest=config.TranslateDest)
|
159 |
-
t.translate = ts.text
|
160 |
-
t.trans_src = ts.src
|
161 |
-
t.trans_dest = ts.dest
|
162 |
-
# ref. https://github.com/SuniTheFish/ChainTranslator/blob/master/ChainTranslator/__main__.py#L31
|
163 |
-
except ValueError as e:
|
164 |
-
logme.debug(__name__ + ':Tweet:translator.translate:' + str(e))
|
165 |
-
raise Exception("Invalid destination language: {} / Tweet: {}".format(config.TranslateDest, t.tweet))
|
166 |
-
return t
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
twitter-scraper/twint-master/twint/url.py
DELETED
@@ -1,195 +0,0 @@
|
|
1 |
-
import datetime
|
2 |
-
import json
|
3 |
-
from sys import platform
|
4 |
-
import logging as logme
|
5 |
-
from urllib.parse import urlencode
|
6 |
-
from urllib.parse import quote
|
7 |
-
|
8 |
-
mobile = "https://mobile.twitter.com"
|
9 |
-
base = "https://api.twitter.com/2/search/adaptive.json"
|
10 |
-
|
11 |
-
|
12 |
-
def _sanitizeQuery(_url, params):
|
13 |
-
_serialQuery = ""
|
14 |
-
_serialQuery = urlencode(params, quote_via=quote)
|
15 |
-
_serialQuery = _url + "?" + _serialQuery
|
16 |
-
return _serialQuery
|
17 |
-
|
18 |
-
|
19 |
-
def _formatDate(date):
|
20 |
-
if "win" in platform:
|
21 |
-
return f'\"{date.split()[0]}\"'
|
22 |
-
try:
|
23 |
-
return int(datetime.datetime.strptime(date, "%Y-%m-%d %H:%M:%S").timestamp())
|
24 |
-
except ValueError:
|
25 |
-
return int(datetime.datetime.strptime(date, "%Y-%m-%d").timestamp())
|
26 |
-
|
27 |
-
|
28 |
-
async def Favorites(username, init):
|
29 |
-
logme.debug(__name__ + ':Favorites')
|
30 |
-
url = f"{mobile}/{username}/favorites?lang=en"
|
31 |
-
|
32 |
-
if init != '-1':
|
33 |
-
url += f"&max_id={init}"
|
34 |
-
|
35 |
-
return url
|
36 |
-
|
37 |
-
|
38 |
-
async def Followers(username, init):
|
39 |
-
logme.debug(__name__ + ':Followers')
|
40 |
-
url = f"{mobile}/{username}/followers?lang=en"
|
41 |
-
|
42 |
-
if init != '-1':
|
43 |
-
url += f"&cursor={init}"
|
44 |
-
|
45 |
-
return url
|
46 |
-
|
47 |
-
|
48 |
-
async def Following(username, init):
|
49 |
-
logme.debug(__name__ + ':Following')
|
50 |
-
url = f"{mobile}/{username}/following?lang=en"
|
51 |
-
|
52 |
-
if init != '-1':
|
53 |
-
url += f"&cursor={init}"
|
54 |
-
|
55 |
-
return url
|
56 |
-
|
57 |
-
|
58 |
-
async def MobileProfile(username, init):
|
59 |
-
logme.debug(__name__ + ':MobileProfile')
|
60 |
-
url = f"{mobile}/{username}?lang=en"
|
61 |
-
|
62 |
-
if init != '-1':
|
63 |
-
url += f"&max_id={init}"
|
64 |
-
|
65 |
-
return url
|
66 |
-
|
67 |
-
|
68 |
-
async def Search(config, init):
|
69 |
-
logme.debug(__name__ + ':Search')
|
70 |
-
url = base
|
71 |
-
tweet_count = 100 if not config.Limit else config.Limit
|
72 |
-
q = ""
|
73 |
-
params = [
|
74 |
-
# ('include_blocking', '1'),
|
75 |
-
# ('include_blocked_by', '1'),
|
76 |
-
# ('include_followed_by', '1'),
|
77 |
-
# ('include_want_retweets', '1'),
|
78 |
-
# ('include_mute_edge', '1'),
|
79 |
-
# ('include_can_dm', '1'),
|
80 |
-
('include_can_media_tag', '1'),
|
81 |
-
# ('skip_status', '1'),
|
82 |
-
# ('include_cards', '1'),
|
83 |
-
('include_ext_alt_text', 'true'),
|
84 |
-
('include_quote_count', 'true'),
|
85 |
-
('include_reply_count', '1'),
|
86 |
-
('tweet_mode', 'extended'),
|
87 |
-
('include_entities', 'true'),
|
88 |
-
('include_user_entities', 'true'),
|
89 |
-
('include_ext_media_availability', 'true'),
|
90 |
-
('send_error_codes', 'true'),
|
91 |
-
('simple_quoted_tweet', 'true'),
|
92 |
-
('count', tweet_count),
|
93 |
-
('query_source', 'typed_query'),
|
94 |
-
# ('pc', '1'),
|
95 |
-
('cursor', str(init)),
|
96 |
-
('spelling_corrections', '1'),
|
97 |
-
('ext', 'mediaStats%2ChighlightedLabel'),
|
98 |
-
('tweet_search_mode', 'live'), # this can be handled better, maybe take an argument and set it then
|
99 |
-
]
|
100 |
-
if not config.Popular_tweets:
|
101 |
-
params.append(('f', 'tweets'))
|
102 |
-
if config.Lang:
|
103 |
-
params.append(("l", config.Lang))
|
104 |
-
params.append(("lang", "en"))
|
105 |
-
if config.Query:
|
106 |
-
q += f" from:{config.Query}"
|
107 |
-
if config.Username:
|
108 |
-
q += f" from:{config.Username}"
|
109 |
-
if config.Geo:
|
110 |
-
config.Geo = config.Geo.replace(" ", "")
|
111 |
-
q += f" geocode:{config.Geo}"
|
112 |
-
if config.Search:
|
113 |
-
|
114 |
-
q += f" {config.Search}"
|
115 |
-
if config.Year:
|
116 |
-
q += f" until:{config.Year}-1-1"
|
117 |
-
if config.Since:
|
118 |
-
q += f" since:{_formatDate(config.Since)}"
|
119 |
-
if config.Until:
|
120 |
-
q += f" until:{_formatDate(config.Until)}"
|
121 |
-
if config.Email:
|
122 |
-
q += ' "mail" OR "email" OR'
|
123 |
-
q += ' "gmail" OR "e-mail"'
|
124 |
-
if config.Phone:
|
125 |
-
q += ' "phone" OR "call me" OR "text me"'
|
126 |
-
if config.Verified:
|
127 |
-
q += " filter:verified"
|
128 |
-
if config.To:
|
129 |
-
q += f" to:{config.To}"
|
130 |
-
if config.All:
|
131 |
-
q += f" to:{config.All} OR from:{config.All} OR @{config.All}"
|
132 |
-
if config.Near:
|
133 |
-
q += f' near:"{config.Near}"'
|
134 |
-
if config.Images:
|
135 |
-
q += " filter:images"
|
136 |
-
if config.Videos:
|
137 |
-
q += " filter:videos"
|
138 |
-
if config.Media:
|
139 |
-
q += " filter:media"
|
140 |
-
if config.Replies:
|
141 |
-
q += " filter:replies"
|
142 |
-
# although this filter can still be used, but I found it broken in my preliminary testing, needs more testing
|
143 |
-
if config.Native_retweets:
|
144 |
-
q += " filter:nativeretweets"
|
145 |
-
if config.Min_likes:
|
146 |
-
q += f" min_faves:{config.Min_likes}"
|
147 |
-
if config.Min_retweets:
|
148 |
-
q += f" min_retweets:{config.Min_retweets}"
|
149 |
-
if config.Min_replies:
|
150 |
-
q += f" min_replies:{config.Min_replies}"
|
151 |
-
if config.Links == "include":
|
152 |
-
q += " filter:links"
|
153 |
-
elif config.Links == "exclude":
|
154 |
-
q += " exclude:links"
|
155 |
-
if config.Source:
|
156 |
-
q += f" source:\"{config.Source}\""
|
157 |
-
if config.Members_list:
|
158 |
-
q += f" list:{config.Members_list}"
|
159 |
-
if config.Filter_retweets:
|
160 |
-
q += f" exclude:nativeretweets exclude:retweets"
|
161 |
-
if config.Custom_query:
|
162 |
-
q = config.Custom_query
|
163 |
-
|
164 |
-
q = q.strip()
|
165 |
-
params.append(("q", q))
|
166 |
-
_serialQuery = _sanitizeQuery(url, params)
|
167 |
-
return url, params, _serialQuery
|
168 |
-
|
169 |
-
|
170 |
-
def SearchProfile(config, init=None):
|
171 |
-
logme.debug(__name__ + ':SearchProfile')
|
172 |
-
_url = 'https://twitter.com/i/api/graphql/CwLU7qTfeu0doqhSr6tW4A/UserTweetsAndReplies'
|
173 |
-
tweet_count = 100
|
174 |
-
variables = {
|
175 |
-
"userId": config.User_id,
|
176 |
-
"count": tweet_count,
|
177 |
-
"includePromotedContent": True,
|
178 |
-
"withCommunity": True,
|
179 |
-
"withSuperFollowsUserFields": True,
|
180 |
-
"withBirdwatchPivots": False,
|
181 |
-
"withDownvotePerspective": False,
|
182 |
-
"withReactionsMetadata": False,
|
183 |
-
"withReactionsPerspective": False,
|
184 |
-
"withSuperFollowsTweetFields": True,
|
185 |
-
"withVoice": True,
|
186 |
-
"withV2Timeline": False,
|
187 |
-
"__fs_interactive_text": False,
|
188 |
-
"__fs_dont_mention_me_view_api_enabled": False,
|
189 |
-
}
|
190 |
-
if type(init) == str:
|
191 |
-
variables['cursor'] = init
|
192 |
-
params = [('variables', json.dumps(variables, separators=(',',':')))]
|
193 |
-
|
194 |
-
_serialQuery = _sanitizeQuery(_url, params)
|
195 |
-
return _serialQuery, [], _serialQuery
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
twitter-scraper/twint-master/twint/user.py
DELETED
@@ -1,52 +0,0 @@
|
|
1 |
-
import datetime
|
2 |
-
import logging as logme
|
3 |
-
|
4 |
-
|
5 |
-
class user:
|
6 |
-
type = "user"
|
7 |
-
|
8 |
-
def __init__(self):
|
9 |
-
pass
|
10 |
-
|
11 |
-
|
12 |
-
User_formats = {
|
13 |
-
'join_date': '%Y-%m-%d',
|
14 |
-
'join_time': '%H:%M:%S %Z'
|
15 |
-
}
|
16 |
-
|
17 |
-
|
18 |
-
# ur object must be a json from the endpoint https://api.twitter.com/graphql
|
19 |
-
def User(ur):
|
20 |
-
logme.debug(__name__ + ':User')
|
21 |
-
if 'data' not in ur and 'user' not in ur['data']:
|
22 |
-
msg = 'malformed json! cannot be parsed to get user data'
|
23 |
-
logme.fatal(msg)
|
24 |
-
raise KeyError(msg)
|
25 |
-
_usr = user()
|
26 |
-
_usr.id = ur['data']['user']['rest_id']
|
27 |
-
_usr.name = ur['data']['user']['legacy']['name']
|
28 |
-
_usr.username = ur['data']['user']['legacy']['screen_name']
|
29 |
-
_usr.bio = ur['data']['user']['legacy']['description']
|
30 |
-
_usr.location = ur['data']['user']['legacy']['location']
|
31 |
-
_usr.url = ur['data']['user']['legacy']['url']
|
32 |
-
# parsing date to user-friendly format
|
33 |
-
_dt = ur['data']['user']['legacy']['created_at']
|
34 |
-
_dt = datetime.datetime.strptime(_dt, '%a %b %d %H:%M:%S %z %Y')
|
35 |
-
# date is of the format year,
|
36 |
-
_usr.join_date = _dt.strftime(User_formats['join_date'])
|
37 |
-
_usr.join_time = _dt.strftime(User_formats['join_time'])
|
38 |
-
|
39 |
-
# :type `int`
|
40 |
-
_usr.tweets = int(ur['data']['user']['legacy']['statuses_count'])
|
41 |
-
_usr.following = int(ur['data']['user']['legacy']['friends_count'])
|
42 |
-
_usr.followers = int(ur['data']['user']['legacy']['followers_count'])
|
43 |
-
_usr.likes = int(ur['data']['user']['legacy']['favourites_count'])
|
44 |
-
_usr.media_count = int(ur['data']['user']['legacy']['media_count'])
|
45 |
-
|
46 |
-
_usr.is_private = ur['data']['user']['legacy']['protected']
|
47 |
-
_usr.is_verified = ur['data']['user']['legacy']['verified']
|
48 |
-
_usr.avatar = ur['data']['user']['legacy']['profile_image_url_https']
|
49 |
-
_usr.background_image = ur['data']['user']['legacy']['profile_banner_url']
|
50 |
-
# TODO : future implementation
|
51 |
-
# legacy_extended_profile is also available in some cases which can be used to get DOB of user
|
52 |
-
return _usr
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
twitter-scraper/twint-master/twint/verbose.py
DELETED
@@ -1,18 +0,0 @@
|
|
1 |
-
def Count(count, config):
|
2 |
-
msg = "[+] Finished: Successfully collected "
|
3 |
-
if config.Followers:
|
4 |
-
msg += f"all {count} users who follow @{config.Username}"
|
5 |
-
elif config.Following:
|
6 |
-
msg += f"all {count} users who @{config.Username} follows"
|
7 |
-
elif config.Favorites:
|
8 |
-
msg += f"{count} Tweets that @{config.Username} liked"
|
9 |
-
else:
|
10 |
-
msg += f"{count} Tweets_and_replies"
|
11 |
-
if config.Username:
|
12 |
-
msg += f" from @{config.Username}"
|
13 |
-
msg += "."
|
14 |
-
print(msg)
|
15 |
-
|
16 |
-
def Elastic(elasticsearch):
|
17 |
-
if elasticsearch:
|
18 |
-
print("[+] Indexing to Elasticsearch @ " + str(elasticsearch))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
twitter-scraper/{twint-master/twitter_scraper.ipynb → twitter_scraper.ipynb}
RENAMED
File without changes
|
twitter_scraper/twint_master/elasticsearch/dashboard.json
ADDED
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[
|
2 |
+
{
|
3 |
+
"_id": "e6d65380-bfe2-11e8-961a-d371b24d5d1d",
|
4 |
+
"_type": "dashboard",
|
5 |
+
"_source": {
|
6 |
+
"title": "Twint Dashboard",
|
7 |
+
"hits": 0,
|
8 |
+
"description": "",
|
9 |
+
"panelsJSON": "[{\"panelIndex\":\"1\",\"gridData\":{\"x\":0,\"y\":0,\"w\":40,\"h\":17,\"i\":\"1\"},\"embeddableConfig\":{},\"id\":\"d47421c0-bfd5-11e8-8858-bbc566841533\",\"type\":\"visualization\",\"version\":\"6.4.1\"},{\"panelIndex\":\"2\",\"gridData\":{\"x\":40,\"y\":6,\"w\":8,\"h\":11,\"i\":\"2\"},\"embeddableConfig\":{\"vis\":{\"legendOpen\":false}},\"id\":\"e2b89640-bfd4-11e8-8858-bbc566841533\",\"type\":\"visualization\",\"version\":\"6.4.1\"},{\"panelIndex\":\"3\",\"gridData\":{\"x\":0,\"y\":32,\"w\":20,\"h\":17,\"i\":\"3\"},\"embeddableConfig\":{\"vis\":{\"legendOpen\":false}},\"id\":\"8a8bb420-bfd9-11e8-8858-bbc566841533\",\"type\":\"visualization\",\"version\":\"6.4.1\"},{\"panelIndex\":\"4\",\"gridData\":{\"x\":0,\"y\":17,\"w\":33,\"h\":15,\"i\":\"4\"},\"embeddableConfig\":{\"vis\":{\"legendOpen\":false}},\"id\":\"a8d3ee70-bfd9-11e8-8858-bbc566841533\",\"type\":\"visualization\",\"version\":\"6.4.1\"},{\"panelIndex\":\"6\",\"gridData\":{\"x\":40,\"y\":0,\"w\":8,\"h\":6,\"i\":\"6\"},\"embeddableConfig\":{},\"id\":\"37cd72e0-bfe4-11e8-961a-d371b24d5d1d\",\"type\":\"visualization\",\"version\":\"6.4.1\"},{\"panelIndex\":\"7\",\"gridData\":{\"x\":33,\"y\":17,\"w\":15,\"h\":15,\"i\":\"7\"},\"embeddableConfig\":{},\"id\":\"149ecbc0-bfe4-11e8-961a-d371b24d5d1d\",\"type\":\"visualization\",\"version\":\"6.4.1\"},{\"panelIndex\":\"8\",\"gridData\":{\"x\":20,\"y\":32,\"w\":28,\"h\":17,\"i\":\"8\"},\"version\":\"6.3.2\",\"type\":\"visualization\",\"id\":\"b45ec590-c267-11e8-bcd4-3956fe930db7\",\"embeddableConfig\":{}}]",
|
10 |
+
"optionsJSON": "{\"darkTheme\":true,\"hidePanelTitles\":true,\"useMargins\":true}",
|
11 |
+
"version": 1,
|
12 |
+
"timeRestore": false,
|
13 |
+
"kibanaSavedObjectMeta": {
|
14 |
+
"searchSourceJSON": "{\"query\":{\"language\":\"lucene\",\"query\":\"\"},\"filter\":[],\"highlightAll\":true,\"version\":true}"
|
15 |
+
}
|
16 |
+
}
|
17 |
+
}
|
18 |
+
]
|
twitter_scraper/twint_master/elasticsearch/index-follow.json
ADDED
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
PUT twintgraph
|
2 |
+
{
|
3 |
+
"mappings": {
|
4 |
+
"items": {
|
5 |
+
"properties": {
|
6 |
+
"user": {"type": "keyword"},
|
7 |
+
"follow": {"type": "keyword"},
|
8 |
+
"essid": {"type": "keyword"}
|
9 |
+
}
|
10 |
+
}
|
11 |
+
},
|
12 |
+
"settings": {
|
13 |
+
"number_of_shards": 1
|
14 |
+
}
|
15 |
+
}
|
twitter_scraper/twint_master/elasticsearch/index-tweets.json
ADDED
@@ -0,0 +1,48 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
PUT twinttweets
|
2 |
+
{
|
3 |
+
"mappings": {
|
4 |
+
"items": {
|
5 |
+
"properties": {
|
6 |
+
"id": {"type": "long"},
|
7 |
+
"conversation_id": {"type": "long"},
|
8 |
+
"created_at": {"type": "long"},
|
9 |
+
"date": {"type": "date", "format": "yyyy-MM-dd HH:mm:ss"},
|
10 |
+
"timezone": {"type": "keyword"},
|
11 |
+
"place": {"type": "keyword"},
|
12 |
+
"location": {"type": "keyword"},
|
13 |
+
"tweet": {"type": "text"},
|
14 |
+
"hashtags": {"type": "keyword"},
|
15 |
+
"cashtags": {"type": "keyword"},
|
16 |
+
"user_id": {"type": "long"},
|
17 |
+
"user_id_str": {"type": "keyword"},
|
18 |
+
"username": {"type": "keyword"},
|
19 |
+
"name": {"type": "text"},
|
20 |
+
"profile_image_url": {"type": "text"},
|
21 |
+
"day": {"type": "integer"},
|
22 |
+
"hour": {"type": "integer"},
|
23 |
+
"link": {"type": "text"},
|
24 |
+
"retweet": {"type": "text"},
|
25 |
+
"essid": {"type": "keyword"},
|
26 |
+
"nlikes": {"type": "integer"},
|
27 |
+
"nreplies": {"type": "integer"},
|
28 |
+
"nretweets": {"type": "integer"},
|
29 |
+
"quote_url": {"type": "text"},
|
30 |
+
"video": {"type": "integer"},
|
31 |
+
"thumbnail": {"type": "text"},
|
32 |
+
"search": {"type": "text"},
|
33 |
+
"near": {"type": "text"},
|
34 |
+
"geo_near": {"type": "geo_point"},
|
35 |
+
"geo_tweet": {"type": "geo_point"},
|
36 |
+
"photos": {"type": "text"},
|
37 |
+
"mentions": {"type": "text"},
|
38 |
+
"translation": {"type": "text"},
|
39 |
+
"trans_src": {"type": "keyword"},
|
40 |
+
"trans_dev": {"type": "keyword"},
|
41 |
+
}
|
42 |
+
}
|
43 |
+
}
|
44 |
+
,
|
45 |
+
"settings": {
|
46 |
+
"number_of_shards": 1
|
47 |
+
}
|
48 |
+
}
|
twitter_scraper/twint_master/elasticsearch/index-user.json
ADDED
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
PUT twintuser
|
2 |
+
{
|
3 |
+
"mappings": {
|
4 |
+
"items": {
|
5 |
+
"properties": {
|
6 |
+
"id": {"type": "keyword"},
|
7 |
+
"name": {"type": "keyword"},
|
8 |
+
"username": {"type": "keyword"},
|
9 |
+
"bio": {"type": "text"},
|
10 |
+
"location": {"type": "keyword"},
|
11 |
+
"url": {"type": "text"},
|
12 |
+
"join_datetime": {"type": "date", "format": "yyyy-MM-dd HH:mm:ss"},
|
13 |
+
"join_date": {"type": "date", "format": "yyyy-MM-dd"},
|
14 |
+
"join_time": {"type": "date", "format": "HH:mm:ss"},
|
15 |
+
"tweets": {"type": "integer"},
|
16 |
+
"following": {"type": "integer"},
|
17 |
+
"followers": {"type": "integer"},
|
18 |
+
"likes": {"type": "integer"},
|
19 |
+
"media": {"type": "integer"},
|
20 |
+
"private": {"type": "integer"},
|
21 |
+
"verified": {"type": "integer"},
|
22 |
+
"avatar": {"type": "text"},
|
23 |
+
"background_image": {"type": "text"},
|
24 |
+
"session": {"type": "keyword"},
|
25 |
+
"geo_user": {"type": "geo_point"}
|
26 |
+
}
|
27 |
+
}
|
28 |
+
}
|
29 |
+
,
|
30 |
+
"settings": {
|
31 |
+
"number_of_shards": 1
|
32 |
+
}
|
33 |
+
}
|
twitter_scraper/twint_master/elasticsearch/visualizations.json
ADDED
@@ -0,0 +1,100 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[
|
2 |
+
{
|
3 |
+
"_id": "d47421c0-bfd5-11e8-8858-bbc566841533",
|
4 |
+
"_type": "visualization",
|
5 |
+
"_source": {
|
6 |
+
"title": "Activity [twinttweets]",
|
7 |
+
"visState": "{\"title\":\"Activity [twinttweets]\",\"type\":\"histogram\",\"params\":{\"type\":\"histogram\",\"grid\":{\"categoryLines\":true,\"style\":{\"color\":\"#eee\"},\"valueAxis\":\"ValueAxis-1\"},\"categoryAxes\":[{\"id\":\"CategoryAxis-1\",\"type\":\"category\",\"position\":\"bottom\",\"show\":true,\"style\":{},\"scale\":{\"type\":\"linear\"},\"labels\":{\"show\":true,\"truncate\":100},\"title\":{}}],\"valueAxes\":[{\"id\":\"ValueAxis-1\",\"name\":\"LeftAxis-1\",\"type\":\"value\",\"position\":\"left\",\"show\":true,\"style\":{},\"scale\":{\"type\":\"linear\",\"mode\":\"normal\"},\"labels\":{\"show\":true,\"rotate\":0,\"filter\":false,\"truncate\":100},\"title\":{\"text\":\"Tweets\"}}],\"seriesParams\":[{\"show\":\"true\",\"type\":\"area\",\"mode\":\"stacked\",\"data\":{\"label\":\"Tweets\",\"id\":\"1\"},\"valueAxis\":\"ValueAxis-1\",\"drawLinesBetweenPoints\":true,\"showCircles\":true,\"interpolate\":\"cardinal\"}],\"addTooltip\":true,\"addLegend\":true,\"legendPosition\":\"right\",\"times\":[],\"addTimeMarker\":true},\"aggs\":[{\"id\":\"1\",\"enabled\":true,\"type\":\"count\",\"schema\":\"metric\",\"params\":{\"customLabel\":\"Tweets\"}},{\"id\":\"2\",\"enabled\":true,\"type\":\"date_histogram\",\"schema\":\"segment\",\"params\":{\"field\":\"date\",\"interval\":\"auto\",\"customInterval\":\"2h\",\"min_doc_count\":1,\"extended_bounds\":{},\"customLabel\":\"Days\"}},{\"id\":\"3\",\"enabled\":true,\"type\":\"terms\",\"schema\":\"group\",\"params\":{\"field\":\"user_id\",\"size\":5,\"order\":\"desc\",\"orderBy\":\"1\",\"otherBucket\":false,\"otherBucketLabel\":\"Other\",\"missingBucket\":false,\"missingBucketLabel\":\"Missing\",\"customLabel\":\"User ids\"}}]}",
|
8 |
+
"uiStateJSON": "{}",
|
9 |
+
"description": "",
|
10 |
+
"version": 1,
|
11 |
+
"kibanaSavedObjectMeta": {
|
12 |
+
"searchSourceJSON": "{\"index\":\"755f4660-bfee-11e8-9911-5b8e1e9c87c6\",\"query\":{\"query\":\"NOT _exists_:likes NOT _exists_:retweets NOT _exists_:replies\",\"language\":\"lucene\"},\"filter\":[]}"
|
13 |
+
}
|
14 |
+
}
|
15 |
+
},
|
16 |
+
{
|
17 |
+
"_id": "e2b89640-bfd4-11e8-8858-bbc566841533",
|
18 |
+
"_type": "visualization",
|
19 |
+
"_source": {
|
20 |
+
"title": "Activity - pie [twinttweets]",
|
21 |
+
"visState": "{\"aggs\":[{\"enabled\":true,\"id\":\"1\",\"params\":{},\"schema\":\"metric\",\"type\":\"count\"},{\"enabled\":true,\"id\":\"2\",\"params\":{\"field\":\"user_id\",\"missingBucket\":false,\"missingBucketLabel\":\"Missing\",\"order\":\"desc\",\"orderBy\":\"1\",\"otherBucket\":false,\"otherBucketLabel\":\"Other\",\"size\":5},\"schema\":\"segment\",\"type\":\"terms\"}],\"params\":{\"addLegend\":true,\"addTooltip\":true,\"isDonut\":true,\"labels\":{\"last_level\":true,\"show\":false,\"truncate\":100,\"values\":true},\"legendPosition\":\"right\",\"type\":\"pie\"},\"title\":\"Activity - pie [twinttweets]\",\"type\":\"pie\"}",
|
22 |
+
"uiStateJSON": "{}",
|
23 |
+
"description": "",
|
24 |
+
"version": 1,
|
25 |
+
"kibanaSavedObjectMeta": {
|
26 |
+
"searchSourceJSON": "{\"index\":\"755f4660-bfee-11e8-9911-5b8e1e9c87c6\",\"query\":{\"language\":\"lucene\",\"query\":\"NOT _exists_:likes NOT _exists_:retweets NOT _exists_:replies\"},\"filter\":[]}"
|
27 |
+
}
|
28 |
+
}
|
29 |
+
},
|
30 |
+
{
|
31 |
+
"_id": "37cd72e0-bfe4-11e8-961a-d371b24d5d1d",
|
32 |
+
"_type": "visualization",
|
33 |
+
"_source": {
|
34 |
+
"title": "Tweets Count [twinttweet]",
|
35 |
+
"visState": "{\"title\":\"Tweets Count [twinttweet]\",\"type\":\"metric\",\"params\":{\"addTooltip\":true,\"addLegend\":false,\"type\":\"metric\",\"metric\":{\"percentageMode\":false,\"useRanges\":false,\"colorSchema\":\"Green to Red\",\"metricColorMode\":\"None\",\"colorsRange\":[{\"from\":0,\"to\":10000}],\"labels\":{\"show\":true},\"invertColors\":false,\"style\":{\"bgFill\":\"#000\",\"bgColor\":false,\"labelColor\":false,\"subText\":\"\",\"fontSize\":33}}},\"aggs\":[{\"id\":\"1\",\"enabled\":true,\"type\":\"count\",\"schema\":\"metric\",\"params\":{\"customLabel\":\"Tweets\"}}]}",
|
36 |
+
"uiStateJSON": "{}",
|
37 |
+
"description": "",
|
38 |
+
"version": 1,
|
39 |
+
"kibanaSavedObjectMeta": {
|
40 |
+
"searchSourceJSON": "{\"index\":\"755f4660-bfee-11e8-9911-5b8e1e9c87c6\",\"query\":{\"language\":\"lucene\",\"query\":\"NOT _exists_:likes NOT _exists_:retweets NOT _exists_:replies\"},\"filter\":[]}"
|
41 |
+
}
|
42 |
+
}
|
43 |
+
},
|
44 |
+
{
|
45 |
+
"_id": "149ecbc0-bfe4-11e8-961a-d371b24d5d1d",
|
46 |
+
"_type": "visualization",
|
47 |
+
"_source": {
|
48 |
+
"title": "Word Cloud [twinttweets]",
|
49 |
+
"visState": "{\"title\":\"Word Cloud [twinttweets]\",\"type\":\"tagcloud\",\"params\":{\"scale\":\"linear\",\"orientation\":\"single\",\"minFontSize\":10,\"maxFontSize\":50,\"showLabel\":false},\"aggs\":[{\"id\":\"1\",\"enabled\":true,\"type\":\"count\",\"schema\":\"metric\",\"params\":{}},{\"id\":\"2\",\"enabled\":true,\"type\":\"terms\",\"schema\":\"segment\",\"params\":{\"field\":\"username\",\"otherBucket\":false,\"otherBucketLabel\":\"Other\",\"missingBucket\":false,\"missingBucketLabel\":\"Missing\",\"size\":10,\"order\":\"desc\",\"orderBy\":\"1\"}}]}",
|
50 |
+
"uiStateJSON": "{}",
|
51 |
+
"description": "",
|
52 |
+
"version": 1,
|
53 |
+
"kibanaSavedObjectMeta": {
|
54 |
+
"searchSourceJSON": "{\"index\":\"755f4660-bfee-11e8-9911-5b8e1e9c87c6\",\"query\":{\"query\":\"NOT _exists_:likes NOT _exists_:retweets NOT _exists_:replies\",\"language\":\"lucene\"},\"filter\":[]}"
|
55 |
+
}
|
56 |
+
}
|
57 |
+
},
|
58 |
+
{
|
59 |
+
"_id": "a8d3ee70-bfd9-11e8-8858-bbc566841533",
|
60 |
+
"_type": "visualization",
|
61 |
+
"_source": {
|
62 |
+
"title": "Day-activity [twinttweet]",
|
63 |
+
"visState": "{\"title\":\"Day-activity [twinttweet]\",\"type\":\"histogram\",\"params\":{\"addLegend\":true,\"addTimeMarker\":false,\"addTooltip\":true,\"categoryAxes\":[{\"id\":\"CategoryAxis-1\",\"labels\":{\"show\":true,\"truncate\":100,\"rotate\":0},\"position\":\"bottom\",\"scale\":{\"type\":\"linear\"},\"show\":true,\"style\":{},\"title\":{},\"type\":\"category\"}],\"grid\":{\"categoryLines\":true,\"style\":{\"color\":\"#eee\"},\"valueAxis\":\"ValueAxis-3\"},\"legendPosition\":\"right\",\"orderBucketsBySum\":false,\"seriesParams\":[{\"data\":{\"id\":\"1\",\"label\":\"Tweets\"},\"drawLinesBetweenPoints\":true,\"mode\":\"normal\",\"show\":\"true\",\"showCircles\":true,\"type\":\"histogram\",\"valueAxis\":\"ValueAxis-3\"}],\"times\":[],\"type\":\"histogram\",\"valueAxes\":[{\"id\":\"ValueAxis-3\",\"labels\":{\"filter\":false,\"rotate\":0,\"show\":true,\"truncate\":100},\"name\":\"LeftAxis-1\",\"position\":\"left\",\"scale\":{\"mode\":\"normal\",\"type\":\"linear\"},\"show\":true,\"style\":{},\"title\":{\"text\":\"Tweets\"},\"type\":\"value\"}]},\"aggs\":[{\"id\":\"1\",\"enabled\":true,\"type\":\"count\",\"schema\":\"metric\",\"params\":{\"customLabel\":\"Tweets\"}},{\"id\":\"2\",\"enabled\":true,\"type\":\"histogram\",\"schema\":\"segment\",\"params\":{\"field\":\"hour\",\"interval\":1,\"min_doc_count\":true,\"extended_bounds\":{\"min\":0,\"max\":23}}},{\"id\":\"3\",\"enabled\":true,\"type\":\"terms\",\"schema\":\"group\",\"params\":{\"field\":\"user_id\",\"otherBucket\":false,\"otherBucketLabel\":\"Other\",\"missingBucket\":false,\"missingBucketLabel\":\"Missing\",\"size\":10,\"order\":\"asc\",\"orderBy\":\"_term\",\"customLabel\":\"\"}}]}",
|
64 |
+
"uiStateJSON": "{\"vis\":{\"legendOpen\":true}}",
|
65 |
+
"description": "",
|
66 |
+
"version": 1,
|
67 |
+
"kibanaSavedObjectMeta": {
|
68 |
+
"searchSourceJSON": "{\"index\":\"755f4660-bfee-11e8-9911-5b8e1e9c87c6\",\"query\":{\"language\":\"lucene\",\"query\":\"NOT _exists_:likes NOT _exists_:retweets NOT _exists_:replies\"},\"filter\":[]}"
|
69 |
+
}
|
70 |
+
}
|
71 |
+
},
|
72 |
+
{
|
73 |
+
"_id": "8a8bb420-bfd9-11e8-8858-bbc566841533",
|
74 |
+
"_type": "visualization",
|
75 |
+
"_source": {
|
76 |
+
"title": "Week-activity [twinttweet]",
|
77 |
+
"visState": "{\"title\":\"Week-activity [twinttweet]\",\"type\":\"histogram\",\"params\":{\"type\":\"histogram\",\"grid\":{\"categoryLines\":true,\"style\":{\"color\":\"#eee\"},\"valueAxis\":\"ValueAxis-1\"},\"categoryAxes\":[{\"id\":\"CategoryAxis-1\",\"type\":\"category\",\"position\":\"bottom\",\"show\":true,\"style\":{},\"scale\":{\"type\":\"linear\"},\"labels\":{\"show\":true,\"truncate\":100,\"rotate\":0},\"title\":{}}],\"valueAxes\":[{\"id\":\"ValueAxis-1\",\"name\":\"LeftAxis-1\",\"type\":\"value\",\"position\":\"left\",\"show\":true,\"style\":{},\"scale\":{\"type\":\"linear\",\"mode\":\"normal\"},\"labels\":{\"show\":true,\"rotate\":0,\"filter\":false,\"truncate\":100},\"title\":{\"text\":\"Tweets\"}}],\"seriesParams\":[{\"show\":\"true\",\"type\":\"histogram\",\"mode\":\"normal\",\"data\":{\"label\":\"Tweets\",\"id\":\"1\"},\"valueAxis\":\"ValueAxis-1\",\"drawLinesBetweenPoints\":true,\"showCircles\":true}],\"addTooltip\":true,\"addLegend\":true,\"legendPosition\":\"right\",\"times\":[],\"addTimeMarker\":false},\"aggs\":[{\"id\":\"1\",\"enabled\":true,\"type\":\"count\",\"schema\":\"metric\",\"params\":{\"customLabel\":\"Tweets\"}},{\"id\":\"2\",\"enabled\":true,\"type\":\"histogram\",\"schema\":\"segment\",\"params\":{\"field\":\"day\",\"interval\":1,\"min_doc_count\":true,\"extended_bounds\":{},\"customLabel\":\"Days of the week\"}},{\"id\":\"3\",\"enabled\":true,\"type\":\"terms\",\"schema\":\"group\",\"params\":{\"field\":\"user_id\",\"otherBucket\":false,\"otherBucketLabel\":\"Other\",\"missingBucket\":false,\"missingBucketLabel\":\"Missing\",\"size\":10,\"order\":\"desc\",\"orderBy\":\"1\",\"customLabel\":\"\"}}]}",
|
78 |
+
"uiStateJSON": "{}",
|
79 |
+
"description": "",
|
80 |
+
"version": 1,
|
81 |
+
"kibanaSavedObjectMeta": {
|
82 |
+
"searchSourceJSON": "{\"index\":\"755f4660-bfee-11e8-9911-5b8e1e9c87c6\",\"query\":{\"query\":\"NOT _exists_:likes NOT _exists_:retweets NOT _exists_:replies\",\"language\":\"lucene\"},\"filter\":[]}"
|
83 |
+
}
|
84 |
+
}
|
85 |
+
},
|
86 |
+
{
|
87 |
+
"_id": "b45ec590-c267-11e8-bcd4-3956fe930db7",
|
88 |
+
"_type": "visualization",
|
89 |
+
"_source": {
|
90 |
+
"title": "Heat-map [twinttweets]",
|
91 |
+
"visState": "{\"title\":\"Heat-map [twinttweets]\",\"type\":\"heatmap\",\"params\":{\"type\":\"heatmap\",\"addTooltip\":true,\"addLegend\":true,\"enableHover\":true,\"legendPosition\":\"right\",\"times\":[],\"colorsNumber\":10,\"colorSchema\":\"Reds\",\"setColorRange\":false,\"colorsRange\":[{\"from\":0,\"to\":10},{\"from\":10,\"to\":100},{\"from\":100,\"to\":200},{\"from\":200,\"to\":500},{\"from\":500,\"to\":1000},{\"from\":1000,\"to\":2000},{\"from\":2000,\"to\":3000},{\"from\":3000,\"to\":4000},{\"from\":4000,\"to\":5000},{\"from\":7000,\"to\":null}],\"invertColors\":false,\"percentageMode\":false,\"valueAxes\":[{\"show\":false,\"id\":\"ValueAxis-1\",\"type\":\"value\",\"scale\":{\"type\":\"linear\",\"defaultYExtents\":true},\"labels\":{\"show\":false,\"rotate\":270,\"overwriteColor\":false,\"color\":\"#555\"}}]},\"aggs\":[{\"id\":\"1\",\"enabled\":true,\"type\":\"count\",\"schema\":\"metric\",\"params\":{}},{\"id\":\"2\",\"enabled\":true,\"type\":\"histogram\",\"schema\":\"segment\",\"params\":{\"field\":\"hour\",\"interval\":1,\"min_doc_count\":false,\"extended_bounds\":{}}},{\"id\":\"3\",\"enabled\":true,\"type\":\"histogram\",\"schema\":\"group\",\"params\":{\"field\":\"day\",\"interval\":1,\"min_doc_count\":false,\"extended_bounds\":{\"min\":0,\"max\":2}}}]}",
|
92 |
+
"uiStateJSON": "{\"vis\":{\"defaultColors\":{\"3 - 592\":\"rgb(255,245,240)\",\"592 - 1.180\":\"rgb(254,228,216)\",\"1.180 - 1.769\":\"rgb(253,202,181)\",\"1.769 - 2.357\":\"rgb(252,171,142)\",\"2.357 - 2.945\":\"rgb(252,138,106)\",\"2.945 - 3.534\":\"rgb(251,106,74)\",\"3.534 - 4.122\":\"rgb(241,68,50)\",\"4.122 - 4.711\":\"rgb(217,38,35)\",\"4.711 - 5.299\":\"rgb(188,20,26)\",\"5.299 - 5.887\":\"rgb(152,12,19)\"},\"colors\":{\"3 - 592\":\"#FCEACA\",\"592 - 1.180\":\"#F9E2D2\",\"1.180 - 1.769\":\"#F9BA8F\"}}}",
|
93 |
+
"description": "",
|
94 |
+
"version": 1,
|
95 |
+
"kibanaSavedObjectMeta": {
|
96 |
+
"searchSourceJSON": "{\"index\":\"755f4660-bfee-11e8-9911-5b8e1e9c87c6\",\"filter\":[],\"query\":{\"language\":\"lucene\",\"query\":\"\"}}"
|
97 |
+
}
|
98 |
+
}
|
99 |
+
}
|
100 |
+
]
|
twitter_scraper/twint_master/extracted-tweets.txt
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
'@annieloof Nej, jag håller med. Tänk mer som Mathias Andersson (SD). https://t.co/gSqQDz5N8z'
|
2 |
+
'Man kan ha synpunkter på en sådan lösning, men den är naturligtvis att föredra framför frigående våldsverkare som fortsätter misshandla sina offer i väntan på fängelse.'
|
3 |
+
'Är det ont om plats på anstalterna så får man sänka standarden rejält för att få rum med fler interner per kvadratmeter.'
|
4 |
+
'Döms man för brott, särskilt våldsbrott, ska man vara inlåst från det att domen faller tills straffet är avtjänat. Allt annat är vansinne.'
|
5 |
+
'Platsbrist? Jaha, vad spelar det för roll? \n\nDet gör mig förbannad och bestört att lösningen på problemet med överfulla fängelser verkar vara att dömda våldsbrottslingar får röra sig fritt i samhället istället för att sitta inlåsta. \n\nhttps://t.co/QDi9rM3kMC'
|
twitter_scraper/twint_master/requirements.txt
ADDED
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
aiohttp
|
2 |
+
aiodns
|
3 |
+
beautifulsoup4
|
4 |
+
cchardet
|
5 |
+
dataclasses
|
6 |
+
elasticsearch
|
7 |
+
pysocks
|
8 |
+
pandas>=0.23.0
|
9 |
+
aiohttp_socks<=0.4.1
|
10 |
+
schedule
|
11 |
+
geopy
|
12 |
+
fake-useragent
|
13 |
+
googletransx
|