Spaces:
Runtime error
Runtime error
Merge pull request #30 from Demea9000/22-redo-twitter-scraper
Browse files- .idea/misc.xml +1 -1
- .idea/politweet.iml +1 -0
- requirements.txt +28 -0
- twitter-scraper/TwitterScraper.py +17 -0
- twitter-scraper/scrape.py +91 -0
- twitter-scraper/twint-master/.github/FUNDING.yml +0 -3
- twitter-scraper/twint-master/.github/ISSUE_TEMPLATE.md +0 -20
- twitter-scraper/twint-master/.github/ISSUE_TEMPLATE/ISSUE_TEMPLATE.md +0 -17
- twitter-scraper/twint-master/.gitignore +0 -115
- twitter-scraper/twint-master/.travis.yml +0 -23
- twitter-scraper/twint-master/Dockerfile +0 -10
- twitter-scraper/twint-master/LICENSE +0 -21
- twitter-scraper/twint-master/MANIFEST.in +0 -1
- twitter-scraper/twint-master/README.md +0 -272
- twitter-scraper/twint-master/Untitled.ipynb +0 -282
- twitter-scraper/twint-master/automate.py +0 -65
- twitter-scraper/twint-master/elasticsearch/README.md +0 -5
- twitter-scraper/twint-master/scrape.py +0 -102
- twitter-scraper/twint-master/scrape__init__.py +0 -14
- twitter-scraper/twint-master/setup.py +0 -65
- twitter-scraper/twint-master/test.py +0 -92
- twitter-scraper/twint-master/twint/__init__.py +0 -32
- twitter-scraper/twint-master/twint/__version__.py +0 -3
- twitter-scraper/twint-master/twint/cli.py +0 -342
- twitter-scraper/twint-master/twint/config.py +0 -87
- twitter-scraper/twint-master/twint/datelock.py +0 -44
- twitter-scraper/twint-master/twint/feed.py +0 -145
- twitter-scraper/twint-master/twint/format.py +0 -91
- twitter-scraper/twint-master/twint/get.py +0 -298
- twitter-scraper/twint-master/twint/output.py +0 -241
- twitter-scraper/twint-master/twint/run.py +0 -412
- twitter-scraper/twint-master/twint/storage/__init__.py +0 -0
- twitter-scraper/twint-master/twint/storage/db.py +0 -297
- twitter-scraper/twint-master/twint/storage/elasticsearch.py +0 -364
- twitter-scraper/twint-master/twint/storage/panda.py +0 -196
- twitter-scraper/twint-master/twint/storage/write.py +0 -77
- twitter-scraper/twint-master/twint/storage/write_meta.py +0 -151
- twitter-scraper/twint-master/twint/token.py +0 -94
- twitter-scraper/twint-master/twint/tweet.py +0 -166
- twitter-scraper/twint-master/twint/url.py +0 -195
- twitter-scraper/twint-master/twint/user.py +0 -52
- twitter-scraper/twint-master/twint/verbose.py +0 -18
- twitter-scraper/twint-master/twitter_scraper.ipynb +0 -265
- twitter-scraper/twitter_scraper.ipynb +819 -0
.idea/misc.xml
CHANGED
@@ -1,4 +1,4 @@
|
|
1 |
<?xml version="1.0" encoding="UTF-8"?>
|
2 |
<project version="4">
|
3 |
-
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.
|
4 |
</project>
|
|
|
1 |
<?xml version="1.0" encoding="UTF-8"?>
|
2 |
<project version="4">
|
3 |
+
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.10 (politweet)" project-jdk-type="Python SDK" />
|
4 |
</project>
|
.idea/politweet.iml
CHANGED
@@ -3,6 +3,7 @@
|
|
3 |
<component name="NewModuleRootManager">
|
4 |
<content url="file://$MODULE_DIR$">
|
5 |
<excludeFolder url="file://$MODULE_DIR$/politweet-environment" />
|
|
|
6 |
</content>
|
7 |
<orderEntry type="inheritedJdk" />
|
8 |
<orderEntry type="sourceFolder" forTests="false" />
|
|
|
3 |
<component name="NewModuleRootManager">
|
4 |
<content url="file://$MODULE_DIR$">
|
5 |
<excludeFolder url="file://$MODULE_DIR$/politweet-environment" />
|
6 |
+
<excludeFolder url="file://$MODULE_DIR$/venv" />
|
7 |
</content>
|
8 |
<orderEntry type="inheritedJdk" />
|
9 |
<orderEntry type="sourceFolder" forTests="false" />
|
requirements.txt
CHANGED
@@ -1,23 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
certifi==2022.6.15
|
|
|
2 |
charset-normalizer==2.1.0
|
3 |
cycler==0.11.0
|
|
|
|
|
|
|
4 |
et-xmlfile==1.1.0
|
|
|
5 |
fonttools==4.34.0
|
|
|
|
|
|
|
|
|
6 |
idna==3.3
|
7 |
kiwisolver==1.4.3
|
8 |
matplotlib==3.5.2
|
|
|
9 |
numpy==1.23.0
|
|
|
10 |
openai==0.20.0
|
11 |
openpyxl==3.0.10
|
12 |
packaging==21.3
|
13 |
pandas==1.4.3
|
14 |
pandas-stubs==1.4.3.220704
|
15 |
Pillow==9.2.0
|
|
|
|
|
16 |
pyparsing==3.0.9
|
|
|
17 |
python-dateutil==2.8.2
|
|
|
18 |
pytz==2022.1
|
19 |
regex==2022.6.2
|
20 |
requests==2.28.1
|
|
|
|
|
21 |
six==1.16.0
|
|
|
22 |
tqdm==4.64.0
|
|
|
23 |
urllib3==1.26.9
|
|
|
|
1 |
+
aiodns==3.0.0
|
2 |
+
aiohttp==3.8.1
|
3 |
+
aiohttp-socks==0.7.1
|
4 |
+
aiosignal==1.2.0
|
5 |
+
async-timeout==4.0.2
|
6 |
+
attrs==21.4.0
|
7 |
+
beautifulsoup4==4.11.1
|
8 |
+
cchardet==2.1.7
|
9 |
certifi==2022.6.15
|
10 |
+
cffi==1.15.1
|
11 |
charset-normalizer==2.1.0
|
12 |
cycler==0.11.0
|
13 |
+
dataclasses==0.6
|
14 |
+
elastic-transport==8.1.2
|
15 |
+
elasticsearch==8.3.1
|
16 |
et-xmlfile==1.1.0
|
17 |
+
fake-useragent==0.1.11
|
18 |
fonttools==4.34.0
|
19 |
+
frozenlist==1.3.0
|
20 |
+
geographiclib==1.52
|
21 |
+
geopy==2.2.0
|
22 |
+
googletransx==2.4.2
|
23 |
idna==3.3
|
24 |
kiwisolver==1.4.3
|
25 |
matplotlib==3.5.2
|
26 |
+
multidict==6.0.2
|
27 |
numpy==1.23.0
|
28 |
+
oauthlib==3.2.0
|
29 |
openai==0.20.0
|
30 |
openpyxl==3.0.10
|
31 |
packaging==21.3
|
32 |
pandas==1.4.3
|
33 |
pandas-stubs==1.4.3.220704
|
34 |
Pillow==9.2.0
|
35 |
+
pycares==4.2.1
|
36 |
+
pycparser==2.21
|
37 |
pyparsing==3.0.9
|
38 |
+
PySocks==1.7.1
|
39 |
python-dateutil==2.8.2
|
40 |
+
python-socks==2.0.3
|
41 |
pytz==2022.1
|
42 |
regex==2022.6.2
|
43 |
requests==2.28.1
|
44 |
+
requests-oauthlib==1.3.1
|
45 |
+
schedule==1.1.0
|
46 |
six==1.16.0
|
47 |
+
soupsieve==2.3.2.post1
|
48 |
tqdm==4.64.0
|
49 |
+
-e git+https://github.com/twintproject/twint.git@e7c8a0c764f6879188e5c21e25fb6f1f856a7221#egg=twint
|
50 |
urllib3==1.26.9
|
51 |
+
yarl==1.7.2
|
twitter-scraper/TwitterScraper.py
ADDED
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import twint
|
2 |
+
import datetime
|
3 |
+
|
4 |
+
c = twint.Config()
|
5 |
+
|
6 |
+
c.Search = ['Taylor Swift'] # topic
|
7 |
+
c.Limit = 500 # number of Tweets to scrape
|
8 |
+
c.Store_csv = True # store tweets in a csv file
|
9 |
+
c.Output = "taylor_swift_tweets.csv" # path to csv file
|
10 |
+
|
11 |
+
twint.run.Search(c)
|
12 |
+
|
13 |
+
import pandas as pd
|
14 |
+
|
15 |
+
df = pd.read_csv('taylor_swift_tweets.csv')
|
16 |
+
|
17 |
+
print(df.head())
|
twitter-scraper/scrape.py
ADDED
@@ -0,0 +1,91 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import twint
|
2 |
+
from datetime import date
|
3 |
+
|
4 |
+
|
5 |
+
class TwitterScraper(object):
|
6 |
+
"""
|
7 |
+
This class is a twitter TwitterScraper called TwitterScraper. It takes the user as input and collects the user's tweets
|
8 |
+
from 'from_date' to 'to_date'. If 'from_date' and 'to_date' are not specified, it collects the number of tweets 'num_tweets' from today.
|
9 |
+
It outputs a dictionary with the tweet unique id and some other information.
|
10 |
+
input: user, from_date, to_date, num_tweets
|
11 |
+
output: dict
|
12 |
+
"""
|
13 |
+
def __init__(self, from_date="2006-07-01", to_date=str(date.today()), num_tweets=20):
|
14 |
+
self.from_date = from_date
|
15 |
+
self.to_date = to_date
|
16 |
+
self.num_tweets = num_tweets
|
17 |
+
self.conf = twint.Config()
|
18 |
+
|
19 |
+
def scrape_by_user(self, _user):
|
20 |
+
"""This method uses twint to extract tweets based on username"""
|
21 |
+
self.conf.Search = "from:@" + _user # is the search configuration is given in this format it searches after
|
22 |
+
# user_names.
|
23 |
+
return self.__get_tweets__from_twint__()
|
24 |
+
|
25 |
+
def scrape_by_string(self, _string: str):
|
26 |
+
"""This method uses twint to extract tweets based on string.
|
27 |
+
all extracted tweets have the specified word in _string parameter in it.
|
28 |
+
"""
|
29 |
+
self.conf.Search = _string # this tells twint configuration to search for string
|
30 |
+
return self.__get_tweets__from_twint__()
|
31 |
+
|
32 |
+
def scrape_by_user_and_string(self, _user: str, _string: str):
|
33 |
+
"""This method uses twint to extract tweets brased on string and username"""
|
34 |
+
self.conf.Username = _user
|
35 |
+
self.conf.Search = _string
|
36 |
+
return self.__get_tweets__from_twint__()
|
37 |
+
|
38 |
+
def get_only_tweets(self, tweet_and_replies_info):
|
39 |
+
tweet_and_replies = tweet_and_replies_info["tweet"]
|
40 |
+
"""
|
41 |
+
This functions input arg is a data frame (the output from scrape methords ) and removes...
|
42 |
+
all tweets starting with \"@\" which is indicator of a reply or retweet.
|
43 |
+
"""
|
44 |
+
indx_replies = []
|
45 |
+
for i in range(len(tweet_and_replies)):
|
46 |
+
if tweet_and_replies[i].startswith("@"):
|
47 |
+
indx_replies.append(i)
|
48 |
+
|
49 |
+
tweets_info = tweet_and_replies_info.drop(labels=indx_replies, axis=0)
|
50 |
+
# drop removes the columns which its index specified by
|
51 |
+
# indx_replies. axis=0 if we want to delete rows.
|
52 |
+
#print(len(tweets['tweet']), " of them are Tweets")
|
53 |
+
return tweets_info
|
54 |
+
|
55 |
+
def __get_tweets__from_twint__(self):
|
56 |
+
""" __get_tweets_from_twint__
|
57 |
+
tweet info is a dataframe with fallowing columns
|
58 |
+
Index(['id', 'conversation_id', 'created_at', 'date', 'timezone', 'place',
|
59 |
+
'tweet', 'language', 'hashtags', 'cashtags', 'user_id', 'user_id_str',
|
60 |
+
'username', 'name', 'day', 'hour', 'link', 'urls', 'photos', 'video',
|
61 |
+
'thumbnail', 'retweet', 'nlikes', 'nreplies', 'nretweets', 'quote_url',
|
62 |
+
'search', 'near', 'geo', 'source', 'user_rt_id', 'user_rt',
|
63 |
+
'retweet_id', 'reply_to', 'retweet_date', 'translate', 'trans_src',
|
64 |
+
'trans_dest']
|
65 |
+
we just pick the relevant ones.
|
66 |
+
c is a twint.Config() object
|
67 |
+
we also configure twint output.
|
68 |
+
"""
|
69 |
+
self.conf.Pandas = True #
|
70 |
+
self.conf.Count = True #
|
71 |
+
self.conf.Limit = self.num_tweets # specifies how many tweet should be scraped
|
72 |
+
self.conf.Since = self.from_date
|
73 |
+
self.conf.Until = self.to_date
|
74 |
+
self.conf.Hide_output = True # Hides the output. If set to False it will prints tweets in the terminal window.
|
75 |
+
twint.run.Search(self.conf)
|
76 |
+
tweet_and_replies_inf = twint.output.panda.Tweets_df # here we say that output souldwe dataframe.
|
77 |
+
tweet_and_replies_inf = tweet_and_replies_inf[
|
78 |
+
["id", "tweet", "date", "user_id", "username", "urls", 'nlikes', 'nreplies', 'nretweets']]
|
79 |
+
return tweet_and_replies_inf
|
80 |
+
# def __check_date_type(d1,d2): if (type(d1) or type(d2)) is not type("str"): # If the type of ite date input
|
81 |
+
# is not string it generates exception print("[!] Please make sure the date is a string in this format
|
82 |
+
# \"yyyy-mm-dd\" ") raise EXCEPTION("Incorrect date type Exception!") elif (len(d1.split("-")) or len(d2.split(
|
83 |
+
# "-")))<2: print("[!] Please make sure the date is a string in this format \"yyyy-mm-dd\" ") raise EXCEPTION(
|
84 |
+
# "Incorrect date type Exception!")
|
85 |
+
if __name__ == "__main__":
|
86 |
+
sc = TwitterScraper(num_tweets=1002)
|
87 |
+
dc = sc.scrape_by_string("jimmieakesson")
|
88 |
+
print(dc.head())
|
89 |
+
print(dc.shape)
|
90 |
+
|
91 |
+
|
twitter-scraper/twint-master/.github/FUNDING.yml
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
# These are supported funding model platforms
|
2 |
-
patreon: twintproject
|
3 |
-
custom: paypal.me/noneprivacy
|
|
|
|
|
|
|
|
twitter-scraper/twint-master/.github/ISSUE_TEMPLATE.md
DELETED
@@ -1,20 +0,0 @@
|
|
1 |
-
# Issue Template
|
2 |
-
Please use this template!
|
3 |
-
|
4 |
-
## Initial Check
|
5 |
-
> If the issue is a request please specify that it is a request in the title (Example: [REQUEST] more features). If this is a question regarding 'twint' please specify that it's a question in the title (Example: [QUESTION] What is x?). Please **only** submit issues related to 'twint'. Thanks.
|
6 |
-
|
7 |
-
>Make sure you've checked the following:
|
8 |
-
|
9 |
-
- [] Python version is 3.6 or later;
|
10 |
-
- [] Updated Twint with `pip3 install --user --upgrade -e git+https://github.com/minamotorin/twint.git@origin/master#egg=twint`;
|
11 |
-
- [] I have searched the issues and there are no duplicates of this issue/question/request (please link to related issues of twintproject/twint for reference).
|
12 |
-
|
13 |
-
## Command Ran
|
14 |
-
>Please provide the _exact_ command ran including the username/search/code so I may reproduce the issue.
|
15 |
-
|
16 |
-
## Description of Issue
|
17 |
-
>Please use **as much detail as possible.**
|
18 |
-
|
19 |
-
## Environment Details
|
20 |
-
>Using Windows, Linux? What OS version? Running this in Anaconda? Jupyter Notebook? Terminal?
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
twitter-scraper/twint-master/.github/ISSUE_TEMPLATE/ISSUE_TEMPLATE.md
DELETED
@@ -1,17 +0,0 @@
|
|
1 |
-
### Initial Check
|
2 |
-
> If the issue is a request please specify that it is a request in the title (Example: [REQUEST] more features). If this is a question regarding 'twint' please specify that it's a question in the title (Example: [QUESTION] What is x?). Please **only** submit issues related to 'twint'. Thanks.
|
3 |
-
|
4 |
-
>Make sure you've checked the following:
|
5 |
-
|
6 |
-
- [] Python version is 3.6;
|
7 |
-
- [] Using the latest version of Twint;
|
8 |
-
- [] Updated Twint with `pip3 install --upgrade -e git+https://github.com/twintproject/twint.git@origin/master#egg=twint`;
|
9 |
-
|
10 |
-
### Command Ran
|
11 |
-
>Please provide the _exact_ command ran including the username/search/code so I may reproduce the issue.
|
12 |
-
|
13 |
-
### Description of Issue
|
14 |
-
>Please use **as much detail as possible.**
|
15 |
-
|
16 |
-
### Environment Details
|
17 |
-
>Using Windows, Linux? What OS version? Running this in Anaconda? Jupyter Notebook? Terminal?
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
twitter-scraper/twint-master/.gitignore
DELETED
@@ -1,115 +0,0 @@
|
|
1 |
-
# Byte-compiled / optimized / DLL files
|
2 |
-
__pycache__/
|
3 |
-
*.py[cod]
|
4 |
-
*$py.class
|
5 |
-
tweets.db
|
6 |
-
# C extensions
|
7 |
-
*.so
|
8 |
-
|
9 |
-
config.ini
|
10 |
-
twint/storage/mysql.py
|
11 |
-
|
12 |
-
# Node Dependency directories
|
13 |
-
node_modules/
|
14 |
-
jspm_packages/
|
15 |
-
tests/
|
16 |
-
# Distribution / packaging
|
17 |
-
.Python
|
18 |
-
env/
|
19 |
-
build/
|
20 |
-
develop-eggs/
|
21 |
-
dist/
|
22 |
-
downloads/
|
23 |
-
eggs/
|
24 |
-
.eggs/
|
25 |
-
lib/
|
26 |
-
lib64/
|
27 |
-
parts/
|
28 |
-
sdist/
|
29 |
-
var/
|
30 |
-
wheels/
|
31 |
-
*.egg-info/
|
32 |
-
.installed.cfg
|
33 |
-
*.egg
|
34 |
-
|
35 |
-
# PyInstaller
|
36 |
-
# Usually these files are written by a python script from a template
|
37 |
-
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
38 |
-
*.manifest
|
39 |
-
*.spec
|
40 |
-
|
41 |
-
# Installer logs
|
42 |
-
pip-log.txt
|
43 |
-
pip-delete-this-directory.txt
|
44 |
-
|
45 |
-
# Unit test / coverage reports
|
46 |
-
htmlcov/
|
47 |
-
.tox/
|
48 |
-
.coverage
|
49 |
-
.coverage.*
|
50 |
-
.cache
|
51 |
-
nosetests.xml
|
52 |
-
coverage.xml
|
53 |
-
*.cover
|
54 |
-
.hypothesis/
|
55 |
-
|
56 |
-
# Translations
|
57 |
-
*.mo
|
58 |
-
*.pot
|
59 |
-
|
60 |
-
# Django stuff:
|
61 |
-
*.log
|
62 |
-
local_settings.py
|
63 |
-
|
64 |
-
# Flask stuff:
|
65 |
-
instance/
|
66 |
-
.webassets-cache
|
67 |
-
|
68 |
-
# Scrapy stuff:
|
69 |
-
.scrapy
|
70 |
-
|
71 |
-
# Sphinx documentation
|
72 |
-
docs/_build/
|
73 |
-
|
74 |
-
# PyBuilder
|
75 |
-
target/
|
76 |
-
|
77 |
-
# Jupyter Notebook
|
78 |
-
.ipynb_checkpoints
|
79 |
-
|
80 |
-
# pyenv
|
81 |
-
.python-version
|
82 |
-
|
83 |
-
# celery beat schedule file
|
84 |
-
celerybeat-schedule
|
85 |
-
|
86 |
-
# SageMath parsed files
|
87 |
-
*.sage.py
|
88 |
-
|
89 |
-
# dotenv
|
90 |
-
.env
|
91 |
-
|
92 |
-
# virtualenv
|
93 |
-
.venv
|
94 |
-
venv/
|
95 |
-
ENV/
|
96 |
-
|
97 |
-
# Spyder project settings
|
98 |
-
.spyderproject
|
99 |
-
.spyproject
|
100 |
-
|
101 |
-
# Rope project settings
|
102 |
-
.ropeproject
|
103 |
-
|
104 |
-
# mkdocs documentation
|
105 |
-
/site
|
106 |
-
|
107 |
-
# mypy
|
108 |
-
.mypy_cache/
|
109 |
-
|
110 |
-
# output
|
111 |
-
*.csv
|
112 |
-
*.json
|
113 |
-
*.txt
|
114 |
-
|
115 |
-
test_twint.py
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
twitter-scraper/twint-master/.travis.yml
DELETED
@@ -1,23 +0,0 @@
|
|
1 |
-
dist: bionic
|
2 |
-
language: python
|
3 |
-
python:
|
4 |
-
- "3.6"
|
5 |
-
- "3.7"
|
6 |
-
- "3.8"
|
7 |
-
- "nightly"
|
8 |
-
matrix:
|
9 |
-
allow_failures:
|
10 |
-
- python: "nightly"
|
11 |
-
- python: "3.8"
|
12 |
-
install:
|
13 |
-
- pip install -r requirements.txt
|
14 |
-
script:
|
15 |
-
- python test.py
|
16 |
-
deploy:
|
17 |
-
provider: pypi
|
18 |
-
user: "codyzacharias"
|
19 |
-
password:
|
20 |
-
secure: sWWvx50F7KJBtf8z2njc+Q31WIAHiQs4zKEiGD4/7xrshw55H5z+WnqZ9VIP83qm9yKefoRKp7WnaJeXZ3ulZSLn64ue45lqFozWMyGvelRPOKvZi9XPMqBA7+qllR/GseTHSGC3G5EGxac6UEI3irYe3mZXxfjpxNOXVti8rJ2xX8TiJM0AVKRrdDiAstOhMMkXkB7fYXMQALwEp8UoW/UbjbeqsKueXydjStaESNP/QzRFZ3/tuNu+3HMz/olniLUhUWcF/xDbJVpXuaRMUalgqe+BTbDdtUVt/s/GKtpg5GAzJyhQphiCM/huihedUIKSoI+6A8PTzuxrLhB5BMi9pcllED02v7w1enpu5L2l5cRDgQJSOpkxkA5Eese8nxKOOq0KzwDQa3JByrRor8R4yz+p5s4u2r0Rs2A9fkjQYwd/uWBSEIRF4K9WZoniiikahwXq070DMRgV7HbovKSjo5NK5F8j+psrtqPF+OHN2aVfWxbGnezrOOkmzuTHhWZVj3pPSpQU1WFWHo9fPo4I6YstR4q6XjNNjrpY3ojSlv0ThMbUem7zhHTRkRsSA2SpPfqw5E3Jf7vaiQb4M5zkBVqxuq4tXb14GJ26tGD8tel8u8b+ccpkAE9xf+QavP8UHz4PbBhqgFX5TbV/H++cdsICyoZnT35yiaDOELM=
|
21 |
-
on:
|
22 |
-
tags: true
|
23 |
-
python: "3.7"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
twitter-scraper/twint-master/Dockerfile
DELETED
@@ -1,10 +0,0 @@
|
|
1 |
-
FROM python:3.6-buster
|
2 |
-
LABEL maintainer="codyzacharias@pm.me"
|
3 |
-
|
4 |
-
WORKDIR /root
|
5 |
-
|
6 |
-
RUN git clone --depth=1 https://github.com/twintproject/twint.git && \
|
7 |
-
cd /root/twint && \
|
8 |
-
pip3 install . -r requirements.txt
|
9 |
-
|
10 |
-
CMD /bin/bash
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
twitter-scraper/twint-master/LICENSE
DELETED
@@ -1,21 +0,0 @@
|
|
1 |
-
MIT License
|
2 |
-
|
3 |
-
Copyright (c) 2018 Cody Zacharias
|
4 |
-
|
5 |
-
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6 |
-
of this software and associated documentation files (the "Software"), to deal
|
7 |
-
in the Software without restriction, including without limitation the rights
|
8 |
-
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9 |
-
copies of the Software, and to permit persons to whom the Software is
|
10 |
-
furnished to do so, subject to the following conditions:
|
11 |
-
|
12 |
-
The above copyright notice and this permission notice shall be included in all
|
13 |
-
copies or substantial portions of the Software.
|
14 |
-
|
15 |
-
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16 |
-
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17 |
-
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18 |
-
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19 |
-
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20 |
-
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
21 |
-
SOFTWARE.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
twitter-scraper/twint-master/MANIFEST.in
DELETED
@@ -1 +0,0 @@
|
|
1 |
-
include README.md LICENSE
|
|
|
|
twitter-scraper/twint-master/README.md
DELETED
@@ -1,272 +0,0 @@
|
|
1 |
-
20220207.0
|
2 |
-
|
3 |
-
# About this fork
|
4 |
-
|
5 |
-
[This repository](https://github.com/minamotorin/twint) is the fork of [https://github.com/twintproject/twint](https://github.com/twintproject/twint) and for myself.
|
6 |
-
|
7 |
-
Modified by [minamotorin](https://github.com/minamotorin).
|
8 |
-
|
9 |
-
## Updates from twintproject/twint
|
10 |
-
|
11 |
-
### twint.token.RefreshTokenException: Could not find the Guest token in HTML
|
12 |
-
|
13 |
-
This problem doesn't happen recently.
|
14 |
-
|
15 |
-
#### Related
|
16 |
-
|
17 |
-
- [twintproject/twint#1320](https://github.com/twintproject/twint/issues/1320)
|
18 |
-
- [twintproject/twint#1322](https://github.com/twintproject/twint/pull/1322)
|
19 |
-
- [twintproject/twint#1328](https://github.com/twintproject/twint/pull/1328)
|
20 |
-
- [twintproject/twint#1061](https://github.com/twintproject/twint/issues/1061)
|
21 |
-
- [twintproject/twint#1114](https://github.com/twintproject/twint/issues/1114)
|
22 |
-
|
23 |
-
### json.decoder.JSONDecodeError: Expecting value: line 1 column 1 (char 0)
|
24 |
-
|
25 |
-
The fix is **not complete**.
|
26 |
-
`twint.run.Profile` will work but `twint.run.db` will not.
|
27 |
-
This means [`test.py`](./test.py) causes an error.
|
28 |
-
|
29 |
-
I think this is because the fields of the result table are not exactly the same as the traditional ones.
|
30 |
-
|
31 |
-
#### Related
|
32 |
-
|
33 |
-
- [twintproject/twint#1335](https://github.com/twintproject/twint/issues/1335)
|
34 |
-
|
35 |
-
### [-] TWINT requires Python version 3.6+.
|
36 |
-
|
37 |
-
#### Related
|
38 |
-
|
39 |
-
- [twintproject/twint#1344](https://github.com/twintproject/twint/issues/1344)
|
40 |
-
- [twintproject/twint#1345](https://github.com/twintproject/twint/pull/1345)
|
41 |
-
- [twintproject/twint#1344](https://github.com/twintproject/twint/issues/1346)
|
42 |
-
- [twintproject/twint#1309](https://github.com/twintproject/twint/pull/1309)
|
43 |
-
- [twintproject/twint#1313](https://github.com/twintproject/twint/issues/1313)
|
44 |
-
|
45 |
-
## References
|
46 |
-
|
47 |
-
- [snscrape](https://github.com/JustAnotherArchivist/snscrape)
|
48 |
-
- [gallery-dl](https://github.com/mikf/gallery-dl)
|
49 |
-
|
50 |
-
## License
|
51 |
-
|
52 |
-
This repository is also under the [MIT License](https://opensource.org/licenses/mit-license.php).
|
53 |
-
|
54 |
-
---
|
55 |
-
|
56 |
-
# TWINT - Twitter Intelligence Tool
|
57 |
-
![2](https://i.imgur.com/iaH3s7z.png)
|
58 |
-
![3](https://i.imgur.com/hVeCrqL.png)
|
59 |
-
|
60 |
-
[![PyPI](https://img.shields.io/pypi/v/twint.svg)](https://pypi.org/project/twint/) [![Build Status](https://travis-ci.org/twintproject/twint.svg?branch=master)](https://travis-ci.org/twintproject/twint) [![Python 3.6|3.7|3.8](https://img.shields.io/badge/Python-3.6%2F3.7%2F3.8-blue.svg)](https://www.python.org/download/releases/3.0/) [![GitHub license](https://img.shields.io/github/license/haccer/tweep.svg)](https://github.com/haccer/tweep/blob/master/LICENSE) [![Downloads](https://pepy.tech/badge/twint)](https://pepy.tech/project/twint) [![Downloads](https://pepy.tech/badge/twint/week)](https://pepy.tech/project/twint/week) [![Patreon](https://img.shields.io/endpoint.svg?url=https:%2F%2Fshieldsio-patreon.herokuapp.com%2Ftwintproject)](https://www.patreon.com/twintproject) ![](https://img.shields.io/twitter/follow/noneprivacy.svg?label=Follow&style=social)
|
61 |
-
|
62 |
-
>No authentication. No API. No limits.
|
63 |
-
|
64 |
-
Twint is an advanced Twitter scraping tool written in Python that allows for scraping Tweets from Twitter profiles **without** using Twitter's API.
|
65 |
-
|
66 |
-
Twint utilizes Twitter's search operators to let you scrape Tweets from specific users, scrape Tweets relating to certain topics, hashtags & trends, or sort out *sensitive* information from Tweets like e-mail and phone numbers. I find this very useful, and you can get really creative with it too.
|
67 |
-
|
68 |
-
Twint also makes special queries to Twitter allowing you to also scrape a Twitter user's followers, Tweets a user has liked, and who they follow **without** any authentication, API, Selenium, or browser emulation.
|
69 |
-
|
70 |
-
## tl;dr Benefits
|
71 |
-
Some of the benefits of using Twint vs Twitter API:
|
72 |
-
- Can fetch almost __all__ Tweets (Twitter API limits to last 3200 Tweets only);
|
73 |
-
- Fast initial setup;
|
74 |
-
- Can be used anonymously and without Twitter sign up;
|
75 |
-
- **No rate limitations**.
|
76 |
-
|
77 |
-
## Limits imposed by Twitter
|
78 |
-
Twitter limits scrolls while browsing the user timeline. This means that with `.Profile` or with `.Favorites` you will be able to get ~3200 tweets.
|
79 |
-
|
80 |
-
## Requirements
|
81 |
-
- Python 3.6;
|
82 |
-
- aiohttp;
|
83 |
-
- aiodns;
|
84 |
-
- beautifulsoup4;
|
85 |
-
- cchardet;
|
86 |
-
- dataclasses
|
87 |
-
- elasticsearch;
|
88 |
-
- pysocks;
|
89 |
-
- pandas (>=0.23.0);
|
90 |
-
- aiohttp_socks;
|
91 |
-
- schedule;
|
92 |
-
- geopy;
|
93 |
-
- fake-useragent;
|
94 |
-
- py-googletransx.
|
95 |
-
|
96 |
-
## Installing
|
97 |
-
|
98 |
-
**Git:**
|
99 |
-
```bash
|
100 |
-
git clone --depth=1 https://github.com/twintproject/twint.git
|
101 |
-
cd twint
|
102 |
-
pip3 install . -r requirements.txt
|
103 |
-
```
|
104 |
-
|
105 |
-
**Pip:**
|
106 |
-
```bash
|
107 |
-
pip3 install twint
|
108 |
-
```
|
109 |
-
|
110 |
-
or
|
111 |
-
|
112 |
-
```bash
|
113 |
-
pip3 install --user --upgrade git+https://github.com/twintproject/twint.git@origin/master#egg=twint
|
114 |
-
```
|
115 |
-
|
116 |
-
**Pipenv**:
|
117 |
-
```bash
|
118 |
-
pipenv install git+https://github.com/twintproject/twint.git#egg=twint
|
119 |
-
```
|
120 |
-
|
121 |
-
### March 2, 2021 Update
|
122 |
-
|
123 |
-
**Added**: Dockerfile
|
124 |
-
|
125 |
-
Noticed a lot of people are having issues installing (including me). Please use the Dockerfile temporarily while I look into them.
|
126 |
-
|
127 |
-
## CLI Basic Examples and Combos
|
128 |
-
A few simple examples to help you understand the basics:
|
129 |
-
|
130 |
-
- `twint -u username` - Scrape all the Tweets of a *user* (doesn't include **retweets** but includes **replies**).
|
131 |
-
- `twint -u username -s pineapple` - Scrape all Tweets from the *user*'s timeline containing _pineapple_.
|
132 |
-
- `twint -s pineapple` - Collect every Tweet containing *pineapple* from everyone's Tweets.
|
133 |
-
- `twint -u username --year 2014` - Collect Tweets that were tweeted **before** 2014.
|
134 |
-
- `twint -u username --since "2015-12-20 20:30:15"` - Collect Tweets that were tweeted since 2015-12-20 20:30:15.
|
135 |
-
- `twint -u username --since 2015-12-20` - Collect Tweets that were tweeted since 2015-12-20 00:00:00.
|
136 |
-
- `twint -u username -o file.txt` - Scrape Tweets and save to file.txt.
|
137 |
-
- `twint -u username -o file.csv --csv` - Scrape Tweets and save as a csv file.
|
138 |
-
- `twint -u username --email --phone` - Show Tweets that might have phone numbers or email addresses.
|
139 |
-
- `twint -s "Donald Trump" --verified` - Display Tweets by verified users that Tweeted about Donald Trump.
|
140 |
-
- `twint -g="48.880048,2.385939,1km" -o file.csv --csv` - Scrape Tweets from a radius of 1km around a place in Paris and export them to a csv file.
|
141 |
-
- `twint -u username -es localhost:9200` - Output Tweets to Elasticsearch
|
142 |
-
- `twint -u username -o file.json --json` - Scrape Tweets and save as a json file.
|
143 |
-
- `twint -u username --database tweets.db` - Save Tweets to a SQLite database.
|
144 |
-
- `twint -u username --followers` - Scrape a Twitter user's followers.
|
145 |
-
- `twint -u username --following` - Scrape who a Twitter user follows.
|
146 |
-
- `twint -u username --favorites` - Collect all the Tweets a user has favorited (gathers ~3200 tweet).
|
147 |
-
- `twint -u username --following --user-full` - Collect full user information a person follows
|
148 |
-
- `twint -u username --timeline` - Use an effective method to gather Tweets from a user's profile (Gathers ~3200 Tweets, including **retweets** & **replies**).
|
149 |
-
- `twint -u username --retweets` - Use a quick method to gather the last 900 Tweets (that includes retweets) from a user's profile.
|
150 |
-
- `twint -u username --resume resume_file.txt` - Resume a search starting from the last saved scroll-id.
|
151 |
-
|
152 |
-
More detail about the commands and options are located in the [wiki](https://github.com/twintproject/twint/wiki/Commands)
|
153 |
-
|
154 |
-
## Module Example
|
155 |
-
|
156 |
-
Twint can now be used as a module and supports custom formatting. **More details are located in the [wiki](https://github.com/twintproject/twint/wiki/Module)**
|
157 |
-
|
158 |
-
```python
|
159 |
-
import twint
|
160 |
-
|
161 |
-
# Configure
|
162 |
-
c = twint.Config()
|
163 |
-
c.Username = "realDonaldTrump"
|
164 |
-
c.Search = "great"
|
165 |
-
|
166 |
-
# Run
|
167 |
-
twint.run.Search(c)
|
168 |
-
```
|
169 |
-
> Output
|
170 |
-
|
171 |
-
`955511208597184512 2018-01-22 18:43:19 GMT <now> pineapples are the best fruit`
|
172 |
-
|
173 |
-
```python
|
174 |
-
import twint
|
175 |
-
|
176 |
-
c = twint.Config()
|
177 |
-
|
178 |
-
c.Username = "noneprivacy"
|
179 |
-
c.Custom["tweet"] = ["id"]
|
180 |
-
c.Custom["user"] = ["bio"]
|
181 |
-
c.Limit = 10
|
182 |
-
c.Store_csv = True
|
183 |
-
c.Output = "none"
|
184 |
-
|
185 |
-
twint.run.Search(c)
|
186 |
-
```
|
187 |
-
|
188 |
-
## Storing Options
|
189 |
-
- Write to file;
|
190 |
-
- CSV;
|
191 |
-
- JSON;
|
192 |
-
- SQLite;
|
193 |
-
- Elasticsearch.
|
194 |
-
|
195 |
-
## Elasticsearch Setup
|
196 |
-
|
197 |
-
Details on setting up Elasticsearch with Twint is located in the [wiki](https://github.com/twintproject/twint/wiki/Elasticsearch).
|
198 |
-
|
199 |
-
## Graph Visualization
|
200 |
-
![graph](https://i.imgur.com/EEJqB8n.png)
|
201 |
-
|
202 |
-
[Graph](https://github.com/twintproject/twint/wiki/Graph) details are also located in the [wiki](https://github.com/twintproject/twint/wiki/Graph).
|
203 |
-
|
204 |
-
We are developing a Twint Desktop App.
|
205 |
-
|
206 |
-
![4](https://i.imgur.com/DzcfIgL.png)
|
207 |
-
|
208 |
-
## FAQ
|
209 |
-
> I tried scraping tweets from a user, I know that they exist but I'm not getting them
|
210 |
-
|
211 |
-
Twitter can shadow-ban accounts, which means that their tweets will not be available via search. To solve this, pass `--profile-full` if you are using Twint via CLI or, if are using Twint as module, add `config.Profile_full = True`. Please note that this process will be quite slow.
|
212 |
-
## More Examples
|
213 |
-
|
214 |
-
#### Followers/Following
|
215 |
-
|
216 |
-
> To get only follower usernames/following usernames
|
217 |
-
|
218 |
-
`twint -u username --followers`
|
219 |
-
|
220 |
-
`twint -u username --following`
|
221 |
-
|
222 |
-
> To get user info of followers/following users
|
223 |
-
|
224 |
-
`twint -u username --followers --user-full`
|
225 |
-
|
226 |
-
`twint -u username --following --user-full`
|
227 |
-
|
228 |
-
#### userlist
|
229 |
-
|
230 |
-
> To get only user info of user
|
231 |
-
|
232 |
-
`twint -u username --user-full`
|
233 |
-
|
234 |
-
> To get user info of users from a userlist
|
235 |
-
|
236 |
-
`twint --userlist inputlist --user-full`
|
237 |
-
|
238 |
-
|
239 |
-
#### tweet translation (experimental)
|
240 |
-
|
241 |
-
> To get 100 english tweets and translate them to italian
|
242 |
-
|
243 |
-
`twint -u noneprivacy --csv --output none.csv --lang en --translate --translate-dest it --limit 100`
|
244 |
-
|
245 |
-
or
|
246 |
-
|
247 |
-
```python
|
248 |
-
import twint
|
249 |
-
|
250 |
-
c = twint.Config()
|
251 |
-
c.Username = "noneprivacy"
|
252 |
-
c.Limit = 100
|
253 |
-
c.Store_csv = True
|
254 |
-
c.Output = "none.csv"
|
255 |
-
c.Lang = "en"
|
256 |
-
c.Translate = True
|
257 |
-
c.TranslateDest = "it"
|
258 |
-
twint.run.Search(c)
|
259 |
-
```
|
260 |
-
|
261 |
-
Notes:
|
262 |
-
- [Google translate has some quotas](https://cloud.google.com/translate/quotas)
|
263 |
-
|
264 |
-
## Featured Blog Posts:
|
265 |
-
- [How to use Twint as an OSINT tool](https://pielco11.ovh/posts/twint-osint/)
|
266 |
-
- [Basic tutorial made by Null Byte](https://null-byte.wonderhowto.com/how-to/mine-twitter-for-targeted-information-with-twint-0193853/)
|
267 |
-
- [Analyzing Tweets with NLP in minutes with Spark, Optimus and Twint](https://towardsdatascience.com/analyzing-tweets-with-nlp-in-minutes-with-spark-optimus-and-twint-a0c96084995f)
|
268 |
-
- [Loading tweets into Kafka and Neo4j](https://markhneedham.com/blog/2019/05/29/loading-tweets-twint-kafka-neo4j/)
|
269 |
-
|
270 |
-
## Contact
|
271 |
-
|
272 |
-
If you have any question, want to join in discussions, or need extra help, you are welcome to join our Twint focused channel at [OSINT team](https://osint.team)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
twitter-scraper/twint-master/Untitled.ipynb
DELETED
@@ -1,282 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"cells": [
|
3 |
-
{
|
4 |
-
"cell_type": "code",
|
5 |
-
"execution_count": 67,
|
6 |
-
"metadata": {},
|
7 |
-
"outputs": [],
|
8 |
-
"source": [
|
9 |
-
"text= \"\\n\\n0. Brottslighet, 1. Miljö, 2. Skola, 3. Sjukvård, 4. Militär, 5. Invandring, 6. Integration \""
|
10 |
-
]
|
11 |
-
},
|
12 |
-
{
|
13 |
-
"cell_type": "code",
|
14 |
-
"execution_count": 17,
|
15 |
-
"metadata": {},
|
16 |
-
"outputs": [
|
17 |
-
{
|
18 |
-
"name": "stdout",
|
19 |
-
"output_type": "stream",
|
20 |
-
"text": [
|
21 |
-
"WARNING: pip is being invoked by an old script wrapper. This will fail in a future version of pip.\n",
|
22 |
-
"Please see https://github.com/pypa/pip/issues/5599 for advice on fixing the underlying issue.\n",
|
23 |
-
"To avoid this problem you can invoke Python with '-m pip' instead of running pip directly.\n",
|
24 |
-
"Requirement already satisfied: regex in /home/oxygen/snap/jupyter/common/lib/python3.7/site-packages (2022.6.2)\n"
|
25 |
-
]
|
26 |
-
}
|
27 |
-
],
|
28 |
-
"source": [
|
29 |
-
"!pip install regex\n"
|
30 |
-
]
|
31 |
-
},
|
32 |
-
{
|
33 |
-
"cell_type": "code",
|
34 |
-
"execution_count": 15,
|
35 |
-
"metadata": {},
|
36 |
-
"outputs": [
|
37 |
-
{
|
38 |
-
"data": {
|
39 |
-
"text/plain": [
|
40 |
-
"['0']"
|
41 |
-
]
|
42 |
-
},
|
43 |
-
"execution_count": 15,
|
44 |
-
"metadata": {},
|
45 |
-
"output_type": "execute_result"
|
46 |
-
}
|
47 |
-
],
|
48 |
-
"source": [
|
49 |
-
"re.findall(\"[0-9]+\", tl[0])"
|
50 |
-
]
|
51 |
-
},
|
52 |
-
{
|
53 |
-
"cell_type": "code",
|
54 |
-
"execution_count": 48,
|
55 |
-
"metadata": {},
|
56 |
-
"outputs": [
|
57 |
-
{
|
58 |
-
"data": {
|
59 |
-
"text/plain": [
|
60 |
-
"'0. Äldrefrågor'"
|
61 |
-
]
|
62 |
-
},
|
63 |
-
"execution_count": 48,
|
64 |
-
"metadata": {},
|
65 |
-
"output_type": "execute_result"
|
66 |
-
}
|
67 |
-
],
|
68 |
-
"source": [
|
69 |
-
"tl[0]"
|
70 |
-
]
|
71 |
-
},
|
72 |
-
{
|
73 |
-
"cell_type": "code",
|
74 |
-
"execution_count": 49,
|
75 |
-
"metadata": {},
|
76 |
-
"outputs": [
|
77 |
-
{
|
78 |
-
"data": {
|
79 |
-
"text/plain": [
|
80 |
-
"['0', ' Äldrefrågor']"
|
81 |
-
]
|
82 |
-
},
|
83 |
-
"execution_count": 49,
|
84 |
-
"metadata": {},
|
85 |
-
"output_type": "execute_result"
|
86 |
-
}
|
87 |
-
],
|
88 |
-
"source": [
|
89 |
-
"f=tl[0].split('.')\n",
|
90 |
-
"\n",
|
91 |
-
"f#int(f[0])"
|
92 |
-
]
|
93 |
-
},
|
94 |
-
{
|
95 |
-
"cell_type": "code",
|
96 |
-
"execution_count": 29,
|
97 |
-
"metadata": {},
|
98 |
-
"outputs": [
|
99 |
-
{
|
100 |
-
"ename": "NameError",
|
101 |
-
"evalue": "name 'str_topics_to_dict' is not defined",
|
102 |
-
"output_type": "error",
|
103 |
-
"traceback": [
|
104 |
-
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
|
105 |
-
"\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)",
|
106 |
-
"\u001b[0;32m<ipython-input-29-b05d9860dbcf>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mstr_topics_to_dict\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtext\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
|
107 |
-
"\u001b[0;31mNameError\u001b[0m: name 'str_topics_to_dict' is not defined"
|
108 |
-
]
|
109 |
-
}
|
110 |
-
],
|
111 |
-
"source": []
|
112 |
-
},
|
113 |
-
{
|
114 |
-
"cell_type": "code",
|
115 |
-
"execution_count": 65,
|
116 |
-
"metadata": {},
|
117 |
-
"outputs": [],
|
118 |
-
"source": [
|
119 |
-
"\n",
|
120 |
-
"def str_topics_to_dict(topics):\n",
|
121 |
-
" topic_list=topics.split(\",\")\n",
|
122 |
-
" ind_topic_dict={}\n",
|
123 |
-
" for i inrange(len(topic_list)): \n",
|
124 |
-
" index_topic_list=\n",
|
125 |
-
" ind=index_topic_list[0]\n",
|
126 |
-
" just_topic=index_topic_list[1][1:]\n",
|
127 |
-
" ind_topic_dict[int(ind)]=just_topic\n",
|
128 |
-
" return ind_topic_dict"
|
129 |
-
]
|
130 |
-
},
|
131 |
-
{
|
132 |
-
"cell_type": "code",
|
133 |
-
"execution_count": 68,
|
134 |
-
"metadata": {},
|
135 |
-
"outputs": [
|
136 |
-
{
|
137 |
-
"data": {
|
138 |
-
"text/plain": [
|
139 |
-
"{0: 'Brottslighet',\n",
|
140 |
-
" 1: 'Miljö',\n",
|
141 |
-
" 2: 'Skola',\n",
|
142 |
-
" 3: 'Sjukvård',\n",
|
143 |
-
" 4: 'Militär',\n",
|
144 |
-
" 5: 'Invandring',\n",
|
145 |
-
" 6: 'Integration '}"
|
146 |
-
]
|
147 |
-
},
|
148 |
-
"execution_count": 68,
|
149 |
-
"metadata": {},
|
150 |
-
"output_type": "execute_result"
|
151 |
-
}
|
152 |
-
],
|
153 |
-
"source": [
|
154 |
-
"str_topics_to_dict(text)"
|
155 |
-
]
|
156 |
-
},
|
157 |
-
{
|
158 |
-
"cell_type": "code",
|
159 |
-
"execution_count": 109,
|
160 |
-
"metadata": {},
|
161 |
-
"outputs": [
|
162 |
-
{
|
163 |
-
"data": {
|
164 |
-
"text/plain": [
|
165 |
-
"' Brottslighet, Miljö, Skola, Sjukvård, Militär stöd, Invandring, Integration '"
|
166 |
-
]
|
167 |
-
},
|
168 |
-
"execution_count": 109,
|
169 |
-
"metadata": {},
|
170 |
-
"output_type": "execute_result"
|
171 |
-
}
|
172 |
-
],
|
173 |
-
"source": [
|
174 |
-
"\n",
|
175 |
-
"text=\"\\n\\n0. Brottslighet, 1. Miljö, 2. Skola, 3. Sjukvård, 4. Militär stöd, 5. Invandring, 6. Integration \"\n",
|
176 |
-
"text=re.sub(r\"(\\n+)\",\" \",text)\n",
|
177 |
-
"text=re.sub(\"(\\.)|\\d+\",\"\",text )\n",
|
178 |
-
"text"
|
179 |
-
]
|
180 |
-
},
|
181 |
-
{
|
182 |
-
"cell_type": "code",
|
183 |
-
"execution_count": 100,
|
184 |
-
"metadata": {},
|
185 |
-
"outputs": [
|
186 |
-
{
|
187 |
-
"data": {
|
188 |
-
"text/plain": [
|
189 |
-
"[' Brottslighet',\n",
|
190 |
-
" ' Miljö',\n",
|
191 |
-
" ' Skola',\n",
|
192 |
-
" ' Sjukvård',\n",
|
193 |
-
" ' Militär stöd',\n",
|
194 |
-
" ' Invandring',\n",
|
195 |
-
" ' Integration ']"
|
196 |
-
]
|
197 |
-
},
|
198 |
-
"execution_count": 100,
|
199 |
-
"metadata": {},
|
200 |
-
"output_type": "execute_result"
|
201 |
-
}
|
202 |
-
],
|
203 |
-
"source": [
|
204 |
-
"text.split(\",\")"
|
205 |
-
]
|
206 |
-
},
|
207 |
-
{
|
208 |
-
"cell_type": "code",
|
209 |
-
"execution_count": 116,
|
210 |
-
"metadata": {},
|
211 |
-
"outputs": [],
|
212 |
-
"source": [
|
213 |
-
"import regex as re \n",
|
214 |
-
"def str_topics_to_dict(topics):\n",
|
215 |
-
" text=re.sub(r\"(\\n+)\",\" \",topics)\n",
|
216 |
-
" text=re.sub(\"(\\.)|\\d+\",\"\",topics )\n",
|
217 |
-
" topics=re.sub(r\"(\\n+)|(\\.)|\\d+\",\"\",topics)\n",
|
218 |
-
" topic_list=topics.split(\",\")\n",
|
219 |
-
" ind_topic_dict={}\n",
|
220 |
-
" for i in range(len(topic_list)): \n",
|
221 |
-
" ind=i\n",
|
222 |
-
" just_topic=topic_list[i]\n",
|
223 |
-
" ind_topic_dict[ind]=just_topic\n",
|
224 |
-
" return ind_topic_dict"
|
225 |
-
]
|
226 |
-
},
|
227 |
-
{
|
228 |
-
"cell_type": "code",
|
229 |
-
"execution_count": 117,
|
230 |
-
"metadata": {},
|
231 |
-
"outputs": [
|
232 |
-
{
|
233 |
-
"data": {
|
234 |
-
"text/plain": [
|
235 |
-
"{0: ' Brottslighet',\n",
|
236 |
-
" 1: ' Miljö',\n",
|
237 |
-
" 2: ' Skola',\n",
|
238 |
-
" 3: ' Sjukvård',\n",
|
239 |
-
" 4: ' Militär stöd',\n",
|
240 |
-
" 5: ' Invandring',\n",
|
241 |
-
" 6: ' Integration '}"
|
242 |
-
]
|
243 |
-
},
|
244 |
-
"execution_count": 117,
|
245 |
-
"metadata": {},
|
246 |
-
"output_type": "execute_result"
|
247 |
-
}
|
248 |
-
],
|
249 |
-
"source": [
|
250 |
-
"str_topics_to_dict(text)"
|
251 |
-
]
|
252 |
-
},
|
253 |
-
{
|
254 |
-
"cell_type": "code",
|
255 |
-
"execution_count": null,
|
256 |
-
"metadata": {},
|
257 |
-
"outputs": [],
|
258 |
-
"source": []
|
259 |
-
}
|
260 |
-
],
|
261 |
-
"metadata": {
|
262 |
-
"kernelspec": {
|
263 |
-
"display_name": "Python 3",
|
264 |
-
"language": "python",
|
265 |
-
"name": "python3"
|
266 |
-
},
|
267 |
-
"language_info": {
|
268 |
-
"codemirror_mode": {
|
269 |
-
"name": "ipython",
|
270 |
-
"version": 3
|
271 |
-
},
|
272 |
-
"file_extension": ".py",
|
273 |
-
"mimetype": "text/x-python",
|
274 |
-
"name": "python",
|
275 |
-
"nbconvert_exporter": "python",
|
276 |
-
"pygments_lexer": "ipython3",
|
277 |
-
"version": "3.7.3"
|
278 |
-
}
|
279 |
-
},
|
280 |
-
"nbformat": 4,
|
281 |
-
"nbformat_minor": 2
|
282 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
twitter-scraper/twint-master/automate.py
DELETED
@@ -1,65 +0,0 @@
|
|
1 |
-
import twint
|
2 |
-
import schedule
|
3 |
-
import time
|
4 |
-
|
5 |
-
# you can change the name of each "job" after "def" if you'd like.
|
6 |
-
def jobone():
|
7 |
-
print ("Fetching Tweets")
|
8 |
-
c = twint.Config()
|
9 |
-
# choose username (optional)
|
10 |
-
c.Username = "insert username here"
|
11 |
-
# choose search term (optional)
|
12 |
-
c.Search = "insert search term here"
|
13 |
-
# choose beginning time (narrow results)
|
14 |
-
c.Since = "2018-01-01"
|
15 |
-
# set limit on total tweets
|
16 |
-
c.Limit = 1000
|
17 |
-
# no idea, but makes the csv format properly
|
18 |
-
c.Store_csv = True
|
19 |
-
# format of the csv
|
20 |
-
c.Custom = ["date", "time", "username", "tweet", "link", "likes", "retweets", "replies", "mentions", "hashtags"]
|
21 |
-
# change the name of the csv file
|
22 |
-
c.Output = "filename.csv"
|
23 |
-
twint.run.Search(c)
|
24 |
-
|
25 |
-
def jobtwo():
|
26 |
-
print ("Fetching Tweets")
|
27 |
-
c = twint.Config()
|
28 |
-
# choose username (optional)
|
29 |
-
c.Username = "insert username here"
|
30 |
-
# choose search term (optional)
|
31 |
-
c.Search = "insert search term here"
|
32 |
-
# choose beginning time (narrow results)
|
33 |
-
c.Since = "2018-01-01"
|
34 |
-
# set limit on total tweets
|
35 |
-
c.Limit = 1000
|
36 |
-
# no idea, but makes the csv format properly
|
37 |
-
c.Store_csv = True
|
38 |
-
# format of the csv
|
39 |
-
c.Custom = ["date", "time", "username", "tweet", "link", "likes", "retweets", "replies", "mentions", "hashtags"]
|
40 |
-
# change the name of the csv file
|
41 |
-
c.Output = "filename2.csv"
|
42 |
-
twint.run.Search(c)
|
43 |
-
|
44 |
-
# run once when you start the program
|
45 |
-
|
46 |
-
jobone()
|
47 |
-
jobtwo()
|
48 |
-
|
49 |
-
# run every minute(s), hour, day at, day of the week, day of the week and time. Use "#" to block out which ones you don't want to use. Remove it to active. Also, replace "jobone" and "jobtwo" with your new function names (if applicable)
|
50 |
-
|
51 |
-
# schedule.every(1).minutes.do(jobone)
|
52 |
-
schedule.every().hour.do(jobone)
|
53 |
-
# schedule.every().day.at("10:30").do(jobone)
|
54 |
-
# schedule.every().monday.do(jobone)
|
55 |
-
# schedule.every().wednesday.at("13:15").do(jobone)
|
56 |
-
|
57 |
-
# schedule.every(1).minutes.do(jobtwo)
|
58 |
-
schedule.every().hour.do(jobtwo)
|
59 |
-
# schedule.every().day.at("10:30").do(jobtwo)
|
60 |
-
# schedule.every().monday.do(jobtwo)
|
61 |
-
# schedule.every().wednesday.at("13:15").do(jobtwo)
|
62 |
-
|
63 |
-
while True:
|
64 |
-
schedule.run_pending()
|
65 |
-
time.sleep(1)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
twitter-scraper/twint-master/elasticsearch/README.md
DELETED
@@ -1,5 +0,0 @@
|
|
1 |
-
# Elasticsearch How-To
|
2 |
-
|
3 |
-
![dashboard](https://i.imgur.com/BEbtdo5.png)
|
4 |
-
|
5 |
-
Please read the Wiki [here](https://github.com/twintproject/twint/wiki/Elasticsearch)
|
|
|
|
|
|
|
|
|
|
|
|
twitter-scraper/twint-master/scrape.py
DELETED
@@ -1,102 +0,0 @@
|
|
1 |
-
import sys
|
2 |
-
import io
|
3 |
-
import time
|
4 |
-
import asyncio
|
5 |
-
import os
|
6 |
-
from tkinter import EXCEPTION
|
7 |
-
from numpy import not_equal
|
8 |
-
loop = asyncio.get_event_loop()
|
9 |
-
loop.is_running()
|
10 |
-
import twint
|
11 |
-
import nest_asyncio
|
12 |
-
nest_asyncio.apply()
|
13 |
-
from datetime import date
|
14 |
-
class scraper:
|
15 |
-
def get_tweets(search_str, from_date="2006-07-01", to_date=str(date.today()), num_tweets=10,u_or_s='s', acceptable_range=10):
|
16 |
-
|
17 |
-
if (type(from_date) or type("str")) is not type("str"):
|
18 |
-
print("[!] Please make sure the date is a string in this format \"yyyy-mm-dd\" ")
|
19 |
-
raise EXCEPTION("Incorrect date type Exception!")
|
20 |
-
|
21 |
-
time_out= time.time()+2*60
|
22 |
-
_dict={}
|
23 |
-
c=twint.Config()
|
24 |
-
if u_or_s.lower() =="u":
|
25 |
-
c.Search = "from:@"+search_str # topic
|
26 |
-
else:
|
27 |
-
c.Search = search_str # topic
|
28 |
-
c.Pandas = True
|
29 |
-
num_tweets_and_replies=num_tweets
|
30 |
-
c.Count=True
|
31 |
-
for j in range(1,5):
|
32 |
-
c.Limit = num_tweets_and_replies
|
33 |
-
c.Since = from_date
|
34 |
-
c.Until = to_date
|
35 |
-
c.Hide_output =True
|
36 |
-
old_stdout = sys.stdout
|
37 |
-
new_stdout = io.StringIO()
|
38 |
-
sys.stdout = new_stdout
|
39 |
-
twint.run.Search(c)
|
40 |
-
output = new_stdout.getvalue()
|
41 |
-
sys.stdout = old_stdout
|
42 |
-
print(output[0:-2])
|
43 |
-
tweet_info=twint.output.panda.Tweets_df
|
44 |
-
|
45 |
-
t_count=0
|
46 |
-
try:
|
47 |
-
_keys=tweet_info["id"]
|
48 |
-
#tweet infor is a dataframe with fallowing columns
|
49 |
-
'''Index(['id', 'conversation_id', 'created_at', 'date', 'timezone', 'place',
|
50 |
-
'tweet', 'language', 'hashtags', 'cashtags', 'user_id', 'user_id_str',
|
51 |
-
'username', 'name', 'day', 'hour', 'link', 'urls', 'photos', 'video',
|
52 |
-
'thumbnail', 'retweet', 'nlikes', 'nreplies', 'nretweets', 'quote_url',
|
53 |
-
'search', 'near', 'geo', 'source', 'user_rt_id', 'user_rt',
|
54 |
-
'retweet_id', 'reply_to', 'retweet_date', 'translate', 'trans_src',
|
55 |
-
'trans_dest'],
|
56 |
-
dtype='object')'''
|
57 |
-
|
58 |
-
for i in range (len(_keys)):
|
59 |
-
if _keys[i] in _dict.keys() or tweet_info["tweet"][i].startswith("@"):
|
60 |
-
pass
|
61 |
-
else:
|
62 |
-
_dict[int(_keys[i])] = {"tweet": tweet_info["tweet"][i],
|
63 |
-
"date" :tweet_info["date"][i],
|
64 |
-
"nlikes": tweet_info["nlikes"][i],
|
65 |
-
"nreplies":tweet_info["nreplies"][i] ,
|
66 |
-
"nretweets": tweet_info["nretweets"][i],"topic":""}
|
67 |
-
if len(list(_dict.keys()))==num_tweets:
|
68 |
-
break
|
69 |
-
except:
|
70 |
-
pass
|
71 |
-
print(len(list(_dict.keys())), " of them are Tweets")
|
72 |
-
if (num_tweets-len(list(_dict.keys())))< acceptable_range:
|
73 |
-
return _dict
|
74 |
-
if len(list(_dict.keys())) < num_tweets:
|
75 |
-
num_tweets_and_replies= num_tweets_and_replies+100*3**j
|
76 |
-
else:
|
77 |
-
break
|
78 |
-
if time_out <time.time():
|
79 |
-
break
|
80 |
-
if output.startswith("[!] No more data!"):
|
81 |
-
break
|
82 |
-
return _dict
|
83 |
-
|
84 |
-
def string_search_user_tweets(user_name,search_str ,from_date="2006-07-01", to_date=str(date.today()), num_tweets=10):
|
85 |
-
c=twint.Config()
|
86 |
-
c.Username =user_name
|
87 |
-
c.Search = search_str # topic
|
88 |
-
c.Pandas = True
|
89 |
-
num_tweets_and_replies=num_tweets
|
90 |
-
c.Count=True
|
91 |
-
c.Limit = num_tweets_and_replies
|
92 |
-
c.Since = from_date
|
93 |
-
c.Until = to_date
|
94 |
-
c.Hide_output =True
|
95 |
-
twint.run.Search(c)
|
96 |
-
return twint.output.panda.Tweets_df
|
97 |
-
|
98 |
-
|
99 |
-
|
100 |
-
|
101 |
-
|
102 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
twitter-scraper/twint-master/scrape__init__.py
DELETED
@@ -1,14 +0,0 @@
|
|
1 |
-
def scraper_libs():
|
2 |
-
import sys
|
3 |
-
import io
|
4 |
-
import time
|
5 |
-
import asyncio
|
6 |
-
import os
|
7 |
-
from tkinter import EXCEPTION
|
8 |
-
from numpy import not_equal
|
9 |
-
loop = asyncio.get_event_loop()
|
10 |
-
loop.is_running()
|
11 |
-
import twint
|
12 |
-
import nest_asyncio
|
13 |
-
nest_asyncio.apply()
|
14 |
-
from datetime import date
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
twitter-scraper/twint-master/setup.py
DELETED
@@ -1,65 +0,0 @@
|
|
1 |
-
#!/usr/bin/python3
|
2 |
-
from setuptools import setup
|
3 |
-
import io
|
4 |
-
import os
|
5 |
-
|
6 |
-
# Package meta-data
|
7 |
-
NAME = 'twint'
|
8 |
-
DESCRIPTION = 'An advanced Twitter scraping & OSINT tool.'
|
9 |
-
URL = 'https://github.com/twintproject/twint'
|
10 |
-
EMAIL = 'codyzacharias@pm.me'
|
11 |
-
AUTHOR = 'Cody Zacharias'
|
12 |
-
REQUIRES_PYTHON = '>=3.6.0'
|
13 |
-
VERSION = None
|
14 |
-
|
15 |
-
# Packages required
|
16 |
-
REQUIRED = [
|
17 |
-
'aiohttp', 'aiodns', 'beautifulsoup4', 'cchardet', 'dataclasses',
|
18 |
-
'elasticsearch', 'pysocks', 'pandas', 'aiohttp_socks',
|
19 |
-
'schedule', 'geopy', 'fake-useragent', 'googletransx'
|
20 |
-
]
|
21 |
-
|
22 |
-
here = os.path.abspath(os.path.dirname(__file__))
|
23 |
-
|
24 |
-
with io.open(os.path.join(here, 'README.md'), encoding='utf-8') as f:
|
25 |
-
long_description = '\n' + f.read()
|
26 |
-
|
27 |
-
# Load the package's __version__.py
|
28 |
-
about = {}
|
29 |
-
if not VERSION:
|
30 |
-
with open(os.path.join(here, NAME, '__version__.py')) as f:
|
31 |
-
exec(f.read(), about)
|
32 |
-
else:
|
33 |
-
about['__version__'] = VERSION
|
34 |
-
|
35 |
-
setup(
|
36 |
-
name=NAME,
|
37 |
-
version=about['__version__'],
|
38 |
-
description=DESCRIPTION,
|
39 |
-
long_description=long_description,
|
40 |
-
long_description_content_type="text/markdown",
|
41 |
-
author=AUTHOR,
|
42 |
-
author_email=EMAIL,
|
43 |
-
python_requires=REQUIRES_PYTHON,
|
44 |
-
url=URL,
|
45 |
-
packages=['twint', 'twint.storage'],
|
46 |
-
entry_points={
|
47 |
-
'console_scripts': [
|
48 |
-
'twint = twint.cli:run_as_command',
|
49 |
-
],
|
50 |
-
},
|
51 |
-
install_requires=REQUIRED,
|
52 |
-
dependency_links=[
|
53 |
-
'git+https://github.com/x0rzkov/py-googletrans#egg=googletrans'
|
54 |
-
],
|
55 |
-
license='MIT',
|
56 |
-
classifiers=[
|
57 |
-
'License :: OSI Approved :: MIT License',
|
58 |
-
'Programming Language :: Python',
|
59 |
-
'Programming Language :: Python :: 3',
|
60 |
-
'Programming Language :: Python :: 3.6',
|
61 |
-
'Programming Language :: Python :: 3.7',
|
62 |
-
'Programming Language :: Python :: 3.8',
|
63 |
-
'Programming Language :: Python :: Implementation :: CPython',
|
64 |
-
],
|
65 |
-
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
twitter-scraper/twint-master/test.py
DELETED
@@ -1,92 +0,0 @@
|
|
1 |
-
import twint
|
2 |
-
import os
|
3 |
-
|
4 |
-
'''
|
5 |
-
Test.py - Testing TWINT to make sure everything works.
|
6 |
-
'''
|
7 |
-
|
8 |
-
|
9 |
-
def test_reg(c, run):
|
10 |
-
print("[+] Beginning vanilla test in {}".format(str(run)))
|
11 |
-
run(c)
|
12 |
-
|
13 |
-
|
14 |
-
def test_db(c, run):
|
15 |
-
print("[+] Beginning DB test in {}".format(str(run)))
|
16 |
-
c.Database = "test_twint.db"
|
17 |
-
run(c)
|
18 |
-
|
19 |
-
|
20 |
-
def custom(c, run, _type):
|
21 |
-
print("[+] Beginning custom {} test in {}".format(_type, str(run)))
|
22 |
-
c.Custom['tweet'] = ["id", "username"]
|
23 |
-
c.Custom['user'] = ["id", "username"]
|
24 |
-
run(c)
|
25 |
-
|
26 |
-
|
27 |
-
def test_json(c, run):
|
28 |
-
c.Store_json = True
|
29 |
-
c.Output = "test_twint.json"
|
30 |
-
custom(c, run, "JSON")
|
31 |
-
print("[+] Beginning JSON test in {}".format(str(run)))
|
32 |
-
run(c)
|
33 |
-
|
34 |
-
|
35 |
-
def test_csv(c, run):
|
36 |
-
c.Store_csv = True
|
37 |
-
c.Output = "test_twint.csv"
|
38 |
-
custom(c, run, "CSV")
|
39 |
-
print("[+] Beginning CSV test in {}".format(str(run)))
|
40 |
-
run(c)
|
41 |
-
|
42 |
-
|
43 |
-
def main():
|
44 |
-
c = twint.Config()
|
45 |
-
c.Username = "verified"
|
46 |
-
c.Limit = 20
|
47 |
-
c.Store_object = True
|
48 |
-
|
49 |
-
# Separate objects are necessary.
|
50 |
-
|
51 |
-
f = twint.Config()
|
52 |
-
f.Username = "verified"
|
53 |
-
f.Limit = 20
|
54 |
-
f.Store_object = True
|
55 |
-
f.User_full = True
|
56 |
-
|
57 |
-
runs = [
|
58 |
-
twint.run.Profile, # this doesn't
|
59 |
-
twint.run.Search, # this works
|
60 |
-
twint.run.Following,
|
61 |
-
twint.run.Followers,
|
62 |
-
twint.run.Favorites,
|
63 |
-
]
|
64 |
-
|
65 |
-
tests = [test_reg, test_json, test_csv, test_db]
|
66 |
-
|
67 |
-
# Something breaks if we don't split these up
|
68 |
-
|
69 |
-
for run in runs[:3]:
|
70 |
-
if run == twint.run.Search:
|
71 |
-
c.Since = "2012-1-1 20:30:22"
|
72 |
-
c.Until = "2017-1-1"
|
73 |
-
else:
|
74 |
-
c.Since = ""
|
75 |
-
c.Until = ""
|
76 |
-
|
77 |
-
for test in tests:
|
78 |
-
test(c, run)
|
79 |
-
|
80 |
-
for run in runs[3:]:
|
81 |
-
for test in tests:
|
82 |
-
test(f, run)
|
83 |
-
|
84 |
-
files = ["test_twint.db", "test_twint.json", "test_twint.csv"]
|
85 |
-
for _file in files:
|
86 |
-
os.remove(_file)
|
87 |
-
|
88 |
-
print("[+] Testing complete!")
|
89 |
-
|
90 |
-
|
91 |
-
if __name__ == '__main__':
|
92 |
-
main()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
twitter-scraper/twint-master/twint/__init__.py
DELETED
@@ -1,32 +0,0 @@
|
|
1 |
-
'''
|
2 |
-
TWINT - Twitter Intelligence Tool (formerly known as Tweep).
|
3 |
-
|
4 |
-
See wiki on Github for in-depth details.
|
5 |
-
https://github.com/twintproject/twint/wiki
|
6 |
-
|
7 |
-
Licensed under MIT License
|
8 |
-
Copyright (c) 2018 Cody Zacharias
|
9 |
-
'''
|
10 |
-
import logging, os
|
11 |
-
|
12 |
-
from .config import Config
|
13 |
-
from .__version__ import __version__
|
14 |
-
from . import run
|
15 |
-
|
16 |
-
_levels = {
|
17 |
-
'info': logging.INFO,
|
18 |
-
'debug': logging.DEBUG
|
19 |
-
}
|
20 |
-
|
21 |
-
_level = os.getenv('TWINT_DEBUG', 'info')
|
22 |
-
_logLevel = _levels[_level]
|
23 |
-
|
24 |
-
if _level == "debug":
|
25 |
-
logger = logging.getLogger()
|
26 |
-
_output_fn = 'twint.log'
|
27 |
-
logger.setLevel(_logLevel)
|
28 |
-
formatter = logging.Formatter('%(levelname)s:%(asctime)s:%(name)s:%(message)s')
|
29 |
-
fileHandler = logging.FileHandler(_output_fn)
|
30 |
-
fileHandler.setLevel(_logLevel)
|
31 |
-
fileHandler.setFormatter(formatter)
|
32 |
-
logger.addHandler(fileHandler)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
twitter-scraper/twint-master/twint/__version__.py
DELETED
@@ -1,3 +0,0 @@
|
|
1 |
-
VERSION = (2, 1, 21)
|
2 |
-
|
3 |
-
__version__ = '.'.join(map(str, VERSION))
|
|
|
|
|
|
|
|
twitter-scraper/twint-master/twint/cli.py
DELETED
@@ -1,342 +0,0 @@
|
|
1 |
-
#!/usr/bin/env python3
|
2 |
-
'''
|
3 |
-
Twint.py - Twitter Intelligence Tool (formerly known as Tweep).
|
4 |
-
|
5 |
-
See wiki on Github for in-depth details.
|
6 |
-
https://github.com/twintproject/twint/wiki
|
7 |
-
|
8 |
-
Licensed under MIT License
|
9 |
-
Copyright (c) 2018 The Twint Project
|
10 |
-
'''
|
11 |
-
import sys
|
12 |
-
import os
|
13 |
-
import argparse
|
14 |
-
|
15 |
-
from . import run
|
16 |
-
from . import config
|
17 |
-
from . import storage
|
18 |
-
|
19 |
-
|
20 |
-
def error(_error, message):
|
21 |
-
""" Print errors to stdout
|
22 |
-
"""
|
23 |
-
print("[-] {}: {}".format(_error, message))
|
24 |
-
sys.exit(0)
|
25 |
-
|
26 |
-
|
27 |
-
def check(args):
|
28 |
-
""" Error checking
|
29 |
-
"""
|
30 |
-
if args.username is not None or args.userlist or args.members_list:
|
31 |
-
if args.verified:
|
32 |
-
error("Contradicting Args",
|
33 |
-
"Please use --verified in combination with -s.")
|
34 |
-
if args.userid:
|
35 |
-
error("Contradicting Args",
|
36 |
-
"--userid and -u cannot be used together.")
|
37 |
-
if args.all:
|
38 |
-
error("Contradicting Args",
|
39 |
-
"--all and -u cannot be used together.")
|
40 |
-
elif args.search and args.timeline:
|
41 |
-
error("Contradicting Args",
|
42 |
-
"--s and --tl cannot be used together.")
|
43 |
-
elif args.timeline and not args.username:
|
44 |
-
error("Error", "-tl cannot be used without -u.")
|
45 |
-
elif args.search is None:
|
46 |
-
if args.custom_query is not None:
|
47 |
-
pass
|
48 |
-
elif (args.geo or args.near) is None and not (args.all or args.userid):
|
49 |
-
error("Error", "Please use at least -u, -s, -g or --near.")
|
50 |
-
elif args.all and args.userid:
|
51 |
-
error("Contradicting Args",
|
52 |
-
"--all and --userid cannot be used together")
|
53 |
-
if args.output is None:
|
54 |
-
if args.csv:
|
55 |
-
error("Error", "Please specify an output file (Example: -o file.csv).")
|
56 |
-
elif args.json:
|
57 |
-
error("Error", "Please specify an output file (Example: -o file.json).")
|
58 |
-
if args.backoff_exponent <= 0:
|
59 |
-
error("Error", "Please specifiy a positive value for backoff_exponent")
|
60 |
-
if args.min_wait_time < 0:
|
61 |
-
error("Error", "Please specifiy a non negative value for min_wait_time")
|
62 |
-
|
63 |
-
|
64 |
-
def loadUserList(ul, _type):
|
65 |
-
""" Concatenate users
|
66 |
-
"""
|
67 |
-
if os.path.exists(os.path.abspath(ul)):
|
68 |
-
userlist = open(os.path.abspath(ul), "r").read().splitlines()
|
69 |
-
else:
|
70 |
-
userlist = ul.split(",")
|
71 |
-
if _type == "search":
|
72 |
-
un = ""
|
73 |
-
for user in userlist:
|
74 |
-
un += "%20OR%20from%3A" + user
|
75 |
-
return un[15:]
|
76 |
-
return userlist
|
77 |
-
|
78 |
-
|
79 |
-
def initialize(args):
|
80 |
-
""" Set default values for config from args
|
81 |
-
"""
|
82 |
-
c = config.Config()
|
83 |
-
c.Username = args.username
|
84 |
-
c.User_id = args.userid
|
85 |
-
c.Search = args.search
|
86 |
-
c.Geo = args.geo
|
87 |
-
c.Location = args.location
|
88 |
-
c.Near = args.near
|
89 |
-
c.Lang = args.lang
|
90 |
-
c.Output = args.output
|
91 |
-
c.Elasticsearch = args.elasticsearch
|
92 |
-
c.Year = args.year
|
93 |
-
c.Since = args.since
|
94 |
-
c.Until = args.until
|
95 |
-
c.Email = args.email
|
96 |
-
c.Phone = args.phone
|
97 |
-
c.Verified = args.verified
|
98 |
-
c.Store_csv = args.csv
|
99 |
-
c.Tabs = args.tabs
|
100 |
-
c.Store_json = args.json
|
101 |
-
c.Show_hashtags = args.hashtags
|
102 |
-
c.Show_cashtags = args.cashtags
|
103 |
-
c.Limit = args.limit
|
104 |
-
c.Count = args.count
|
105 |
-
c.Stats = args.stats
|
106 |
-
c.Database = args.database
|
107 |
-
c.To = args.to
|
108 |
-
c.All = args.all
|
109 |
-
c.Essid = args.essid
|
110 |
-
c.Format = args.format
|
111 |
-
c.User_full = args.user_full
|
112 |
-
# c.Profile_full = args.profile_full
|
113 |
-
c.Pandas_type = args.pandas_type
|
114 |
-
c.Index_tweets = args.index_tweets
|
115 |
-
c.Index_follow = args.index_follow
|
116 |
-
c.Index_users = args.index_users
|
117 |
-
c.Debug = args.debug
|
118 |
-
c.Resume = args.resume
|
119 |
-
c.Images = args.images
|
120 |
-
c.Videos = args.videos
|
121 |
-
c.Media = args.media
|
122 |
-
c.Replies = args.replies
|
123 |
-
c.Pandas_clean = args.pandas_clean
|
124 |
-
c.Proxy_host = args.proxy_host
|
125 |
-
c.Proxy_port = args.proxy_port
|
126 |
-
c.Proxy_type = args.proxy_type
|
127 |
-
c.Tor_control_port = args.tor_control_port
|
128 |
-
c.Tor_control_password = args.tor_control_password
|
129 |
-
c.Retweets = args.retweets
|
130 |
-
c.Custom_query = args.custom_query
|
131 |
-
c.Popular_tweets = args.popular_tweets
|
132 |
-
c.Skip_certs = args.skip_certs
|
133 |
-
c.Hide_output = args.hide_output
|
134 |
-
c.Native_retweets = args.native_retweets
|
135 |
-
c.Min_likes = args.min_likes
|
136 |
-
c.Min_retweets = args.min_retweets
|
137 |
-
c.Min_replies = args.min_replies
|
138 |
-
c.Links = args.links
|
139 |
-
c.Source = args.source
|
140 |
-
c.Members_list = args.members_list
|
141 |
-
c.Filter_retweets = args.filter_retweets
|
142 |
-
c.Translate = args.translate
|
143 |
-
c.TranslateDest = args.translate_dest
|
144 |
-
c.Backoff_exponent = args.backoff_exponent
|
145 |
-
c.Min_wait_time = args.min_wait_time
|
146 |
-
return c
|
147 |
-
|
148 |
-
|
149 |
-
def options():
|
150 |
-
""" Parse arguments
|
151 |
-
"""
|
152 |
-
ap = argparse.ArgumentParser(prog="twint",
|
153 |
-
usage="python3 %(prog)s [options]",
|
154 |
-
description="TWINT - An Advanced Twitter Scraping Tool.")
|
155 |
-
ap.add_argument("-u", "--username", help="User's Tweets you want to scrape.")
|
156 |
-
ap.add_argument("-s", "--search", help="Search for Tweets containing this word or phrase.")
|
157 |
-
ap.add_argument("-g", "--geo", help="Search for geocoded Tweets.")
|
158 |
-
ap.add_argument("--near", help="Near a specified city.")
|
159 |
-
ap.add_argument("--location", help="Show user's location (Experimental).", action="store_true")
|
160 |
-
ap.add_argument("-l", "--lang", help="Search for Tweets in a specific language.")
|
161 |
-
ap.add_argument("-o", "--output", help="Save output to a file.")
|
162 |
-
ap.add_argument("-es", "--elasticsearch", help="Index to Elasticsearch.")
|
163 |
-
ap.add_argument("--year", help="Filter Tweets before specified year.")
|
164 |
-
ap.add_argument("--since", help="Filter Tweets sent since date (Example: \"2017-12-27 20:30:15\" or 2017-12-27).",
|
165 |
-
metavar="DATE")
|
166 |
-
ap.add_argument("--until", help="Filter Tweets sent until date (Example: \"2017-12-27 20:30:15\" or 2017-12-27).",
|
167 |
-
metavar="DATE")
|
168 |
-
ap.add_argument("--email", help="Filter Tweets that might have email addresses", action="store_true")
|
169 |
-
ap.add_argument("--phone", help="Filter Tweets that might have phone numbers", action="store_true")
|
170 |
-
ap.add_argument("--verified", help="Display Tweets only from verified users (Use with -s).",
|
171 |
-
action="store_true")
|
172 |
-
ap.add_argument("--csv", help="Write as .csv file.", action="store_true")
|
173 |
-
ap.add_argument("--tabs", help="Separate CSV fields with tab characters, not commas.", action="store_true")
|
174 |
-
ap.add_argument("--json", help="Write as .json file", action="store_true")
|
175 |
-
ap.add_argument("--hashtags", help="Output hashtags in seperate column.", action="store_true")
|
176 |
-
ap.add_argument("--cashtags", help="Output cashtags in seperate column.", action="store_true")
|
177 |
-
ap.add_argument("--userid", help="Twitter user id.")
|
178 |
-
ap.add_argument("--limit", help="Number of Tweets to pull (Increments of 20).")
|
179 |
-
ap.add_argument("--count", help="Display number of Tweets scraped at the end of session.",
|
180 |
-
action="store_true")
|
181 |
-
ap.add_argument("--stats", help="Show number of replies, retweets, and likes.",
|
182 |
-
action="store_true")
|
183 |
-
ap.add_argument("-db", "--database", help="Store Tweets in a sqlite3 database.")
|
184 |
-
ap.add_argument("--to", help="Search Tweets to a user.", metavar="USERNAME")
|
185 |
-
ap.add_argument("--all", help="Search all Tweets associated with a user.", metavar="USERNAME")
|
186 |
-
ap.add_argument("--followers", help="Scrape a person's followers.", action="store_true")
|
187 |
-
ap.add_argument("--following", help="Scrape a person's follows", action="store_true")
|
188 |
-
ap.add_argument("--favorites", help="Scrape Tweets a user has liked.", action="store_true")
|
189 |
-
ap.add_argument("--proxy-type", help="Socks5, HTTP, etc.")
|
190 |
-
ap.add_argument("--proxy-host", help="Proxy hostname or IP.")
|
191 |
-
ap.add_argument("--proxy-port", help="The port of the proxy server.")
|
192 |
-
ap.add_argument("--tor-control-port", help="If proxy-host is set to tor, this is the control port", default=9051)
|
193 |
-
ap.add_argument("--tor-control-password",
|
194 |
-
help="If proxy-host is set to tor, this is the password for the control port",
|
195 |
-
default="my_password")
|
196 |
-
ap.add_argument("--essid",
|
197 |
-
help="Elasticsearch Session ID, use this to differentiate scraping sessions.",
|
198 |
-
nargs="?", default="")
|
199 |
-
ap.add_argument("--userlist", help="Userlist from list or file.")
|
200 |
-
ap.add_argument("--retweets",
|
201 |
-
help="Include user's Retweets (Warning: limited).",
|
202 |
-
action="store_true")
|
203 |
-
ap.add_argument("--format", help="Custom output format (See wiki for details).")
|
204 |
-
ap.add_argument("--user-full",
|
205 |
-
help="Collect all user information (Use with followers or following only).",
|
206 |
-
action="store_true")
|
207 |
-
# I am removing this this feature for the time being, because it is no longer required, default method will do this
|
208 |
-
# ap.add_argument("--profile-full",
|
209 |
-
# help="Slow, but effective method of collecting a user's Tweets and RT.",
|
210 |
-
# action="store_true")
|
211 |
-
ap.add_argument(
|
212 |
-
"-tl",
|
213 |
-
"--timeline",
|
214 |
-
help="Collects every tweet from a User's Timeline. (Tweets, RTs & Replies)",
|
215 |
-
action="store_true",
|
216 |
-
)
|
217 |
-
ap.add_argument("--translate",
|
218 |
-
help="Get tweets translated by Google Translate.",
|
219 |
-
action="store_true")
|
220 |
-
ap.add_argument("--translate-dest", help="Translate tweet to language (ISO2).",
|
221 |
-
default="en")
|
222 |
-
ap.add_argument("--store-pandas", help="Save Tweets in a DataFrame (Pandas) file.")
|
223 |
-
ap.add_argument("--pandas-type",
|
224 |
-
help="Specify HDF5 or Pickle (HDF5 as default)", nargs="?", default="HDF5")
|
225 |
-
ap.add_argument("-it", "--index-tweets",
|
226 |
-
help="Custom Elasticsearch Index name for Tweets.", nargs="?", default="twinttweets")
|
227 |
-
ap.add_argument("-if", "--index-follow",
|
228 |
-
help="Custom Elasticsearch Index name for Follows.",
|
229 |
-
nargs="?", default="twintgraph")
|
230 |
-
ap.add_argument("-iu", "--index-users", help="Custom Elasticsearch Index name for Users.",
|
231 |
-
nargs="?", default="twintuser")
|
232 |
-
ap.add_argument("--debug",
|
233 |
-
help="Store information in debug logs", action="store_true")
|
234 |
-
ap.add_argument("--resume", help="Resume from Tweet ID.", metavar="TWEET_ID")
|
235 |
-
ap.add_argument("--videos", help="Display only Tweets with videos.", action="store_true")
|
236 |
-
ap.add_argument("--images", help="Display only Tweets with images.", action="store_true")
|
237 |
-
ap.add_argument("--media",
|
238 |
-
help="Display Tweets with only images or videos.", action="store_true")
|
239 |
-
ap.add_argument("--replies", help="Display replies to a subject.", action="store_true")
|
240 |
-
ap.add_argument("-pc", "--pandas-clean",
|
241 |
-
help="Automatically clean Pandas dataframe at every scrape.")
|
242 |
-
ap.add_argument("-cq", "--custom-query", help="Custom search query.")
|
243 |
-
ap.add_argument("-pt", "--popular-tweets", help="Scrape popular tweets instead of recent ones.",
|
244 |
-
action="store_true")
|
245 |
-
ap.add_argument("-sc", "--skip-certs", help="Skip certs verification, useful for SSC.", action="store_false")
|
246 |
-
ap.add_argument("-ho", "--hide-output", help="Hide output, no tweets will be displayed.", action="store_true")
|
247 |
-
ap.add_argument("-nr", "--native-retweets", help="Filter the results for retweets only.", action="store_true")
|
248 |
-
ap.add_argument("--min-likes", help="Filter the tweets by minimum number of likes.")
|
249 |
-
ap.add_argument("--min-retweets", help="Filter the tweets by minimum number of retweets.")
|
250 |
-
ap.add_argument("--min-replies", help="Filter the tweets by minimum number of replies.")
|
251 |
-
ap.add_argument("--links", help="Include or exclude tweets containing one o more links. If not specified" +
|
252 |
-
" you will get both tweets that might contain links or not.")
|
253 |
-
ap.add_argument("--source", help="Filter the tweets for specific source client.")
|
254 |
-
ap.add_argument("--members-list", help="Filter the tweets sent by users in a given list.")
|
255 |
-
ap.add_argument("-fr", "--filter-retweets", help="Exclude retweets from the results.", action="store_true")
|
256 |
-
ap.add_argument("--backoff-exponent", help="Specify a exponent for the polynomial backoff in case of errors.",
|
257 |
-
type=float, default=3.0)
|
258 |
-
ap.add_argument("--min-wait-time", type=float, default=15,
|
259 |
-
help="specifiy a minimum wait time in case of scraping limit error. This value will be adjusted by twint if the value provided does not satisfy the limits constraints")
|
260 |
-
args = ap.parse_args()
|
261 |
-
|
262 |
-
return args
|
263 |
-
|
264 |
-
|
265 |
-
def main():
|
266 |
-
""" Main
|
267 |
-
"""
|
268 |
-
args = options()
|
269 |
-
check(args)
|
270 |
-
|
271 |
-
if args.pandas_clean:
|
272 |
-
storage.panda.clean()
|
273 |
-
|
274 |
-
c = initialize(args)
|
275 |
-
|
276 |
-
if args.userlist:
|
277 |
-
c.Query = loadUserList(args.userlist, "search")
|
278 |
-
|
279 |
-
if args.pandas_clean:
|
280 |
-
storage.panda.clean()
|
281 |
-
|
282 |
-
if args.favorites:
|
283 |
-
if args.userlist:
|
284 |
-
_userlist = loadUserList(args.userlist, "favorites")
|
285 |
-
for _user in _userlist:
|
286 |
-
args.username = _user
|
287 |
-
c = initialize(args)
|
288 |
-
run.Favorites(c)
|
289 |
-
else:
|
290 |
-
run.Favorites(c)
|
291 |
-
elif args.following:
|
292 |
-
if args.userlist:
|
293 |
-
_userlist = loadUserList(args.userlist, "following")
|
294 |
-
for _user in _userlist:
|
295 |
-
args.username = _user
|
296 |
-
c = initialize(args)
|
297 |
-
run.Following(c)
|
298 |
-
else:
|
299 |
-
run.Following(c)
|
300 |
-
elif args.followers:
|
301 |
-
if args.userlist:
|
302 |
-
_userlist = loadUserList(args.userlist, "followers")
|
303 |
-
for _user in _userlist:
|
304 |
-
args.username = _user
|
305 |
-
c = initialize(args)
|
306 |
-
run.Followers(c)
|
307 |
-
else:
|
308 |
-
run.Followers(c)
|
309 |
-
elif args.retweets: # or args.profile_full:
|
310 |
-
if args.userlist:
|
311 |
-
_userlist = loadUserList(args.userlist, "profile")
|
312 |
-
for _user in _userlist:
|
313 |
-
args.username = _user
|
314 |
-
c = initialize(args)
|
315 |
-
run.Profile(c)
|
316 |
-
else:
|
317 |
-
run.Profile(c)
|
318 |
-
elif args.user_full:
|
319 |
-
if args.userlist:
|
320 |
-
_userlist = loadUserList(args.userlist, "userlist")
|
321 |
-
for _user in _userlist:
|
322 |
-
args.username = _user
|
323 |
-
c = initialize(args)
|
324 |
-
run.Lookup(c)
|
325 |
-
else:
|
326 |
-
run.Lookup(c)
|
327 |
-
elif args.timeline:
|
328 |
-
run.Profile(c)
|
329 |
-
else:
|
330 |
-
run.Search(c)
|
331 |
-
|
332 |
-
|
333 |
-
def run_as_command():
|
334 |
-
if(sys.version_info.major < 3 or (sys.version_info.major == 3 and sys.version_info.minor < 6)):
|
335 |
-
print("[-] TWINT requires Python version 3.6+.")
|
336 |
-
sys.exit(0)
|
337 |
-
|
338 |
-
main()
|
339 |
-
|
340 |
-
|
341 |
-
if __name__ == '__main__':
|
342 |
-
main()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
twitter-scraper/twint-master/twint/config.py
DELETED
@@ -1,87 +0,0 @@
|
|
1 |
-
from dataclasses import dataclass
|
2 |
-
from typing import Optional
|
3 |
-
|
4 |
-
@dataclass
|
5 |
-
class Config:
|
6 |
-
Username: Optional[str] = None
|
7 |
-
User_id: Optional[str] = None
|
8 |
-
Search: Optional[str] = None
|
9 |
-
Lookup: bool = False
|
10 |
-
Geo: str = ""
|
11 |
-
Location: bool = False
|
12 |
-
Near: str = None
|
13 |
-
Lang: Optional[str] = None
|
14 |
-
Output: Optional[str] = None
|
15 |
-
Elasticsearch: object = None
|
16 |
-
Year: Optional[int] = None
|
17 |
-
Since: Optional[str] = None
|
18 |
-
Until: Optional[str] = None
|
19 |
-
Email: Optional[str] = None
|
20 |
-
Phone: Optional[str] = None
|
21 |
-
Verified: bool = False
|
22 |
-
Store_csv: bool = False
|
23 |
-
Store_json: bool = False
|
24 |
-
Custom = {"tweet": None, "user": None, "username": None}
|
25 |
-
Show_hashtags: bool = False
|
26 |
-
Show_cashtags: bool = False
|
27 |
-
Limit: Optional[int] = None
|
28 |
-
Count: Optional[int] = None
|
29 |
-
Stats: bool = False
|
30 |
-
Database: object = None
|
31 |
-
To: str = None
|
32 |
-
All = None
|
33 |
-
Debug: bool = False
|
34 |
-
Format = None
|
35 |
-
Essid: str = ""
|
36 |
-
Profile: bool = False
|
37 |
-
Followers: bool = False
|
38 |
-
Following: bool = False
|
39 |
-
Favorites: bool = False
|
40 |
-
TwitterSearch: bool = False
|
41 |
-
User_full: bool = False
|
42 |
-
# Profile_full: bool = False
|
43 |
-
Store_object: bool = False
|
44 |
-
Store_object_tweets_list: list = None
|
45 |
-
Store_object_users_list: list = None
|
46 |
-
Store_object_follow_list: list = None
|
47 |
-
Pandas_type: type = None
|
48 |
-
Pandas: bool = False
|
49 |
-
Index_tweets: str = "twinttweets"
|
50 |
-
Index_follow: str = "twintgraph"
|
51 |
-
Index_users: str = "twintuser"
|
52 |
-
Retries_count: int = 10
|
53 |
-
Resume: object = None
|
54 |
-
Images: bool = False
|
55 |
-
Videos: bool = False
|
56 |
-
Media: bool = False
|
57 |
-
Replies: bool = False
|
58 |
-
Pandas_clean: bool = True
|
59 |
-
Lowercase: bool = True
|
60 |
-
Pandas_au: bool = True
|
61 |
-
Proxy_host: str = ""
|
62 |
-
Proxy_port: int = 0
|
63 |
-
Proxy_type: object = None
|
64 |
-
Tor_control_port: int = 9051
|
65 |
-
Tor_control_password: str = None
|
66 |
-
Retweets: bool = False
|
67 |
-
Query: str = None
|
68 |
-
Hide_output: bool = False
|
69 |
-
Custom_query: str = ""
|
70 |
-
Popular_tweets: bool = False
|
71 |
-
Skip_certs: bool = False
|
72 |
-
Native_retweets: bool = False
|
73 |
-
Min_likes: int = 0
|
74 |
-
Min_retweets: int = 0
|
75 |
-
Min_replies: int = 0
|
76 |
-
Links: Optional[str] = None
|
77 |
-
Source: Optional[str] = None
|
78 |
-
Members_list: Optional[str] = None
|
79 |
-
Filter_retweets: bool = False
|
80 |
-
Translate: bool = False
|
81 |
-
TranslateSrc: str = "en"
|
82 |
-
TranslateDest: str = "en"
|
83 |
-
Backoff_exponent: float = 3.0
|
84 |
-
Min_wait_time: int = 0
|
85 |
-
Bearer_token: str = None
|
86 |
-
Guest_token: str = None
|
87 |
-
deleted: list = None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
twitter-scraper/twint-master/twint/datelock.py
DELETED
@@ -1,44 +0,0 @@
|
|
1 |
-
import datetime
|
2 |
-
|
3 |
-
import logging as logme
|
4 |
-
|
5 |
-
from .tweet import utc_to_local
|
6 |
-
|
7 |
-
|
8 |
-
class Datelock:
|
9 |
-
until = None
|
10 |
-
since = None
|
11 |
-
_since_def_user = None
|
12 |
-
|
13 |
-
|
14 |
-
def convertToDateTime(string):
|
15 |
-
dateTimeList = string.split()
|
16 |
-
ListLength = len(dateTimeList)
|
17 |
-
if ListLength == 2:
|
18 |
-
return string
|
19 |
-
if ListLength == 1:
|
20 |
-
return string + " 00:00:00"
|
21 |
-
else:
|
22 |
-
return ""
|
23 |
-
|
24 |
-
|
25 |
-
def Set(Until, Since):
|
26 |
-
logme.debug(__name__+':Set')
|
27 |
-
d = Datelock()
|
28 |
-
|
29 |
-
if Until:
|
30 |
-
d.until = datetime.datetime.strptime(convertToDateTime(Until), "%Y-%m-%d %H:%M:%S")
|
31 |
-
d.until = utc_to_local(d.until)
|
32 |
-
else:
|
33 |
-
d.until = datetime.datetime.today()
|
34 |
-
|
35 |
-
if Since:
|
36 |
-
d.since = datetime.datetime.strptime(convertToDateTime(Since), "%Y-%m-%d %H:%M:%S")
|
37 |
-
d.since = utc_to_local(d.since)
|
38 |
-
d._since_def_user = True
|
39 |
-
else:
|
40 |
-
d.since = datetime.datetime.strptime("2006-03-21 00:00:00", "%Y-%m-%d %H:%M:%S")
|
41 |
-
d.since = utc_to_local(d.since)
|
42 |
-
d._since_def_user = False
|
43 |
-
|
44 |
-
return d
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
twitter-scraper/twint-master/twint/feed.py
DELETED
@@ -1,145 +0,0 @@
|
|
1 |
-
import time
|
2 |
-
from datetime import datetime
|
3 |
-
|
4 |
-
from bs4 import BeautifulSoup
|
5 |
-
from re import findall
|
6 |
-
from json import loads
|
7 |
-
|
8 |
-
import logging as logme
|
9 |
-
|
10 |
-
from .tweet import utc_to_local, Tweet_formats
|
11 |
-
|
12 |
-
|
13 |
-
class NoMoreTweetsException(Exception):
|
14 |
-
def __init__(self, msg):
|
15 |
-
super().__init__(msg)
|
16 |
-
|
17 |
-
|
18 |
-
def Follow(response):
|
19 |
-
logme.debug(__name__ + ':Follow')
|
20 |
-
soup = BeautifulSoup(response, "html.parser")
|
21 |
-
follow = soup.find_all("td", "info fifty screenname")
|
22 |
-
cursor = soup.find_all("div", "w-button-more")
|
23 |
-
try:
|
24 |
-
cursor = findall(r'cursor=(.*?)">', str(cursor))[0]
|
25 |
-
except IndexError:
|
26 |
-
logme.critical(__name__ + ':Follow:IndexError')
|
27 |
-
|
28 |
-
return follow, cursor
|
29 |
-
|
30 |
-
|
31 |
-
# TODO: this won't be used by --profile-full anymore. if it isn't used anywhere else, perhaps remove this in future
|
32 |
-
def Mobile(response):
|
33 |
-
logme.debug(__name__ + ':Mobile')
|
34 |
-
soup = BeautifulSoup(response, "html.parser")
|
35 |
-
tweets = soup.find_all("span", "metadata")
|
36 |
-
max_id = soup.find_all("div", "w-button-more")
|
37 |
-
try:
|
38 |
-
max_id = findall(r'max_id=(.*?)">', str(max_id))[0]
|
39 |
-
except Exception as e:
|
40 |
-
logme.critical(__name__ + ':Mobile:' + str(e))
|
41 |
-
|
42 |
-
return tweets, max_id
|
43 |
-
|
44 |
-
|
45 |
-
def MobileFav(response):
|
46 |
-
soup = BeautifulSoup(response, "html.parser")
|
47 |
-
tweets = soup.find_all("table", "tweet")
|
48 |
-
max_id = soup.find_all("div", "w-button-more")
|
49 |
-
try:
|
50 |
-
max_id = findall(r'max_id=(.*?)">', str(max_id))[0]
|
51 |
-
except Exception as e:
|
52 |
-
print(str(e) + " [x] feed.MobileFav")
|
53 |
-
|
54 |
-
return tweets, max_id
|
55 |
-
|
56 |
-
|
57 |
-
def _get_cursor(response):
|
58 |
-
if isinstance(response, dict): # case 1
|
59 |
-
try:
|
60 |
-
next_cursor = response['timeline']['instructions'][0]['addEntries']['entries'][-1]['content'][
|
61 |
-
'operation']['cursor']['value']
|
62 |
-
except KeyError:
|
63 |
-
# this is needed because after the first request location of cursor is changed
|
64 |
-
next_cursor = response['timeline']['instructions'][-1]['replaceEntry']['entry']['content']['operation'][
|
65 |
-
'cursor']['value']
|
66 |
-
else: # case 2
|
67 |
-
next_cursor = response[-1]['content']['value']
|
68 |
-
return next_cursor
|
69 |
-
|
70 |
-
|
71 |
-
def Json(response):
|
72 |
-
logme.debug(__name__ + ':Json')
|
73 |
-
json_response = loads(response)
|
74 |
-
html = json_response["items_html"]
|
75 |
-
soup = BeautifulSoup(html, "html.parser")
|
76 |
-
feed = soup.find_all("div", "tweet")
|
77 |
-
return feed, json_response["min_position"]
|
78 |
-
|
79 |
-
|
80 |
-
def parse_tweets(config, response):
|
81 |
-
logme.debug(__name__ + ':parse_tweets')
|
82 |
-
response = loads(response)
|
83 |
-
feed = []
|
84 |
-
if 'globalObjects' in response:
|
85 |
-
if len(response['globalObjects']['tweets']) == 0:
|
86 |
-
msg = 'No more data!'
|
87 |
-
raise NoMoreTweetsException(msg)
|
88 |
-
for timeline_entry in response['timeline']['instructions'][0]['addEntries']['entries']:
|
89 |
-
# this will handle the cases when the timeline entry is a tweet
|
90 |
-
if (config.TwitterSearch or config.Profile) and (timeline_entry['entryId'].startswith('sq-I-t-') or
|
91 |
-
timeline_entry['entryId'].startswith('tweet-')):
|
92 |
-
if 'tweet' in timeline_entry['content']['item']['content']:
|
93 |
-
_id = timeline_entry['content']['item']['content']['tweet']['id']
|
94 |
-
# skip the ads
|
95 |
-
if 'promotedMetadata' in timeline_entry['content']['item']['content']['tweet']:
|
96 |
-
continue
|
97 |
-
elif 'tombstone' in timeline_entry['content']['item']['content'] and 'tweet' in \
|
98 |
-
timeline_entry['content']['item']['content']['tombstone']:
|
99 |
-
_id = timeline_entry['content']['item']['content']['tombstone']['tweet']['id']
|
100 |
-
else:
|
101 |
-
_id = None
|
102 |
-
if _id is None:
|
103 |
-
raise ValueError('Unable to find ID of tweet in timeline.')
|
104 |
-
try:
|
105 |
-
temp_obj = response['globalObjects']['tweets'][_id]
|
106 |
-
except KeyError:
|
107 |
-
logme.info('encountered a deleted tweet with id {}'.format(_id))
|
108 |
-
|
109 |
-
config.deleted.append(_id)
|
110 |
-
continue
|
111 |
-
temp_obj['user_data'] = response['globalObjects']['users'][temp_obj['user_id_str']]
|
112 |
-
if 'retweeted_status_id_str' in temp_obj:
|
113 |
-
rt_id = temp_obj['retweeted_status_id_str']
|
114 |
-
_dt = response['globalObjects']['tweets'][rt_id]['created_at']
|
115 |
-
_dt = datetime.strptime(_dt, '%a %b %d %H:%M:%S %z %Y')
|
116 |
-
_dt = utc_to_local(_dt)
|
117 |
-
_dt = str(_dt.strftime(Tweet_formats['datetime']))
|
118 |
-
temp_obj['retweet_data'] = {
|
119 |
-
'user_rt_id': response['globalObjects']['tweets'][rt_id]['user_id_str'],
|
120 |
-
'user_rt': response['globalObjects']['tweets'][rt_id]['full_text'],
|
121 |
-
'retweet_id': rt_id,
|
122 |
-
'retweet_date': _dt,
|
123 |
-
}
|
124 |
-
feed.append(temp_obj)
|
125 |
-
next_cursor = _get_cursor(response) # case 1
|
126 |
-
else:
|
127 |
-
response = response['data']['user']['result']['timeline']
|
128 |
-
entries = response['timeline']['instructions']
|
129 |
-
for e in entries:
|
130 |
-
if e.get('entries'):
|
131 |
-
entries = e['entries']
|
132 |
-
break
|
133 |
-
if len(entries) == 2:
|
134 |
-
msg = 'No more data!'
|
135 |
-
raise NoMoreTweetsException(msg)
|
136 |
-
for timeline_entry in entries:
|
137 |
-
if timeline_entry['content'].get('itemContent'):
|
138 |
-
try:
|
139 |
-
temp_obj = timeline_entry['content']['itemContent']['tweet_results']['result']['legacy']
|
140 |
-
temp_obj['user_data'] = timeline_entry['content']['itemContent']['tweet_results']['result']['core']['user_results']['result']['legacy']
|
141 |
-
feed.append(temp_obj)
|
142 |
-
except KeyError: # doubtful
|
143 |
-
next
|
144 |
-
next_cursor = _get_cursor(entries) # case 2
|
145 |
-
return feed, next_cursor
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
twitter-scraper/twint-master/twint/format.py
DELETED
@@ -1,91 +0,0 @@
|
|
1 |
-
import logging as logme
|
2 |
-
|
3 |
-
def Tweet(config, t):
|
4 |
-
if config.Format:
|
5 |
-
logme.debug(__name__+':Tweet:Format')
|
6 |
-
output = config.Format.replace("{id}", t.id_str)
|
7 |
-
output = output.replace("{conversation_id}", t.conversation_id)
|
8 |
-
output = output.replace("{date}", t.datestamp)
|
9 |
-
output = output.replace("{time}", t.timestamp)
|
10 |
-
output = output.replace("{user_id}", t.user_id_str)
|
11 |
-
output = output.replace("{username}", t.username)
|
12 |
-
output = output.replace("{name}", t.name)
|
13 |
-
output = output.replace("{place}", t.place)
|
14 |
-
output = output.replace("{timezone}", t.timezone)
|
15 |
-
output = output.replace("{urls}", ",".join(t.urls))
|
16 |
-
output = output.replace("{photos}", ",".join(t.photos))
|
17 |
-
output = output.replace("{video}", str(t.video))
|
18 |
-
output = output.replace("{thumbnail}", t.thumbnail)
|
19 |
-
output = output.replace("{tweet}", t.tweet)
|
20 |
-
output = output.replace("{language}", t.lang)
|
21 |
-
output = output.replace("{hashtags}", ",".join(t.hashtags))
|
22 |
-
output = output.replace("{cashtags}", ",".join(t.cashtags))
|
23 |
-
output = output.replace("{replies}", t.replies_count)
|
24 |
-
output = output.replace("{retweets}", t.retweets_count)
|
25 |
-
output = output.replace("{likes}", t.likes_count)
|
26 |
-
output = output.replace("{link}", t.link)
|
27 |
-
output = output.replace("{is_retweet}", str(t.retweet))
|
28 |
-
output = output.replace("{user_rt_id}", str(t.user_rt_id))
|
29 |
-
output = output.replace("{quote_url}", t.quote_url)
|
30 |
-
output = output.replace("{near}", t.near)
|
31 |
-
output = output.replace("{geo}", t.geo)
|
32 |
-
output = output.replace("{mentions}", ",".join(t.mentions))
|
33 |
-
output = output.replace("{translate}", t.translate)
|
34 |
-
output = output.replace("{trans_src}", t.trans_src)
|
35 |
-
output = output.replace("{trans_dest}", t.trans_dest)
|
36 |
-
else:
|
37 |
-
logme.debug(__name__+':Tweet:notFormat')
|
38 |
-
output = f"{t.id_str} {t.datestamp} {t.timestamp} {t.timezone} "
|
39 |
-
|
40 |
-
# TODO: someone who is familiar with this code, needs to take a look at what this is <also see tweet.py>
|
41 |
-
# if t.retweet:
|
42 |
-
# output += "RT "
|
43 |
-
|
44 |
-
output += f"<{t.username}> {t.tweet}"
|
45 |
-
|
46 |
-
if config.Show_hashtags:
|
47 |
-
hashtags = ",".join(t.hashtags)
|
48 |
-
output += f" {hashtags}"
|
49 |
-
if config.Show_cashtags:
|
50 |
-
cashtags = ",".join(t.cashtags)
|
51 |
-
output += f" {cashtags}"
|
52 |
-
if config.Stats:
|
53 |
-
output += f" | {t.replies_count} replies {t.retweets_count} retweets {t.likes_count} likes"
|
54 |
-
if config.Translate:
|
55 |
-
output += f" {t.translate} {t.trans_src} {t.trans_dest}"
|
56 |
-
return output
|
57 |
-
|
58 |
-
def User(_format, u):
|
59 |
-
if _format:
|
60 |
-
logme.debug(__name__+':User:Format')
|
61 |
-
output = _format.replace("{id}", str(u.id))
|
62 |
-
output = output.replace("{name}", u.name)
|
63 |
-
output = output.replace("{username}", u.username)
|
64 |
-
output = output.replace("{bio}", u.bio)
|
65 |
-
output = output.replace("{location}", u.location)
|
66 |
-
output = output.replace("{url}", u.url)
|
67 |
-
output = output.replace("{join_date}", u.join_date)
|
68 |
-
output = output.replace("{join_time}", u.join_time)
|
69 |
-
output = output.replace("{tweets}", str(u.tweets))
|
70 |
-
output = output.replace("{following}", str(u.following))
|
71 |
-
output = output.replace("{followers}", str(u.followers))
|
72 |
-
output = output.replace("{likes}", str(u.likes))
|
73 |
-
output = output.replace("{media}", str(u.media_count))
|
74 |
-
output = output.replace("{private}", str(u.is_private))
|
75 |
-
output = output.replace("{verified}", str(u.is_verified))
|
76 |
-
output = output.replace("{avatar}", u.avatar)
|
77 |
-
if u.background_image:
|
78 |
-
output = output.replace("{background_image}", u.background_image)
|
79 |
-
else:
|
80 |
-
output = output.replace("{background_image}", "")
|
81 |
-
else:
|
82 |
-
logme.debug(__name__+':User:notFormat')
|
83 |
-
output = f"{u.id} | {u.name} | @{u.username} | Private: "
|
84 |
-
output += f"{u.is_private} | Verified: {u.is_verified} |"
|
85 |
-
output += f" Bio: {u.bio} | Location: {u.location} | Url: "
|
86 |
-
output += f"{u.url} | Joined: {u.join_date} {u.join_time} "
|
87 |
-
output += f"| Tweets: {u.tweets} | Following: {u.following}"
|
88 |
-
output += f" | Followers: {u.followers} | Likes: {u.likes} "
|
89 |
-
output += f"| Media: {u.media_count} | Avatar: {u.avatar}"
|
90 |
-
|
91 |
-
return output
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
twitter-scraper/twint-master/twint/get.py
DELETED
@@ -1,298 +0,0 @@
|
|
1 |
-
from async_timeout import timeout
|
2 |
-
from datetime import datetime
|
3 |
-
from bs4 import BeautifulSoup
|
4 |
-
import sys
|
5 |
-
import socket
|
6 |
-
import aiohttp
|
7 |
-
from fake_useragent import UserAgent
|
8 |
-
import asyncio
|
9 |
-
import concurrent.futures
|
10 |
-
import random
|
11 |
-
from json import loads, dumps
|
12 |
-
from aiohttp_socks import ProxyConnector, ProxyType
|
13 |
-
from urllib.parse import quote
|
14 |
-
import time
|
15 |
-
|
16 |
-
from . import url
|
17 |
-
from .output import Tweets, Users
|
18 |
-
from .token import TokenExpiryException
|
19 |
-
|
20 |
-
import logging as logme
|
21 |
-
|
22 |
-
httpproxy = None
|
23 |
-
|
24 |
-
user_agent_list = [
|
25 |
-
# 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)'
|
26 |
-
# ' Chrome/60.0.3112.113 Safari/537.36',
|
27 |
-
# 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)'
|
28 |
-
# ' Chrome/60.0.3112.90 Safari/537.36',
|
29 |
-
# 'Mozilla/5.0 (Windows NT 5.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)'
|
30 |
-
# ' Chrome/60.0.3112.90 Safari/537.36',
|
31 |
-
# 'Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)'
|
32 |
-
# ' Chrome/60.0.3112.90 Safari/537.36',
|
33 |
-
# 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko)'
|
34 |
-
# ' Chrome/44.0.2403.157 Safari/537.36',
|
35 |
-
# 'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)'
|
36 |
-
# ' Chrome/60.0.3112.113 Safari/537.36',
|
37 |
-
# 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)'
|
38 |
-
# ' Chrome/57.0.2987.133 Safari/537.36',
|
39 |
-
# 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)'
|
40 |
-
# ' Chrome/57.0.2987.133 Safari/537.36',
|
41 |
-
# 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)'
|
42 |
-
# ' Chrome/55.0.2883.87 Safari/537.36',
|
43 |
-
# 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)'
|
44 |
-
# ' Chrome/55.0.2883.87 Safari/537.36',
|
45 |
-
|
46 |
-
'Mozilla/4.0 (compatible; MSIE 9.0; Windows NT 6.1)',
|
47 |
-
'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko',
|
48 |
-
'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0)',
|
49 |
-
'Mozilla/5.0 (Windows NT 6.1; Trident/7.0; rv:11.0) like Gecko',
|
50 |
-
'Mozilla/5.0 (Windows NT 6.2; WOW64; Trident/7.0; rv:11.0) like Gecko',
|
51 |
-
'Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko',
|
52 |
-
'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.0; Trident/5.0)',
|
53 |
-
'Mozilla/5.0 (Windows NT 6.3; WOW64; Trident/7.0; rv:11.0) like Gecko',
|
54 |
-
'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)',
|
55 |
-
'Mozilla/5.0 (Windows NT 6.1; Win64; x64; Trident/7.0; rv:11.0) like Gecko',
|
56 |
-
'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0)',
|
57 |
-
'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/6.0)',
|
58 |
-
'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; .NET CLR 2.0.50727; .NET CLR 3.0.4506.2152; .NET '
|
59 |
-
'CLR 3.5.30729)',
|
60 |
-
]
|
61 |
-
|
62 |
-
|
63 |
-
# function to convert python `dict` to json and then encode it to be passed in the url as a parameter
|
64 |
-
# some urls require this format
|
65 |
-
def dict_to_url(dct):
|
66 |
-
return quote(dumps(dct))
|
67 |
-
|
68 |
-
|
69 |
-
def get_connector(config):
|
70 |
-
logme.debug(__name__ + ':get_connector')
|
71 |
-
_connector = None
|
72 |
-
if config.Proxy_host:
|
73 |
-
if config.Proxy_host.lower() == "tor":
|
74 |
-
_connector = ProxyConnector(
|
75 |
-
host='127.0.0.1',
|
76 |
-
port=9050,
|
77 |
-
rdns=True)
|
78 |
-
elif config.Proxy_port and config.Proxy_type:
|
79 |
-
if config.Proxy_type.lower() == "socks5":
|
80 |
-
_type = ProxyType.SOCKS5
|
81 |
-
elif config.Proxy_type.lower() == "socks4":
|
82 |
-
_type = ProxyType.SOCKS4
|
83 |
-
elif config.Proxy_type.lower() == "http":
|
84 |
-
global httpproxy
|
85 |
-
httpproxy = "http://" + config.Proxy_host + ":" + str(config.Proxy_port)
|
86 |
-
return _connector
|
87 |
-
else:
|
88 |
-
logme.critical("get_connector:proxy-type-error")
|
89 |
-
print("Error: Proxy types allowed are: http, socks5 and socks4. No https.")
|
90 |
-
sys.exit(1)
|
91 |
-
_connector = ProxyConnector(
|
92 |
-
proxy_type=_type,
|
93 |
-
host=config.Proxy_host,
|
94 |
-
port=config.Proxy_port,
|
95 |
-
rdns=True)
|
96 |
-
else:
|
97 |
-
logme.critical(__name__ + ':get_connector:proxy-port-type-error')
|
98 |
-
print("Error: Please specify --proxy-host, --proxy-port, and --proxy-type")
|
99 |
-
sys.exit(1)
|
100 |
-
else:
|
101 |
-
if config.Proxy_port or config.Proxy_type:
|
102 |
-
logme.critical(__name__ + ':get_connector:proxy-host-arg-error')
|
103 |
-
print("Error: Please specify --proxy-host, --proxy-port, and --proxy-type")
|
104 |
-
sys.exit(1)
|
105 |
-
|
106 |
-
return _connector
|
107 |
-
|
108 |
-
|
109 |
-
async def RequestUrl(config, init):
|
110 |
-
logme.debug(__name__ + ':RequestUrl')
|
111 |
-
_connector = get_connector(config)
|
112 |
-
_serialQuery = ""
|
113 |
-
params = []
|
114 |
-
_url = ""
|
115 |
-
_headers = [("authorization", config.Bearer_token), ("x-guest-token", config.Guest_token)]
|
116 |
-
|
117 |
-
# TODO : do this later
|
118 |
-
if config.Profile:
|
119 |
-
logme.debug(__name__ + ':RequestUrl:Profile')
|
120 |
-
_url, params, _serialQuery = url.SearchProfile(config, init)
|
121 |
-
elif config.TwitterSearch:
|
122 |
-
logme.debug(__name__ + ':RequestUrl:TwitterSearch')
|
123 |
-
_url, params, _serialQuery = await url.Search(config, init)
|
124 |
-
else:
|
125 |
-
if config.Following:
|
126 |
-
logme.debug(__name__ + ':RequestUrl:Following')
|
127 |
-
_url = await url.Following(config.Username, init)
|
128 |
-
elif config.Followers:
|
129 |
-
logme.debug(__name__ + ':RequestUrl:Followers')
|
130 |
-
_url = await url.Followers(config.Username, init)
|
131 |
-
else:
|
132 |
-
logme.debug(__name__ + ':RequestUrl:Favorites')
|
133 |
-
_url = await url.Favorites(config.Username, init)
|
134 |
-
_serialQuery = _url
|
135 |
-
|
136 |
-
response = await Request(_url, params=params, connector=_connector, headers=_headers)
|
137 |
-
|
138 |
-
if config.Debug:
|
139 |
-
print(_serialQuery, file=open("twint-request_urls.log", "a", encoding="utf-8"))
|
140 |
-
|
141 |
-
return response
|
142 |
-
|
143 |
-
|
144 |
-
def ForceNewTorIdentity(config):
|
145 |
-
logme.debug(__name__ + ':ForceNewTorIdentity')
|
146 |
-
try:
|
147 |
-
tor_c = socket.create_connection(('127.0.0.1', config.Tor_control_port))
|
148 |
-
tor_c.send('AUTHENTICATE "{}"\r\nSIGNAL NEWNYM\r\n'.format(config.Tor_control_password).encode())
|
149 |
-
response = tor_c.recv(1024)
|
150 |
-
if response != b'250 OK\r\n250 OK\r\n':
|
151 |
-
sys.stderr.write('Unexpected response from Tor control port: {}\n'.format(response))
|
152 |
-
logme.critical(__name__ + ':ForceNewTorIdentity:unexpectedResponse')
|
153 |
-
except Exception as e:
|
154 |
-
logme.debug(__name__ + ':ForceNewTorIdentity:errorConnectingTor')
|
155 |
-
sys.stderr.write('Error connecting to Tor control port: {}\n'.format(repr(e)))
|
156 |
-
sys.stderr.write('If you want to rotate Tor ports automatically - enable Tor control port\n')
|
157 |
-
|
158 |
-
|
159 |
-
async def Request(_url, connector=None, params=None, headers=None):
|
160 |
-
logme.debug(__name__ + ':Request:Connector')
|
161 |
-
async with aiohttp.ClientSession(connector=connector, headers=headers) as session:
|
162 |
-
return await Response(session, _url, params)
|
163 |
-
|
164 |
-
|
165 |
-
async def Response(session, _url, params=None):
|
166 |
-
logme.debug(__name__ + ':Response')
|
167 |
-
retries = 5
|
168 |
-
wait = 10 # No basis, maybe work with 0
|
169 |
-
for attempt in range(retries + 1):
|
170 |
-
try:
|
171 |
-
with timeout(120):
|
172 |
-
async with session.get(_url, ssl=True, params=params, proxy=httpproxy) as response:
|
173 |
-
resp = await response.text()
|
174 |
-
if response.status == 429: # 429 implies Too many requests i.e. Rate Limit Exceeded
|
175 |
-
raise TokenExpiryException(loads(resp)['errors'][0]['message'])
|
176 |
-
return resp
|
177 |
-
except aiohttp.client_exceptions.ClientConnectorError as exc:
|
178 |
-
if attempt < retries:
|
179 |
-
retrying = ', retrying'
|
180 |
-
level = logme.WARNING
|
181 |
-
else:
|
182 |
-
retrying = ''
|
183 |
-
level = logme.ERROR
|
184 |
-
logme.log(level, f'Error retrieving {_url}: {exc!r}{retrying}')
|
185 |
-
if attempt < retries:
|
186 |
-
time.sleep(wait)
|
187 |
-
else:
|
188 |
-
logme.fatal(f'{retries + 1} requests to {_url} failed, giving up.')
|
189 |
-
raise TokenExpiryException(f'{exc!r}')
|
190 |
-
|
191 |
-
|
192 |
-
async def RandomUserAgent(wa=None):
|
193 |
-
logme.debug(__name__ + ':RandomUserAgent')
|
194 |
-
try:
|
195 |
-
if wa:
|
196 |
-
return "Mozilla/5.0 (Windows NT 6.4; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2225.0 Safari/537.36"
|
197 |
-
return UserAgent(verify_ssl=False, use_cache_server=False).random
|
198 |
-
except:
|
199 |
-
return random.choice(user_agent_list)
|
200 |
-
|
201 |
-
|
202 |
-
async def Username(_id, bearer_token, guest_token):
|
203 |
-
logme.debug(__name__ + ':Username')
|
204 |
-
_dct = {'userId': _id, 'withHighlightedLabel': False}
|
205 |
-
_url = "https://api.twitter.com/graphql/B9FuNQVmyx32rdbIPEZKag/UserByRestId?variables={}".format(dict_to_url(_dct))
|
206 |
-
_headers = {
|
207 |
-
'authorization': bearer_token,
|
208 |
-
'x-guest-token': guest_token,
|
209 |
-
}
|
210 |
-
r = await Request(_url, headers=_headers)
|
211 |
-
j_r = loads(r)
|
212 |
-
username = j_r['data']['user']['legacy']['screen_name']
|
213 |
-
return username
|
214 |
-
|
215 |
-
|
216 |
-
async def Tweet(url, config, conn):
|
217 |
-
logme.debug(__name__ + ':Tweet')
|
218 |
-
try:
|
219 |
-
response = await Request(url)
|
220 |
-
soup = BeautifulSoup(response, "html.parser")
|
221 |
-
tweets = soup.find_all("div", "tweet")
|
222 |
-
await Tweets(tweets, config, conn, url)
|
223 |
-
except Exception as e:
|
224 |
-
logme.critical(__name__ + ':Tweet:' + str(e))
|
225 |
-
|
226 |
-
|
227 |
-
async def User(username, config, conn, user_id=False):
|
228 |
-
logme.debug(__name__ + ':User')
|
229 |
-
_dct = {'screen_name': username, 'withHighlightedLabel': False}
|
230 |
-
_url = 'https://api.twitter.com/graphql/jMaTS-_Ea8vh9rpKggJbCQ/UserByScreenName?variables={}'\
|
231 |
-
.format(dict_to_url(_dct))
|
232 |
-
_headers = {
|
233 |
-
'authorization': config.Bearer_token,
|
234 |
-
'x-guest-token': config.Guest_token,
|
235 |
-
}
|
236 |
-
try:
|
237 |
-
response = await Request(_url, headers=_headers)
|
238 |
-
j_r = loads(response)
|
239 |
-
if user_id:
|
240 |
-
try:
|
241 |
-
_id = j_r['data']['user']['rest_id']
|
242 |
-
return _id
|
243 |
-
except KeyError as e:
|
244 |
-
logme.critical(__name__ + ':User:' + str(e))
|
245 |
-
return
|
246 |
-
await Users(j_r, config, conn)
|
247 |
-
except Exception as e:
|
248 |
-
logme.critical(__name__ + ':User:' + str(e))
|
249 |
-
raise
|
250 |
-
|
251 |
-
|
252 |
-
def Limit(Limit, count):
|
253 |
-
logme.debug(__name__ + ':Limit')
|
254 |
-
if Limit is not None and count >= int(Limit):
|
255 |
-
return True
|
256 |
-
|
257 |
-
|
258 |
-
async def Multi(feed, config, conn):
|
259 |
-
logme.debug(__name__ + ':Multi')
|
260 |
-
count = 0
|
261 |
-
try:
|
262 |
-
with concurrent.futures.ThreadPoolExecutor(max_workers=20) as executor:
|
263 |
-
loop = asyncio.get_event_loop()
|
264 |
-
futures = []
|
265 |
-
for tweet in feed:
|
266 |
-
count += 1
|
267 |
-
if config.Favorites or config.Profile_full:
|
268 |
-
logme.debug(__name__ + ':Multi:Favorites-profileFull')
|
269 |
-
link = tweet.find("a")["href"]
|
270 |
-
url = f"https://twitter.com{link}&lang=en"
|
271 |
-
elif config.User_full:
|
272 |
-
logme.debug(__name__ + ':Multi:userFull')
|
273 |
-
username = tweet.find("a")["name"]
|
274 |
-
url = f"http://twitter.com/{username}?lang=en"
|
275 |
-
else:
|
276 |
-
logme.debug(__name__ + ':Multi:else-url')
|
277 |
-
link = tweet.find("a", "tweet-timestamp js-permalink js-nav js-tooltip")["href"]
|
278 |
-
url = f"https://twitter.com{link}?lang=en"
|
279 |
-
|
280 |
-
if config.User_full:
|
281 |
-
logme.debug(__name__ + ':Multi:user-full-Run')
|
282 |
-
futures.append(loop.run_in_executor(executor, await User(url,
|
283 |
-
config, conn)))
|
284 |
-
else:
|
285 |
-
logme.debug(__name__ + ':Multi:notUser-full-Run')
|
286 |
-
futures.append(loop.run_in_executor(executor, await Tweet(url,
|
287 |
-
config, conn)))
|
288 |
-
logme.debug(__name__ + ':Multi:asyncioGather')
|
289 |
-
await asyncio.gather(*futures)
|
290 |
-
except Exception as e:
|
291 |
-
# TODO: fix error not error
|
292 |
-
# print(str(e) + " [x] get.Multi")
|
293 |
-
# will return "'NoneType' object is not callable"
|
294 |
-
# but still works
|
295 |
-
# logme.critical(__name__+':Multi:' + str(e))
|
296 |
-
pass
|
297 |
-
|
298 |
-
return count
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
twitter-scraper/twint-master/twint/output.py
DELETED
@@ -1,241 +0,0 @@
|
|
1 |
-
from datetime import datetime
|
2 |
-
|
3 |
-
from . import format, get
|
4 |
-
from .tweet import Tweet
|
5 |
-
from .user import User
|
6 |
-
from .storage import db, elasticsearch, write, panda
|
7 |
-
|
8 |
-
import logging as logme
|
9 |
-
|
10 |
-
follows_list = []
|
11 |
-
tweets_list = []
|
12 |
-
users_list = []
|
13 |
-
|
14 |
-
author_list = {''}
|
15 |
-
author_list.pop()
|
16 |
-
|
17 |
-
# used by Pandas
|
18 |
-
_follows_object = {}
|
19 |
-
|
20 |
-
|
21 |
-
def _formatDateTime(datetimestamp):
|
22 |
-
try:
|
23 |
-
return int(datetime.strptime(datetimestamp, "%Y-%m-%d %H:%M:%S").timestamp())
|
24 |
-
except ValueError:
|
25 |
-
return int(datetime.strptime(datetimestamp, "%Y-%m-%d").timestamp())
|
26 |
-
|
27 |
-
|
28 |
-
def _clean_follow_list():
|
29 |
-
logme.debug(__name__ + ':clean_follow_list')
|
30 |
-
global _follows_object
|
31 |
-
_follows_object = {}
|
32 |
-
|
33 |
-
|
34 |
-
def clean_lists():
|
35 |
-
logme.debug(__name__ + ':clean_lists')
|
36 |
-
global follows_list
|
37 |
-
global tweets_list
|
38 |
-
global users_list
|
39 |
-
follows_list = []
|
40 |
-
tweets_list = []
|
41 |
-
users_list = []
|
42 |
-
|
43 |
-
|
44 |
-
def datecheck(datetimestamp, config):
|
45 |
-
logme.debug(__name__ + ':datecheck')
|
46 |
-
if config.Since:
|
47 |
-
logme.debug(__name__ + ':datecheck:SinceTrue')
|
48 |
-
|
49 |
-
d = _formatDateTime(datetimestamp)
|
50 |
-
s = _formatDateTime(config.Since)
|
51 |
-
|
52 |
-
if d < s:
|
53 |
-
return False
|
54 |
-
if config.Until:
|
55 |
-
logme.debug(__name__ + ':datecheck:UntilTrue')
|
56 |
-
|
57 |
-
d = _formatDateTime(datetimestamp)
|
58 |
-
s = _formatDateTime(config.Until)
|
59 |
-
|
60 |
-
if d > s:
|
61 |
-
return False
|
62 |
-
logme.debug(__name__ + ':datecheck:dateRangeFalse')
|
63 |
-
return True
|
64 |
-
|
65 |
-
|
66 |
-
# TODO In this method we need to delete the quoted tweets, because twitter also sends the quoted tweets in the
|
67 |
-
# `tweets` list along with the other tweets
|
68 |
-
def is_tweet(tw):
|
69 |
-
try:
|
70 |
-
tw["data-item-id"]
|
71 |
-
logme.debug(__name__ + ':is_tweet:True')
|
72 |
-
return True
|
73 |
-
except:
|
74 |
-
logme.critical(__name__ + ':is_tweet:False')
|
75 |
-
return False
|
76 |
-
|
77 |
-
|
78 |
-
def _output(obj, output, config, **extra):
|
79 |
-
logme.debug(__name__ + ':_output')
|
80 |
-
if config.Lowercase:
|
81 |
-
if isinstance(obj, str):
|
82 |
-
logme.debug(__name__ + ':_output:Lowercase:username')
|
83 |
-
obj = obj.lower()
|
84 |
-
elif obj.__class__.__name__ == "user":
|
85 |
-
logme.debug(__name__ + ':_output:Lowercase:user')
|
86 |
-
pass
|
87 |
-
elif obj.__class__.__name__ == "tweet":
|
88 |
-
logme.debug(__name__ + ':_output:Lowercase:tweet')
|
89 |
-
obj.username = obj.username.lower()
|
90 |
-
author_list.update({obj.username})
|
91 |
-
for dct in obj.mentions:
|
92 |
-
for key, val in dct.items():
|
93 |
-
dct[key] = val.lower()
|
94 |
-
for i in range(len(obj.hashtags)):
|
95 |
-
obj.hashtags[i] = obj.hashtags[i].lower()
|
96 |
-
for i in range(len(obj.cashtags)):
|
97 |
-
obj.cashtags[i] = obj.cashtags[i].lower()
|
98 |
-
else:
|
99 |
-
logme.info('_output:Lowercase:hiddenTweetFound')
|
100 |
-
print("[x] Hidden tweet found, account suspended due to violation of TOS")
|
101 |
-
return
|
102 |
-
if config.Output != None:
|
103 |
-
if config.Store_csv:
|
104 |
-
try:
|
105 |
-
write.Csv(obj, config)
|
106 |
-
logme.debug(__name__ + ':_output:CSV')
|
107 |
-
except Exception as e:
|
108 |
-
logme.critical(__name__ + ':_output:CSV:Error:' + str(e))
|
109 |
-
print(str(e) + " [x] output._output")
|
110 |
-
elif config.Store_json:
|
111 |
-
write.Json(obj, config)
|
112 |
-
logme.debug(__name__ + ':_output:JSON')
|
113 |
-
else:
|
114 |
-
write.Text(output, config.Output)
|
115 |
-
logme.debug(__name__ + ':_output:Text')
|
116 |
-
|
117 |
-
if config.Elasticsearch:
|
118 |
-
logme.debug(__name__ + ':_output:Elasticsearch')
|
119 |
-
print("", end=".", flush=True)
|
120 |
-
else:
|
121 |
-
if not config.Hide_output:
|
122 |
-
try:
|
123 |
-
print(output.replace('\n', ' '))
|
124 |
-
except UnicodeEncodeError:
|
125 |
-
logme.critical(__name__ + ':_output:UnicodeEncodeError')
|
126 |
-
print("unicode error [x] output._output")
|
127 |
-
|
128 |
-
|
129 |
-
async def checkData(tweet, config, conn):
|
130 |
-
logme.debug(__name__ + ':checkData')
|
131 |
-
tweet = Tweet(tweet, config)
|
132 |
-
if not tweet.datestamp:
|
133 |
-
logme.critical(__name__ + ':checkData:hiddenTweetFound')
|
134 |
-
print("[x] Hidden tweet found, account suspended due to violation of TOS")
|
135 |
-
return
|
136 |
-
if datecheck(tweet.datestamp + " " + tweet.timestamp, config):
|
137 |
-
output = format.Tweet(config, tweet)
|
138 |
-
if config.Database:
|
139 |
-
logme.debug(__name__ + ':checkData:Database')
|
140 |
-
db.tweets(conn, tweet, config)
|
141 |
-
if config.Pandas:
|
142 |
-
logme.debug(__name__ + ':checkData:Pandas')
|
143 |
-
panda.update(tweet, config)
|
144 |
-
if config.Store_object:
|
145 |
-
logme.debug(__name__ + ':checkData:Store_object')
|
146 |
-
if hasattr(config.Store_object_tweets_list, 'append'):
|
147 |
-
config.Store_object_tweets_list.append(tweet)
|
148 |
-
else:
|
149 |
-
tweets_list.append(tweet)
|
150 |
-
if config.Elasticsearch:
|
151 |
-
logme.debug(__name__ + ':checkData:Elasticsearch')
|
152 |
-
elasticsearch.Tweet(tweet, config)
|
153 |
-
_output(tweet, output, config)
|
154 |
-
# else:
|
155 |
-
# logme.critical(__name__+':checkData:copyrightedTweet')
|
156 |
-
|
157 |
-
|
158 |
-
async def Tweets(tweets, config, conn):
|
159 |
-
logme.debug(__name__ + ':Tweets')
|
160 |
-
if config.Favorites or config.Location:
|
161 |
-
logme.debug(__name__ + ':Tweets:fav+full+loc')
|
162 |
-
for tw in tweets:
|
163 |
-
await checkData(tw, config, conn)
|
164 |
-
elif config.TwitterSearch or config.Profile:
|
165 |
-
logme.debug(__name__ + ':Tweets:TwitterSearch')
|
166 |
-
await checkData(tweets, config, conn)
|
167 |
-
else:
|
168 |
-
logme.debug(__name__ + ':Tweets:else')
|
169 |
-
if int(tweets["data-user-id"]) == config.User_id or config.Retweets:
|
170 |
-
await checkData(tweets, config, conn)
|
171 |
-
|
172 |
-
|
173 |
-
async def Users(u, config, conn):
|
174 |
-
logme.debug(__name__ + ':User')
|
175 |
-
global users_list
|
176 |
-
|
177 |
-
user = User(u)
|
178 |
-
output = format.User(config.Format, user)
|
179 |
-
|
180 |
-
if config.Database:
|
181 |
-
logme.debug(__name__ + ':User:Database')
|
182 |
-
db.user(conn, config, user)
|
183 |
-
|
184 |
-
if config.Elasticsearch:
|
185 |
-
logme.debug(__name__ + ':User:Elasticsearch')
|
186 |
-
_save_date = user.join_date
|
187 |
-
_save_time = user.join_time
|
188 |
-
user.join_date = str(datetime.strptime(user.join_date, "%d %b %Y")).split()[0]
|
189 |
-
user.join_time = str(datetime.strptime(user.join_time, "%I:%M %p")).split()[1]
|
190 |
-
elasticsearch.UserProfile(user, config)
|
191 |
-
user.join_date = _save_date
|
192 |
-
user.join_time = _save_time
|
193 |
-
|
194 |
-
if config.Store_object:
|
195 |
-
logme.debug(__name__ + ':User:Store_object')
|
196 |
-
|
197 |
-
if hasattr(config.Store_object_follow_list, 'append'):
|
198 |
-
config.Store_object_follow_list.append(user)
|
199 |
-
elif hasattr(config.Store_object_users_list, 'append'):
|
200 |
-
config.Store_object_users_list.append(user)
|
201 |
-
else:
|
202 |
-
users_list.append(user) # twint.user.user
|
203 |
-
|
204 |
-
if config.Pandas:
|
205 |
-
logme.debug(__name__ + ':User:Pandas+user')
|
206 |
-
panda.update(user, config)
|
207 |
-
|
208 |
-
_output(user, output, config)
|
209 |
-
|
210 |
-
|
211 |
-
async def Username(username, config, conn):
|
212 |
-
logme.debug(__name__ + ':Username')
|
213 |
-
global _follows_object
|
214 |
-
global follows_list
|
215 |
-
follow_var = config.Following * "following" + config.Followers * "followers"
|
216 |
-
|
217 |
-
if config.Database:
|
218 |
-
logme.debug(__name__ + ':Username:Database')
|
219 |
-
db.follow(conn, config.Username, config.Followers, username)
|
220 |
-
|
221 |
-
if config.Elasticsearch:
|
222 |
-
logme.debug(__name__ + ':Username:Elasticsearch')
|
223 |
-
elasticsearch.Follow(username, config)
|
224 |
-
|
225 |
-
if config.Store_object:
|
226 |
-
if hasattr(config.Store_object_follow_list, 'append'):
|
227 |
-
config.Store_object_follow_list.append(username)
|
228 |
-
else:
|
229 |
-
follows_list.append(username) # twint.user.user
|
230 |
-
|
231 |
-
if config.Pandas:
|
232 |
-
logme.debug(__name__ + ':Username:object+pandas')
|
233 |
-
try:
|
234 |
-
_ = _follows_object[config.Username][follow_var]
|
235 |
-
except KeyError:
|
236 |
-
_follows_object.update({config.Username: {follow_var: []}})
|
237 |
-
_follows_object[config.Username][follow_var].append(username)
|
238 |
-
if config.Pandas_au:
|
239 |
-
logme.debug(__name__ + ':Username:object+pandas+au')
|
240 |
-
panda.update(_follows_object[config.Username], config)
|
241 |
-
_output(username, username, config)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
twitter-scraper/twint-master/twint/run.py
DELETED
@@ -1,412 +0,0 @@
|
|
1 |
-
import sys, os, datetime
|
2 |
-
from asyncio import get_event_loop, TimeoutError, ensure_future, new_event_loop, set_event_loop
|
3 |
-
|
4 |
-
from . import datelock, feed, get, output, verbose, storage
|
5 |
-
from .token import TokenExpiryException
|
6 |
-
from . import token
|
7 |
-
from .storage import db
|
8 |
-
from .feed import NoMoreTweetsException
|
9 |
-
|
10 |
-
import logging as logme
|
11 |
-
|
12 |
-
import time
|
13 |
-
|
14 |
-
bearer = 'Bearer AAAAAAAAAAAAAAAAAAAAANRILgAAAAAAnNwIzUejRCOuH5E6I8xnZz4puTs' \
|
15 |
-
'%3D1Zv7ttfk8LF81IUq16cHjhLTvJu4FA33AGWWjCpTnA'
|
16 |
-
|
17 |
-
|
18 |
-
class Twint:
|
19 |
-
def __init__(self, config):
|
20 |
-
logme.debug(__name__ + ':Twint:__init__')
|
21 |
-
if config.Resume is not None and (config.TwitterSearch or config.Followers or config.Following):
|
22 |
-
logme.debug(__name__ + ':Twint:__init__:Resume')
|
23 |
-
self.init = self.get_resume(config.Resume)
|
24 |
-
else:
|
25 |
-
self.init = -1
|
26 |
-
|
27 |
-
config.deleted = []
|
28 |
-
self.feed: list = [-1]
|
29 |
-
self.count = 0
|
30 |
-
self.user_agent = ""
|
31 |
-
self.config = config
|
32 |
-
self.config.Bearer_token = bearer
|
33 |
-
# TODO might have to make some adjustments for it to work with multi-treading
|
34 |
-
# USAGE : to get a new guest token simply do `self.token.refresh()`
|
35 |
-
self.token = token.Token(config)
|
36 |
-
self.token.refresh()
|
37 |
-
self.conn = db.Conn(config.Database)
|
38 |
-
self.d = datelock.Set(self.config.Until, self.config.Since)
|
39 |
-
verbose.Elastic(config.Elasticsearch)
|
40 |
-
|
41 |
-
if self.config.Store_object:
|
42 |
-
logme.debug(__name__ + ':Twint:__init__:clean_follow_list')
|
43 |
-
output._clean_follow_list()
|
44 |
-
|
45 |
-
if self.config.Pandas_clean:
|
46 |
-
logme.debug(__name__ + ':Twint:__init__:pandas_clean')
|
47 |
-
storage.panda.clean()
|
48 |
-
|
49 |
-
def get_resume(self, resumeFile):
|
50 |
-
if not os.path.exists(resumeFile):
|
51 |
-
return '-1'
|
52 |
-
with open(resumeFile, 'r') as rFile:
|
53 |
-
_init = rFile.readlines()[-1].strip('\n')
|
54 |
-
return _init
|
55 |
-
|
56 |
-
async def Feed(self):
|
57 |
-
logme.debug(__name__ + ':Twint:Feed')
|
58 |
-
consecutive_errors_count = 0
|
59 |
-
while True:
|
60 |
-
# this will receive a JSON string, parse it into a `dict` and do the required stuff
|
61 |
-
try:
|
62 |
-
response = await get.RequestUrl(self.config, self.init)
|
63 |
-
except TokenExpiryException as e:
|
64 |
-
logme.debug(__name__ + 'Twint:Feed:' + str(e))
|
65 |
-
self.token.refresh()
|
66 |
-
response = await get.RequestUrl(self.config, self.init)
|
67 |
-
|
68 |
-
if self.config.Debug:
|
69 |
-
print(response, file=open("twint-last-request.log", "w", encoding="utf-8"))
|
70 |
-
|
71 |
-
self.feed = []
|
72 |
-
try:
|
73 |
-
if self.config.Favorites:
|
74 |
-
self.feed, self.init = feed.MobileFav(response)
|
75 |
-
favorite_err_cnt = 0
|
76 |
-
if len(self.feed) == 0 and len(self.init) == 0:
|
77 |
-
while (len(self.feed) == 0 or len(self.init) == 0) and favorite_err_cnt < 5:
|
78 |
-
self.user_agent = await get.RandomUserAgent(wa=False)
|
79 |
-
response = await get.RequestUrl(self.config, self.init,
|
80 |
-
headers=[("User-Agent", self.user_agent)])
|
81 |
-
self.feed, self.init = feed.MobileFav(response)
|
82 |
-
favorite_err_cnt += 1
|
83 |
-
time.sleep(1)
|
84 |
-
if favorite_err_cnt == 5:
|
85 |
-
print("Favorite page could not be fetched")
|
86 |
-
if not self.count % 40:
|
87 |
-
time.sleep(5)
|
88 |
-
elif self.config.Followers or self.config.Following:
|
89 |
-
self.feed, self.init = feed.Follow(response)
|
90 |
-
if not self.count % 40:
|
91 |
-
time.sleep(5)
|
92 |
-
elif self.config.Profile or self.config.TwitterSearch:
|
93 |
-
try:
|
94 |
-
self.feed, self.init = feed.parse_tweets(self.config, response)
|
95 |
-
except NoMoreTweetsException as e:
|
96 |
-
logme.debug(__name__ + ':Twint:Feed:' + str(e))
|
97 |
-
print('[!] ' + str(e) + ' Scraping will stop now.')
|
98 |
-
print('found {} deleted tweets in this search.'.format(len(self.config.deleted)))
|
99 |
-
break
|
100 |
-
break
|
101 |
-
except TimeoutError as e:
|
102 |
-
if self.config.Proxy_host.lower() == "tor":
|
103 |
-
print("[?] Timed out, changing Tor identity...")
|
104 |
-
if self.config.Tor_control_password is None:
|
105 |
-
logme.critical(__name__ + ':Twint:Feed:tor-password')
|
106 |
-
sys.stderr.write("Error: config.Tor_control_password must be set for proxy auto-rotation!\r\n")
|
107 |
-
sys.stderr.write(
|
108 |
-
"Info: What is it? See https://stem.torproject.org/faq.html#can-i-interact-with-tors"
|
109 |
-
"-controller-interface-directly\r\n")
|
110 |
-
break
|
111 |
-
else:
|
112 |
-
get.ForceNewTorIdentity(self.config)
|
113 |
-
continue
|
114 |
-
else:
|
115 |
-
logme.critical(__name__ + ':Twint:Feed:' + str(e))
|
116 |
-
print(str(e))
|
117 |
-
break
|
118 |
-
except Exception as e:
|
119 |
-
if self.config.Profile or self.config.Favorites:
|
120 |
-
print("[!] Twitter does not return more data, scrape stops here.")
|
121 |
-
break
|
122 |
-
|
123 |
-
logme.critical(__name__ + ':Twint:Feed:noData' + str(e))
|
124 |
-
# Sometimes Twitter says there is no data. But it's a lie.
|
125 |
-
# raise
|
126 |
-
consecutive_errors_count += 1
|
127 |
-
if consecutive_errors_count < self.config.Retries_count:
|
128 |
-
# skip to the next iteration if wait time does not satisfy limit constraints
|
129 |
-
delay = round(consecutive_errors_count ** self.config.Backoff_exponent, 1)
|
130 |
-
|
131 |
-
# if the delay is less than users set min wait time then replace delay
|
132 |
-
if self.config.Min_wait_time > delay:
|
133 |
-
delay = self.config.Min_wait_time
|
134 |
-
|
135 |
-
sys.stderr.write('sleeping for {} secs\n'.format(delay))
|
136 |
-
time.sleep(delay)
|
137 |
-
self.user_agent = await get.RandomUserAgent(wa=True)
|
138 |
-
continue
|
139 |
-
logme.critical(__name__ + ':Twint:Feed:Tweets_known_error:' + str(e))
|
140 |
-
sys.stderr.write(str(e) + " [x] run.Feed")
|
141 |
-
sys.stderr.write(
|
142 |
-
"[!] if you get this error but you know for sure that more tweets exist, please open an issue and "
|
143 |
-
"we will investigate it!")
|
144 |
-
break
|
145 |
-
if self.config.Resume:
|
146 |
-
print(self.init, file=open(self.config.Resume, "a", encoding="utf-8"))
|
147 |
-
|
148 |
-
async def follow(self):
|
149 |
-
await self.Feed()
|
150 |
-
if self.config.User_full:
|
151 |
-
logme.debug(__name__ + ':Twint:follow:userFull')
|
152 |
-
self.count += await get.Multi(self.feed, self.config, self.conn)
|
153 |
-
else:
|
154 |
-
logme.debug(__name__ + ':Twint:follow:notUserFull')
|
155 |
-
for user in self.feed:
|
156 |
-
self.count += 1
|
157 |
-
username = user.find("a")["name"]
|
158 |
-
await output.Username(username, self.config, self.conn)
|
159 |
-
|
160 |
-
async def favorite(self):
|
161 |
-
logme.debug(__name__ + ':Twint:favorite')
|
162 |
-
await self.Feed()
|
163 |
-
favorited_tweets_list = []
|
164 |
-
for tweet in self.feed:
|
165 |
-
tweet_dict = {}
|
166 |
-
self.count += 1
|
167 |
-
try:
|
168 |
-
tweet_dict['data-item-id'] = tweet.find("div", {"class": "tweet-text"})['data-id']
|
169 |
-
t_url = tweet.find("span", {"class": "metadata"}).find("a")["href"]
|
170 |
-
tweet_dict['data-conversation-id'] = t_url.split('?')[0].split('/')[-1]
|
171 |
-
tweet_dict['username'] = tweet.find("div", {"class": "username"}).text.replace('\n', '').replace(' ',
|
172 |
-
'')
|
173 |
-
tweet_dict['tweet'] = tweet.find("div", {"class": "tweet-text"}).find("div", {"class": "dir-ltr"}).text
|
174 |
-
date_str = tweet.find("td", {"class": "timestamp"}).find("a").text
|
175 |
-
# test_dates = ["1m", "2h", "Jun 21, 2019", "Mar 12", "28 Jun 19"]
|
176 |
-
# date_str = test_dates[3]
|
177 |
-
if len(date_str) <= 3 and (date_str[-1] == "m" or date_str[-1] == "h"): # 25m 1h
|
178 |
-
dateu = str(datetime.date.today())
|
179 |
-
tweet_dict['date'] = dateu
|
180 |
-
elif ',' in date_str: # Aug 21, 2019
|
181 |
-
sp = date_str.replace(',', '').split(' ')
|
182 |
-
date_str_formatted = sp[1] + ' ' + sp[0] + ' ' + sp[2]
|
183 |
-
dateu = datetime.datetime.strptime(date_str_formatted, "%d %b %Y").strftime("%Y-%m-%d")
|
184 |
-
tweet_dict['date'] = dateu
|
185 |
-
elif len(date_str.split(' ')) == 3: # 28 Jun 19
|
186 |
-
sp = date_str.split(' ')
|
187 |
-
if len(sp[2]) == 2:
|
188 |
-
sp[2] = '20' + sp[2]
|
189 |
-
date_str_formatted = sp[0] + ' ' + sp[1] + ' ' + sp[2]
|
190 |
-
dateu = datetime.datetime.strptime(date_str_formatted, "%d %b %Y").strftime("%Y-%m-%d")
|
191 |
-
tweet_dict['date'] = dateu
|
192 |
-
else: # Aug 21
|
193 |
-
sp = date_str.split(' ')
|
194 |
-
date_str_formatted = sp[1] + ' ' + sp[0] + ' ' + str(datetime.date.today().year)
|
195 |
-
dateu = datetime.datetime.strptime(date_str_formatted, "%d %b %Y").strftime("%Y-%m-%d")
|
196 |
-
tweet_dict['date'] = dateu
|
197 |
-
|
198 |
-
favorited_tweets_list.append(tweet_dict)
|
199 |
-
|
200 |
-
except Exception as e:
|
201 |
-
logme.critical(__name__ + ':Twint:favorite:favorite_field_lack')
|
202 |
-
print("shit: ", date_str, " ", str(e))
|
203 |
-
|
204 |
-
try:
|
205 |
-
self.config.favorited_tweets_list += favorited_tweets_list
|
206 |
-
except AttributeError:
|
207 |
-
self.config.favorited_tweets_list = favorited_tweets_list
|
208 |
-
|
209 |
-
async def profile(self):
|
210 |
-
await self.Feed()
|
211 |
-
logme.debug(__name__ + ':Twint:profile')
|
212 |
-
for tweet in self.feed:
|
213 |
-
self.count += 1
|
214 |
-
await output.Tweets(tweet, self.config, self.conn)
|
215 |
-
|
216 |
-
async def tweets(self):
|
217 |
-
await self.Feed()
|
218 |
-
# TODO : need to take care of this later
|
219 |
-
if self.config.Location:
|
220 |
-
logme.debug(__name__ + ':Twint:tweets:location')
|
221 |
-
self.count += await get.Multi(self.feed, self.config, self.conn)
|
222 |
-
else:
|
223 |
-
logme.debug(__name__ + ':Twint:tweets:notLocation')
|
224 |
-
for tweet in self.feed:
|
225 |
-
self.count += 1
|
226 |
-
await output.Tweets(tweet, self.config, self.conn)
|
227 |
-
|
228 |
-
async def main(self, callback=None):
|
229 |
-
|
230 |
-
task = ensure_future(self.run()) # Might be changed to create_task in 3.7+.
|
231 |
-
|
232 |
-
if callback:
|
233 |
-
task.add_done_callback(callback)
|
234 |
-
|
235 |
-
await task
|
236 |
-
|
237 |
-
async def run(self):
|
238 |
-
if self.config.TwitterSearch:
|
239 |
-
self.user_agent = await get.RandomUserAgent(wa=True)
|
240 |
-
else:
|
241 |
-
self.user_agent = await get.RandomUserAgent()
|
242 |
-
|
243 |
-
if self.config.User_id is not None and self.config.Username is None:
|
244 |
-
logme.debug(__name__ + ':Twint:main:user_id')
|
245 |
-
self.config.Username = await get.Username(self.config.User_id, self.config.Bearer_token,
|
246 |
-
self.config.Guest_token)
|
247 |
-
|
248 |
-
if self.config.Username is not None and self.config.User_id is None:
|
249 |
-
logme.debug(__name__ + ':Twint:main:username')
|
250 |
-
|
251 |
-
self.config.User_id = await get.User(self.config.Username, self.config, self.conn, True)
|
252 |
-
if self.config.User_id is None:
|
253 |
-
raise ValueError("Cannot find twitter account with name = " + self.config.Username)
|
254 |
-
|
255 |
-
# TODO : will need to modify it to work with the new endpoints
|
256 |
-
if self.config.TwitterSearch and self.config.Since and self.config.Until:
|
257 |
-
logme.debug(__name__ + ':Twint:main:search+since+until')
|
258 |
-
while self.d.since < self.d.until:
|
259 |
-
self.config.Since = datetime.datetime.strftime(self.d.since, "%Y-%m-%d %H:%M:%S")
|
260 |
-
self.config.Until = datetime.datetime.strftime(self.d.until, "%Y-%m-%d %H:%M:%S")
|
261 |
-
if len(self.feed) > 0:
|
262 |
-
await self.tweets()
|
263 |
-
else:
|
264 |
-
logme.debug(__name__ + ':Twint:main:gettingNewTweets')
|
265 |
-
break
|
266 |
-
|
267 |
-
if get.Limit(self.config.Limit, self.count):
|
268 |
-
break
|
269 |
-
elif self.config.Lookup:
|
270 |
-
await self.Lookup()
|
271 |
-
else:
|
272 |
-
logme.debug(__name__ + ':Twint:main:not-search+since+until')
|
273 |
-
while True:
|
274 |
-
if len(self.feed) > 0:
|
275 |
-
if self.config.Followers or self.config.Following:
|
276 |
-
logme.debug(__name__ + ':Twint:main:follow')
|
277 |
-
await self.follow()
|
278 |
-
elif self.config.Favorites:
|
279 |
-
logme.debug(__name__ + ':Twint:main:favorites')
|
280 |
-
await self.favorite()
|
281 |
-
elif self.config.Profile:
|
282 |
-
logme.debug(__name__ + ':Twint:main:profile')
|
283 |
-
await self.profile()
|
284 |
-
elif self.config.TwitterSearch:
|
285 |
-
logme.debug(__name__ + ':Twint:main:twitter-search')
|
286 |
-
await self.tweets()
|
287 |
-
else:
|
288 |
-
logme.debug(__name__ + ':Twint:main:no-more-tweets')
|
289 |
-
break
|
290 |
-
|
291 |
-
# logging.info("[<] " + str(datetime.now()) + ':: run+Twint+main+CallingGetLimit2')
|
292 |
-
if get.Limit(self.config.Limit, self.count):
|
293 |
-
logme.debug(__name__ + ':Twint:main:reachedLimit')
|
294 |
-
break
|
295 |
-
|
296 |
-
if self.config.Count:
|
297 |
-
verbose.Count(self.count, self.config)
|
298 |
-
|
299 |
-
async def Lookup(self):
|
300 |
-
logme.debug(__name__ + ':Twint:Lookup')
|
301 |
-
|
302 |
-
try:
|
303 |
-
if self.config.User_id is not None and self.config.Username is None:
|
304 |
-
logme.debug(__name__ + ':Twint:Lookup:user_id')
|
305 |
-
self.config.Username = await get.Username(self.config.User_id, self.config.Bearer_token,
|
306 |
-
self.config.Guest_token)
|
307 |
-
await get.User(self.config.Username, self.config, db.Conn(self.config.Database))
|
308 |
-
|
309 |
-
except Exception as e:
|
310 |
-
logme.exception(__name__ + ':Twint:Lookup:Unexpected exception occurred.')
|
311 |
-
raise
|
312 |
-
|
313 |
-
|
314 |
-
def run(config, callback=None):
|
315 |
-
logme.debug(__name__ + ':run')
|
316 |
-
try:
|
317 |
-
get_event_loop()
|
318 |
-
except RuntimeError as e:
|
319 |
-
if "no current event loop" in str(e):
|
320 |
-
set_event_loop(new_event_loop())
|
321 |
-
else:
|
322 |
-
logme.exception(__name__ + ':run:Unexpected exception while handling an expected RuntimeError.')
|
323 |
-
raise
|
324 |
-
except Exception as e:
|
325 |
-
logme.exception(
|
326 |
-
__name__ + ':run:Unexpected exception occurred while attempting to get or create a new event loop.')
|
327 |
-
raise
|
328 |
-
|
329 |
-
get_event_loop().run_until_complete(Twint(config).main(callback))
|
330 |
-
|
331 |
-
|
332 |
-
def Favorites(config):
|
333 |
-
logme.debug(__name__ + ':Favorites')
|
334 |
-
config.Favorites = True
|
335 |
-
config.Following = False
|
336 |
-
config.Followers = False
|
337 |
-
config.Profile = False
|
338 |
-
config.TwitterSearch = False
|
339 |
-
run(config)
|
340 |
-
if config.Pandas_au:
|
341 |
-
storage.panda._autoget("tweet")
|
342 |
-
|
343 |
-
|
344 |
-
def Followers(config):
|
345 |
-
logme.debug(__name__ + ':Followers')
|
346 |
-
config.Followers = True
|
347 |
-
config.Following = False
|
348 |
-
config.Profile = False
|
349 |
-
config.Favorites = False
|
350 |
-
config.TwitterSearch = False
|
351 |
-
run(config)
|
352 |
-
if config.Pandas_au:
|
353 |
-
storage.panda._autoget("followers")
|
354 |
-
if config.User_full:
|
355 |
-
storage.panda._autoget("user")
|
356 |
-
if config.Pandas_clean and not config.Store_object:
|
357 |
-
# storage.panda.clean()
|
358 |
-
output._clean_follow_list()
|
359 |
-
|
360 |
-
|
361 |
-
def Following(config):
|
362 |
-
logme.debug(__name__ + ':Following')
|
363 |
-
config.Following = True
|
364 |
-
config.Followers = False
|
365 |
-
config.Profile = False
|
366 |
-
config.Favorites = False
|
367 |
-
config.TwitterSearch = False
|
368 |
-
run(config)
|
369 |
-
if config.Pandas_au:
|
370 |
-
storage.panda._autoget("following")
|
371 |
-
if config.User_full:
|
372 |
-
storage.panda._autoget("user")
|
373 |
-
if config.Pandas_clean and not config.Store_object:
|
374 |
-
# storage.panda.clean()
|
375 |
-
output._clean_follow_list()
|
376 |
-
|
377 |
-
|
378 |
-
def Lookup(config):
|
379 |
-
logme.debug(__name__ + ':Lookup')
|
380 |
-
config.Profile = False
|
381 |
-
config.Lookup = True
|
382 |
-
config.Favorites = False
|
383 |
-
config.FOllowing = False
|
384 |
-
config.Followers = False
|
385 |
-
config.TwitterSearch = False
|
386 |
-
run(config)
|
387 |
-
if config.Pandas_au:
|
388 |
-
storage.panda._autoget("user")
|
389 |
-
|
390 |
-
|
391 |
-
def Profile(config):
|
392 |
-
logme.debug(__name__ + ':Profile')
|
393 |
-
config.Profile = True
|
394 |
-
config.Favorites = False
|
395 |
-
config.Following = False
|
396 |
-
config.Followers = False
|
397 |
-
config.TwitterSearch = False
|
398 |
-
run(config)
|
399 |
-
if config.Pandas_au:
|
400 |
-
storage.panda._autoget("tweet")
|
401 |
-
|
402 |
-
|
403 |
-
def Search(config, callback=None):
|
404 |
-
logme.debug(__name__ + ':Search')
|
405 |
-
config.TwitterSearch = True
|
406 |
-
config.Favorites = False
|
407 |
-
config.Following = False
|
408 |
-
config.Followers = False
|
409 |
-
config.Profile = False
|
410 |
-
run(config, callback)
|
411 |
-
if config.Pandas_au:
|
412 |
-
storage.panda._autoget("tweet")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
twitter-scraper/twint-master/twint/storage/__init__.py
DELETED
File without changes
|
twitter-scraper/twint-master/twint/storage/db.py
DELETED
@@ -1,297 +0,0 @@
|
|
1 |
-
import sqlite3
|
2 |
-
import sys
|
3 |
-
import time
|
4 |
-
import hashlib
|
5 |
-
|
6 |
-
from datetime import datetime
|
7 |
-
|
8 |
-
def Conn(database):
|
9 |
-
if database:
|
10 |
-
print("[+] Inserting into Database: " + str(database))
|
11 |
-
conn = init(database)
|
12 |
-
if isinstance(conn, str): # error
|
13 |
-
print(conn)
|
14 |
-
sys.exit(1)
|
15 |
-
else:
|
16 |
-
conn = ""
|
17 |
-
|
18 |
-
return conn
|
19 |
-
|
20 |
-
def init(db):
|
21 |
-
try:
|
22 |
-
conn = sqlite3.connect(db)
|
23 |
-
cursor = conn.cursor()
|
24 |
-
|
25 |
-
table_users = """
|
26 |
-
CREATE TABLE IF NOT EXISTS
|
27 |
-
users(
|
28 |
-
id integer not null,
|
29 |
-
id_str text not null,
|
30 |
-
name text,
|
31 |
-
username text not null,
|
32 |
-
bio text,
|
33 |
-
location text,
|
34 |
-
url text,
|
35 |
-
join_date text not null,
|
36 |
-
join_time text not null,
|
37 |
-
tweets integer,
|
38 |
-
following integer,
|
39 |
-
followers integer,
|
40 |
-
likes integer,
|
41 |
-
media integer,
|
42 |
-
private integer not null,
|
43 |
-
verified integer not null,
|
44 |
-
profile_image_url text not null,
|
45 |
-
background_image text,
|
46 |
-
hex_dig text not null,
|
47 |
-
time_update integer not null,
|
48 |
-
CONSTRAINT users_pk PRIMARY KEY (id, hex_dig)
|
49 |
-
);
|
50 |
-
"""
|
51 |
-
cursor.execute(table_users)
|
52 |
-
|
53 |
-
table_tweets = """
|
54 |
-
CREATE TABLE IF NOT EXISTS
|
55 |
-
tweets (
|
56 |
-
id integer not null,
|
57 |
-
id_str text not null,
|
58 |
-
tweet text default '',
|
59 |
-
language text default '',
|
60 |
-
conversation_id text not null,
|
61 |
-
created_at integer not null,
|
62 |
-
date text not null,
|
63 |
-
time text not null,
|
64 |
-
timezone text not null,
|
65 |
-
place text default '',
|
66 |
-
replies_count integer,
|
67 |
-
likes_count integer,
|
68 |
-
retweets_count integer,
|
69 |
-
user_id integer not null,
|
70 |
-
user_id_str text not null,
|
71 |
-
screen_name text not null,
|
72 |
-
name text default '',
|
73 |
-
link text,
|
74 |
-
mentions text,
|
75 |
-
hashtags text,
|
76 |
-
cashtags text,
|
77 |
-
urls text,
|
78 |
-
photos text,
|
79 |
-
thumbnail text,
|
80 |
-
quote_url text,
|
81 |
-
video integer,
|
82 |
-
geo text,
|
83 |
-
near text,
|
84 |
-
source text,
|
85 |
-
time_update integer not null,
|
86 |
-
`translate` text default '',
|
87 |
-
trans_src text default '',
|
88 |
-
trans_dest text default '',
|
89 |
-
PRIMARY KEY (id)
|
90 |
-
);
|
91 |
-
"""
|
92 |
-
cursor.execute(table_tweets)
|
93 |
-
|
94 |
-
table_retweets = """
|
95 |
-
CREATE TABLE IF NOT EXISTS
|
96 |
-
retweets(
|
97 |
-
user_id integer not null,
|
98 |
-
username text not null,
|
99 |
-
tweet_id integer not null,
|
100 |
-
retweet_id integer not null,
|
101 |
-
retweet_date integer,
|
102 |
-
CONSTRAINT retweets_pk PRIMARY KEY(user_id, tweet_id),
|
103 |
-
CONSTRAINT user_id_fk FOREIGN KEY(user_id) REFERENCES users(id),
|
104 |
-
CONSTRAINT tweet_id_fk FOREIGN KEY(tweet_id) REFERENCES tweets(id)
|
105 |
-
);
|
106 |
-
"""
|
107 |
-
cursor.execute(table_retweets)
|
108 |
-
|
109 |
-
table_reply_to = """
|
110 |
-
CREATE TABLE IF NOT EXISTS
|
111 |
-
replies(
|
112 |
-
tweet_id integer not null,
|
113 |
-
user_id integer not null,
|
114 |
-
username text not null,
|
115 |
-
CONSTRAINT replies_pk PRIMARY KEY (user_id, tweet_id),
|
116 |
-
CONSTRAINT tweet_id_fk FOREIGN KEY (tweet_id) REFERENCES tweets(id)
|
117 |
-
);
|
118 |
-
"""
|
119 |
-
cursor.execute(table_reply_to)
|
120 |
-
|
121 |
-
table_favorites = """
|
122 |
-
CREATE TABLE IF NOT EXISTS
|
123 |
-
favorites(
|
124 |
-
user_id integer not null,
|
125 |
-
tweet_id integer not null,
|
126 |
-
CONSTRAINT favorites_pk PRIMARY KEY (user_id, tweet_id),
|
127 |
-
CONSTRAINT user_id_fk FOREIGN KEY (user_id) REFERENCES users(id),
|
128 |
-
CONSTRAINT tweet_id_fk FOREIGN KEY (tweet_id) REFERENCES tweets(id)
|
129 |
-
);
|
130 |
-
"""
|
131 |
-
cursor.execute(table_favorites)
|
132 |
-
|
133 |
-
table_followers = """
|
134 |
-
CREATE TABLE IF NOT EXISTS
|
135 |
-
followers (
|
136 |
-
id integer not null,
|
137 |
-
follower_id integer not null,
|
138 |
-
CONSTRAINT followers_pk PRIMARY KEY (id, follower_id),
|
139 |
-
CONSTRAINT id_fk FOREIGN KEY(id) REFERENCES users(id),
|
140 |
-
CONSTRAINT follower_id_fk FOREIGN KEY(follower_id) REFERENCES users(id)
|
141 |
-
);
|
142 |
-
"""
|
143 |
-
cursor.execute(table_followers)
|
144 |
-
|
145 |
-
table_following = """
|
146 |
-
CREATE TABLE IF NOT EXISTS
|
147 |
-
following (
|
148 |
-
id integer not null,
|
149 |
-
following_id integer not null,
|
150 |
-
CONSTRAINT following_pk PRIMARY KEY (id, following_id),
|
151 |
-
CONSTRAINT id_fk FOREIGN KEY(id) REFERENCES users(id),
|
152 |
-
CONSTRAINT following_id_fk FOREIGN KEY(following_id) REFERENCES users(id)
|
153 |
-
);
|
154 |
-
"""
|
155 |
-
cursor.execute(table_following)
|
156 |
-
|
157 |
-
table_followers_names = """
|
158 |
-
CREATE TABLE IF NOT EXISTS
|
159 |
-
followers_names (
|
160 |
-
user text not null,
|
161 |
-
time_update integer not null,
|
162 |
-
follower text not null,
|
163 |
-
PRIMARY KEY (user, follower)
|
164 |
-
);
|
165 |
-
"""
|
166 |
-
cursor.execute(table_followers_names)
|
167 |
-
|
168 |
-
table_following_names = """
|
169 |
-
CREATE TABLE IF NOT EXISTS
|
170 |
-
following_names (
|
171 |
-
user text not null,
|
172 |
-
time_update integer not null,
|
173 |
-
follows text not null,
|
174 |
-
PRIMARY KEY (user, follows)
|
175 |
-
);
|
176 |
-
"""
|
177 |
-
cursor.execute(table_following_names)
|
178 |
-
|
179 |
-
return conn
|
180 |
-
except Exception as e:
|
181 |
-
return str(e)
|
182 |
-
|
183 |
-
def fTable(Followers):
|
184 |
-
if Followers:
|
185 |
-
table = "followers_names"
|
186 |
-
else:
|
187 |
-
table = "following_names"
|
188 |
-
|
189 |
-
return table
|
190 |
-
|
191 |
-
def uTable(Followers):
|
192 |
-
if Followers:
|
193 |
-
table = "followers"
|
194 |
-
else:
|
195 |
-
table = "following"
|
196 |
-
|
197 |
-
return table
|
198 |
-
|
199 |
-
def follow(conn, Username, Followers, User):
|
200 |
-
try:
|
201 |
-
time_ms = round(time.time()*1000)
|
202 |
-
cursor = conn.cursor()
|
203 |
-
entry = (User, time_ms, Username,)
|
204 |
-
table = fTable(Followers)
|
205 |
-
query = f"INSERT INTO {table} VALUES(?,?,?)"
|
206 |
-
cursor.execute(query, entry)
|
207 |
-
conn.commit()
|
208 |
-
except sqlite3.IntegrityError:
|
209 |
-
pass
|
210 |
-
|
211 |
-
def get_hash_id(conn, id):
|
212 |
-
cursor = conn.cursor()
|
213 |
-
cursor.execute('SELECT hex_dig FROM users WHERE id = ? LIMIT 1', (id,))
|
214 |
-
resultset = cursor.fetchall()
|
215 |
-
return resultset[0][0] if resultset else -1
|
216 |
-
|
217 |
-
def user(conn, config, User):
|
218 |
-
try:
|
219 |
-
time_ms = round(time.time()*1000)
|
220 |
-
cursor = conn.cursor()
|
221 |
-
user = [int(User.id), User.id, User.name, User.username, User.bio, User.location, User.url,User.join_date, User.join_time, User.tweets, User.following, User.followers, User.likes, User.media_count, User.is_private, User.is_verified, User.avatar, User.background_image]
|
222 |
-
|
223 |
-
hex_dig = hashlib.sha256(','.join(str(v) for v in user).encode()).hexdigest()
|
224 |
-
entry = tuple(user) + (hex_dig,time_ms,)
|
225 |
-
old_hash = get_hash_id(conn, User.id)
|
226 |
-
|
227 |
-
if old_hash == -1 or old_hash != hex_dig:
|
228 |
-
query = f"INSERT INTO users VALUES(?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)"
|
229 |
-
cursor.execute(query, entry)
|
230 |
-
else:
|
231 |
-
pass
|
232 |
-
|
233 |
-
if config.Followers or config.Following:
|
234 |
-
table = uTable(config.Followers)
|
235 |
-
query = f"INSERT INTO {table} VALUES(?,?)"
|
236 |
-
cursor.execute(query, (config.User_id, int(User.id)))
|
237 |
-
|
238 |
-
conn.commit()
|
239 |
-
except sqlite3.IntegrityError:
|
240 |
-
pass
|
241 |
-
|
242 |
-
def tweets(conn, Tweet, config):
|
243 |
-
try:
|
244 |
-
time_ms = round(time.time()*1000)
|
245 |
-
cursor = conn.cursor()
|
246 |
-
entry = (Tweet.id,
|
247 |
-
Tweet.id_str,
|
248 |
-
Tweet.tweet,
|
249 |
-
Tweet.lang,
|
250 |
-
Tweet.conversation_id,
|
251 |
-
Tweet.datetime,
|
252 |
-
Tweet.datestamp,
|
253 |
-
Tweet.timestamp,
|
254 |
-
Tweet.timezone,
|
255 |
-
Tweet.place,
|
256 |
-
Tweet.replies_count,
|
257 |
-
Tweet.likes_count,
|
258 |
-
Tweet.retweets_count,
|
259 |
-
Tweet.user_id,
|
260 |
-
Tweet.user_id_str,
|
261 |
-
Tweet.username,
|
262 |
-
Tweet.name,
|
263 |
-
Tweet.link,
|
264 |
-
",".join(Tweet.mentions),
|
265 |
-
",".join(Tweet.hashtags),
|
266 |
-
",".join(Tweet.cashtags),
|
267 |
-
",".join(Tweet.urls),
|
268 |
-
",".join(Tweet.photos),
|
269 |
-
Tweet.thumbnail,
|
270 |
-
Tweet.quote_url,
|
271 |
-
Tweet.video,
|
272 |
-
Tweet.geo,
|
273 |
-
Tweet.near,
|
274 |
-
Tweet.source,
|
275 |
-
time_ms,
|
276 |
-
Tweet.translate,
|
277 |
-
Tweet.trans_src,
|
278 |
-
Tweet.trans_dest)
|
279 |
-
cursor.execute('INSERT INTO tweets VALUES(?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)', entry)
|
280 |
-
|
281 |
-
if config.Favorites:
|
282 |
-
query = 'INSERT INTO favorites VALUES(?,?)'
|
283 |
-
cursor.execute(query, (config.User_id, Tweet.id))
|
284 |
-
|
285 |
-
if Tweet.retweet:
|
286 |
-
query = 'INSERT INTO retweets VALUES(?,?,?,?,?)'
|
287 |
-
_d = datetime.timestamp(datetime.strptime(Tweet.retweet_date, "%Y-%m-%d %H:%M:%S"))
|
288 |
-
cursor.execute(query, (int(Tweet.user_rt_id), Tweet.user_rt, Tweet.id, int(Tweet.retweet_id), _d))
|
289 |
-
|
290 |
-
if Tweet.reply_to:
|
291 |
-
for reply in Tweet.reply_to:
|
292 |
-
query = 'INSERT INTO replies VALUES(?,?,?)'
|
293 |
-
cursor.execute(query, (Tweet.id, int(reply['user_id']), reply['username']))
|
294 |
-
|
295 |
-
conn.commit()
|
296 |
-
except sqlite3.IntegrityError:
|
297 |
-
pass
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
twitter-scraper/twint-master/twint/storage/elasticsearch.py
DELETED
@@ -1,364 +0,0 @@
|
|
1 |
-
## TODO - Fix Weekday situation
|
2 |
-
from elasticsearch import Elasticsearch, helpers
|
3 |
-
from geopy.geocoders import Nominatim
|
4 |
-
from datetime import datetime
|
5 |
-
import contextlib
|
6 |
-
import sys
|
7 |
-
|
8 |
-
_index_tweet_status = False
|
9 |
-
_index_follow_status = False
|
10 |
-
_index_user_status = False
|
11 |
-
_is_near_def = False
|
12 |
-
_is_location_def = False
|
13 |
-
_near = {}
|
14 |
-
_location = {}
|
15 |
-
|
16 |
-
geolocator = Nominatim(user_agent="twint-1.2")
|
17 |
-
|
18 |
-
class RecycleObject(object):
|
19 |
-
def write(self, junk): pass
|
20 |
-
def flush(self): pass
|
21 |
-
|
22 |
-
def getLocation(place, **options):
|
23 |
-
location = geolocator.geocode(place,timeout=1000)
|
24 |
-
if location:
|
25 |
-
if options.get("near"):
|
26 |
-
global _near
|
27 |
-
_near = {"lat": location.latitude, "lon": location.longitude}
|
28 |
-
return True
|
29 |
-
elif options.get("location"):
|
30 |
-
global _location
|
31 |
-
_location = {"lat": location.latitude, "lon": location.longitude}
|
32 |
-
return True
|
33 |
-
return {"lat": location.latitude, "lon": location.longitude}
|
34 |
-
else:
|
35 |
-
return {}
|
36 |
-
|
37 |
-
def handleIndexResponse(response):
|
38 |
-
try:
|
39 |
-
if response["status"] == 400:
|
40 |
-
return True
|
41 |
-
except KeyError:
|
42 |
-
pass
|
43 |
-
if response["acknowledged"]:
|
44 |
-
print("[+] Index \"" + response["index"] + "\" created!")
|
45 |
-
else:
|
46 |
-
print("[x] error index creation :: storage.elasticsearch.handleIndexCreation")
|
47 |
-
if response["shards_acknowledged"]:
|
48 |
-
print("[+] Shards acknowledged, everything is ready to be used!")
|
49 |
-
return True
|
50 |
-
else:
|
51 |
-
print("[x] error with shards :: storage.elasticsearch.HandleIndexCreation")
|
52 |
-
return False
|
53 |
-
|
54 |
-
def createIndex(config, instance, **scope):
|
55 |
-
if scope.get("scope") == "tweet":
|
56 |
-
tweets_body = {
|
57 |
-
"mappings": {
|
58 |
-
"properties": {
|
59 |
-
"id": {"type": "long"},
|
60 |
-
"conversation_id": {"type": "long"},
|
61 |
-
"created_at": {"type": "text"},
|
62 |
-
"date": {"type": "date", "format": "yyyy-MM-dd HH:mm:ss"},
|
63 |
-
"timezone": {"type": "keyword"},
|
64 |
-
"place": {"type": "keyword"},
|
65 |
-
"location": {"type": "keyword"},
|
66 |
-
"tweet": {"type": "text"},
|
67 |
-
"lang": {"type": "keyword"},
|
68 |
-
"hashtags": {"type": "keyword", "normalizer": "hashtag_normalizer"},
|
69 |
-
"cashtags": {"type": "keyword", "normalizer": "hashtag_normalizer"},
|
70 |
-
"user_id_str": {"type": "keyword"},
|
71 |
-
"username": {"type": "keyword", "normalizer": "hashtag_normalizer"},
|
72 |
-
"name": {"type": "text"},
|
73 |
-
"profile_image_url": {"type": "text"},
|
74 |
-
"day": {"type": "integer"},
|
75 |
-
"hour": {"type": "integer"},
|
76 |
-
"link": {"type": "text"},
|
77 |
-
"retweet": {"type": "text"},
|
78 |
-
"essid": {"type": "keyword"},
|
79 |
-
"nlikes": {"type": "integer"},
|
80 |
-
"nreplies": {"type": "integer"},
|
81 |
-
"nretweets": {"type": "integer"},
|
82 |
-
"quote_url": {"type": "text"},
|
83 |
-
"video": {"type":"integer"},
|
84 |
-
"thumbnail": {"type":"text"},
|
85 |
-
"search": {"type": "text"},
|
86 |
-
"near": {"type": "text"},
|
87 |
-
"geo_near": {"type": "geo_point"},
|
88 |
-
"geo_tweet": {"type": "geo_point"},
|
89 |
-
"photos": {"type": "text"},
|
90 |
-
"user_rt_id": {"type": "keyword"},
|
91 |
-
"mentions": {"type": "keyword", "normalizer": "hashtag_normalizer"},
|
92 |
-
"source": {"type": "keyword"},
|
93 |
-
"user_rt": {"type": "keyword"},
|
94 |
-
"retweet_id": {"type": "keyword"},
|
95 |
-
"reply_to": {
|
96 |
-
"type": "nested",
|
97 |
-
"properties": {
|
98 |
-
"user_id": {"type": "keyword"},
|
99 |
-
"username": {"type": "keyword"}
|
100 |
-
}
|
101 |
-
},
|
102 |
-
"retweet_date": {"type": "date", "format": "yyyy-MM-dd HH:mm:ss", "ignore_malformed": True},
|
103 |
-
"urls": {"type": "keyword"},
|
104 |
-
"translate": {"type": "text"},
|
105 |
-
"trans_src": {"type": "keyword"},
|
106 |
-
"trans_dest": {"type": "keyword"},
|
107 |
-
}
|
108 |
-
},
|
109 |
-
"settings": {
|
110 |
-
"number_of_shards": 1,
|
111 |
-
"analysis": {
|
112 |
-
"normalizer": {
|
113 |
-
"hashtag_normalizer": {
|
114 |
-
"type": "custom",
|
115 |
-
"char_filter": [],
|
116 |
-
"filter": ["lowercase", "asciifolding"]
|
117 |
-
}
|
118 |
-
}
|
119 |
-
}
|
120 |
-
}
|
121 |
-
}
|
122 |
-
with nostdout():
|
123 |
-
resp = instance.indices.create(index=config.Index_tweets, body=tweets_body, ignore=400)
|
124 |
-
return handleIndexResponse(resp)
|
125 |
-
elif scope.get("scope") == "follow":
|
126 |
-
follow_body = {
|
127 |
-
"mappings": {
|
128 |
-
"properties": {
|
129 |
-
"user": {"type": "keyword"},
|
130 |
-
"follow": {"type": "keyword"},
|
131 |
-
"essid": {"type": "keyword"}
|
132 |
-
}
|
133 |
-
},
|
134 |
-
"settings": {
|
135 |
-
"number_of_shards": 1
|
136 |
-
}
|
137 |
-
}
|
138 |
-
with nostdout():
|
139 |
-
resp = instance.indices.create(index=config.Index_follow, body=follow_body, ignore=400)
|
140 |
-
return handleIndexResponse(resp)
|
141 |
-
elif scope.get("scope") == "user":
|
142 |
-
user_body = {
|
143 |
-
"mappings": {
|
144 |
-
"properties": {
|
145 |
-
"id": {"type": "keyword"},
|
146 |
-
"name": {"type": "keyword"},
|
147 |
-
"username": {"type": "keyword"},
|
148 |
-
"bio": {"type": "text"},
|
149 |
-
"location": {"type": "keyword"},
|
150 |
-
"url": {"type": "text"},
|
151 |
-
"join_datetime": {"type": "date", "format": "yyyy-MM-dd HH:mm:ss"},
|
152 |
-
"tweets": {"type": "integer"},
|
153 |
-
"following": {"type": "integer"},
|
154 |
-
"followers": {"type": "integer"},
|
155 |
-
"likes": {"type": "integer"},
|
156 |
-
"media": {"type": "integer"},
|
157 |
-
"private": {"type": "integer"},
|
158 |
-
"verified": {"type": "integer"},
|
159 |
-
"avatar": {"type": "text"},
|
160 |
-
"background_image": {"type": "text"},
|
161 |
-
"session": {"type": "keyword"},
|
162 |
-
"geo_user": {"type": "geo_point"}
|
163 |
-
}
|
164 |
-
},
|
165 |
-
"settings": {
|
166 |
-
"number_of_shards": 1
|
167 |
-
}
|
168 |
-
}
|
169 |
-
with nostdout():
|
170 |
-
resp = instance.indices.create(index=config.Index_users, body=user_body, ignore=400)
|
171 |
-
return handleIndexResponse(resp)
|
172 |
-
else:
|
173 |
-
print("[x] error index pre-creation :: storage.elasticsearch.createIndex")
|
174 |
-
return False
|
175 |
-
|
176 |
-
@contextlib.contextmanager
|
177 |
-
def nostdout():
|
178 |
-
savestdout = sys.stdout
|
179 |
-
sys.stdout = RecycleObject()
|
180 |
-
yield
|
181 |
-
sys.stdout = savestdout
|
182 |
-
|
183 |
-
def weekday(day):
|
184 |
-
weekdays = {
|
185 |
-
"Monday": 1,
|
186 |
-
"Tuesday": 2,
|
187 |
-
"Wednesday": 3,
|
188 |
-
"Thursday": 4,
|
189 |
-
"Friday": 5,
|
190 |
-
"Saturday": 6,
|
191 |
-
"Sunday": 7,
|
192 |
-
}
|
193 |
-
|
194 |
-
return weekdays[day]
|
195 |
-
|
196 |
-
def Tweet(Tweet, config):
|
197 |
-
global _index_tweet_status
|
198 |
-
global _is_near_def
|
199 |
-
date_obj = datetime.strptime(Tweet.datetime, "%Y-%m-%d %H:%M:%S %Z")
|
200 |
-
|
201 |
-
actions = []
|
202 |
-
|
203 |
-
try:
|
204 |
-
retweet = Tweet.retweet
|
205 |
-
except AttributeError:
|
206 |
-
retweet = None
|
207 |
-
|
208 |
-
dt = f"{Tweet.datestamp} {Tweet.timestamp}"
|
209 |
-
|
210 |
-
j_data = {
|
211 |
-
"_index": config.Index_tweets,
|
212 |
-
"_id": str(Tweet.id) + "_raw_" + config.Essid,
|
213 |
-
"_source": {
|
214 |
-
"id": str(Tweet.id),
|
215 |
-
"conversation_id": Tweet.conversation_id,
|
216 |
-
"created_at": Tweet.datetime,
|
217 |
-
"date": dt,
|
218 |
-
"timezone": Tweet.timezone,
|
219 |
-
"place": Tweet.place,
|
220 |
-
"tweet": Tweet.tweet,
|
221 |
-
"language": Tweet.lang,
|
222 |
-
"hashtags": Tweet.hashtags,
|
223 |
-
"cashtags": Tweet.cashtags,
|
224 |
-
"user_id_str": Tweet.user_id_str,
|
225 |
-
"username": Tweet.username,
|
226 |
-
"name": Tweet.name,
|
227 |
-
"day": date_obj.weekday(),
|
228 |
-
"hour": date_obj.hour,
|
229 |
-
"link": Tweet.link,
|
230 |
-
"retweet": retweet,
|
231 |
-
"essid": config.Essid,
|
232 |
-
"nlikes": int(Tweet.likes_count),
|
233 |
-
"nreplies": int(Tweet.replies_count),
|
234 |
-
"nretweets": int(Tweet.retweets_count),
|
235 |
-
"quote_url": Tweet.quote_url,
|
236 |
-
"video": Tweet.video,
|
237 |
-
"search": str(config.Search),
|
238 |
-
"near": config.Near
|
239 |
-
}
|
240 |
-
}
|
241 |
-
if retweet is not None:
|
242 |
-
j_data["_source"].update({"user_rt_id": Tweet.user_rt_id})
|
243 |
-
j_data["_source"].update({"user_rt": Tweet.user_rt})
|
244 |
-
j_data["_source"].update({"retweet_id": Tweet.retweet_id})
|
245 |
-
j_data["_source"].update({"retweet_date": Tweet.retweet_date})
|
246 |
-
if Tweet.reply_to:
|
247 |
-
j_data["_source"].update({"reply_to": Tweet.reply_to})
|
248 |
-
if Tweet.photos:
|
249 |
-
_photos = []
|
250 |
-
for photo in Tweet.photos:
|
251 |
-
_photos.append(photo)
|
252 |
-
j_data["_source"].update({"photos": _photos})
|
253 |
-
if Tweet.thumbnail:
|
254 |
-
j_data["_source"].update({"thumbnail": Tweet.thumbnail})
|
255 |
-
if Tweet.mentions:
|
256 |
-
_mentions = []
|
257 |
-
for mention in Tweet.mentions:
|
258 |
-
_mentions.append(mention)
|
259 |
-
j_data["_source"].update({"mentions": _mentions})
|
260 |
-
if Tweet.urls:
|
261 |
-
_urls = []
|
262 |
-
for url in Tweet.urls:
|
263 |
-
_urls.append(url)
|
264 |
-
j_data["_source"].update({"urls": _urls})
|
265 |
-
if config.Near or config.Geo:
|
266 |
-
if not _is_near_def:
|
267 |
-
__geo = ""
|
268 |
-
__near = ""
|
269 |
-
if config.Geo:
|
270 |
-
__geo = config.Geo
|
271 |
-
if config.Near:
|
272 |
-
__near = config.Near
|
273 |
-
_is_near_def = getLocation(__near + __geo, near=True)
|
274 |
-
if _near:
|
275 |
-
j_data["_source"].update({"geo_near": _near})
|
276 |
-
if Tweet.place:
|
277 |
-
_t_place = getLocation(Tweet.place)
|
278 |
-
if _t_place:
|
279 |
-
j_data["_source"].update({"geo_tweet": getLocation(Tweet.place)})
|
280 |
-
if Tweet.source:
|
281 |
-
j_data["_source"].update({"source": Tweet.Source})
|
282 |
-
if config.Translate:
|
283 |
-
j_data["_source"].update({"translate": Tweet.translate})
|
284 |
-
j_data["_source"].update({"trans_src": Tweet.trans_src})
|
285 |
-
j_data["_source"].update({"trans_dest": Tweet.trans_dest})
|
286 |
-
|
287 |
-
actions.append(j_data)
|
288 |
-
|
289 |
-
es = Elasticsearch(config.Elasticsearch, verify_certs=config.Skip_certs)
|
290 |
-
if not _index_tweet_status:
|
291 |
-
_index_tweet_status = createIndex(config, es, scope="tweet")
|
292 |
-
with nostdout():
|
293 |
-
helpers.bulk(es, actions, chunk_size=2000, request_timeout=200)
|
294 |
-
actions = []
|
295 |
-
|
296 |
-
def Follow(user, config):
|
297 |
-
global _index_follow_status
|
298 |
-
actions = []
|
299 |
-
|
300 |
-
if config.Following:
|
301 |
-
_user = config.Username
|
302 |
-
_follow = user
|
303 |
-
else:
|
304 |
-
_user = user
|
305 |
-
_follow = config.Username
|
306 |
-
j_data = {
|
307 |
-
"_index": config.Index_follow,
|
308 |
-
"_id": _user + "_" + _follow + "_" + config.Essid,
|
309 |
-
"_source": {
|
310 |
-
"user": _user,
|
311 |
-
"follow": _follow,
|
312 |
-
"essid": config.Essid
|
313 |
-
}
|
314 |
-
}
|
315 |
-
actions.append(j_data)
|
316 |
-
|
317 |
-
es = Elasticsearch(config.Elasticsearch, verify_certs=config.Skip_certs)
|
318 |
-
if not _index_follow_status:
|
319 |
-
_index_follow_status = createIndex(config, es, scope="follow")
|
320 |
-
with nostdout():
|
321 |
-
helpers.bulk(es, actions, chunk_size=2000, request_timeout=200)
|
322 |
-
actions = []
|
323 |
-
|
324 |
-
def UserProfile(user, config):
|
325 |
-
global _index_user_status
|
326 |
-
global _is_location_def
|
327 |
-
actions = []
|
328 |
-
|
329 |
-
j_data = {
|
330 |
-
"_index": config.Index_users,
|
331 |
-
"_id": user.id + "_" + user.join_date + "_" + user.join_time + "_" + config.Essid,
|
332 |
-
"_source": {
|
333 |
-
"id": user.id,
|
334 |
-
"name": user.name,
|
335 |
-
"username": user.username,
|
336 |
-
"bio": user.bio,
|
337 |
-
"location": user.location,
|
338 |
-
"url": user.url,
|
339 |
-
"join_datetime": user.join_date + " " + user.join_time,
|
340 |
-
"tweets": user.tweets,
|
341 |
-
"following": user.following,
|
342 |
-
"followers": user.followers,
|
343 |
-
"likes": user.likes,
|
344 |
-
"media": user.media_count,
|
345 |
-
"private": user.is_private,
|
346 |
-
"verified": user.is_verified,
|
347 |
-
"avatar": user.avatar,
|
348 |
-
"background_image": user.background_image,
|
349 |
-
"session": config.Essid
|
350 |
-
}
|
351 |
-
}
|
352 |
-
if config.Location:
|
353 |
-
if not _is_location_def:
|
354 |
-
_is_location_def = getLocation(user.location, location=True)
|
355 |
-
if _location:
|
356 |
-
j_data["_source"].update({"geo_user": _location})
|
357 |
-
actions.append(j_data)
|
358 |
-
|
359 |
-
es = Elasticsearch(config.Elasticsearch, verify_certs=config.Skip_certs)
|
360 |
-
if not _index_user_status:
|
361 |
-
_index_user_status = createIndex(config, es, scope="user")
|
362 |
-
with nostdout():
|
363 |
-
helpers.bulk(es, actions, chunk_size=2000, request_timeout=200)
|
364 |
-
actions = []
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
twitter-scraper/twint-master/twint/storage/panda.py
DELETED
@@ -1,196 +0,0 @@
|
|
1 |
-
import datetime, pandas as pd, warnings
|
2 |
-
from time import strftime, localtime
|
3 |
-
from twint.tweet import Tweet_formats
|
4 |
-
|
5 |
-
Tweets_df = None
|
6 |
-
Follow_df = None
|
7 |
-
User_df = None
|
8 |
-
|
9 |
-
_object_blocks = {
|
10 |
-
"tweet": [],
|
11 |
-
"user": [],
|
12 |
-
"following": [],
|
13 |
-
"followers": []
|
14 |
-
}
|
15 |
-
|
16 |
-
weekdays = {
|
17 |
-
"Monday": 1,
|
18 |
-
"Tuesday": 2,
|
19 |
-
"Wednesday": 3,
|
20 |
-
"Thursday": 4,
|
21 |
-
"Friday": 5,
|
22 |
-
"Saturday": 6,
|
23 |
-
"Sunday": 7,
|
24 |
-
}
|
25 |
-
|
26 |
-
_type = ""
|
27 |
-
|
28 |
-
def _concat(df, _type):
|
29 |
-
if df is None:
|
30 |
-
df = pd.DataFrame(_object_blocks[_type])
|
31 |
-
else:
|
32 |
-
_df = pd.DataFrame(_object_blocks[_type])
|
33 |
-
df = pd.concat([df, _df], sort=True)
|
34 |
-
return df
|
35 |
-
|
36 |
-
def _autoget(_type):
|
37 |
-
global Tweets_df
|
38 |
-
global Follow_df
|
39 |
-
global User_df
|
40 |
-
|
41 |
-
if _type == "tweet":
|
42 |
-
Tweets_df = _concat(Tweets_df, _type)
|
43 |
-
elif _type == "followers" or _type == "following":
|
44 |
-
Follow_df = _concat(Follow_df, _type)
|
45 |
-
elif _type == "user":
|
46 |
-
User_df = _concat(User_df, _type)
|
47 |
-
else:
|
48 |
-
error("[x] Wrong type of object passed")
|
49 |
-
|
50 |
-
|
51 |
-
def update(object, config):
|
52 |
-
global _type
|
53 |
-
|
54 |
-
#try:
|
55 |
-
# _type = ((object.__class__.__name__ == "tweet")*"tweet" +
|
56 |
-
# (object.__class__.__name__ == "user")*"user")
|
57 |
-
#except AttributeError:
|
58 |
-
# _type = config.Following*"following" + config.Followers*"followers"
|
59 |
-
if object.__class__.__name__ == "tweet":
|
60 |
-
_type = "tweet"
|
61 |
-
elif object.__class__.__name__ == "user":
|
62 |
-
_type = "user"
|
63 |
-
elif object.__class__.__name__ == "dict":
|
64 |
-
_type = config.Following*"following" + config.Followers*"followers"
|
65 |
-
|
66 |
-
if _type == "tweet":
|
67 |
-
Tweet = object
|
68 |
-
datetime_ms = datetime.datetime.strptime(Tweet.datetime, Tweet_formats['datetime']).timestamp() * 1000
|
69 |
-
day = weekdays[strftime("%A", localtime(datetime_ms/1000))]
|
70 |
-
dt = f"{object.datestamp} {object.timestamp}"
|
71 |
-
_data = {
|
72 |
-
"id": str(Tweet.id),
|
73 |
-
"conversation_id": Tweet.conversation_id,
|
74 |
-
"created_at": datetime_ms,
|
75 |
-
"date": dt,
|
76 |
-
"timezone": Tweet.timezone,
|
77 |
-
"place": Tweet.place,
|
78 |
-
"tweet": Tweet.tweet,
|
79 |
-
"language": Tweet.lang,
|
80 |
-
"hashtags": Tweet.hashtags,
|
81 |
-
"cashtags": Tweet.cashtags,
|
82 |
-
"user_id": Tweet.user_id,
|
83 |
-
"user_id_str": Tweet.user_id_str,
|
84 |
-
"username": Tweet.username,
|
85 |
-
"name": Tweet.name,
|
86 |
-
"day": day,
|
87 |
-
"hour": strftime("%H", localtime(datetime_ms/1000)),
|
88 |
-
"link": Tweet.link,
|
89 |
-
"urls": Tweet.urls,
|
90 |
-
"photos": Tweet.photos,
|
91 |
-
"video": Tweet.video,
|
92 |
-
"thumbnail": Tweet.thumbnail,
|
93 |
-
"retweet": Tweet.retweet,
|
94 |
-
"nlikes": int(Tweet.likes_count),
|
95 |
-
"nreplies": int(Tweet.replies_count),
|
96 |
-
"nretweets": int(Tweet.retweets_count),
|
97 |
-
"quote_url": Tweet.quote_url,
|
98 |
-
"search": str(config.Search),
|
99 |
-
"near": Tweet.near,
|
100 |
-
"geo": Tweet.geo,
|
101 |
-
"source": Tweet.source,
|
102 |
-
"user_rt_id": Tweet.user_rt_id,
|
103 |
-
"user_rt": Tweet.user_rt,
|
104 |
-
"retweet_id": Tweet.retweet_id,
|
105 |
-
"reply_to": Tweet.reply_to,
|
106 |
-
"retweet_date": Tweet.retweet_date,
|
107 |
-
"translate": Tweet.translate,
|
108 |
-
"trans_src": Tweet.trans_src,
|
109 |
-
"trans_dest": Tweet.trans_dest
|
110 |
-
}
|
111 |
-
_object_blocks[_type].append(_data)
|
112 |
-
elif _type == "user":
|
113 |
-
user = object
|
114 |
-
try:
|
115 |
-
background_image = user.background_image
|
116 |
-
except:
|
117 |
-
background_image = ""
|
118 |
-
_data = {
|
119 |
-
"id": user.id,
|
120 |
-
"name": user.name,
|
121 |
-
"username": user.username,
|
122 |
-
"bio": user.bio,
|
123 |
-
"url": user.url,
|
124 |
-
"join_datetime": user.join_date + " " + user.join_time,
|
125 |
-
"join_date": user.join_date,
|
126 |
-
"join_time": user.join_time,
|
127 |
-
"tweets": user.tweets,
|
128 |
-
"location": user.location,
|
129 |
-
"following": user.following,
|
130 |
-
"followers": user.followers,
|
131 |
-
"likes": user.likes,
|
132 |
-
"media": user.media_count,
|
133 |
-
"private": user.is_private,
|
134 |
-
"verified": user.is_verified,
|
135 |
-
"avatar": user.avatar,
|
136 |
-
"background_image": background_image,
|
137 |
-
}
|
138 |
-
_object_blocks[_type].append(_data)
|
139 |
-
elif _type == "followers" or _type == "following":
|
140 |
-
_data = {
|
141 |
-
config.Following*"following" + config.Followers*"followers" :
|
142 |
-
{config.Username: object[_type]}
|
143 |
-
}
|
144 |
-
_object_blocks[_type] = _data
|
145 |
-
else:
|
146 |
-
print("Wrong type of object passed!")
|
147 |
-
|
148 |
-
|
149 |
-
def clean():
|
150 |
-
global Tweets_df
|
151 |
-
global Follow_df
|
152 |
-
global User_df
|
153 |
-
_object_blocks["tweet"].clear()
|
154 |
-
_object_blocks["following"].clear()
|
155 |
-
_object_blocks["followers"].clear()
|
156 |
-
_object_blocks["user"].clear()
|
157 |
-
Tweets_df = None
|
158 |
-
Follow_df = None
|
159 |
-
User_df = None
|
160 |
-
|
161 |
-
def save(_filename, _dataframe, **options):
|
162 |
-
if options.get("dataname"):
|
163 |
-
_dataname = options.get("dataname")
|
164 |
-
else:
|
165 |
-
_dataname = "twint"
|
166 |
-
|
167 |
-
if not options.get("type"):
|
168 |
-
with warnings.catch_warnings():
|
169 |
-
warnings.simplefilter("ignore")
|
170 |
-
_store = pd.HDFStore(_filename + ".h5")
|
171 |
-
_store[_dataname] = _dataframe
|
172 |
-
_store.close()
|
173 |
-
elif options.get("type") == "Pickle":
|
174 |
-
with warnings.catch_warnings():
|
175 |
-
warnings.simplefilter("ignore")
|
176 |
-
_dataframe.to_pickle(_filename + ".pkl")
|
177 |
-
else:
|
178 |
-
print("""Please specify: filename, DataFrame, DataFrame name and type
|
179 |
-
(HDF5, default, or Pickle)""")
|
180 |
-
|
181 |
-
def read(_filename, **options):
|
182 |
-
if not options.get("dataname"):
|
183 |
-
_dataname = "twint"
|
184 |
-
else:
|
185 |
-
_dataname = options.get("dataname")
|
186 |
-
|
187 |
-
if not options.get("type"):
|
188 |
-
_store = pd.HDFStore(_filename + ".h5")
|
189 |
-
_df = _store[_dataname]
|
190 |
-
return _df
|
191 |
-
elif options.get("type") == "Pickle":
|
192 |
-
_df = pd.read_pickle(_filename + ".pkl")
|
193 |
-
return _df
|
194 |
-
else:
|
195 |
-
print("""Please specify: DataFrame, DataFrame name (twint as default),
|
196 |
-
filename and type (HDF5, default, or Pickle""")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
twitter-scraper/twint-master/twint/storage/write.py
DELETED
@@ -1,77 +0,0 @@
|
|
1 |
-
from . import write_meta as meta
|
2 |
-
import csv
|
3 |
-
import json
|
4 |
-
import os
|
5 |
-
|
6 |
-
def outputExt(objType, fType):
|
7 |
-
if objType == "str":
|
8 |
-
objType = "username"
|
9 |
-
outExt = f"/{objType}s.{fType}"
|
10 |
-
|
11 |
-
return outExt
|
12 |
-
|
13 |
-
def addExt(base, objType, fType):
|
14 |
-
if len(base.split('.')) == 1:
|
15 |
-
createDirIfMissing(base)
|
16 |
-
base += outputExt(objType, fType)
|
17 |
-
|
18 |
-
return base
|
19 |
-
|
20 |
-
def Text(entry, f):
|
21 |
-
print(entry.replace('\n', ' '), file=open(f, "a", encoding="utf-8"))
|
22 |
-
|
23 |
-
def Type(config):
|
24 |
-
if config.User_full:
|
25 |
-
_type = "user"
|
26 |
-
elif config.Followers or config.Following:
|
27 |
-
_type = "username"
|
28 |
-
else:
|
29 |
-
_type = "tweet"
|
30 |
-
|
31 |
-
return _type
|
32 |
-
|
33 |
-
def struct(obj, custom, _type):
|
34 |
-
if custom:
|
35 |
-
fieldnames = custom
|
36 |
-
row = {}
|
37 |
-
for f in fieldnames:
|
38 |
-
row[f] = meta.Data(obj, _type)[f]
|
39 |
-
else:
|
40 |
-
fieldnames = meta.Fieldnames(_type)
|
41 |
-
row = meta.Data(obj, _type)
|
42 |
-
|
43 |
-
return fieldnames, row
|
44 |
-
|
45 |
-
def createDirIfMissing(dirname):
|
46 |
-
if not os.path.exists(dirname):
|
47 |
-
os.makedirs(dirname)
|
48 |
-
|
49 |
-
def Csv(obj, config):
|
50 |
-
_obj_type = obj.__class__.__name__
|
51 |
-
if _obj_type == "str":
|
52 |
-
_obj_type = "username"
|
53 |
-
fieldnames, row = struct(obj, config.Custom[_obj_type], _obj_type)
|
54 |
-
|
55 |
-
base = addExt(config.Output, _obj_type, "csv")
|
56 |
-
dialect = 'excel-tab' if 'Tabs' in config.__dict__ else 'excel'
|
57 |
-
|
58 |
-
if not (os.path.exists(base)):
|
59 |
-
with open(base, "w", newline='', encoding="utf-8") as csv_file:
|
60 |
-
writer = csv.DictWriter(csv_file, fieldnames=fieldnames, dialect=dialect)
|
61 |
-
writer.writeheader()
|
62 |
-
|
63 |
-
with open(base, "a", newline='', encoding="utf-8") as csv_file:
|
64 |
-
writer = csv.DictWriter(csv_file, fieldnames=fieldnames, dialect=dialect)
|
65 |
-
writer.writerow(row)
|
66 |
-
|
67 |
-
def Json(obj, config):
|
68 |
-
_obj_type = obj.__class__.__name__
|
69 |
-
if _obj_type == "str":
|
70 |
-
_obj_type = "username"
|
71 |
-
null, data = struct(obj, config.Custom[_obj_type], _obj_type)
|
72 |
-
|
73 |
-
base = addExt(config.Output, _obj_type, "json")
|
74 |
-
|
75 |
-
with open(base, "a", newline='', encoding="utf-8") as json_file:
|
76 |
-
json.dump(data, json_file, ensure_ascii=False)
|
77 |
-
json_file.write("\n")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
twitter-scraper/twint-master/twint/storage/write_meta.py
DELETED
@@ -1,151 +0,0 @@
|
|
1 |
-
def tweetData(t):
|
2 |
-
data = {
|
3 |
-
"id": int(t.id),
|
4 |
-
"conversation_id": t.conversation_id,
|
5 |
-
"created_at": t.datetime,
|
6 |
-
"date": t.datestamp,
|
7 |
-
"time": t.timestamp,
|
8 |
-
"timezone": t.timezone,
|
9 |
-
"user_id": t.user_id,
|
10 |
-
"username": t.username,
|
11 |
-
"name": t.name,
|
12 |
-
"place": t.place,
|
13 |
-
"tweet": t.tweet,
|
14 |
-
"language": t.lang,
|
15 |
-
"mentions": t.mentions,
|
16 |
-
"urls": t.urls,
|
17 |
-
"photos": t.photos,
|
18 |
-
"replies_count": int(t.replies_count),
|
19 |
-
"retweets_count": int(t.retweets_count),
|
20 |
-
"likes_count": int(t.likes_count),
|
21 |
-
"hashtags": t.hashtags,
|
22 |
-
"cashtags": t.cashtags,
|
23 |
-
"link": t.link,
|
24 |
-
"retweet": t.retweet,
|
25 |
-
"quote_url": t.quote_url,
|
26 |
-
"video": t.video,
|
27 |
-
"thumbnail": t.thumbnail,
|
28 |
-
"near": t.near,
|
29 |
-
"geo": t.geo,
|
30 |
-
"source": t.source,
|
31 |
-
"user_rt_id": t.user_rt_id,
|
32 |
-
"user_rt": t.user_rt,
|
33 |
-
"retweet_id": t.retweet_id,
|
34 |
-
"reply_to": t.reply_to,
|
35 |
-
"retweet_date": t.retweet_date,
|
36 |
-
"translate": t.translate,
|
37 |
-
"trans_src": t.trans_src,
|
38 |
-
"trans_dest": t.trans_dest,
|
39 |
-
}
|
40 |
-
return data
|
41 |
-
|
42 |
-
def tweetFieldnames():
|
43 |
-
fieldnames = [
|
44 |
-
"id",
|
45 |
-
"conversation_id",
|
46 |
-
"created_at",
|
47 |
-
"date",
|
48 |
-
"time",
|
49 |
-
"timezone",
|
50 |
-
"user_id",
|
51 |
-
"username",
|
52 |
-
"name",
|
53 |
-
"place",
|
54 |
-
"tweet",
|
55 |
-
"language",
|
56 |
-
"mentions",
|
57 |
-
"urls",
|
58 |
-
"photos",
|
59 |
-
"replies_count",
|
60 |
-
"retweets_count",
|
61 |
-
"likes_count",
|
62 |
-
"hashtags",
|
63 |
-
"cashtags",
|
64 |
-
"link",
|
65 |
-
"retweet",
|
66 |
-
"quote_url",
|
67 |
-
"video",
|
68 |
-
"thumbnail",
|
69 |
-
"near",
|
70 |
-
"geo",
|
71 |
-
"source",
|
72 |
-
"user_rt_id",
|
73 |
-
"user_rt",
|
74 |
-
"retweet_id",
|
75 |
-
"reply_to",
|
76 |
-
"retweet_date",
|
77 |
-
"translate",
|
78 |
-
"trans_src",
|
79 |
-
"trans_dest"
|
80 |
-
]
|
81 |
-
return fieldnames
|
82 |
-
|
83 |
-
def userData(u):
|
84 |
-
data = {
|
85 |
-
"id": int(u.id),
|
86 |
-
"name": u.name,
|
87 |
-
"username": u.username,
|
88 |
-
"bio": u.bio,
|
89 |
-
"location": u.location,
|
90 |
-
"url": u.url,
|
91 |
-
"join_date": u.join_date,
|
92 |
-
"join_time": u.join_time,
|
93 |
-
"tweets": int(u.tweets),
|
94 |
-
"following": int(u.following),
|
95 |
-
"followers": int(u.followers),
|
96 |
-
"likes": int(u.likes),
|
97 |
-
"media": int(u.media_count),
|
98 |
-
"private": u.is_private,
|
99 |
-
"verified": u.is_verified,
|
100 |
-
"profile_image_url": u.avatar,
|
101 |
-
"background_image": u.background_image
|
102 |
-
}
|
103 |
-
return data
|
104 |
-
|
105 |
-
def userFieldnames():
|
106 |
-
fieldnames = [
|
107 |
-
"id",
|
108 |
-
"name",
|
109 |
-
"username",
|
110 |
-
"bio",
|
111 |
-
"location",
|
112 |
-
"url",
|
113 |
-
"join_date",
|
114 |
-
"join_time",
|
115 |
-
"tweets",
|
116 |
-
"following",
|
117 |
-
"followers",
|
118 |
-
"likes",
|
119 |
-
"media",
|
120 |
-
"private",
|
121 |
-
"verified",
|
122 |
-
"profile_image_url",
|
123 |
-
"background_image"
|
124 |
-
]
|
125 |
-
return fieldnames
|
126 |
-
|
127 |
-
def usernameData(u):
|
128 |
-
return {"username": u}
|
129 |
-
|
130 |
-
def usernameFieldnames():
|
131 |
-
return ["username"]
|
132 |
-
|
133 |
-
def Data(obj, _type):
|
134 |
-
if _type == "user":
|
135 |
-
ret = userData(obj)
|
136 |
-
elif _type == "username":
|
137 |
-
ret = usernameData(obj)
|
138 |
-
else:
|
139 |
-
ret = tweetData(obj)
|
140 |
-
|
141 |
-
return ret
|
142 |
-
|
143 |
-
def Fieldnames(_type):
|
144 |
-
if _type == "user":
|
145 |
-
ret = userFieldnames()
|
146 |
-
elif _type == "username":
|
147 |
-
ret = usernameFieldnames()
|
148 |
-
else:
|
149 |
-
ret = tweetFieldnames()
|
150 |
-
|
151 |
-
return ret
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
twitter-scraper/twint-master/twint/token.py
DELETED
@@ -1,94 +0,0 @@
|
|
1 |
-
import re
|
2 |
-
import time
|
3 |
-
|
4 |
-
import requests
|
5 |
-
import logging as logme
|
6 |
-
|
7 |
-
|
8 |
-
class TokenExpiryException(Exception):
|
9 |
-
def __init__(self, msg):
|
10 |
-
super().__init__(msg)
|
11 |
-
|
12 |
-
|
13 |
-
class RefreshTokenException(Exception):
|
14 |
-
def __init__(self, msg):
|
15 |
-
super().__init__(msg)
|
16 |
-
|
17 |
-
|
18 |
-
class Token:
|
19 |
-
def __init__(self, config):
|
20 |
-
self._session = requests.Session()
|
21 |
-
self._session.headers.update({'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:78.0) Gecko/20100101 Firefox/78.0'})
|
22 |
-
self.config = config
|
23 |
-
self._retries = 5
|
24 |
-
self._timeout = 10
|
25 |
-
self.url = 'https://twitter.com'
|
26 |
-
|
27 |
-
def _request(self):
|
28 |
-
for attempt in range(self._retries + 1):
|
29 |
-
# The request is newly prepared on each retry because of potential cookie updates.
|
30 |
-
req = self._session.prepare_request(requests.Request('GET', self.url))
|
31 |
-
logme.debug(f'Retrieving {req.url}')
|
32 |
-
try:
|
33 |
-
r = self._session.send(req, allow_redirects=True, timeout=self._timeout)
|
34 |
-
except requests.exceptions.RequestException as exc:
|
35 |
-
if attempt < self._retries:
|
36 |
-
retrying = ', retrying'
|
37 |
-
level = logme.WARNING
|
38 |
-
else:
|
39 |
-
retrying = ''
|
40 |
-
level = logme.ERROR
|
41 |
-
logme.log(level, f'Error retrieving {req.url}: {exc!r}{retrying}')
|
42 |
-
else:
|
43 |
-
success, msg = (True, None)
|
44 |
-
msg = f': {msg}' if msg else ''
|
45 |
-
|
46 |
-
if success:
|
47 |
-
logme.debug(f'{req.url} retrieved successfully{msg}')
|
48 |
-
return r
|
49 |
-
if attempt < self._retries:
|
50 |
-
# TODO : might wanna tweak this back-off timer
|
51 |
-
sleep_time = 2.0 * 2 ** attempt
|
52 |
-
logme.info(f'Waiting {sleep_time:.0f} seconds')
|
53 |
-
time.sleep(sleep_time)
|
54 |
-
else:
|
55 |
-
msg = f'{self._retries + 1} requests to {self.url} failed, giving up.'
|
56 |
-
logme.fatal(msg)
|
57 |
-
self.config.Guest_token = None
|
58 |
-
raise RefreshTokenException(msg)
|
59 |
-
|
60 |
-
def refresh(self):
|
61 |
-
logme.debug('Retrieving guest token')
|
62 |
-
res = self._request()
|
63 |
-
match = re.search(r'\("gt=(\d+);', res.text)
|
64 |
-
if match:
|
65 |
-
logme.debug('Found guest token in HTML')
|
66 |
-
self.config.Guest_token = str(match.group(1))
|
67 |
-
else:
|
68 |
-
headers = {
|
69 |
-
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:78.0) Gecko/20100101 Firefox/78.0',
|
70 |
-
'authority': 'api.twitter.com',
|
71 |
-
'content-length': '0',
|
72 |
-
'authorization': self.config.Bearer_token,
|
73 |
-
'x-twitter-client-language': 'en',
|
74 |
-
'x-csrf-token': res.cookies.get("ct0"),
|
75 |
-
'x-twitter-active-user': 'yes',
|
76 |
-
'content-type': 'application/x-www-form-urlencoded',
|
77 |
-
'accept': '*/*',
|
78 |
-
'sec-gpc': '1',
|
79 |
-
'origin': 'https://twitter.com',
|
80 |
-
'sec-fetch-site': 'same-site',
|
81 |
-
'sec-fetch-mode': 'cors',
|
82 |
-
'sec-fetch-dest': 'empty',
|
83 |
-
'referer': 'https://twitter.com/',
|
84 |
-
'accept-language': 'en-US',
|
85 |
-
}
|
86 |
-
self._session.headers.update(headers)
|
87 |
-
req = self._session.prepare_request(requests.Request('POST', 'https://api.twitter.com/1.1/guest/activate.json'))
|
88 |
-
res = self._session.send(req, allow_redirects=True, timeout=self._timeout)
|
89 |
-
if 'guest_token' in res.json():
|
90 |
-
logme.debug('Found guest token in JSON')
|
91 |
-
self.config.Guest_token = res.json()['guest_token']
|
92 |
-
else:
|
93 |
-
self.config.Guest_token = None
|
94 |
-
raise RefreshTokenException('Could not find the Guest token in HTML')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
twitter-scraper/twint-master/twint/tweet.py
DELETED
@@ -1,166 +0,0 @@
|
|
1 |
-
from time import strftime, localtime
|
2 |
-
from datetime import datetime, timezone
|
3 |
-
|
4 |
-
import logging as logme
|
5 |
-
from googletransx import Translator
|
6 |
-
# ref.
|
7 |
-
# - https://github.com/x0rzkov/py-googletrans#basic-usage
|
8 |
-
translator = Translator()
|
9 |
-
|
10 |
-
|
11 |
-
class tweet:
|
12 |
-
"""Define Tweet class
|
13 |
-
"""
|
14 |
-
type = "tweet"
|
15 |
-
|
16 |
-
def __init__(self):
|
17 |
-
pass
|
18 |
-
|
19 |
-
|
20 |
-
def utc_to_local(utc_dt):
|
21 |
-
return utc_dt.replace(tzinfo=timezone.utc).astimezone(tz=None)
|
22 |
-
|
23 |
-
|
24 |
-
Tweet_formats = {
|
25 |
-
'datetime': '%Y-%m-%d %H:%M:%S %Z',
|
26 |
-
'datestamp': '%Y-%m-%d',
|
27 |
-
'timestamp': '%H:%M:%S'
|
28 |
-
}
|
29 |
-
|
30 |
-
|
31 |
-
def _get_mentions(tw):
|
32 |
-
"""Extract mentions from tweet
|
33 |
-
"""
|
34 |
-
logme.debug(__name__ + ':get_mentions')
|
35 |
-
try:
|
36 |
-
mentions = [
|
37 |
-
{
|
38 |
-
'screen_name': _mention['screen_name'],
|
39 |
-
'name': _mention['name'],
|
40 |
-
'id': _mention['id_str'],
|
41 |
-
} for _mention in tw['entities']['user_mentions']
|
42 |
-
if tw['display_text_range'][0] < _mention['indices'][0]
|
43 |
-
]
|
44 |
-
except KeyError:
|
45 |
-
mentions = []
|
46 |
-
return mentions
|
47 |
-
|
48 |
-
|
49 |
-
def _get_reply_to(tw):
|
50 |
-
try:
|
51 |
-
reply_to = [
|
52 |
-
{
|
53 |
-
'screen_name': _mention['screen_name'],
|
54 |
-
'name': _mention['name'],
|
55 |
-
'id': _mention['id_str'],
|
56 |
-
} for _mention in tw['entities']['user_mentions']
|
57 |
-
if tw['display_text_range'][0] > _mention['indices'][1]
|
58 |
-
]
|
59 |
-
except KeyError:
|
60 |
-
reply_to = []
|
61 |
-
return reply_to
|
62 |
-
|
63 |
-
|
64 |
-
def getText(tw):
|
65 |
-
"""Replace some text
|
66 |
-
"""
|
67 |
-
logme.debug(__name__ + ':getText')
|
68 |
-
text = tw['full_text']
|
69 |
-
text = text.replace("http", " http")
|
70 |
-
text = text.replace("pic.twitter", " pic.twitter")
|
71 |
-
text = text.replace("\n", " ")
|
72 |
-
|
73 |
-
return text
|
74 |
-
|
75 |
-
|
76 |
-
def Tweet(tw, config):
|
77 |
-
"""Create Tweet object
|
78 |
-
"""
|
79 |
-
logme.debug(__name__ + ':Tweet')
|
80 |
-
t = tweet()
|
81 |
-
t.id = int(tw['id_str'])
|
82 |
-
t.id_str = tw["id_str"]
|
83 |
-
t.conversation_id = tw["conversation_id_str"]
|
84 |
-
|
85 |
-
# parsing date to user-friendly format
|
86 |
-
_dt = tw['created_at']
|
87 |
-
_dt = datetime.strptime(_dt, '%a %b %d %H:%M:%S %z %Y')
|
88 |
-
_dt = utc_to_local(_dt)
|
89 |
-
t.datetime = str(_dt.strftime(Tweet_formats['datetime']))
|
90 |
-
# date is of the format year,
|
91 |
-
t.datestamp = _dt.strftime(Tweet_formats['datestamp'])
|
92 |
-
t.timestamp = _dt.strftime(Tweet_formats['timestamp'])
|
93 |
-
t.user_id = int(tw["user_id_str"])
|
94 |
-
t.user_id_str = tw["user_id_str"]
|
95 |
-
t.username = tw["user_data"]['screen_name']
|
96 |
-
t.name = tw["user_data"]['name']
|
97 |
-
t.place = tw['geo'] if 'geo' in tw and tw['geo'] else ""
|
98 |
-
t.timezone = strftime("%z", localtime())
|
99 |
-
t.mentions = _get_mentions(tw)
|
100 |
-
t.reply_to = _get_reply_to(tw)
|
101 |
-
try:
|
102 |
-
t.urls = [_url['expanded_url'] for _url in tw['entities']['urls']]
|
103 |
-
except KeyError:
|
104 |
-
t.urls = []
|
105 |
-
try:
|
106 |
-
t.photos = [_img['media_url_https'] for _img in tw['entities']['media'] if _img['type'] == 'photo' and
|
107 |
-
_img['expanded_url'].find('/photo/') != -1]
|
108 |
-
except KeyError:
|
109 |
-
t.photos = []
|
110 |
-
try:
|
111 |
-
t.video = 1 if len(tw['extended_entities']['media']) else 0
|
112 |
-
except KeyError:
|
113 |
-
t.video = 0
|
114 |
-
try:
|
115 |
-
t.thumbnail = tw['extended_entities']['media'][0]['media_url_https']
|
116 |
-
except KeyError:
|
117 |
-
t.thumbnail = ''
|
118 |
-
t.tweet = getText(tw)
|
119 |
-
t.lang = tw['lang']
|
120 |
-
try:
|
121 |
-
t.hashtags = [hashtag['text'] for hashtag in tw['entities']['hashtags']]
|
122 |
-
except KeyError:
|
123 |
-
t.hashtags = []
|
124 |
-
try:
|
125 |
-
t.cashtags = [cashtag['text'] for cashtag in tw['entities']['symbols']]
|
126 |
-
except KeyError:
|
127 |
-
t.cashtags = []
|
128 |
-
t.replies_count = tw['reply_count']
|
129 |
-
t.retweets_count = tw['retweet_count']
|
130 |
-
t.likes_count = tw['favorite_count']
|
131 |
-
t.link = f"https://twitter.com/{t.username}/status/{t.id}"
|
132 |
-
try:
|
133 |
-
if 'user_rt_id' in tw['retweet_data']:
|
134 |
-
t.retweet = True
|
135 |
-
t.retweet_id = tw['retweet_data']['retweet_id']
|
136 |
-
t.retweet_date = tw['retweet_data']['retweet_date']
|
137 |
-
t.user_rt = tw['retweet_data']['user_rt']
|
138 |
-
t.user_rt_id = tw['retweet_data']['user_rt_id']
|
139 |
-
except KeyError:
|
140 |
-
t.retweet = False
|
141 |
-
t.retweet_id = ''
|
142 |
-
t.retweet_date = ''
|
143 |
-
t.user_rt = ''
|
144 |
-
t.user_rt_id = ''
|
145 |
-
try:
|
146 |
-
t.quote_url = tw['quoted_status_permalink']['expanded'] if tw['is_quote_status'] else ''
|
147 |
-
except KeyError:
|
148 |
-
# means that the quoted tweet have been deleted
|
149 |
-
t.quote_url = 0
|
150 |
-
t.near = config.Near if config.Near else ""
|
151 |
-
t.geo = config.Geo if config.Geo else ""
|
152 |
-
t.source = config.Source if config.Source else ""
|
153 |
-
t.translate = ''
|
154 |
-
t.trans_src = ''
|
155 |
-
t.trans_dest = ''
|
156 |
-
if config.Translate:
|
157 |
-
try:
|
158 |
-
ts = translator.translate(text=t.tweet, dest=config.TranslateDest)
|
159 |
-
t.translate = ts.text
|
160 |
-
t.trans_src = ts.src
|
161 |
-
t.trans_dest = ts.dest
|
162 |
-
# ref. https://github.com/SuniTheFish/ChainTranslator/blob/master/ChainTranslator/__main__.py#L31
|
163 |
-
except ValueError as e:
|
164 |
-
logme.debug(__name__ + ':Tweet:translator.translate:' + str(e))
|
165 |
-
raise Exception("Invalid destination language: {} / Tweet: {}".format(config.TranslateDest, t.tweet))
|
166 |
-
return t
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
twitter-scraper/twint-master/twint/url.py
DELETED
@@ -1,195 +0,0 @@
|
|
1 |
-
import datetime
|
2 |
-
import json
|
3 |
-
from sys import platform
|
4 |
-
import logging as logme
|
5 |
-
from urllib.parse import urlencode
|
6 |
-
from urllib.parse import quote
|
7 |
-
|
8 |
-
mobile = "https://mobile.twitter.com"
|
9 |
-
base = "https://api.twitter.com/2/search/adaptive.json"
|
10 |
-
|
11 |
-
|
12 |
-
def _sanitizeQuery(_url, params):
|
13 |
-
_serialQuery = ""
|
14 |
-
_serialQuery = urlencode(params, quote_via=quote)
|
15 |
-
_serialQuery = _url + "?" + _serialQuery
|
16 |
-
return _serialQuery
|
17 |
-
|
18 |
-
|
19 |
-
def _formatDate(date):
|
20 |
-
if "win" in platform:
|
21 |
-
return f'\"{date.split()[0]}\"'
|
22 |
-
try:
|
23 |
-
return int(datetime.datetime.strptime(date, "%Y-%m-%d %H:%M:%S").timestamp())
|
24 |
-
except ValueError:
|
25 |
-
return int(datetime.datetime.strptime(date, "%Y-%m-%d").timestamp())
|
26 |
-
|
27 |
-
|
28 |
-
async def Favorites(username, init):
|
29 |
-
logme.debug(__name__ + ':Favorites')
|
30 |
-
url = f"{mobile}/{username}/favorites?lang=en"
|
31 |
-
|
32 |
-
if init != '-1':
|
33 |
-
url += f"&max_id={init}"
|
34 |
-
|
35 |
-
return url
|
36 |
-
|
37 |
-
|
38 |
-
async def Followers(username, init):
|
39 |
-
logme.debug(__name__ + ':Followers')
|
40 |
-
url = f"{mobile}/{username}/followers?lang=en"
|
41 |
-
|
42 |
-
if init != '-1':
|
43 |
-
url += f"&cursor={init}"
|
44 |
-
|
45 |
-
return url
|
46 |
-
|
47 |
-
|
48 |
-
async def Following(username, init):
|
49 |
-
logme.debug(__name__ + ':Following')
|
50 |
-
url = f"{mobile}/{username}/following?lang=en"
|
51 |
-
|
52 |
-
if init != '-1':
|
53 |
-
url += f"&cursor={init}"
|
54 |
-
|
55 |
-
return url
|
56 |
-
|
57 |
-
|
58 |
-
async def MobileProfile(username, init):
|
59 |
-
logme.debug(__name__ + ':MobileProfile')
|
60 |
-
url = f"{mobile}/{username}?lang=en"
|
61 |
-
|
62 |
-
if init != '-1':
|
63 |
-
url += f"&max_id={init}"
|
64 |
-
|
65 |
-
return url
|
66 |
-
|
67 |
-
|
68 |
-
async def Search(config, init):
|
69 |
-
logme.debug(__name__ + ':Search')
|
70 |
-
url = base
|
71 |
-
tweet_count = 100 if not config.Limit else config.Limit
|
72 |
-
q = ""
|
73 |
-
params = [
|
74 |
-
# ('include_blocking', '1'),
|
75 |
-
# ('include_blocked_by', '1'),
|
76 |
-
# ('include_followed_by', '1'),
|
77 |
-
# ('include_want_retweets', '1'),
|
78 |
-
# ('include_mute_edge', '1'),
|
79 |
-
# ('include_can_dm', '1'),
|
80 |
-
('include_can_media_tag', '1'),
|
81 |
-
# ('skip_status', '1'),
|
82 |
-
# ('include_cards', '1'),
|
83 |
-
('include_ext_alt_text', 'true'),
|
84 |
-
('include_quote_count', 'true'),
|
85 |
-
('include_reply_count', '1'),
|
86 |
-
('tweet_mode', 'extended'),
|
87 |
-
('include_entities', 'true'),
|
88 |
-
('include_user_entities', 'true'),
|
89 |
-
('include_ext_media_availability', 'true'),
|
90 |
-
('send_error_codes', 'true'),
|
91 |
-
('simple_quoted_tweet', 'true'),
|
92 |
-
('count', tweet_count),
|
93 |
-
('query_source', 'typed_query'),
|
94 |
-
# ('pc', '1'),
|
95 |
-
('cursor', str(init)),
|
96 |
-
('spelling_corrections', '1'),
|
97 |
-
('ext', 'mediaStats%2ChighlightedLabel'),
|
98 |
-
('tweet_search_mode', 'live'), # this can be handled better, maybe take an argument and set it then
|
99 |
-
]
|
100 |
-
if not config.Popular_tweets:
|
101 |
-
params.append(('f', 'tweets'))
|
102 |
-
if config.Lang:
|
103 |
-
params.append(("l", config.Lang))
|
104 |
-
params.append(("lang", "en"))
|
105 |
-
if config.Query:
|
106 |
-
q += f" from:{config.Query}"
|
107 |
-
if config.Username:
|
108 |
-
q += f" from:{config.Username}"
|
109 |
-
if config.Geo:
|
110 |
-
config.Geo = config.Geo.replace(" ", "")
|
111 |
-
q += f" geocode:{config.Geo}"
|
112 |
-
if config.Search:
|
113 |
-
|
114 |
-
q += f" {config.Search}"
|
115 |
-
if config.Year:
|
116 |
-
q += f" until:{config.Year}-1-1"
|
117 |
-
if config.Since:
|
118 |
-
q += f" since:{_formatDate(config.Since)}"
|
119 |
-
if config.Until:
|
120 |
-
q += f" until:{_formatDate(config.Until)}"
|
121 |
-
if config.Email:
|
122 |
-
q += ' "mail" OR "email" OR'
|
123 |
-
q += ' "gmail" OR "e-mail"'
|
124 |
-
if config.Phone:
|
125 |
-
q += ' "phone" OR "call me" OR "text me"'
|
126 |
-
if config.Verified:
|
127 |
-
q += " filter:verified"
|
128 |
-
if config.To:
|
129 |
-
q += f" to:{config.To}"
|
130 |
-
if config.All:
|
131 |
-
q += f" to:{config.All} OR from:{config.All} OR @{config.All}"
|
132 |
-
if config.Near:
|
133 |
-
q += f' near:"{config.Near}"'
|
134 |
-
if config.Images:
|
135 |
-
q += " filter:images"
|
136 |
-
if config.Videos:
|
137 |
-
q += " filter:videos"
|
138 |
-
if config.Media:
|
139 |
-
q += " filter:media"
|
140 |
-
if config.Replies:
|
141 |
-
q += " filter:replies"
|
142 |
-
# although this filter can still be used, but I found it broken in my preliminary testing, needs more testing
|
143 |
-
if config.Native_retweets:
|
144 |
-
q += " filter:nativeretweets"
|
145 |
-
if config.Min_likes:
|
146 |
-
q += f" min_faves:{config.Min_likes}"
|
147 |
-
if config.Min_retweets:
|
148 |
-
q += f" min_retweets:{config.Min_retweets}"
|
149 |
-
if config.Min_replies:
|
150 |
-
q += f" min_replies:{config.Min_replies}"
|
151 |
-
if config.Links == "include":
|
152 |
-
q += " filter:links"
|
153 |
-
elif config.Links == "exclude":
|
154 |
-
q += " exclude:links"
|
155 |
-
if config.Source:
|
156 |
-
q += f" source:\"{config.Source}\""
|
157 |
-
if config.Members_list:
|
158 |
-
q += f" list:{config.Members_list}"
|
159 |
-
if config.Filter_retweets:
|
160 |
-
q += f" exclude:nativeretweets exclude:retweets"
|
161 |
-
if config.Custom_query:
|
162 |
-
q = config.Custom_query
|
163 |
-
|
164 |
-
q = q.strip()
|
165 |
-
params.append(("q", q))
|
166 |
-
_serialQuery = _sanitizeQuery(url, params)
|
167 |
-
return url, params, _serialQuery
|
168 |
-
|
169 |
-
|
170 |
-
def SearchProfile(config, init=None):
|
171 |
-
logme.debug(__name__ + ':SearchProfile')
|
172 |
-
_url = 'https://twitter.com/i/api/graphql/CwLU7qTfeu0doqhSr6tW4A/UserTweetsAndReplies'
|
173 |
-
tweet_count = 100
|
174 |
-
variables = {
|
175 |
-
"userId": config.User_id,
|
176 |
-
"count": tweet_count,
|
177 |
-
"includePromotedContent": True,
|
178 |
-
"withCommunity": True,
|
179 |
-
"withSuperFollowsUserFields": True,
|
180 |
-
"withBirdwatchPivots": False,
|
181 |
-
"withDownvotePerspective": False,
|
182 |
-
"withReactionsMetadata": False,
|
183 |
-
"withReactionsPerspective": False,
|
184 |
-
"withSuperFollowsTweetFields": True,
|
185 |
-
"withVoice": True,
|
186 |
-
"withV2Timeline": False,
|
187 |
-
"__fs_interactive_text": False,
|
188 |
-
"__fs_dont_mention_me_view_api_enabled": False,
|
189 |
-
}
|
190 |
-
if type(init) == str:
|
191 |
-
variables['cursor'] = init
|
192 |
-
params = [('variables', json.dumps(variables, separators=(',',':')))]
|
193 |
-
|
194 |
-
_serialQuery = _sanitizeQuery(_url, params)
|
195 |
-
return _serialQuery, [], _serialQuery
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
twitter-scraper/twint-master/twint/user.py
DELETED
@@ -1,52 +0,0 @@
|
|
1 |
-
import datetime
|
2 |
-
import logging as logme
|
3 |
-
|
4 |
-
|
5 |
-
class user:
|
6 |
-
type = "user"
|
7 |
-
|
8 |
-
def __init__(self):
|
9 |
-
pass
|
10 |
-
|
11 |
-
|
12 |
-
User_formats = {
|
13 |
-
'join_date': '%Y-%m-%d',
|
14 |
-
'join_time': '%H:%M:%S %Z'
|
15 |
-
}
|
16 |
-
|
17 |
-
|
18 |
-
# ur object must be a json from the endpoint https://api.twitter.com/graphql
|
19 |
-
def User(ur):
|
20 |
-
logme.debug(__name__ + ':User')
|
21 |
-
if 'data' not in ur and 'user' not in ur['data']:
|
22 |
-
msg = 'malformed json! cannot be parsed to get user data'
|
23 |
-
logme.fatal(msg)
|
24 |
-
raise KeyError(msg)
|
25 |
-
_usr = user()
|
26 |
-
_usr.id = ur['data']['user']['rest_id']
|
27 |
-
_usr.name = ur['data']['user']['legacy']['name']
|
28 |
-
_usr.username = ur['data']['user']['legacy']['screen_name']
|
29 |
-
_usr.bio = ur['data']['user']['legacy']['description']
|
30 |
-
_usr.location = ur['data']['user']['legacy']['location']
|
31 |
-
_usr.url = ur['data']['user']['legacy']['url']
|
32 |
-
# parsing date to user-friendly format
|
33 |
-
_dt = ur['data']['user']['legacy']['created_at']
|
34 |
-
_dt = datetime.datetime.strptime(_dt, '%a %b %d %H:%M:%S %z %Y')
|
35 |
-
# date is of the format year,
|
36 |
-
_usr.join_date = _dt.strftime(User_formats['join_date'])
|
37 |
-
_usr.join_time = _dt.strftime(User_formats['join_time'])
|
38 |
-
|
39 |
-
# :type `int`
|
40 |
-
_usr.tweets = int(ur['data']['user']['legacy']['statuses_count'])
|
41 |
-
_usr.following = int(ur['data']['user']['legacy']['friends_count'])
|
42 |
-
_usr.followers = int(ur['data']['user']['legacy']['followers_count'])
|
43 |
-
_usr.likes = int(ur['data']['user']['legacy']['favourites_count'])
|
44 |
-
_usr.media_count = int(ur['data']['user']['legacy']['media_count'])
|
45 |
-
|
46 |
-
_usr.is_private = ur['data']['user']['legacy']['protected']
|
47 |
-
_usr.is_verified = ur['data']['user']['legacy']['verified']
|
48 |
-
_usr.avatar = ur['data']['user']['legacy']['profile_image_url_https']
|
49 |
-
_usr.background_image = ur['data']['user']['legacy']['profile_banner_url']
|
50 |
-
# TODO : future implementation
|
51 |
-
# legacy_extended_profile is also available in some cases which can be used to get DOB of user
|
52 |
-
return _usr
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
twitter-scraper/twint-master/twint/verbose.py
DELETED
@@ -1,18 +0,0 @@
|
|
1 |
-
def Count(count, config):
|
2 |
-
msg = "[+] Finished: Successfully collected "
|
3 |
-
if config.Followers:
|
4 |
-
msg += f"all {count} users who follow @{config.Username}"
|
5 |
-
elif config.Following:
|
6 |
-
msg += f"all {count} users who @{config.Username} follows"
|
7 |
-
elif config.Favorites:
|
8 |
-
msg += f"{count} Tweets that @{config.Username} liked"
|
9 |
-
else:
|
10 |
-
msg += f"{count} Tweets_and_replies"
|
11 |
-
if config.Username:
|
12 |
-
msg += f" from @{config.Username}"
|
13 |
-
msg += "."
|
14 |
-
print(msg)
|
15 |
-
|
16 |
-
def Elastic(elasticsearch):
|
17 |
-
if elasticsearch:
|
18 |
-
print("[+] Indexing to Elasticsearch @ " + str(elasticsearch))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
twitter-scraper/twint-master/twitter_scraper.ipynb
DELETED
@@ -1,265 +0,0 @@
|
|
1 |
-
{
|
2 |
-
"cells": [
|
3 |
-
{
|
4 |
-
"cell_type": "markdown",
|
5 |
-
"id": "a5361789",
|
6 |
-
"metadata": {},
|
7 |
-
"source": [
|
8 |
-
"## Have to install these packages \n"
|
9 |
-
]
|
10 |
-
},
|
11 |
-
{
|
12 |
-
"cell_type": "code",
|
13 |
-
"execution_count": null,
|
14 |
-
"id": "c9021300",
|
15 |
-
"metadata": {
|
16 |
-
"scrolled": true
|
17 |
-
},
|
18 |
-
"outputs": [],
|
19 |
-
"source": [
|
20 |
-
"%%capture \n",
|
21 |
-
"!pip3 install Twint \n"
|
22 |
-
]
|
23 |
-
},
|
24 |
-
{
|
25 |
-
"cell_type": "markdown",
|
26 |
-
"id": "5c857dbf",
|
27 |
-
"metadata": {},
|
28 |
-
"source": [
|
29 |
-
"## Nessessary Imports"
|
30 |
-
]
|
31 |
-
},
|
32 |
-
{
|
33 |
-
"cell_type": "code",
|
34 |
-
"execution_count": null,
|
35 |
-
"id": "1413ab2b",
|
36 |
-
"metadata": {},
|
37 |
-
"outputs": [],
|
38 |
-
"source": [
|
39 |
-
"# import asyncio\n",
|
40 |
-
"# import os\n",
|
41 |
-
"# loop = asyncio.get_event_loop()\n",
|
42 |
-
"# loop.is_running()\n",
|
43 |
-
"# import twint\n",
|
44 |
-
"# import nest_asyncio\n",
|
45 |
-
"# nest_asyncio.apply()"
|
46 |
-
]
|
47 |
-
},
|
48 |
-
{
|
49 |
-
"cell_type": "code",
|
50 |
-
"execution_count": null,
|
51 |
-
"id": "d38514f3",
|
52 |
-
"metadata": {},
|
53 |
-
"outputs": [],
|
54 |
-
"source": [
|
55 |
-
"import scrape\n"
|
56 |
-
]
|
57 |
-
},
|
58 |
-
{
|
59 |
-
"cell_type": "code",
|
60 |
-
"execution_count": null,
|
61 |
-
"id": "a7912a91",
|
62 |
-
"metadata": {},
|
63 |
-
"outputs": [],
|
64 |
-
"source": [
|
65 |
-
"from_date=\"2022-6-10 10:30:22\"\n",
|
66 |
-
"to_date= \"2022-6-30\"\n",
|
67 |
-
"num_tweets = 20\n",
|
68 |
-
"_data=scrape.scraper.get_tweets(\"jimmieakesson\",u_or_s=\"u\",from_date=221232,to_date=2313)\n"
|
69 |
-
]
|
70 |
-
},
|
71 |
-
{
|
72 |
-
"cell_type": "code",
|
73 |
-
"execution_count": null,
|
74 |
-
"id": "48d50b46",
|
75 |
-
"metadata": {},
|
76 |
-
"outputs": [],
|
77 |
-
"source": [
|
78 |
-
"tweets= _data.keys()\n",
|
79 |
-
"for i in tweets:\n",
|
80 |
-
" _data[i][\"tweet\"]\n",
|
81 |
-
" print(_data[i][\"tweet\"], \"\\n\", \"__________________________________________________________\")"
|
82 |
-
]
|
83 |
-
},
|
84 |
-
{
|
85 |
-
"cell_type": "code",
|
86 |
-
"execution_count": null,
|
87 |
-
"id": "72cabcb5",
|
88 |
-
"metadata": {},
|
89 |
-
"outputs": [],
|
90 |
-
"source": [
|
91 |
-
"from_date=\"2022-6-10 10:30:22\"\n",
|
92 |
-
"to_date= \"2022-6-30\"\n",
|
93 |
-
"num_tweets = 20\n",
|
94 |
-
"_data=scrape.scraper.string_search_user_tweets(\"jimmieakesson\",\"invandring\")\n"
|
95 |
-
]
|
96 |
-
},
|
97 |
-
{
|
98 |
-
"cell_type": "code",
|
99 |
-
"execution_count": null,
|
100 |
-
"id": "549e4fb3",
|
101 |
-
"metadata": {},
|
102 |
-
"outputs": [],
|
103 |
-
"source": [
|
104 |
-
"tweets= _data[\"tweet\"]\n",
|
105 |
-
"for i in tweets:\n",
|
106 |
-
" print(i, \"\\n\", \"__________________________________________________________\")"
|
107 |
-
]
|
108 |
-
},
|
109 |
-
{
|
110 |
-
"cell_type": "code",
|
111 |
-
"execution_count": 3,
|
112 |
-
"id": "733dd44a",
|
113 |
-
"metadata": {},
|
114 |
-
"outputs": [
|
115 |
-
{
|
116 |
-
"name": "stdout",
|
117 |
-
"output_type": "stream",
|
118 |
-
"text": [
|
119 |
-
"Defaulting to user installation because normal site-packages is not writeable\n",
|
120 |
-
"Requirement already satisfied: snscrape in /home/oxygen/.local/lib/python3.10/site-packages (0.3.4)\n",
|
121 |
-
"Requirement already satisfied: beautifulsoup4 in /home/oxygen/.local/lib/python3.10/site-packages (from snscrape) (4.11.1)\n",
|
122 |
-
"Requirement already satisfied: requests[socks] in /usr/lib/python3/dist-packages (from snscrape) (2.25.1)\n",
|
123 |
-
"Requirement already satisfied: lxml in /usr/lib/python3/dist-packages (from snscrape) (4.8.0)\n",
|
124 |
-
"Requirement already satisfied: soupsieve>1.2 in /home/oxygen/.local/lib/python3.10/site-packages (from beautifulsoup4->snscrape) (2.3.2.post1)\n",
|
125 |
-
"Requirement already satisfied: PySocks!=1.5.7,>=1.5.6 in /home/oxygen/.local/lib/python3.10/site-packages (from requests[socks]->snscrape) (1.7.1)\n"
|
126 |
-
]
|
127 |
-
}
|
128 |
-
],
|
129 |
-
"source": [
|
130 |
-
"#%pip install -q snscrape==0.3.4\n",
|
131 |
-
"!pip3 install snscrape\n",
|
132 |
-
"#!pip3 install git+https://github.com/JustAnotherArchivist/snscrape.git"
|
133 |
-
]
|
134 |
-
},
|
135 |
-
{
|
136 |
-
"cell_type": "code",
|
137 |
-
"execution_count": 14,
|
138 |
-
"id": "0d16422c",
|
139 |
-
"metadata": {},
|
140 |
-
"outputs": [
|
141 |
-
{
|
142 |
-
"name": "stdout",
|
143 |
-
"output_type": "stream",
|
144 |
-
"text": [
|
145 |
-
"Note: you may need to restart the kernel to use updated packages.\n"
|
146 |
-
]
|
147 |
-
}
|
148 |
-
],
|
149 |
-
"source": [
|
150 |
-
"%pip install -q snscrape==0.3.4\n",
|
151 |
-
"from datetime import date\n",
|
152 |
-
"import os\n",
|
153 |
-
"import pandas as pd\n",
|
154 |
-
"\n",
|
155 |
-
"\n",
|
156 |
-
"def get_tweets(search_term, from_date, to_date=date.today(), num_tweets=100,u_or_s='s'):\n",
|
157 |
-
" if u_or_s.lower() =='u':\n",
|
158 |
-
" extracted_tweets = \"snscrape --format '{content!r}'\"+ f\" --max-results {num_tweets} --since {from_date} twitter-user '{search_term} until:{to_date}' > extracted-tweets.txt\" \n",
|
159 |
-
" else:\n",
|
160 |
-
" extracted_tweets = \"snscrape --format '{content!r}'\"+ f\" --max-results {num_tweets} --since {from_date} twitter-search '{search_term} until:{to_date}' > extracted-tweets.txt\"\n",
|
161 |
-
" \n",
|
162 |
-
" os.system(extracted_tweets)\n",
|
163 |
-
" if os.stat(\"extracted-tweets.txt\").st_size == 0:\n",
|
164 |
-
" print('No Tweets found')\n",
|
165 |
-
" else:\n",
|
166 |
-
" df = pd.read_csv('extracted-tweets.txt', names=['content'])\n",
|
167 |
-
" data_list=[]\n",
|
168 |
-
" for row in df['content'].iteritems():\n",
|
169 |
-
" temp= str(row[0])+str(row[1])\n",
|
170 |
-
" temp= temp.replace(\"\\'\",\"\")\n",
|
171 |
-
" data_list.append(temp)\n",
|
172 |
-
" return data_list\n",
|
173 |
-
"\n"
|
174 |
-
]
|
175 |
-
},
|
176 |
-
{
|
177 |
-
"cell_type": "code",
|
178 |
-
"execution_count": 12,
|
179 |
-
"id": "8e2adb35",
|
180 |
-
"metadata": {},
|
181 |
-
"outputs": [
|
182 |
-
{
|
183 |
-
"name": "stdout",
|
184 |
-
"output_type": "stream",
|
185 |
-
"text": [
|
186 |
-
"No Tweets found\n"
|
187 |
-
]
|
188 |
-
},
|
189 |
-
{
|
190 |
-
"name": "stderr",
|
191 |
-
"output_type": "stream",
|
192 |
-
"text": [
|
193 |
-
"Traceback (most recent call last):\n",
|
194 |
-
" File \"/home/oxygen/.local/bin/snscrape\", line 8, in <module>\n",
|
195 |
-
" sys.exit(main())\n",
|
196 |
-
" File \"/home/oxygen/.local/lib/python3.10/site-packages/snscrape/cli.py\", line 224, in main\n",
|
197 |
-
" args = parse_args()\n",
|
198 |
-
" File \"/home/oxygen/.local/lib/python3.10/site-packages/snscrape/cli.py\", line 159, in parse_args\n",
|
199 |
-
" import snscrape.modules\n",
|
200 |
-
" File \"/home/oxygen/.local/lib/python3.10/site-packages/snscrape/modules/__init__.py\", line 15, in <module>\n",
|
201 |
-
" _import_modules()\n",
|
202 |
-
" File \"/home/oxygen/.local/lib/python3.10/site-packages/snscrape/modules/__init__.py\", line 12, in _import_modules\n",
|
203 |
-
" module = importlib.import_module(moduleName)\n",
|
204 |
-
" File \"/usr/lib/python3.10/importlib/__init__.py\", line 126, in import_module\n",
|
205 |
-
" return _bootstrap._gcd_import(name[level:], package, level)\n",
|
206 |
-
" File \"/home/oxygen/.local/lib/python3.10/site-packages/snscrape/modules/instagram.py\", line 12, in <module>\n",
|
207 |
-
" class InstagramPost(typing.NamedTuple, snscrape.base.Item):\n",
|
208 |
-
" File \"/usr/lib/python3.10/typing.py\", line 2329, in _namedtuple_mro_entries\n",
|
209 |
-
" raise TypeError(\"Multiple inheritance with NamedTuple is not supported\")\n",
|
210 |
-
"TypeError: Multiple inheritance with NamedTuple is not supported\n"
|
211 |
-
]
|
212 |
-
},
|
213 |
-
{
|
214 |
-
"ename": "UnboundLocalError",
|
215 |
-
"evalue": "local variable 'df' referenced before assignment",
|
216 |
-
"output_type": "error",
|
217 |
-
"traceback": [
|
218 |
-
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
|
219 |
-
"\u001b[0;31mUnboundLocalError\u001b[0m Traceback (most recent call last)",
|
220 |
-
"\u001b[0;32m/tmp/ipykernel_26511/1892081786.py\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0md\u001b[0m\u001b[0;34m=\u001b[0m \u001b[0mget_tweets\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"jimmieakesson\"\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0mfrom_date\u001b[0m\u001b[0;34m=\u001b[0m \u001b[0;34m\"2022-06-01\"\u001b[0m \u001b[0;34m,\u001b[0m\u001b[0mnum_tweets\u001b[0m \u001b[0;34m=\u001b[0m\u001b[0;36m5\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mu_or_s\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m\"u\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
|
221 |
-
"\u001b[0;32m/tmp/ipykernel_26511/275462205.py\u001b[0m in \u001b[0;36mget_tweets\u001b[0;34m(search_term, from_date, to_date, num_tweets, u_or_s)\u001b[0m\n\u001b[1;32m 17\u001b[0m \u001b[0mdf\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mpd\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mread_csv\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'extracted-tweets.txt'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mnames\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'content'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 18\u001b[0m \u001b[0mdata_list\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 19\u001b[0;31m \u001b[0;32mfor\u001b[0m \u001b[0mrow\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mdf\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'content'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0miteritems\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 20\u001b[0m \u001b[0mtemp\u001b[0m\u001b[0;34m=\u001b[0m \u001b[0mstr\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mrow\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m+\u001b[0m\u001b[0mstr\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mrow\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 21\u001b[0m \u001b[0mtemp\u001b[0m\u001b[0;34m=\u001b[0m \u001b[0mtemp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mreplace\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"\\'\"\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\"\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
|
222 |
-
"\u001b[0;31mUnboundLocalError\u001b[0m: local variable 'df' referenced before assignment"
|
223 |
-
]
|
224 |
-
}
|
225 |
-
],
|
226 |
-
"source": [
|
227 |
-
"d= get_tweets(\"jimmieakesson\",from_date= \"2022-06-01\" ,num_tweets =5, u_or_s=\"u\")"
|
228 |
-
]
|
229 |
-
},
|
230 |
-
{
|
231 |
-
"cell_type": "code",
|
232 |
-
"execution_count": null,
|
233 |
-
"id": "a2c837f4",
|
234 |
-
"metadata": {},
|
235 |
-
"outputs": [],
|
236 |
-
"source": []
|
237 |
-
}
|
238 |
-
],
|
239 |
-
"metadata": {
|
240 |
-
"kernelspec": {
|
241 |
-
"display_name": "Python 3.10.4 64-bit",
|
242 |
-
"language": "python",
|
243 |
-
"name": "python3"
|
244 |
-
},
|
245 |
-
"language_info": {
|
246 |
-
"codemirror_mode": {
|
247 |
-
"name": "ipython",
|
248 |
-
"version": 3
|
249 |
-
},
|
250 |
-
"file_extension": ".py",
|
251 |
-
"mimetype": "text/x-python",
|
252 |
-
"name": "python",
|
253 |
-
"nbconvert_exporter": "python",
|
254 |
-
"pygments_lexer": "ipython3",
|
255 |
-
"version": "3.10.4"
|
256 |
-
},
|
257 |
-
"vscode": {
|
258 |
-
"interpreter": {
|
259 |
-
"hash": "916dbcbb3f70747c44a77c7bcd40155683ae19c65e1c03b4aa3499c5328201f1"
|
260 |
-
}
|
261 |
-
}
|
262 |
-
},
|
263 |
-
"nbformat": 4,
|
264 |
-
"nbformat_minor": 5
|
265 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
twitter-scraper/twitter_scraper.ipynb
ADDED
@@ -0,0 +1,819 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cells": [
|
3 |
+
{
|
4 |
+
"cell_type": "markdown",
|
5 |
+
"id": "a5361789",
|
6 |
+
"metadata": {},
|
7 |
+
"source": [
|
8 |
+
"## Have to install these packages \n"
|
9 |
+
]
|
10 |
+
},
|
11 |
+
{
|
12 |
+
"cell_type": "code",
|
13 |
+
"execution_count": 2,
|
14 |
+
"id": "c9021300",
|
15 |
+
"metadata": {
|
16 |
+
"scrolled": true
|
17 |
+
},
|
18 |
+
"outputs": [],
|
19 |
+
"source": [
|
20 |
+
"%%capture \n",
|
21 |
+
"!pip3 install Twint \n",
|
22 |
+
"\n"
|
23 |
+
]
|
24 |
+
},
|
25 |
+
{
|
26 |
+
"cell_type": "markdown",
|
27 |
+
"id": "5c857dbf",
|
28 |
+
"metadata": {},
|
29 |
+
"source": [
|
30 |
+
"## Nessessary Imports"
|
31 |
+
]
|
32 |
+
},
|
33 |
+
{
|
34 |
+
"cell_type": "code",
|
35 |
+
"execution_count": 3,
|
36 |
+
"id": "1413ab2b",
|
37 |
+
"metadata": {},
|
38 |
+
"outputs": [],
|
39 |
+
"source": [
|
40 |
+
"import asyncio\n",
|
41 |
+
"import os\n",
|
42 |
+
"loop = asyncio.get_event_loop()\n",
|
43 |
+
"loop.is_running()\n",
|
44 |
+
"import twint\n",
|
45 |
+
"import nest_asyncio\n",
|
46 |
+
"nest_asyncio.apply()"
|
47 |
+
]
|
48 |
+
},
|
49 |
+
{
|
50 |
+
"cell_type": "code",
|
51 |
+
"execution_count": 4,
|
52 |
+
"id": "d38514f3",
|
53 |
+
"metadata": {},
|
54 |
+
"outputs": [],
|
55 |
+
"source": [
|
56 |
+
"import scrape\n",
|
57 |
+
"sc= scrape.TwitterScraper(num_tweets=10)\n"
|
58 |
+
]
|
59 |
+
},
|
60 |
+
{
|
61 |
+
"cell_type": "code",
|
62 |
+
"execution_count": 5,
|
63 |
+
"id": "d37e5cbf",
|
64 |
+
"metadata": {},
|
65 |
+
"outputs": [
|
66 |
+
{
|
67 |
+
"name": "stdout",
|
68 |
+
"output_type": "stream",
|
69 |
+
"text": [
|
70 |
+
"[+] Finished: Successfully collected 20 Tweets.\n"
|
71 |
+
]
|
72 |
+
},
|
73 |
+
{
|
74 |
+
"data": {
|
75 |
+
"text/html": [
|
76 |
+
"<div>\n",
|
77 |
+
"<style scoped>\n",
|
78 |
+
" .dataframe tbody tr th:only-of-type {\n",
|
79 |
+
" vertical-align: middle;\n",
|
80 |
+
" }\n",
|
81 |
+
"\n",
|
82 |
+
" .dataframe tbody tr th {\n",
|
83 |
+
" vertical-align: top;\n",
|
84 |
+
" }\n",
|
85 |
+
"\n",
|
86 |
+
" .dataframe thead th {\n",
|
87 |
+
" text-align: right;\n",
|
88 |
+
" }\n",
|
89 |
+
"</style>\n",
|
90 |
+
"<table border=\"1\" class=\"dataframe\">\n",
|
91 |
+
" <thead>\n",
|
92 |
+
" <tr style=\"text-align: right;\">\n",
|
93 |
+
" <th></th>\n",
|
94 |
+
" <th>id</th>\n",
|
95 |
+
" <th>tweet</th>\n",
|
96 |
+
" <th>date</th>\n",
|
97 |
+
" <th>user_id</th>\n",
|
98 |
+
" <th>username</th>\n",
|
99 |
+
" <th>urls</th>\n",
|
100 |
+
" <th>nlikes</th>\n",
|
101 |
+
" <th>nreplies</th>\n",
|
102 |
+
" <th>nretweets</th>\n",
|
103 |
+
" </tr>\n",
|
104 |
+
" </thead>\n",
|
105 |
+
" <tbody>\n",
|
106 |
+
" <tr>\n",
|
107 |
+
" <th>0</th>\n",
|
108 |
+
" <td>1545194541006950400</td>\n",
|
109 |
+
" <td>kim sever benim gibi sevmeyecekler bıraktığın ...</td>\n",
|
110 |
+
" <td>2022-07-08 01:54:21</td>\n",
|
111 |
+
" <td>1396065566117466113</td>\n",
|
112 |
+
" <td>heja4r</td>\n",
|
113 |
+
" <td>[]</td>\n",
|
114 |
+
" <td>1</td>\n",
|
115 |
+
" <td>0</td>\n",
|
116 |
+
" <td>0</td>\n",
|
117 |
+
" </tr>\n",
|
118 |
+
" <tr>\n",
|
119 |
+
" <th>1</th>\n",
|
120 |
+
" <td>1545192735354806274</td>\n",
|
121 |
+
" <td>Kelimeler,albayım,bazı anlamalara gelmiyor..</td>\n",
|
122 |
+
" <td>2022-07-08 01:47:11</td>\n",
|
123 |
+
" <td>1481604485118140425</td>\n",
|
124 |
+
" <td>Theguapo6</td>\n",
|
125 |
+
" <td>[]</td>\n",
|
126 |
+
" <td>1</td>\n",
|
127 |
+
" <td>0</td>\n",
|
128 |
+
" <td>0</td>\n",
|
129 |
+
" </tr>\n",
|
130 |
+
" <tr>\n",
|
131 |
+
" <th>2</th>\n",
|
132 |
+
" <td>1545190168533008385</td>\n",
|
133 |
+
" <td>@shikan213 ptdr ? y’a aucune racisme à quel mo...</td>\n",
|
134 |
+
" <td>2022-07-08 01:36:59</td>\n",
|
135 |
+
" <td>1476042813741617155</td>\n",
|
136 |
+
" <td>srndz213__</td>\n",
|
137 |
+
" <td>[]</td>\n",
|
138 |
+
" <td>0</td>\n",
|
139 |
+
" <td>1</td>\n",
|
140 |
+
" <td>0</td>\n",
|
141 |
+
" </tr>\n",
|
142 |
+
" <tr>\n",
|
143 |
+
" <th>3</th>\n",
|
144 |
+
" <td>1545190106910171136</td>\n",
|
145 |
+
" <td>@guzzeida Men gud du har presterat så mkt bätt...</td>\n",
|
146 |
+
" <td>2022-07-08 01:36:44</td>\n",
|
147 |
+
" <td>34343541</td>\n",
|
148 |
+
" <td>lisaxamanda</td>\n",
|
149 |
+
" <td>[]</td>\n",
|
150 |
+
" <td>1</td>\n",
|
151 |
+
" <td>0</td>\n",
|
152 |
+
" <td>0</td>\n",
|
153 |
+
" </tr>\n",
|
154 |
+
" <tr>\n",
|
155 |
+
" <th>4</th>\n",
|
156 |
+
" <td>1545190096042860544</td>\n",
|
157 |
+
" <td>Heja, heja, heja Slovensko</td>\n",
|
158 |
+
" <td>2022-07-08 01:36:41</td>\n",
|
159 |
+
" <td>3158344237</td>\n",
|
160 |
+
" <td>ian_10_19</td>\n",
|
161 |
+
" <td>[]</td>\n",
|
162 |
+
" <td>0</td>\n",
|
163 |
+
" <td>0</td>\n",
|
164 |
+
" <td>0</td>\n",
|
165 |
+
" </tr>\n",
|
166 |
+
" </tbody>\n",
|
167 |
+
"</table>\n",
|
168 |
+
"</div>"
|
169 |
+
],
|
170 |
+
"text/plain": [
|
171 |
+
" id tweet \\\n",
|
172 |
+
"0 1545194541006950400 kim sever benim gibi sevmeyecekler bıraktığın ... \n",
|
173 |
+
"1 1545192735354806274 Kelimeler,albayım,bazı anlamalara gelmiyor.. \n",
|
174 |
+
"2 1545190168533008385 @shikan213 ptdr ? y’a aucune racisme à quel mo... \n",
|
175 |
+
"3 1545190106910171136 @guzzeida Men gud du har presterat så mkt bätt... \n",
|
176 |
+
"4 1545190096042860544 Heja, heja, heja Slovensko \n",
|
177 |
+
"\n",
|
178 |
+
" date user_id username urls nlikes \\\n",
|
179 |
+
"0 2022-07-08 01:54:21 1396065566117466113 heja4r [] 1 \n",
|
180 |
+
"1 2022-07-08 01:47:11 1481604485118140425 Theguapo6 [] 1 \n",
|
181 |
+
"2 2022-07-08 01:36:59 1476042813741617155 srndz213__ [] 0 \n",
|
182 |
+
"3 2022-07-08 01:36:44 34343541 lisaxamanda [] 1 \n",
|
183 |
+
"4 2022-07-08 01:36:41 3158344237 ian_10_19 [] 0 \n",
|
184 |
+
"\n",
|
185 |
+
" nreplies nretweets \n",
|
186 |
+
"0 0 0 \n",
|
187 |
+
"1 0 0 \n",
|
188 |
+
"2 1 0 \n",
|
189 |
+
"3 0 0 \n",
|
190 |
+
"4 0 0 "
|
191 |
+
]
|
192 |
+
},
|
193 |
+
"execution_count": 5,
|
194 |
+
"metadata": {},
|
195 |
+
"output_type": "execute_result"
|
196 |
+
}
|
197 |
+
],
|
198 |
+
"source": [
|
199 |
+
"string_tr_info=sc.scrape_by_string(\"heja\")\n",
|
200 |
+
"string_tr_info.head()\n"
|
201 |
+
]
|
202 |
+
},
|
203 |
+
{
|
204 |
+
"cell_type": "code",
|
205 |
+
"execution_count": 6,
|
206 |
+
"id": "902170ad",
|
207 |
+
"metadata": {},
|
208 |
+
"outputs": [
|
209 |
+
{
|
210 |
+
"data": {
|
211 |
+
"text/html": [
|
212 |
+
"<div>\n",
|
213 |
+
"<style scoped>\n",
|
214 |
+
" .dataframe tbody tr th:only-of-type {\n",
|
215 |
+
" vertical-align: middle;\n",
|
216 |
+
" }\n",
|
217 |
+
"\n",
|
218 |
+
" .dataframe tbody tr th {\n",
|
219 |
+
" vertical-align: top;\n",
|
220 |
+
" }\n",
|
221 |
+
"\n",
|
222 |
+
" .dataframe thead th {\n",
|
223 |
+
" text-align: right;\n",
|
224 |
+
" }\n",
|
225 |
+
"</style>\n",
|
226 |
+
"<table border=\"1\" class=\"dataframe\">\n",
|
227 |
+
" <thead>\n",
|
228 |
+
" <tr style=\"text-align: right;\">\n",
|
229 |
+
" <th></th>\n",
|
230 |
+
" <th>id</th>\n",
|
231 |
+
" <th>tweet</th>\n",
|
232 |
+
" <th>date</th>\n",
|
233 |
+
" <th>user_id</th>\n",
|
234 |
+
" <th>username</th>\n",
|
235 |
+
" <th>urls</th>\n",
|
236 |
+
" <th>nlikes</th>\n",
|
237 |
+
" <th>nreplies</th>\n",
|
238 |
+
" <th>nretweets</th>\n",
|
239 |
+
" </tr>\n",
|
240 |
+
" </thead>\n",
|
241 |
+
" <tbody>\n",
|
242 |
+
" <tr>\n",
|
243 |
+
" <th>0</th>\n",
|
244 |
+
" <td>1545194541006950400</td>\n",
|
245 |
+
" <td>kim sever benim gibi sevmeyecekler bıraktığın ...</td>\n",
|
246 |
+
" <td>2022-07-08 01:54:21</td>\n",
|
247 |
+
" <td>1396065566117466113</td>\n",
|
248 |
+
" <td>heja4r</td>\n",
|
249 |
+
" <td>[]</td>\n",
|
250 |
+
" <td>1</td>\n",
|
251 |
+
" <td>0</td>\n",
|
252 |
+
" <td>0</td>\n",
|
253 |
+
" </tr>\n",
|
254 |
+
" <tr>\n",
|
255 |
+
" <th>1</th>\n",
|
256 |
+
" <td>1545192735354806274</td>\n",
|
257 |
+
" <td>Kelimeler,albayım,bazı anlamalara gelmiyor..</td>\n",
|
258 |
+
" <td>2022-07-08 01:47:11</td>\n",
|
259 |
+
" <td>1481604485118140425</td>\n",
|
260 |
+
" <td>Theguapo6</td>\n",
|
261 |
+
" <td>[]</td>\n",
|
262 |
+
" <td>1</td>\n",
|
263 |
+
" <td>0</td>\n",
|
264 |
+
" <td>0</td>\n",
|
265 |
+
" </tr>\n",
|
266 |
+
" <tr>\n",
|
267 |
+
" <th>4</th>\n",
|
268 |
+
" <td>1545190096042860544</td>\n",
|
269 |
+
" <td>Heja, heja, heja Slovensko</td>\n",
|
270 |
+
" <td>2022-07-08 01:36:41</td>\n",
|
271 |
+
" <td>3158344237</td>\n",
|
272 |
+
" <td>ian_10_19</td>\n",
|
273 |
+
" <td>[]</td>\n",
|
274 |
+
" <td>0</td>\n",
|
275 |
+
" <td>0</td>\n",
|
276 |
+
" <td>0</td>\n",
|
277 |
+
" </tr>\n",
|
278 |
+
" <tr>\n",
|
279 |
+
" <th>6</th>\n",
|
280 |
+
" <td>1545189783747436545</td>\n",
|
281 |
+
" <td>Beni sorarsan dardayım..</td>\n",
|
282 |
+
" <td>2022-07-08 01:35:27</td>\n",
|
283 |
+
" <td>1481604485118140425</td>\n",
|
284 |
+
" <td>Theguapo6</td>\n",
|
285 |
+
" <td>[]</td>\n",
|
286 |
+
" <td>2</td>\n",
|
287 |
+
" <td>0</td>\n",
|
288 |
+
" <td>0</td>\n",
|
289 |
+
" </tr>\n",
|
290 |
+
" <tr>\n",
|
291 |
+
" <th>12</th>\n",
|
292 |
+
" <td>1545186234623991813</td>\n",
|
293 |
+
" <td>Heja strandhäll. Vilket jävla block mongo</td>\n",
|
294 |
+
" <td>2022-07-08 01:21:21</td>\n",
|
295 |
+
" <td>1160537136250195968</td>\n",
|
296 |
+
" <td>Siggydunn</td>\n",
|
297 |
+
" <td>[]</td>\n",
|
298 |
+
" <td>0</td>\n",
|
299 |
+
" <td>0</td>\n",
|
300 |
+
" <td>0</td>\n",
|
301 |
+
" </tr>\n",
|
302 |
+
" </tbody>\n",
|
303 |
+
"</table>\n",
|
304 |
+
"</div>"
|
305 |
+
],
|
306 |
+
"text/plain": [
|
307 |
+
" id tweet \\\n",
|
308 |
+
"0 1545194541006950400 kim sever benim gibi sevmeyecekler bıraktığın ... \n",
|
309 |
+
"1 1545192735354806274 Kelimeler,albayım,bazı anlamalara gelmiyor.. \n",
|
310 |
+
"4 1545190096042860544 Heja, heja, heja Slovensko \n",
|
311 |
+
"6 1545189783747436545 Beni sorarsan dardayım.. \n",
|
312 |
+
"12 1545186234623991813 Heja strandhäll. Vilket jävla block mongo \n",
|
313 |
+
"\n",
|
314 |
+
" date user_id username urls nlikes \\\n",
|
315 |
+
"0 2022-07-08 01:54:21 1396065566117466113 heja4r [] 1 \n",
|
316 |
+
"1 2022-07-08 01:47:11 1481604485118140425 Theguapo6 [] 1 \n",
|
317 |
+
"4 2022-07-08 01:36:41 3158344237 ian_10_19 [] 0 \n",
|
318 |
+
"6 2022-07-08 01:35:27 1481604485118140425 Theguapo6 [] 2 \n",
|
319 |
+
"12 2022-07-08 01:21:21 1160537136250195968 Siggydunn [] 0 \n",
|
320 |
+
"\n",
|
321 |
+
" nreplies nretweets \n",
|
322 |
+
"0 0 0 \n",
|
323 |
+
"1 0 0 \n",
|
324 |
+
"4 0 0 \n",
|
325 |
+
"6 0 0 \n",
|
326 |
+
"12 0 0 "
|
327 |
+
]
|
328 |
+
},
|
329 |
+
"execution_count": 6,
|
330 |
+
"metadata": {},
|
331 |
+
"output_type": "execute_result"
|
332 |
+
}
|
333 |
+
],
|
334 |
+
"source": [
|
335 |
+
"string_t_info=sc.get_only_tweets(tr_info)\n",
|
336 |
+
"string_t_info.head()"
|
337 |
+
]
|
338 |
+
},
|
339 |
+
{
|
340 |
+
"cell_type": "code",
|
341 |
+
"execution_count": 7,
|
342 |
+
"id": "a7912a91",
|
343 |
+
"metadata": {},
|
344 |
+
"outputs": [
|
345 |
+
{
|
346 |
+
"name": "stdout",
|
347 |
+
"output_type": "stream",
|
348 |
+
"text": [
|
349 |
+
"[+] Finished: Successfully collected 20 Tweets.\n"
|
350 |
+
]
|
351 |
+
},
|
352 |
+
{
|
353 |
+
"data": {
|
354 |
+
"text/html": [
|
355 |
+
"<div>\n",
|
356 |
+
"<style scoped>\n",
|
357 |
+
" .dataframe tbody tr th:only-of-type {\n",
|
358 |
+
" vertical-align: middle;\n",
|
359 |
+
" }\n",
|
360 |
+
"\n",
|
361 |
+
" .dataframe tbody tr th {\n",
|
362 |
+
" vertical-align: top;\n",
|
363 |
+
" }\n",
|
364 |
+
"\n",
|
365 |
+
" .dataframe thead th {\n",
|
366 |
+
" text-align: right;\n",
|
367 |
+
" }\n",
|
368 |
+
"</style>\n",
|
369 |
+
"<table border=\"1\" class=\"dataframe\">\n",
|
370 |
+
" <thead>\n",
|
371 |
+
" <tr style=\"text-align: right;\">\n",
|
372 |
+
" <th></th>\n",
|
373 |
+
" <th>id</th>\n",
|
374 |
+
" <th>tweet</th>\n",
|
375 |
+
" <th>date</th>\n",
|
376 |
+
" <th>user_id</th>\n",
|
377 |
+
" <th>username</th>\n",
|
378 |
+
" <th>urls</th>\n",
|
379 |
+
" <th>nlikes</th>\n",
|
380 |
+
" <th>nreplies</th>\n",
|
381 |
+
" <th>nretweets</th>\n",
|
382 |
+
" </tr>\n",
|
383 |
+
" </thead>\n",
|
384 |
+
" <tbody>\n",
|
385 |
+
" <tr>\n",
|
386 |
+
" <th>0</th>\n",
|
387 |
+
" <td>1544748873767424001</td>\n",
|
388 |
+
" <td>Fruktansvärt att nås av beskedet att kvinnan s...</td>\n",
|
389 |
+
" <td>2022-07-06 20:23:26</td>\n",
|
390 |
+
" <td>95972673</td>\n",
|
391 |
+
" <td>jimmieakesson</td>\n",
|
392 |
+
" <td>[]</td>\n",
|
393 |
+
" <td>3397</td>\n",
|
394 |
+
" <td>167</td>\n",
|
395 |
+
" <td>140</td>\n",
|
396 |
+
" </tr>\n",
|
397 |
+
" <tr>\n",
|
398 |
+
" <th>1</th>\n",
|
399 |
+
" <td>1538948369611210764</td>\n",
|
400 |
+
" <td>@annieloof Nej, jag håller med. Tänk mer som M...</td>\n",
|
401 |
+
" <td>2022-06-20 20:14:18</td>\n",
|
402 |
+
" <td>95972673</td>\n",
|
403 |
+
" <td>jimmieakesson</td>\n",
|
404 |
+
" <td>[]</td>\n",
|
405 |
+
" <td>1513</td>\n",
|
406 |
+
" <td>89</td>\n",
|
407 |
+
" <td>115</td>\n",
|
408 |
+
" </tr>\n",
|
409 |
+
" <tr>\n",
|
410 |
+
" <th>2</th>\n",
|
411 |
+
" <td>1537770920621879297</td>\n",
|
412 |
+
" <td>Man kan ha synpunkter på en sådan lösning, men...</td>\n",
|
413 |
+
" <td>2022-06-17 14:15:32</td>\n",
|
414 |
+
" <td>95972673</td>\n",
|
415 |
+
" <td>jimmieakesson</td>\n",
|
416 |
+
" <td>[]</td>\n",
|
417 |
+
" <td>694</td>\n",
|
418 |
+
" <td>17</td>\n",
|
419 |
+
" <td>41</td>\n",
|
420 |
+
" </tr>\n",
|
421 |
+
" <tr>\n",
|
422 |
+
" <th>3</th>\n",
|
423 |
+
" <td>1537770809225273344</td>\n",
|
424 |
+
" <td>Är det ont om plats på anstalterna så får man ...</td>\n",
|
425 |
+
" <td>2022-06-17 14:15:05</td>\n",
|
426 |
+
" <td>95972673</td>\n",
|
427 |
+
" <td>jimmieakesson</td>\n",
|
428 |
+
" <td>[]</td>\n",
|
429 |
+
" <td>810</td>\n",
|
430 |
+
" <td>26</td>\n",
|
431 |
+
" <td>57</td>\n",
|
432 |
+
" </tr>\n",
|
433 |
+
" <tr>\n",
|
434 |
+
" <th>4</th>\n",
|
435 |
+
" <td>1537770713368735744</td>\n",
|
436 |
+
" <td>Döms man för brott, särskilt våldsbrott, ska m...</td>\n",
|
437 |
+
" <td>2022-06-17 14:14:43</td>\n",
|
438 |
+
" <td>95972673</td>\n",
|
439 |
+
" <td>jimmieakesson</td>\n",
|
440 |
+
" <td>[]</td>\n",
|
441 |
+
" <td>1020</td>\n",
|
442 |
+
" <td>26</td>\n",
|
443 |
+
" <td>86</td>\n",
|
444 |
+
" </tr>\n",
|
445 |
+
" </tbody>\n",
|
446 |
+
"</table>\n",
|
447 |
+
"</div>"
|
448 |
+
],
|
449 |
+
"text/plain": [
|
450 |
+
" id tweet \\\n",
|
451 |
+
"0 1544748873767424001 Fruktansvärt att nås av beskedet att kvinnan s... \n",
|
452 |
+
"1 1538948369611210764 @annieloof Nej, jag håller med. Tänk mer som M... \n",
|
453 |
+
"2 1537770920621879297 Man kan ha synpunkter på en sådan lösning, men... \n",
|
454 |
+
"3 1537770809225273344 Är det ont om plats på anstalterna så får man ... \n",
|
455 |
+
"4 1537770713368735744 Döms man för brott, särskilt våldsbrott, ska m... \n",
|
456 |
+
"\n",
|
457 |
+
" date user_id username urls nlikes nreplies \\\n",
|
458 |
+
"0 2022-07-06 20:23:26 95972673 jimmieakesson [] 3397 167 \n",
|
459 |
+
"1 2022-06-20 20:14:18 95972673 jimmieakesson [] 1513 89 \n",
|
460 |
+
"2 2022-06-17 14:15:32 95972673 jimmieakesson [] 694 17 \n",
|
461 |
+
"3 2022-06-17 14:15:05 95972673 jimmieakesson [] 810 26 \n",
|
462 |
+
"4 2022-06-17 14:14:43 95972673 jimmieakesson [] 1020 26 \n",
|
463 |
+
"\n",
|
464 |
+
" nretweets \n",
|
465 |
+
"0 140 \n",
|
466 |
+
"1 115 \n",
|
467 |
+
"2 41 \n",
|
468 |
+
"3 57 \n",
|
469 |
+
"4 86 "
|
470 |
+
]
|
471 |
+
},
|
472 |
+
"execution_count": 7,
|
473 |
+
"metadata": {},
|
474 |
+
"output_type": "execute_result"
|
475 |
+
}
|
476 |
+
],
|
477 |
+
"source": [
|
478 |
+
"user__tr_info=sc.scrape_by_user(\"jimmieakesson\")\n",
|
479 |
+
"df.head()"
|
480 |
+
]
|
481 |
+
},
|
482 |
+
{
|
483 |
+
"cell_type": "code",
|
484 |
+
"execution_count": null,
|
485 |
+
"id": "7db69757",
|
486 |
+
"metadata": {},
|
487 |
+
"outputs": [],
|
488 |
+
"source": [
|
489 |
+
"user__t_info=sc.get_only_tweets(tr_info)\n",
|
490 |
+
"user__t_info.head()"
|
491 |
+
]
|
492 |
+
},
|
493 |
+
{
|
494 |
+
"cell_type": "code",
|
495 |
+
"execution_count": 8,
|
496 |
+
"id": "9d6b1bdf",
|
497 |
+
"metadata": {},
|
498 |
+
"outputs": [
|
499 |
+
{
|
500 |
+
"name": "stdout",
|
501 |
+
"output_type": "stream",
|
502 |
+
"text": [
|
503 |
+
"[+] Finished: Successfully collected 16 Tweets from @jimmieakesson.\n"
|
504 |
+
]
|
505 |
+
},
|
506 |
+
{
|
507 |
+
"data": {
|
508 |
+
"text/html": [
|
509 |
+
"<div>\n",
|
510 |
+
"<style scoped>\n",
|
511 |
+
" .dataframe tbody tr th:only-of-type {\n",
|
512 |
+
" vertical-align: middle;\n",
|
513 |
+
" }\n",
|
514 |
+
"\n",
|
515 |
+
" .dataframe tbody tr th {\n",
|
516 |
+
" vertical-align: top;\n",
|
517 |
+
" }\n",
|
518 |
+
"\n",
|
519 |
+
" .dataframe thead th {\n",
|
520 |
+
" text-align: right;\n",
|
521 |
+
" }\n",
|
522 |
+
"</style>\n",
|
523 |
+
"<table border=\"1\" class=\"dataframe\">\n",
|
524 |
+
" <thead>\n",
|
525 |
+
" <tr style=\"text-align: right;\">\n",
|
526 |
+
" <th></th>\n",
|
527 |
+
" <th>id</th>\n",
|
528 |
+
" <th>tweet</th>\n",
|
529 |
+
" <th>date</th>\n",
|
530 |
+
" <th>user_id</th>\n",
|
531 |
+
" <th>username</th>\n",
|
532 |
+
" <th>urls</th>\n",
|
533 |
+
" <th>nlikes</th>\n",
|
534 |
+
" <th>nreplies</th>\n",
|
535 |
+
" <th>nretweets</th>\n",
|
536 |
+
" </tr>\n",
|
537 |
+
" </thead>\n",
|
538 |
+
" <tbody>\n",
|
539 |
+
" <tr>\n",
|
540 |
+
" <th>0</th>\n",
|
541 |
+
" <td>1363067834260201475</td>\n",
|
542 |
+
" <td>Utan massiv, asylrelaterad invandring från frä...</td>\n",
|
543 |
+
" <td>2021-02-20 11:07:50</td>\n",
|
544 |
+
" <td>95972673</td>\n",
|
545 |
+
" <td>jimmieakesson</td>\n",
|
546 |
+
" <td>[]</td>\n",
|
547 |
+
" <td>1277</td>\n",
|
548 |
+
" <td>22</td>\n",
|
549 |
+
" <td>105</td>\n",
|
550 |
+
" </tr>\n",
|
551 |
+
" <tr>\n",
|
552 |
+
" <th>1</th>\n",
|
553 |
+
" <td>1363067613660778496</td>\n",
|
554 |
+
" <td>Många vänsterliberaler tycks ha reagerat på de...</td>\n",
|
555 |
+
" <td>2021-02-20 11:06:58</td>\n",
|
556 |
+
" <td>95972673</td>\n",
|
557 |
+
" <td>jimmieakesson</td>\n",
|
558 |
+
" <td>[]</td>\n",
|
559 |
+
" <td>625</td>\n",
|
560 |
+
" <td>9</td>\n",
|
561 |
+
" <td>68</td>\n",
|
562 |
+
" </tr>\n",
|
563 |
+
" <tr>\n",
|
564 |
+
" <th>2</th>\n",
|
565 |
+
" <td>1363067558409158656</td>\n",
|
566 |
+
" <td>Jag förstår — uppriktigt — inte den närmast hy...</td>\n",
|
567 |
+
" <td>2021-02-20 11:06:45</td>\n",
|
568 |
+
" <td>95972673</td>\n",
|
569 |
+
" <td>jimmieakesson</td>\n",
|
570 |
+
" <td>[]</td>\n",
|
571 |
+
" <td>2458</td>\n",
|
572 |
+
" <td>199</td>\n",
|
573 |
+
" <td>336</td>\n",
|
574 |
+
" </tr>\n",
|
575 |
+
" <tr>\n",
|
576 |
+
" <th>3</th>\n",
|
577 |
+
" <td>1362748777552113670</td>\n",
|
578 |
+
" <td>Invandring av hundratusentals människor från f...</td>\n",
|
579 |
+
" <td>2021-02-19 14:00:01</td>\n",
|
580 |
+
" <td>95972673</td>\n",
|
581 |
+
" <td>jimmieakesson</td>\n",
|
582 |
+
" <td>[]</td>\n",
|
583 |
+
" <td>1334</td>\n",
|
584 |
+
" <td>55</td>\n",
|
585 |
+
" <td>101</td>\n",
|
586 |
+
" </tr>\n",
|
587 |
+
" <tr>\n",
|
588 |
+
" <th>4</th>\n",
|
589 |
+
" <td>1362409505557012490</td>\n",
|
590 |
+
" <td>Vårt land behöver ett totalstopp för all asyl-...</td>\n",
|
591 |
+
" <td>2021-02-18 15:31:53</td>\n",
|
592 |
+
" <td>95972673</td>\n",
|
593 |
+
" <td>jimmieakesson</td>\n",
|
594 |
+
" <td>[]</td>\n",
|
595 |
+
" <td>3044</td>\n",
|
596 |
+
" <td>268</td>\n",
|
597 |
+
" <td>404</td>\n",
|
598 |
+
" </tr>\n",
|
599 |
+
" </tbody>\n",
|
600 |
+
"</table>\n",
|
601 |
+
"</div>"
|
602 |
+
],
|
603 |
+
"text/plain": [
|
604 |
+
" id tweet \\\n",
|
605 |
+
"0 1363067834260201475 Utan massiv, asylrelaterad invandring från frä... \n",
|
606 |
+
"1 1363067613660778496 Många vänsterliberaler tycks ha reagerat på de... \n",
|
607 |
+
"2 1363067558409158656 Jag förstår — uppriktigt — inte den närmast hy... \n",
|
608 |
+
"3 1362748777552113670 Invandring av hundratusentals människor från f... \n",
|
609 |
+
"4 1362409505557012490 Vårt land behöver ett totalstopp för all asyl-... \n",
|
610 |
+
"\n",
|
611 |
+
" date user_id username urls nlikes nreplies \\\n",
|
612 |
+
"0 2021-02-20 11:07:50 95972673 jimmieakesson [] 1277 22 \n",
|
613 |
+
"1 2021-02-20 11:06:58 95972673 jimmieakesson [] 625 9 \n",
|
614 |
+
"2 2021-02-20 11:06:45 95972673 jimmieakesson [] 2458 199 \n",
|
615 |
+
"3 2021-02-19 14:00:01 95972673 jimmieakesson [] 1334 55 \n",
|
616 |
+
"4 2021-02-18 15:31:53 95972673 jimmieakesson [] 3044 268 \n",
|
617 |
+
"\n",
|
618 |
+
" nretweets \n",
|
619 |
+
"0 105 \n",
|
620 |
+
"1 68 \n",
|
621 |
+
"2 336 \n",
|
622 |
+
"3 101 \n",
|
623 |
+
"4 404 "
|
624 |
+
]
|
625 |
+
},
|
626 |
+
"execution_count": 8,
|
627 |
+
"metadata": {},
|
628 |
+
"output_type": "execute_result"
|
629 |
+
}
|
630 |
+
],
|
631 |
+
"source": [
|
632 |
+
"user__string_tr_info=sc.scrape_by_user_and_string(\"jimmieakesson\",\"invandring\")\n",
|
633 |
+
"user__string_tr_info.head()\n"
|
634 |
+
]
|
635 |
+
},
|
636 |
+
{
|
637 |
+
"cell_type": "code",
|
638 |
+
"execution_count": 9,
|
639 |
+
"id": "a1aede79",
|
640 |
+
"metadata": {},
|
641 |
+
"outputs": [
|
642 |
+
{
|
643 |
+
"data": {
|
644 |
+
"text/html": [
|
645 |
+
"<div>\n",
|
646 |
+
"<style scoped>\n",
|
647 |
+
" .dataframe tbody tr th:only-of-type {\n",
|
648 |
+
" vertical-align: middle;\n",
|
649 |
+
" }\n",
|
650 |
+
"\n",
|
651 |
+
" .dataframe tbody tr th {\n",
|
652 |
+
" vertical-align: top;\n",
|
653 |
+
" }\n",
|
654 |
+
"\n",
|
655 |
+
" .dataframe thead th {\n",
|
656 |
+
" text-align: right;\n",
|
657 |
+
" }\n",
|
658 |
+
"</style>\n",
|
659 |
+
"<table border=\"1\" class=\"dataframe\">\n",
|
660 |
+
" <thead>\n",
|
661 |
+
" <tr style=\"text-align: right;\">\n",
|
662 |
+
" <th></th>\n",
|
663 |
+
" <th>id</th>\n",
|
664 |
+
" <th>tweet</th>\n",
|
665 |
+
" <th>date</th>\n",
|
666 |
+
" <th>user_id</th>\n",
|
667 |
+
" <th>username</th>\n",
|
668 |
+
" <th>urls</th>\n",
|
669 |
+
" <th>nlikes</th>\n",
|
670 |
+
" <th>nreplies</th>\n",
|
671 |
+
" <th>nretweets</th>\n",
|
672 |
+
" </tr>\n",
|
673 |
+
" </thead>\n",
|
674 |
+
" <tbody>\n",
|
675 |
+
" <tr>\n",
|
676 |
+
" <th>0</th>\n",
|
677 |
+
" <td>1363067834260201475</td>\n",
|
678 |
+
" <td>Utan massiv, asylrelaterad invandring från frä...</td>\n",
|
679 |
+
" <td>2021-02-20 11:07:50</td>\n",
|
680 |
+
" <td>95972673</td>\n",
|
681 |
+
" <td>jimmieakesson</td>\n",
|
682 |
+
" <td>[]</td>\n",
|
683 |
+
" <td>1277</td>\n",
|
684 |
+
" <td>22</td>\n",
|
685 |
+
" <td>105</td>\n",
|
686 |
+
" </tr>\n",
|
687 |
+
" <tr>\n",
|
688 |
+
" <th>1</th>\n",
|
689 |
+
" <td>1363067613660778496</td>\n",
|
690 |
+
" <td>Många vänsterliberaler tycks ha reagerat på de...</td>\n",
|
691 |
+
" <td>2021-02-20 11:06:58</td>\n",
|
692 |
+
" <td>95972673</td>\n",
|
693 |
+
" <td>jimmieakesson</td>\n",
|
694 |
+
" <td>[]</td>\n",
|
695 |
+
" <td>625</td>\n",
|
696 |
+
" <td>9</td>\n",
|
697 |
+
" <td>68</td>\n",
|
698 |
+
" </tr>\n",
|
699 |
+
" <tr>\n",
|
700 |
+
" <th>2</th>\n",
|
701 |
+
" <td>1363067558409158656</td>\n",
|
702 |
+
" <td>Jag förstår — uppriktigt — inte den närmast hy...</td>\n",
|
703 |
+
" <td>2021-02-20 11:06:45</td>\n",
|
704 |
+
" <td>95972673</td>\n",
|
705 |
+
" <td>jimmieakesson</td>\n",
|
706 |
+
" <td>[]</td>\n",
|
707 |
+
" <td>2458</td>\n",
|
708 |
+
" <td>199</td>\n",
|
709 |
+
" <td>336</td>\n",
|
710 |
+
" </tr>\n",
|
711 |
+
" <tr>\n",
|
712 |
+
" <th>3</th>\n",
|
713 |
+
" <td>1362748777552113670</td>\n",
|
714 |
+
" <td>Invandring av hundratusentals människor från f...</td>\n",
|
715 |
+
" <td>2021-02-19 14:00:01</td>\n",
|
716 |
+
" <td>95972673</td>\n",
|
717 |
+
" <td>jimmieakesson</td>\n",
|
718 |
+
" <td>[]</td>\n",
|
719 |
+
" <td>1334</td>\n",
|
720 |
+
" <td>55</td>\n",
|
721 |
+
" <td>101</td>\n",
|
722 |
+
" </tr>\n",
|
723 |
+
" <tr>\n",
|
724 |
+
" <th>4</th>\n",
|
725 |
+
" <td>1362409505557012490</td>\n",
|
726 |
+
" <td>Vårt land behöver ett totalstopp för all asyl-...</td>\n",
|
727 |
+
" <td>2021-02-18 15:31:53</td>\n",
|
728 |
+
" <td>95972673</td>\n",
|
729 |
+
" <td>jimmieakesson</td>\n",
|
730 |
+
" <td>[]</td>\n",
|
731 |
+
" <td>3044</td>\n",
|
732 |
+
" <td>268</td>\n",
|
733 |
+
" <td>404</td>\n",
|
734 |
+
" </tr>\n",
|
735 |
+
" </tbody>\n",
|
736 |
+
"</table>\n",
|
737 |
+
"</div>"
|
738 |
+
],
|
739 |
+
"text/plain": [
|
740 |
+
" id tweet \\\n",
|
741 |
+
"0 1363067834260201475 Utan massiv, asylrelaterad invandring från frä... \n",
|
742 |
+
"1 1363067613660778496 Många vänsterliberaler tycks ha reagerat på de... \n",
|
743 |
+
"2 1363067558409158656 Jag förstår — uppriktigt — inte den närmast hy... \n",
|
744 |
+
"3 1362748777552113670 Invandring av hundratusentals människor från f... \n",
|
745 |
+
"4 1362409505557012490 Vårt land behöver ett totalstopp för all asyl-... \n",
|
746 |
+
"\n",
|
747 |
+
" date user_id username urls nlikes nreplies \\\n",
|
748 |
+
"0 2021-02-20 11:07:50 95972673 jimmieakesson [] 1277 22 \n",
|
749 |
+
"1 2021-02-20 11:06:58 95972673 jimmieakesson [] 625 9 \n",
|
750 |
+
"2 2021-02-20 11:06:45 95972673 jimmieakesson [] 2458 199 \n",
|
751 |
+
"3 2021-02-19 14:00:01 95972673 jimmieakesson [] 1334 55 \n",
|
752 |
+
"4 2021-02-18 15:31:53 95972673 jimmieakesson [] 3044 268 \n",
|
753 |
+
"\n",
|
754 |
+
" nretweets \n",
|
755 |
+
"0 105 \n",
|
756 |
+
"1 68 \n",
|
757 |
+
"2 336 \n",
|
758 |
+
"3 101 \n",
|
759 |
+
"4 404 "
|
760 |
+
]
|
761 |
+
},
|
762 |
+
"execution_count": 9,
|
763 |
+
"metadata": {},
|
764 |
+
"output_type": "execute_result"
|
765 |
+
}
|
766 |
+
],
|
767 |
+
"source": [
|
768 |
+
"user__string_t_info = sc.get_only_tweets(user__string_tr_info)\n",
|
769 |
+
"user__string_t_info.head()"
|
770 |
+
]
|
771 |
+
},
|
772 |
+
{
|
773 |
+
"cell_type": "code",
|
774 |
+
"execution_count": null,
|
775 |
+
"id": "48d50b46",
|
776 |
+
"metadata": {},
|
777 |
+
"outputs": [],
|
778 |
+
"source": [
|
779 |
+
"tweets= df[\"tweet\"]\n",
|
780 |
+
"for tweet in tweets:\n",
|
781 |
+
" print(tweet, \"\\n\", \"__________________________________________________________\")"
|
782 |
+
]
|
783 |
+
},
|
784 |
+
{
|
785 |
+
"cell_type": "code",
|
786 |
+
"execution_count": null,
|
787 |
+
"id": "530c26e2",
|
788 |
+
"metadata": {},
|
789 |
+
"outputs": [],
|
790 |
+
"source": []
|
791 |
+
}
|
792 |
+
],
|
793 |
+
"metadata": {
|
794 |
+
"kernelspec": {
|
795 |
+
"display_name": "Python 3.10.4 64-bit",
|
796 |
+
"language": "python",
|
797 |
+
"name": "python3"
|
798 |
+
},
|
799 |
+
"language_info": {
|
800 |
+
"codemirror_mode": {
|
801 |
+
"name": "ipython",
|
802 |
+
"version": 3
|
803 |
+
},
|
804 |
+
"file_extension": ".py",
|
805 |
+
"mimetype": "text/x-python",
|
806 |
+
"name": "python",
|
807 |
+
"nbconvert_exporter": "python",
|
808 |
+
"pygments_lexer": "ipython3",
|
809 |
+
"version": "3.10.4"
|
810 |
+
},
|
811 |
+
"vscode": {
|
812 |
+
"interpreter": {
|
813 |
+
"hash": "916dbcbb3f70747c44a77c7bcd40155683ae19c65e1c03b4aa3499c5328201f1"
|
814 |
+
}
|
815 |
+
}
|
816 |
+
},
|
817 |
+
"nbformat": 4,
|
818 |
+
"nbformat_minor": 5
|
819 |
+
}
|