Oresti Theodoridis commited on
Commit
dcd24fe
2 Parent(s): 9de6496 ba518ab

Merge pull request #30 from Demea9000/22-redo-twitter-scraper

Browse files
Files changed (44) hide show
  1. .idea/misc.xml +1 -1
  2. .idea/politweet.iml +1 -0
  3. requirements.txt +28 -0
  4. twitter-scraper/TwitterScraper.py +17 -0
  5. twitter-scraper/scrape.py +91 -0
  6. twitter-scraper/twint-master/.github/FUNDING.yml +0 -3
  7. twitter-scraper/twint-master/.github/ISSUE_TEMPLATE.md +0 -20
  8. twitter-scraper/twint-master/.github/ISSUE_TEMPLATE/ISSUE_TEMPLATE.md +0 -17
  9. twitter-scraper/twint-master/.gitignore +0 -115
  10. twitter-scraper/twint-master/.travis.yml +0 -23
  11. twitter-scraper/twint-master/Dockerfile +0 -10
  12. twitter-scraper/twint-master/LICENSE +0 -21
  13. twitter-scraper/twint-master/MANIFEST.in +0 -1
  14. twitter-scraper/twint-master/README.md +0 -272
  15. twitter-scraper/twint-master/Untitled.ipynb +0 -282
  16. twitter-scraper/twint-master/automate.py +0 -65
  17. twitter-scraper/twint-master/elasticsearch/README.md +0 -5
  18. twitter-scraper/twint-master/scrape.py +0 -102
  19. twitter-scraper/twint-master/scrape__init__.py +0 -14
  20. twitter-scraper/twint-master/setup.py +0 -65
  21. twitter-scraper/twint-master/test.py +0 -92
  22. twitter-scraper/twint-master/twint/__init__.py +0 -32
  23. twitter-scraper/twint-master/twint/__version__.py +0 -3
  24. twitter-scraper/twint-master/twint/cli.py +0 -342
  25. twitter-scraper/twint-master/twint/config.py +0 -87
  26. twitter-scraper/twint-master/twint/datelock.py +0 -44
  27. twitter-scraper/twint-master/twint/feed.py +0 -145
  28. twitter-scraper/twint-master/twint/format.py +0 -91
  29. twitter-scraper/twint-master/twint/get.py +0 -298
  30. twitter-scraper/twint-master/twint/output.py +0 -241
  31. twitter-scraper/twint-master/twint/run.py +0 -412
  32. twitter-scraper/twint-master/twint/storage/__init__.py +0 -0
  33. twitter-scraper/twint-master/twint/storage/db.py +0 -297
  34. twitter-scraper/twint-master/twint/storage/elasticsearch.py +0 -364
  35. twitter-scraper/twint-master/twint/storage/panda.py +0 -196
  36. twitter-scraper/twint-master/twint/storage/write.py +0 -77
  37. twitter-scraper/twint-master/twint/storage/write_meta.py +0 -151
  38. twitter-scraper/twint-master/twint/token.py +0 -94
  39. twitter-scraper/twint-master/twint/tweet.py +0 -166
  40. twitter-scraper/twint-master/twint/url.py +0 -195
  41. twitter-scraper/twint-master/twint/user.py +0 -52
  42. twitter-scraper/twint-master/twint/verbose.py +0 -18
  43. twitter-scraper/twint-master/twitter_scraper.ipynb +0 -265
  44. twitter-scraper/twitter_scraper.ipynb +819 -0
.idea/misc.xml CHANGED
@@ -1,4 +1,4 @@
1
  <?xml version="1.0" encoding="UTF-8"?>
2
  <project version="4">
3
- <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.9 (politweet)" project-jdk-type="Python SDK" />
4
  </project>
 
1
  <?xml version="1.0" encoding="UTF-8"?>
2
  <project version="4">
3
+ <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.10 (politweet)" project-jdk-type="Python SDK" />
4
  </project>
.idea/politweet.iml CHANGED
@@ -3,6 +3,7 @@
3
  <component name="NewModuleRootManager">
4
  <content url="file://$MODULE_DIR$">
5
  <excludeFolder url="file://$MODULE_DIR$/politweet-environment" />
 
6
  </content>
7
  <orderEntry type="inheritedJdk" />
8
  <orderEntry type="sourceFolder" forTests="false" />
 
3
  <component name="NewModuleRootManager">
4
  <content url="file://$MODULE_DIR$">
5
  <excludeFolder url="file://$MODULE_DIR$/politweet-environment" />
6
+ <excludeFolder url="file://$MODULE_DIR$/venv" />
7
  </content>
8
  <orderEntry type="inheritedJdk" />
9
  <orderEntry type="sourceFolder" forTests="false" />
requirements.txt CHANGED
@@ -1,23 +1,51 @@
 
 
 
 
 
 
 
 
1
  certifi==2022.6.15
 
2
  charset-normalizer==2.1.0
3
  cycler==0.11.0
 
 
 
4
  et-xmlfile==1.1.0
 
5
  fonttools==4.34.0
 
 
 
 
6
  idna==3.3
7
  kiwisolver==1.4.3
8
  matplotlib==3.5.2
 
9
  numpy==1.23.0
 
10
  openai==0.20.0
11
  openpyxl==3.0.10
12
  packaging==21.3
13
  pandas==1.4.3
14
  pandas-stubs==1.4.3.220704
15
  Pillow==9.2.0
 
 
16
  pyparsing==3.0.9
 
17
  python-dateutil==2.8.2
 
18
  pytz==2022.1
19
  regex==2022.6.2
20
  requests==2.28.1
 
 
21
  six==1.16.0
 
22
  tqdm==4.64.0
 
23
  urllib3==1.26.9
 
 
1
+ aiodns==3.0.0
2
+ aiohttp==3.8.1
3
+ aiohttp-socks==0.7.1
4
+ aiosignal==1.2.0
5
+ async-timeout==4.0.2
6
+ attrs==21.4.0
7
+ beautifulsoup4==4.11.1
8
+ cchardet==2.1.7
9
  certifi==2022.6.15
10
+ cffi==1.15.1
11
  charset-normalizer==2.1.0
12
  cycler==0.11.0
13
+ dataclasses==0.6
14
+ elastic-transport==8.1.2
15
+ elasticsearch==8.3.1
16
  et-xmlfile==1.1.0
17
+ fake-useragent==0.1.11
18
  fonttools==4.34.0
19
+ frozenlist==1.3.0
20
+ geographiclib==1.52
21
+ geopy==2.2.0
22
+ googletransx==2.4.2
23
  idna==3.3
24
  kiwisolver==1.4.3
25
  matplotlib==3.5.2
26
+ multidict==6.0.2
27
  numpy==1.23.0
28
+ oauthlib==3.2.0
29
  openai==0.20.0
30
  openpyxl==3.0.10
31
  packaging==21.3
32
  pandas==1.4.3
33
  pandas-stubs==1.4.3.220704
34
  Pillow==9.2.0
35
+ pycares==4.2.1
36
+ pycparser==2.21
37
  pyparsing==3.0.9
38
+ PySocks==1.7.1
39
  python-dateutil==2.8.2
40
+ python-socks==2.0.3
41
  pytz==2022.1
42
  regex==2022.6.2
43
  requests==2.28.1
44
+ requests-oauthlib==1.3.1
45
+ schedule==1.1.0
46
  six==1.16.0
47
+ soupsieve==2.3.2.post1
48
  tqdm==4.64.0
49
+ -e git+https://github.com/twintproject/twint.git@e7c8a0c764f6879188e5c21e25fb6f1f856a7221#egg=twint
50
  urllib3==1.26.9
51
+ yarl==1.7.2
twitter-scraper/TwitterScraper.py ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import twint
2
+ import datetime
3
+
4
+ c = twint.Config()
5
+
6
+ c.Search = ['Taylor Swift'] # topic
7
+ c.Limit = 500 # number of Tweets to scrape
8
+ c.Store_csv = True # store tweets in a csv file
9
+ c.Output = "taylor_swift_tweets.csv" # path to csv file
10
+
11
+ twint.run.Search(c)
12
+
13
+ import pandas as pd
14
+
15
+ df = pd.read_csv('taylor_swift_tweets.csv')
16
+
17
+ print(df.head())
twitter-scraper/scrape.py ADDED
@@ -0,0 +1,91 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import twint
2
+ from datetime import date
3
+
4
+
5
+ class TwitterScraper(object):
6
+ """
7
+ This class is a twitter TwitterScraper called TwitterScraper. It takes the user as input and collects the user's tweets
8
+ from 'from_date' to 'to_date'. If 'from_date' and 'to_date' are not specified, it collects the number of tweets 'num_tweets' from today.
9
+ It outputs a dictionary with the tweet unique id and some other information.
10
+ input: user, from_date, to_date, num_tweets
11
+ output: dict
12
+ """
13
+ def __init__(self, from_date="2006-07-01", to_date=str(date.today()), num_tweets=20):
14
+ self.from_date = from_date
15
+ self.to_date = to_date
16
+ self.num_tweets = num_tweets
17
+ self.conf = twint.Config()
18
+
19
+ def scrape_by_user(self, _user):
20
+ """This method uses twint to extract tweets based on username"""
21
+ self.conf.Search = "from:@" + _user # is the search configuration is given in this format it searches after
22
+ # user_names.
23
+ return self.__get_tweets__from_twint__()
24
+
25
+ def scrape_by_string(self, _string: str):
26
+ """This method uses twint to extract tweets based on string.
27
+ all extracted tweets have the specified word in _string parameter in it.
28
+ """
29
+ self.conf.Search = _string # this tells twint configuration to search for string
30
+ return self.__get_tweets__from_twint__()
31
+
32
+ def scrape_by_user_and_string(self, _user: str, _string: str):
33
+ """This method uses twint to extract tweets brased on string and username"""
34
+ self.conf.Username = _user
35
+ self.conf.Search = _string
36
+ return self.__get_tweets__from_twint__()
37
+
38
+ def get_only_tweets(self, tweet_and_replies_info):
39
+ tweet_and_replies = tweet_and_replies_info["tweet"]
40
+ """
41
+ This functions input arg is a data frame (the output from scrape methords ) and removes...
42
+ all tweets starting with \"@\" which is indicator of a reply or retweet.
43
+ """
44
+ indx_replies = []
45
+ for i in range(len(tweet_and_replies)):
46
+ if tweet_and_replies[i].startswith("@"):
47
+ indx_replies.append(i)
48
+
49
+ tweets_info = tweet_and_replies_info.drop(labels=indx_replies, axis=0)
50
+ # drop removes the columns which its index specified by
51
+ # indx_replies. axis=0 if we want to delete rows.
52
+ #print(len(tweets['tweet']), " of them are Tweets")
53
+ return tweets_info
54
+
55
+ def __get_tweets__from_twint__(self):
56
+ """ __get_tweets_from_twint__
57
+ tweet info is a dataframe with fallowing columns
58
+ Index(['id', 'conversation_id', 'created_at', 'date', 'timezone', 'place',
59
+ 'tweet', 'language', 'hashtags', 'cashtags', 'user_id', 'user_id_str',
60
+ 'username', 'name', 'day', 'hour', 'link', 'urls', 'photos', 'video',
61
+ 'thumbnail', 'retweet', 'nlikes', 'nreplies', 'nretweets', 'quote_url',
62
+ 'search', 'near', 'geo', 'source', 'user_rt_id', 'user_rt',
63
+ 'retweet_id', 'reply_to', 'retweet_date', 'translate', 'trans_src',
64
+ 'trans_dest']
65
+ we just pick the relevant ones.
66
+ c is a twint.Config() object
67
+ we also configure twint output.
68
+ """
69
+ self.conf.Pandas = True #
70
+ self.conf.Count = True #
71
+ self.conf.Limit = self.num_tweets # specifies how many tweet should be scraped
72
+ self.conf.Since = self.from_date
73
+ self.conf.Until = self.to_date
74
+ self.conf.Hide_output = True # Hides the output. If set to False it will prints tweets in the terminal window.
75
+ twint.run.Search(self.conf)
76
+ tweet_and_replies_inf = twint.output.panda.Tweets_df # here we say that output souldwe dataframe.
77
+ tweet_and_replies_inf = tweet_and_replies_inf[
78
+ ["id", "tweet", "date", "user_id", "username", "urls", 'nlikes', 'nreplies', 'nretweets']]
79
+ return tweet_and_replies_inf
80
+ # def __check_date_type(d1,d2): if (type(d1) or type(d2)) is not type("str"): # If the type of ite date input
81
+ # is not string it generates exception print("[!] Please make sure the date is a string in this format
82
+ # \"yyyy-mm-dd\" ") raise EXCEPTION("Incorrect date type Exception!") elif (len(d1.split("-")) or len(d2.split(
83
+ # "-")))<2: print("[!] Please make sure the date is a string in this format \"yyyy-mm-dd\" ") raise EXCEPTION(
84
+ # "Incorrect date type Exception!")
85
+ if __name__ == "__main__":
86
+ sc = TwitterScraper(num_tweets=1002)
87
+ dc = sc.scrape_by_string("jimmieakesson")
88
+ print(dc.head())
89
+ print(dc.shape)
90
+
91
+
twitter-scraper/twint-master/.github/FUNDING.yml DELETED
@@ -1,3 +0,0 @@
1
- # These are supported funding model platforms
2
- patreon: twintproject
3
- custom: paypal.me/noneprivacy
 
 
 
 
twitter-scraper/twint-master/.github/ISSUE_TEMPLATE.md DELETED
@@ -1,20 +0,0 @@
1
- # Issue Template
2
- Please use this template!
3
-
4
- ## Initial Check
5
- > If the issue is a request please specify that it is a request in the title (Example: [REQUEST] more features). If this is a question regarding 'twint' please specify that it's a question in the title (Example: [QUESTION] What is x?). Please **only** submit issues related to 'twint'. Thanks.
6
-
7
- >Make sure you've checked the following:
8
-
9
- - [] Python version is 3.6 or later;
10
- - [] Updated Twint with `pip3 install --user --upgrade -e git+https://github.com/minamotorin/twint.git@origin/master#egg=twint`;
11
- - [] I have searched the issues and there are no duplicates of this issue/question/request (please link to related issues of twintproject/twint for reference).
12
-
13
- ## Command Ran
14
- >Please provide the _exact_ command ran including the username/search/code so I may reproduce the issue.
15
-
16
- ## Description of Issue
17
- >Please use **as much detail as possible.**
18
-
19
- ## Environment Details
20
- >Using Windows, Linux? What OS version? Running this in Anaconda? Jupyter Notebook? Terminal?
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
twitter-scraper/twint-master/.github/ISSUE_TEMPLATE/ISSUE_TEMPLATE.md DELETED
@@ -1,17 +0,0 @@
1
- ### Initial Check
2
- > If the issue is a request please specify that it is a request in the title (Example: [REQUEST] more features). If this is a question regarding 'twint' please specify that it's a question in the title (Example: [QUESTION] What is x?). Please **only** submit issues related to 'twint'. Thanks.
3
-
4
- >Make sure you've checked the following:
5
-
6
- - [] Python version is 3.6;
7
- - [] Using the latest version of Twint;
8
- - [] Updated Twint with `pip3 install --upgrade -e git+https://github.com/twintproject/twint.git@origin/master#egg=twint`;
9
-
10
- ### Command Ran
11
- >Please provide the _exact_ command ran including the username/search/code so I may reproduce the issue.
12
-
13
- ### Description of Issue
14
- >Please use **as much detail as possible.**
15
-
16
- ### Environment Details
17
- >Using Windows, Linux? What OS version? Running this in Anaconda? Jupyter Notebook? Terminal?
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
twitter-scraper/twint-master/.gitignore DELETED
@@ -1,115 +0,0 @@
1
- # Byte-compiled / optimized / DLL files
2
- __pycache__/
3
- *.py[cod]
4
- *$py.class
5
- tweets.db
6
- # C extensions
7
- *.so
8
-
9
- config.ini
10
- twint/storage/mysql.py
11
-
12
- # Node Dependency directories
13
- node_modules/
14
- jspm_packages/
15
- tests/
16
- # Distribution / packaging
17
- .Python
18
- env/
19
- build/
20
- develop-eggs/
21
- dist/
22
- downloads/
23
- eggs/
24
- .eggs/
25
- lib/
26
- lib64/
27
- parts/
28
- sdist/
29
- var/
30
- wheels/
31
- *.egg-info/
32
- .installed.cfg
33
- *.egg
34
-
35
- # PyInstaller
36
- # Usually these files are written by a python script from a template
37
- # before PyInstaller builds the exe, so as to inject date/other infos into it.
38
- *.manifest
39
- *.spec
40
-
41
- # Installer logs
42
- pip-log.txt
43
- pip-delete-this-directory.txt
44
-
45
- # Unit test / coverage reports
46
- htmlcov/
47
- .tox/
48
- .coverage
49
- .coverage.*
50
- .cache
51
- nosetests.xml
52
- coverage.xml
53
- *.cover
54
- .hypothesis/
55
-
56
- # Translations
57
- *.mo
58
- *.pot
59
-
60
- # Django stuff:
61
- *.log
62
- local_settings.py
63
-
64
- # Flask stuff:
65
- instance/
66
- .webassets-cache
67
-
68
- # Scrapy stuff:
69
- .scrapy
70
-
71
- # Sphinx documentation
72
- docs/_build/
73
-
74
- # PyBuilder
75
- target/
76
-
77
- # Jupyter Notebook
78
- .ipynb_checkpoints
79
-
80
- # pyenv
81
- .python-version
82
-
83
- # celery beat schedule file
84
- celerybeat-schedule
85
-
86
- # SageMath parsed files
87
- *.sage.py
88
-
89
- # dotenv
90
- .env
91
-
92
- # virtualenv
93
- .venv
94
- venv/
95
- ENV/
96
-
97
- # Spyder project settings
98
- .spyderproject
99
- .spyproject
100
-
101
- # Rope project settings
102
- .ropeproject
103
-
104
- # mkdocs documentation
105
- /site
106
-
107
- # mypy
108
- .mypy_cache/
109
-
110
- # output
111
- *.csv
112
- *.json
113
- *.txt
114
-
115
- test_twint.py
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
twitter-scraper/twint-master/.travis.yml DELETED
@@ -1,23 +0,0 @@
1
- dist: bionic
2
- language: python
3
- python:
4
- - "3.6"
5
- - "3.7"
6
- - "3.8"
7
- - "nightly"
8
- matrix:
9
- allow_failures:
10
- - python: "nightly"
11
- - python: "3.8"
12
- install:
13
- - pip install -r requirements.txt
14
- script:
15
- - python test.py
16
- deploy:
17
- provider: pypi
18
- user: "codyzacharias"
19
- password:
20
- secure: sWWvx50F7KJBtf8z2njc+Q31WIAHiQs4zKEiGD4/7xrshw55H5z+WnqZ9VIP83qm9yKefoRKp7WnaJeXZ3ulZSLn64ue45lqFozWMyGvelRPOKvZi9XPMqBA7+qllR/GseTHSGC3G5EGxac6UEI3irYe3mZXxfjpxNOXVti8rJ2xX8TiJM0AVKRrdDiAstOhMMkXkB7fYXMQALwEp8UoW/UbjbeqsKueXydjStaESNP/QzRFZ3/tuNu+3HMz/olniLUhUWcF/xDbJVpXuaRMUalgqe+BTbDdtUVt/s/GKtpg5GAzJyhQphiCM/huihedUIKSoI+6A8PTzuxrLhB5BMi9pcllED02v7w1enpu5L2l5cRDgQJSOpkxkA5Eese8nxKOOq0KzwDQa3JByrRor8R4yz+p5s4u2r0Rs2A9fkjQYwd/uWBSEIRF4K9WZoniiikahwXq070DMRgV7HbovKSjo5NK5F8j+psrtqPF+OHN2aVfWxbGnezrOOkmzuTHhWZVj3pPSpQU1WFWHo9fPo4I6YstR4q6XjNNjrpY3ojSlv0ThMbUem7zhHTRkRsSA2SpPfqw5E3Jf7vaiQb4M5zkBVqxuq4tXb14GJ26tGD8tel8u8b+ccpkAE9xf+QavP8UHz4PbBhqgFX5TbV/H++cdsICyoZnT35yiaDOELM=
21
- on:
22
- tags: true
23
- python: "3.7"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
twitter-scraper/twint-master/Dockerfile DELETED
@@ -1,10 +0,0 @@
1
- FROM python:3.6-buster
2
- LABEL maintainer="codyzacharias@pm.me"
3
-
4
- WORKDIR /root
5
-
6
- RUN git clone --depth=1 https://github.com/twintproject/twint.git && \
7
- cd /root/twint && \
8
- pip3 install . -r requirements.txt
9
-
10
- CMD /bin/bash
 
 
 
 
 
 
 
 
 
 
 
twitter-scraper/twint-master/LICENSE DELETED
@@ -1,21 +0,0 @@
1
- MIT License
2
-
3
- Copyright (c) 2018 Cody Zacharias
4
-
5
- Permission is hereby granted, free of charge, to any person obtaining a copy
6
- of this software and associated documentation files (the "Software"), to deal
7
- in the Software without restriction, including without limitation the rights
8
- to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
- copies of the Software, and to permit persons to whom the Software is
10
- furnished to do so, subject to the following conditions:
11
-
12
- The above copyright notice and this permission notice shall be included in all
13
- copies or substantial portions of the Software.
14
-
15
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
- IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
- FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
- AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
- LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
- OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
- SOFTWARE.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
twitter-scraper/twint-master/MANIFEST.in DELETED
@@ -1 +0,0 @@
1
- include README.md LICENSE
 
 
twitter-scraper/twint-master/README.md DELETED
@@ -1,272 +0,0 @@
1
- 20220207.0
2
-
3
- # About this fork
4
-
5
- [This repository](https://github.com/minamotorin/twint) is the fork of [https://github.com/twintproject/twint](https://github.com/twintproject/twint) and for myself.
6
-
7
- Modified by [minamotorin](https://github.com/minamotorin).
8
-
9
- ## Updates from twintproject/twint
10
-
11
- ### twint.token.RefreshTokenException: Could not find the Guest token in HTML
12
-
13
- This problem doesn't happen recently.
14
-
15
- #### Related
16
-
17
- - [twintproject/twint#1320](https://github.com/twintproject/twint/issues/1320)
18
- - [twintproject/twint#1322](https://github.com/twintproject/twint/pull/1322)
19
- - [twintproject/twint#1328](https://github.com/twintproject/twint/pull/1328)
20
- - [twintproject/twint#1061](https://github.com/twintproject/twint/issues/1061)
21
- - [twintproject/twint#1114](https://github.com/twintproject/twint/issues/1114)
22
-
23
- ### json.decoder.JSONDecodeError: Expecting value: line 1 column 1 (char 0)
24
-
25
- The fix is **not complete**.
26
- `twint.run.Profile` will work but `twint.run.db` will not.
27
- This means [`test.py`](./test.py) causes an error.
28
-
29
- I think this is because the fields of the result table are not exactly the same as the traditional ones.
30
-
31
- #### Related
32
-
33
- - [twintproject/twint#1335](https://github.com/twintproject/twint/issues/1335)
34
-
35
- ### [-] TWINT requires Python version 3.6+.
36
-
37
- #### Related
38
-
39
- - [twintproject/twint#1344](https://github.com/twintproject/twint/issues/1344)
40
- - [twintproject/twint#1345](https://github.com/twintproject/twint/pull/1345)
41
- - [twintproject/twint#1344](https://github.com/twintproject/twint/issues/1346)
42
- - [twintproject/twint#1309](https://github.com/twintproject/twint/pull/1309)
43
- - [twintproject/twint#1313](https://github.com/twintproject/twint/issues/1313)
44
-
45
- ## References
46
-
47
- - [snscrape](https://github.com/JustAnotherArchivist/snscrape)
48
- - [gallery-dl](https://github.com/mikf/gallery-dl)
49
-
50
- ## License
51
-
52
- This repository is also under the [MIT License](https://opensource.org/licenses/mit-license.php).
53
-
54
- ---
55
-
56
- # TWINT - Twitter Intelligence Tool
57
- ![2](https://i.imgur.com/iaH3s7z.png)
58
- ![3](https://i.imgur.com/hVeCrqL.png)
59
-
60
- [![PyPI](https://img.shields.io/pypi/v/twint.svg)](https://pypi.org/project/twint/) [![Build Status](https://travis-ci.org/twintproject/twint.svg?branch=master)](https://travis-ci.org/twintproject/twint) [![Python 3.6|3.7|3.8](https://img.shields.io/badge/Python-3.6%2F3.7%2F3.8-blue.svg)](https://www.python.org/download/releases/3.0/) [![GitHub license](https://img.shields.io/github/license/haccer/tweep.svg)](https://github.com/haccer/tweep/blob/master/LICENSE) [![Downloads](https://pepy.tech/badge/twint)](https://pepy.tech/project/twint) [![Downloads](https://pepy.tech/badge/twint/week)](https://pepy.tech/project/twint/week) [![Patreon](https://img.shields.io/endpoint.svg?url=https:%2F%2Fshieldsio-patreon.herokuapp.com%2Ftwintproject)](https://www.patreon.com/twintproject) ![](https://img.shields.io/twitter/follow/noneprivacy.svg?label=Follow&style=social)
61
-
62
- >No authentication. No API. No limits.
63
-
64
- Twint is an advanced Twitter scraping tool written in Python that allows for scraping Tweets from Twitter profiles **without** using Twitter's API.
65
-
66
- Twint utilizes Twitter's search operators to let you scrape Tweets from specific users, scrape Tweets relating to certain topics, hashtags & trends, or sort out *sensitive* information from Tweets like e-mail and phone numbers. I find this very useful, and you can get really creative with it too.
67
-
68
- Twint also makes special queries to Twitter allowing you to also scrape a Twitter user's followers, Tweets a user has liked, and who they follow **without** any authentication, API, Selenium, or browser emulation.
69
-
70
- ## tl;dr Benefits
71
- Some of the benefits of using Twint vs Twitter API:
72
- - Can fetch almost __all__ Tweets (Twitter API limits to last 3200 Tweets only);
73
- - Fast initial setup;
74
- - Can be used anonymously and without Twitter sign up;
75
- - **No rate limitations**.
76
-
77
- ## Limits imposed by Twitter
78
- Twitter limits scrolls while browsing the user timeline. This means that with `.Profile` or with `.Favorites` you will be able to get ~3200 tweets.
79
-
80
- ## Requirements
81
- - Python 3.6;
82
- - aiohttp;
83
- - aiodns;
84
- - beautifulsoup4;
85
- - cchardet;
86
- - dataclasses
87
- - elasticsearch;
88
- - pysocks;
89
- - pandas (>=0.23.0);
90
- - aiohttp_socks;
91
- - schedule;
92
- - geopy;
93
- - fake-useragent;
94
- - py-googletransx.
95
-
96
- ## Installing
97
-
98
- **Git:**
99
- ```bash
100
- git clone --depth=1 https://github.com/twintproject/twint.git
101
- cd twint
102
- pip3 install . -r requirements.txt
103
- ```
104
-
105
- **Pip:**
106
- ```bash
107
- pip3 install twint
108
- ```
109
-
110
- or
111
-
112
- ```bash
113
- pip3 install --user --upgrade git+https://github.com/twintproject/twint.git@origin/master#egg=twint
114
- ```
115
-
116
- **Pipenv**:
117
- ```bash
118
- pipenv install git+https://github.com/twintproject/twint.git#egg=twint
119
- ```
120
-
121
- ### March 2, 2021 Update
122
-
123
- **Added**: Dockerfile
124
-
125
- Noticed a lot of people are having issues installing (including me). Please use the Dockerfile temporarily while I look into them.
126
-
127
- ## CLI Basic Examples and Combos
128
- A few simple examples to help you understand the basics:
129
-
130
- - `twint -u username` - Scrape all the Tweets of a *user* (doesn't include **retweets** but includes **replies**).
131
- - `twint -u username -s pineapple` - Scrape all Tweets from the *user*'s timeline containing _pineapple_.
132
- - `twint -s pineapple` - Collect every Tweet containing *pineapple* from everyone's Tweets.
133
- - `twint -u username --year 2014` - Collect Tweets that were tweeted **before** 2014.
134
- - `twint -u username --since "2015-12-20 20:30:15"` - Collect Tweets that were tweeted since 2015-12-20 20:30:15.
135
- - `twint -u username --since 2015-12-20` - Collect Tweets that were tweeted since 2015-12-20 00:00:00.
136
- - `twint -u username -o file.txt` - Scrape Tweets and save to file.txt.
137
- - `twint -u username -o file.csv --csv` - Scrape Tweets and save as a csv file.
138
- - `twint -u username --email --phone` - Show Tweets that might have phone numbers or email addresses.
139
- - `twint -s "Donald Trump" --verified` - Display Tweets by verified users that Tweeted about Donald Trump.
140
- - `twint -g="48.880048,2.385939,1km" -o file.csv --csv` - Scrape Tweets from a radius of 1km around a place in Paris and export them to a csv file.
141
- - `twint -u username -es localhost:9200` - Output Tweets to Elasticsearch
142
- - `twint -u username -o file.json --json` - Scrape Tweets and save as a json file.
143
- - `twint -u username --database tweets.db` - Save Tweets to a SQLite database.
144
- - `twint -u username --followers` - Scrape a Twitter user's followers.
145
- - `twint -u username --following` - Scrape who a Twitter user follows.
146
- - `twint -u username --favorites` - Collect all the Tweets a user has favorited (gathers ~3200 tweet).
147
- - `twint -u username --following --user-full` - Collect full user information a person follows
148
- - `twint -u username --timeline` - Use an effective method to gather Tweets from a user's profile (Gathers ~3200 Tweets, including **retweets** & **replies**).
149
- - `twint -u username --retweets` - Use a quick method to gather the last 900 Tweets (that includes retweets) from a user's profile.
150
- - `twint -u username --resume resume_file.txt` - Resume a search starting from the last saved scroll-id.
151
-
152
- More detail about the commands and options are located in the [wiki](https://github.com/twintproject/twint/wiki/Commands)
153
-
154
- ## Module Example
155
-
156
- Twint can now be used as a module and supports custom formatting. **More details are located in the [wiki](https://github.com/twintproject/twint/wiki/Module)**
157
-
158
- ```python
159
- import twint
160
-
161
- # Configure
162
- c = twint.Config()
163
- c.Username = "realDonaldTrump"
164
- c.Search = "great"
165
-
166
- # Run
167
- twint.run.Search(c)
168
- ```
169
- > Output
170
-
171
- `955511208597184512 2018-01-22 18:43:19 GMT <now> pineapples are the best fruit`
172
-
173
- ```python
174
- import twint
175
-
176
- c = twint.Config()
177
-
178
- c.Username = "noneprivacy"
179
- c.Custom["tweet"] = ["id"]
180
- c.Custom["user"] = ["bio"]
181
- c.Limit = 10
182
- c.Store_csv = True
183
- c.Output = "none"
184
-
185
- twint.run.Search(c)
186
- ```
187
-
188
- ## Storing Options
189
- - Write to file;
190
- - CSV;
191
- - JSON;
192
- - SQLite;
193
- - Elasticsearch.
194
-
195
- ## Elasticsearch Setup
196
-
197
- Details on setting up Elasticsearch with Twint is located in the [wiki](https://github.com/twintproject/twint/wiki/Elasticsearch).
198
-
199
- ## Graph Visualization
200
- ![graph](https://i.imgur.com/EEJqB8n.png)
201
-
202
- [Graph](https://github.com/twintproject/twint/wiki/Graph) details are also located in the [wiki](https://github.com/twintproject/twint/wiki/Graph).
203
-
204
- We are developing a Twint Desktop App.
205
-
206
- ![4](https://i.imgur.com/DzcfIgL.png)
207
-
208
- ## FAQ
209
- > I tried scraping tweets from a user, I know that they exist but I'm not getting them
210
-
211
- Twitter can shadow-ban accounts, which means that their tweets will not be available via search. To solve this, pass `--profile-full` if you are using Twint via CLI or, if are using Twint as module, add `config.Profile_full = True`. Please note that this process will be quite slow.
212
- ## More Examples
213
-
214
- #### Followers/Following
215
-
216
- > To get only follower usernames/following usernames
217
-
218
- `twint -u username --followers`
219
-
220
- `twint -u username --following`
221
-
222
- > To get user info of followers/following users
223
-
224
- `twint -u username --followers --user-full`
225
-
226
- `twint -u username --following --user-full`
227
-
228
- #### userlist
229
-
230
- > To get only user info of user
231
-
232
- `twint -u username --user-full`
233
-
234
- > To get user info of users from a userlist
235
-
236
- `twint --userlist inputlist --user-full`
237
-
238
-
239
- #### tweet translation (experimental)
240
-
241
- > To get 100 english tweets and translate them to italian
242
-
243
- `twint -u noneprivacy --csv --output none.csv --lang en --translate --translate-dest it --limit 100`
244
-
245
- or
246
-
247
- ```python
248
- import twint
249
-
250
- c = twint.Config()
251
- c.Username = "noneprivacy"
252
- c.Limit = 100
253
- c.Store_csv = True
254
- c.Output = "none.csv"
255
- c.Lang = "en"
256
- c.Translate = True
257
- c.TranslateDest = "it"
258
- twint.run.Search(c)
259
- ```
260
-
261
- Notes:
262
- - [Google translate has some quotas](https://cloud.google.com/translate/quotas)
263
-
264
- ## Featured Blog Posts:
265
- - [How to use Twint as an OSINT tool](https://pielco11.ovh/posts/twint-osint/)
266
- - [Basic tutorial made by Null Byte](https://null-byte.wonderhowto.com/how-to/mine-twitter-for-targeted-information-with-twint-0193853/)
267
- - [Analyzing Tweets with NLP in minutes with Spark, Optimus and Twint](https://towardsdatascience.com/analyzing-tweets-with-nlp-in-minutes-with-spark-optimus-and-twint-a0c96084995f)
268
- - [Loading tweets into Kafka and Neo4j](https://markhneedham.com/blog/2019/05/29/loading-tweets-twint-kafka-neo4j/)
269
-
270
- ## Contact
271
-
272
- If you have any question, want to join in discussions, or need extra help, you are welcome to join our Twint focused channel at [OSINT team](https://osint.team)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
twitter-scraper/twint-master/Untitled.ipynb DELETED
@@ -1,282 +0,0 @@
1
- {
2
- "cells": [
3
- {
4
- "cell_type": "code",
5
- "execution_count": 67,
6
- "metadata": {},
7
- "outputs": [],
8
- "source": [
9
- "text= \"\\n\\n0. Brottslighet, 1. Miljö, 2. Skola, 3. Sjukvård, 4. Militär, 5. Invandring, 6. Integration \""
10
- ]
11
- },
12
- {
13
- "cell_type": "code",
14
- "execution_count": 17,
15
- "metadata": {},
16
- "outputs": [
17
- {
18
- "name": "stdout",
19
- "output_type": "stream",
20
- "text": [
21
- "WARNING: pip is being invoked by an old script wrapper. This will fail in a future version of pip.\n",
22
- "Please see https://github.com/pypa/pip/issues/5599 for advice on fixing the underlying issue.\n",
23
- "To avoid this problem you can invoke Python with '-m pip' instead of running pip directly.\n",
24
- "Requirement already satisfied: regex in /home/oxygen/snap/jupyter/common/lib/python3.7/site-packages (2022.6.2)\n"
25
- ]
26
- }
27
- ],
28
- "source": [
29
- "!pip install regex\n"
30
- ]
31
- },
32
- {
33
- "cell_type": "code",
34
- "execution_count": 15,
35
- "metadata": {},
36
- "outputs": [
37
- {
38
- "data": {
39
- "text/plain": [
40
- "['0']"
41
- ]
42
- },
43
- "execution_count": 15,
44
- "metadata": {},
45
- "output_type": "execute_result"
46
- }
47
- ],
48
- "source": [
49
- "re.findall(\"[0-9]+\", tl[0])"
50
- ]
51
- },
52
- {
53
- "cell_type": "code",
54
- "execution_count": 48,
55
- "metadata": {},
56
- "outputs": [
57
- {
58
- "data": {
59
- "text/plain": [
60
- "'0. Äldrefrågor'"
61
- ]
62
- },
63
- "execution_count": 48,
64
- "metadata": {},
65
- "output_type": "execute_result"
66
- }
67
- ],
68
- "source": [
69
- "tl[0]"
70
- ]
71
- },
72
- {
73
- "cell_type": "code",
74
- "execution_count": 49,
75
- "metadata": {},
76
- "outputs": [
77
- {
78
- "data": {
79
- "text/plain": [
80
- "['0', ' Äldrefrågor']"
81
- ]
82
- },
83
- "execution_count": 49,
84
- "metadata": {},
85
- "output_type": "execute_result"
86
- }
87
- ],
88
- "source": [
89
- "f=tl[0].split('.')\n",
90
- "\n",
91
- "f#int(f[0])"
92
- ]
93
- },
94
- {
95
- "cell_type": "code",
96
- "execution_count": 29,
97
- "metadata": {},
98
- "outputs": [
99
- {
100
- "ename": "NameError",
101
- "evalue": "name 'str_topics_to_dict' is not defined",
102
- "output_type": "error",
103
- "traceback": [
104
- "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
105
- "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)",
106
- "\u001b[0;32m<ipython-input-29-b05d9860dbcf>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mstr_topics_to_dict\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtext\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
107
- "\u001b[0;31mNameError\u001b[0m: name 'str_topics_to_dict' is not defined"
108
- ]
109
- }
110
- ],
111
- "source": []
112
- },
113
- {
114
- "cell_type": "code",
115
- "execution_count": 65,
116
- "metadata": {},
117
- "outputs": [],
118
- "source": [
119
- "\n",
120
- "def str_topics_to_dict(topics):\n",
121
- " topic_list=topics.split(\",\")\n",
122
- " ind_topic_dict={}\n",
123
- " for i inrange(len(topic_list)): \n",
124
- " index_topic_list=\n",
125
- " ind=index_topic_list[0]\n",
126
- " just_topic=index_topic_list[1][1:]\n",
127
- " ind_topic_dict[int(ind)]=just_topic\n",
128
- " return ind_topic_dict"
129
- ]
130
- },
131
- {
132
- "cell_type": "code",
133
- "execution_count": 68,
134
- "metadata": {},
135
- "outputs": [
136
- {
137
- "data": {
138
- "text/plain": [
139
- "{0: 'Brottslighet',\n",
140
- " 1: 'Miljö',\n",
141
- " 2: 'Skola',\n",
142
- " 3: 'Sjukvård',\n",
143
- " 4: 'Militär',\n",
144
- " 5: 'Invandring',\n",
145
- " 6: 'Integration '}"
146
- ]
147
- },
148
- "execution_count": 68,
149
- "metadata": {},
150
- "output_type": "execute_result"
151
- }
152
- ],
153
- "source": [
154
- "str_topics_to_dict(text)"
155
- ]
156
- },
157
- {
158
- "cell_type": "code",
159
- "execution_count": 109,
160
- "metadata": {},
161
- "outputs": [
162
- {
163
- "data": {
164
- "text/plain": [
165
- "' Brottslighet, Miljö, Skola, Sjukvård, Militär stöd, Invandring, Integration '"
166
- ]
167
- },
168
- "execution_count": 109,
169
- "metadata": {},
170
- "output_type": "execute_result"
171
- }
172
- ],
173
- "source": [
174
- "\n",
175
- "text=\"\\n\\n0. Brottslighet, 1. Miljö, 2. Skola, 3. Sjukvård, 4. Militär stöd, 5. Invandring, 6. Integration \"\n",
176
- "text=re.sub(r\"(\\n+)\",\" \",text)\n",
177
- "text=re.sub(\"(\\.)|\\d+\",\"\",text )\n",
178
- "text"
179
- ]
180
- },
181
- {
182
- "cell_type": "code",
183
- "execution_count": 100,
184
- "metadata": {},
185
- "outputs": [
186
- {
187
- "data": {
188
- "text/plain": [
189
- "[' Brottslighet',\n",
190
- " ' Miljö',\n",
191
- " ' Skola',\n",
192
- " ' Sjukvård',\n",
193
- " ' Militär stöd',\n",
194
- " ' Invandring',\n",
195
- " ' Integration ']"
196
- ]
197
- },
198
- "execution_count": 100,
199
- "metadata": {},
200
- "output_type": "execute_result"
201
- }
202
- ],
203
- "source": [
204
- "text.split(\",\")"
205
- ]
206
- },
207
- {
208
- "cell_type": "code",
209
- "execution_count": 116,
210
- "metadata": {},
211
- "outputs": [],
212
- "source": [
213
- "import regex as re \n",
214
- "def str_topics_to_dict(topics):\n",
215
- " text=re.sub(r\"(\\n+)\",\" \",topics)\n",
216
- " text=re.sub(\"(\\.)|\\d+\",\"\",topics )\n",
217
- " topics=re.sub(r\"(\\n+)|(\\.)|\\d+\",\"\",topics)\n",
218
- " topic_list=topics.split(\",\")\n",
219
- " ind_topic_dict={}\n",
220
- " for i in range(len(topic_list)): \n",
221
- " ind=i\n",
222
- " just_topic=topic_list[i]\n",
223
- " ind_topic_dict[ind]=just_topic\n",
224
- " return ind_topic_dict"
225
- ]
226
- },
227
- {
228
- "cell_type": "code",
229
- "execution_count": 117,
230
- "metadata": {},
231
- "outputs": [
232
- {
233
- "data": {
234
- "text/plain": [
235
- "{0: ' Brottslighet',\n",
236
- " 1: ' Miljö',\n",
237
- " 2: ' Skola',\n",
238
- " 3: ' Sjukvård',\n",
239
- " 4: ' Militär stöd',\n",
240
- " 5: ' Invandring',\n",
241
- " 6: ' Integration '}"
242
- ]
243
- },
244
- "execution_count": 117,
245
- "metadata": {},
246
- "output_type": "execute_result"
247
- }
248
- ],
249
- "source": [
250
- "str_topics_to_dict(text)"
251
- ]
252
- },
253
- {
254
- "cell_type": "code",
255
- "execution_count": null,
256
- "metadata": {},
257
- "outputs": [],
258
- "source": []
259
- }
260
- ],
261
- "metadata": {
262
- "kernelspec": {
263
- "display_name": "Python 3",
264
- "language": "python",
265
- "name": "python3"
266
- },
267
- "language_info": {
268
- "codemirror_mode": {
269
- "name": "ipython",
270
- "version": 3
271
- },
272
- "file_extension": ".py",
273
- "mimetype": "text/x-python",
274
- "name": "python",
275
- "nbconvert_exporter": "python",
276
- "pygments_lexer": "ipython3",
277
- "version": "3.7.3"
278
- }
279
- },
280
- "nbformat": 4,
281
- "nbformat_minor": 2
282
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
twitter-scraper/twint-master/automate.py DELETED
@@ -1,65 +0,0 @@
1
- import twint
2
- import schedule
3
- import time
4
-
5
- # you can change the name of each "job" after "def" if you'd like.
6
- def jobone():
7
- print ("Fetching Tweets")
8
- c = twint.Config()
9
- # choose username (optional)
10
- c.Username = "insert username here"
11
- # choose search term (optional)
12
- c.Search = "insert search term here"
13
- # choose beginning time (narrow results)
14
- c.Since = "2018-01-01"
15
- # set limit on total tweets
16
- c.Limit = 1000
17
- # no idea, but makes the csv format properly
18
- c.Store_csv = True
19
- # format of the csv
20
- c.Custom = ["date", "time", "username", "tweet", "link", "likes", "retweets", "replies", "mentions", "hashtags"]
21
- # change the name of the csv file
22
- c.Output = "filename.csv"
23
- twint.run.Search(c)
24
-
25
- def jobtwo():
26
- print ("Fetching Tweets")
27
- c = twint.Config()
28
- # choose username (optional)
29
- c.Username = "insert username here"
30
- # choose search term (optional)
31
- c.Search = "insert search term here"
32
- # choose beginning time (narrow results)
33
- c.Since = "2018-01-01"
34
- # set limit on total tweets
35
- c.Limit = 1000
36
- # no idea, but makes the csv format properly
37
- c.Store_csv = True
38
- # format of the csv
39
- c.Custom = ["date", "time", "username", "tweet", "link", "likes", "retweets", "replies", "mentions", "hashtags"]
40
- # change the name of the csv file
41
- c.Output = "filename2.csv"
42
- twint.run.Search(c)
43
-
44
- # run once when you start the program
45
-
46
- jobone()
47
- jobtwo()
48
-
49
- # run every minute(s), hour, day at, day of the week, day of the week and time. Use "#" to block out which ones you don't want to use. Remove it to active. Also, replace "jobone" and "jobtwo" with your new function names (if applicable)
50
-
51
- # schedule.every(1).minutes.do(jobone)
52
- schedule.every().hour.do(jobone)
53
- # schedule.every().day.at("10:30").do(jobone)
54
- # schedule.every().monday.do(jobone)
55
- # schedule.every().wednesday.at("13:15").do(jobone)
56
-
57
- # schedule.every(1).minutes.do(jobtwo)
58
- schedule.every().hour.do(jobtwo)
59
- # schedule.every().day.at("10:30").do(jobtwo)
60
- # schedule.every().monday.do(jobtwo)
61
- # schedule.every().wednesday.at("13:15").do(jobtwo)
62
-
63
- while True:
64
- schedule.run_pending()
65
- time.sleep(1)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
twitter-scraper/twint-master/elasticsearch/README.md DELETED
@@ -1,5 +0,0 @@
1
- # Elasticsearch How-To
2
-
3
- ![dashboard](https://i.imgur.com/BEbtdo5.png)
4
-
5
- Please read the Wiki [here](https://github.com/twintproject/twint/wiki/Elasticsearch)
 
 
 
 
 
 
twitter-scraper/twint-master/scrape.py DELETED
@@ -1,102 +0,0 @@
1
- import sys
2
- import io
3
- import time
4
- import asyncio
5
- import os
6
- from tkinter import EXCEPTION
7
- from numpy import not_equal
8
- loop = asyncio.get_event_loop()
9
- loop.is_running()
10
- import twint
11
- import nest_asyncio
12
- nest_asyncio.apply()
13
- from datetime import date
14
- class scraper:
15
- def get_tweets(search_str, from_date="2006-07-01", to_date=str(date.today()), num_tweets=10,u_or_s='s', acceptable_range=10):
16
-
17
- if (type(from_date) or type("str")) is not type("str"):
18
- print("[!] Please make sure the date is a string in this format \"yyyy-mm-dd\" ")
19
- raise EXCEPTION("Incorrect date type Exception!")
20
-
21
- time_out= time.time()+2*60
22
- _dict={}
23
- c=twint.Config()
24
- if u_or_s.lower() =="u":
25
- c.Search = "from:@"+search_str # topic
26
- else:
27
- c.Search = search_str # topic
28
- c.Pandas = True
29
- num_tweets_and_replies=num_tweets
30
- c.Count=True
31
- for j in range(1,5):
32
- c.Limit = num_tweets_and_replies
33
- c.Since = from_date
34
- c.Until = to_date
35
- c.Hide_output =True
36
- old_stdout = sys.stdout
37
- new_stdout = io.StringIO()
38
- sys.stdout = new_stdout
39
- twint.run.Search(c)
40
- output = new_stdout.getvalue()
41
- sys.stdout = old_stdout
42
- print(output[0:-2])
43
- tweet_info=twint.output.panda.Tweets_df
44
-
45
- t_count=0
46
- try:
47
- _keys=tweet_info["id"]
48
- #tweet infor is a dataframe with fallowing columns
49
- '''Index(['id', 'conversation_id', 'created_at', 'date', 'timezone', 'place',
50
- 'tweet', 'language', 'hashtags', 'cashtags', 'user_id', 'user_id_str',
51
- 'username', 'name', 'day', 'hour', 'link', 'urls', 'photos', 'video',
52
- 'thumbnail', 'retweet', 'nlikes', 'nreplies', 'nretweets', 'quote_url',
53
- 'search', 'near', 'geo', 'source', 'user_rt_id', 'user_rt',
54
- 'retweet_id', 'reply_to', 'retweet_date', 'translate', 'trans_src',
55
- 'trans_dest'],
56
- dtype='object')'''
57
-
58
- for i in range (len(_keys)):
59
- if _keys[i] in _dict.keys() or tweet_info["tweet"][i].startswith("@"):
60
- pass
61
- else:
62
- _dict[int(_keys[i])] = {"tweet": tweet_info["tweet"][i],
63
- "date" :tweet_info["date"][i],
64
- "nlikes": tweet_info["nlikes"][i],
65
- "nreplies":tweet_info["nreplies"][i] ,
66
- "nretweets": tweet_info["nretweets"][i],"topic":""}
67
- if len(list(_dict.keys()))==num_tweets:
68
- break
69
- except:
70
- pass
71
- print(len(list(_dict.keys())), " of them are Tweets")
72
- if (num_tweets-len(list(_dict.keys())))< acceptable_range:
73
- return _dict
74
- if len(list(_dict.keys())) < num_tweets:
75
- num_tweets_and_replies= num_tweets_and_replies+100*3**j
76
- else:
77
- break
78
- if time_out <time.time():
79
- break
80
- if output.startswith("[!] No more data!"):
81
- break
82
- return _dict
83
-
84
- def string_search_user_tweets(user_name,search_str ,from_date="2006-07-01", to_date=str(date.today()), num_tweets=10):
85
- c=twint.Config()
86
- c.Username =user_name
87
- c.Search = search_str # topic
88
- c.Pandas = True
89
- num_tweets_and_replies=num_tweets
90
- c.Count=True
91
- c.Limit = num_tweets_and_replies
92
- c.Since = from_date
93
- c.Until = to_date
94
- c.Hide_output =True
95
- twint.run.Search(c)
96
- return twint.output.panda.Tweets_df
97
-
98
-
99
-
100
-
101
-
102
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
twitter-scraper/twint-master/scrape__init__.py DELETED
@@ -1,14 +0,0 @@
1
- def scraper_libs():
2
- import sys
3
- import io
4
- import time
5
- import asyncio
6
- import os
7
- from tkinter import EXCEPTION
8
- from numpy import not_equal
9
- loop = asyncio.get_event_loop()
10
- loop.is_running()
11
- import twint
12
- import nest_asyncio
13
- nest_asyncio.apply()
14
- from datetime import date
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
twitter-scraper/twint-master/setup.py DELETED
@@ -1,65 +0,0 @@
1
- #!/usr/bin/python3
2
- from setuptools import setup
3
- import io
4
- import os
5
-
6
- # Package meta-data
7
- NAME = 'twint'
8
- DESCRIPTION = 'An advanced Twitter scraping & OSINT tool.'
9
- URL = 'https://github.com/twintproject/twint'
10
- EMAIL = 'codyzacharias@pm.me'
11
- AUTHOR = 'Cody Zacharias'
12
- REQUIRES_PYTHON = '>=3.6.0'
13
- VERSION = None
14
-
15
- # Packages required
16
- REQUIRED = [
17
- 'aiohttp', 'aiodns', 'beautifulsoup4', 'cchardet', 'dataclasses',
18
- 'elasticsearch', 'pysocks', 'pandas', 'aiohttp_socks',
19
- 'schedule', 'geopy', 'fake-useragent', 'googletransx'
20
- ]
21
-
22
- here = os.path.abspath(os.path.dirname(__file__))
23
-
24
- with io.open(os.path.join(here, 'README.md'), encoding='utf-8') as f:
25
- long_description = '\n' + f.read()
26
-
27
- # Load the package's __version__.py
28
- about = {}
29
- if not VERSION:
30
- with open(os.path.join(here, NAME, '__version__.py')) as f:
31
- exec(f.read(), about)
32
- else:
33
- about['__version__'] = VERSION
34
-
35
- setup(
36
- name=NAME,
37
- version=about['__version__'],
38
- description=DESCRIPTION,
39
- long_description=long_description,
40
- long_description_content_type="text/markdown",
41
- author=AUTHOR,
42
- author_email=EMAIL,
43
- python_requires=REQUIRES_PYTHON,
44
- url=URL,
45
- packages=['twint', 'twint.storage'],
46
- entry_points={
47
- 'console_scripts': [
48
- 'twint = twint.cli:run_as_command',
49
- ],
50
- },
51
- install_requires=REQUIRED,
52
- dependency_links=[
53
- 'git+https://github.com/x0rzkov/py-googletrans#egg=googletrans'
54
- ],
55
- license='MIT',
56
- classifiers=[
57
- 'License :: OSI Approved :: MIT License',
58
- 'Programming Language :: Python',
59
- 'Programming Language :: Python :: 3',
60
- 'Programming Language :: Python :: 3.6',
61
- 'Programming Language :: Python :: 3.7',
62
- 'Programming Language :: Python :: 3.8',
63
- 'Programming Language :: Python :: Implementation :: CPython',
64
- ],
65
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
twitter-scraper/twint-master/test.py DELETED
@@ -1,92 +0,0 @@
1
- import twint
2
- import os
3
-
4
- '''
5
- Test.py - Testing TWINT to make sure everything works.
6
- '''
7
-
8
-
9
- def test_reg(c, run):
10
- print("[+] Beginning vanilla test in {}".format(str(run)))
11
- run(c)
12
-
13
-
14
- def test_db(c, run):
15
- print("[+] Beginning DB test in {}".format(str(run)))
16
- c.Database = "test_twint.db"
17
- run(c)
18
-
19
-
20
- def custom(c, run, _type):
21
- print("[+] Beginning custom {} test in {}".format(_type, str(run)))
22
- c.Custom['tweet'] = ["id", "username"]
23
- c.Custom['user'] = ["id", "username"]
24
- run(c)
25
-
26
-
27
- def test_json(c, run):
28
- c.Store_json = True
29
- c.Output = "test_twint.json"
30
- custom(c, run, "JSON")
31
- print("[+] Beginning JSON test in {}".format(str(run)))
32
- run(c)
33
-
34
-
35
- def test_csv(c, run):
36
- c.Store_csv = True
37
- c.Output = "test_twint.csv"
38
- custom(c, run, "CSV")
39
- print("[+] Beginning CSV test in {}".format(str(run)))
40
- run(c)
41
-
42
-
43
- def main():
44
- c = twint.Config()
45
- c.Username = "verified"
46
- c.Limit = 20
47
- c.Store_object = True
48
-
49
- # Separate objects are necessary.
50
-
51
- f = twint.Config()
52
- f.Username = "verified"
53
- f.Limit = 20
54
- f.Store_object = True
55
- f.User_full = True
56
-
57
- runs = [
58
- twint.run.Profile, # this doesn't
59
- twint.run.Search, # this works
60
- twint.run.Following,
61
- twint.run.Followers,
62
- twint.run.Favorites,
63
- ]
64
-
65
- tests = [test_reg, test_json, test_csv, test_db]
66
-
67
- # Something breaks if we don't split these up
68
-
69
- for run in runs[:3]:
70
- if run == twint.run.Search:
71
- c.Since = "2012-1-1 20:30:22"
72
- c.Until = "2017-1-1"
73
- else:
74
- c.Since = ""
75
- c.Until = ""
76
-
77
- for test in tests:
78
- test(c, run)
79
-
80
- for run in runs[3:]:
81
- for test in tests:
82
- test(f, run)
83
-
84
- files = ["test_twint.db", "test_twint.json", "test_twint.csv"]
85
- for _file in files:
86
- os.remove(_file)
87
-
88
- print("[+] Testing complete!")
89
-
90
-
91
- if __name__ == '__main__':
92
- main()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
twitter-scraper/twint-master/twint/__init__.py DELETED
@@ -1,32 +0,0 @@
1
- '''
2
- TWINT - Twitter Intelligence Tool (formerly known as Tweep).
3
-
4
- See wiki on Github for in-depth details.
5
- https://github.com/twintproject/twint/wiki
6
-
7
- Licensed under MIT License
8
- Copyright (c) 2018 Cody Zacharias
9
- '''
10
- import logging, os
11
-
12
- from .config import Config
13
- from .__version__ import __version__
14
- from . import run
15
-
16
- _levels = {
17
- 'info': logging.INFO,
18
- 'debug': logging.DEBUG
19
- }
20
-
21
- _level = os.getenv('TWINT_DEBUG', 'info')
22
- _logLevel = _levels[_level]
23
-
24
- if _level == "debug":
25
- logger = logging.getLogger()
26
- _output_fn = 'twint.log'
27
- logger.setLevel(_logLevel)
28
- formatter = logging.Formatter('%(levelname)s:%(asctime)s:%(name)s:%(message)s')
29
- fileHandler = logging.FileHandler(_output_fn)
30
- fileHandler.setLevel(_logLevel)
31
- fileHandler.setFormatter(formatter)
32
- logger.addHandler(fileHandler)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
twitter-scraper/twint-master/twint/__version__.py DELETED
@@ -1,3 +0,0 @@
1
- VERSION = (2, 1, 21)
2
-
3
- __version__ = '.'.join(map(str, VERSION))
 
 
 
 
twitter-scraper/twint-master/twint/cli.py DELETED
@@ -1,342 +0,0 @@
1
- #!/usr/bin/env python3
2
- '''
3
- Twint.py - Twitter Intelligence Tool (formerly known as Tweep).
4
-
5
- See wiki on Github for in-depth details.
6
- https://github.com/twintproject/twint/wiki
7
-
8
- Licensed under MIT License
9
- Copyright (c) 2018 The Twint Project
10
- '''
11
- import sys
12
- import os
13
- import argparse
14
-
15
- from . import run
16
- from . import config
17
- from . import storage
18
-
19
-
20
- def error(_error, message):
21
- """ Print errors to stdout
22
- """
23
- print("[-] {}: {}".format(_error, message))
24
- sys.exit(0)
25
-
26
-
27
- def check(args):
28
- """ Error checking
29
- """
30
- if args.username is not None or args.userlist or args.members_list:
31
- if args.verified:
32
- error("Contradicting Args",
33
- "Please use --verified in combination with -s.")
34
- if args.userid:
35
- error("Contradicting Args",
36
- "--userid and -u cannot be used together.")
37
- if args.all:
38
- error("Contradicting Args",
39
- "--all and -u cannot be used together.")
40
- elif args.search and args.timeline:
41
- error("Contradicting Args",
42
- "--s and --tl cannot be used together.")
43
- elif args.timeline and not args.username:
44
- error("Error", "-tl cannot be used without -u.")
45
- elif args.search is None:
46
- if args.custom_query is not None:
47
- pass
48
- elif (args.geo or args.near) is None and not (args.all or args.userid):
49
- error("Error", "Please use at least -u, -s, -g or --near.")
50
- elif args.all and args.userid:
51
- error("Contradicting Args",
52
- "--all and --userid cannot be used together")
53
- if args.output is None:
54
- if args.csv:
55
- error("Error", "Please specify an output file (Example: -o file.csv).")
56
- elif args.json:
57
- error("Error", "Please specify an output file (Example: -o file.json).")
58
- if args.backoff_exponent <= 0:
59
- error("Error", "Please specifiy a positive value for backoff_exponent")
60
- if args.min_wait_time < 0:
61
- error("Error", "Please specifiy a non negative value for min_wait_time")
62
-
63
-
64
- def loadUserList(ul, _type):
65
- """ Concatenate users
66
- """
67
- if os.path.exists(os.path.abspath(ul)):
68
- userlist = open(os.path.abspath(ul), "r").read().splitlines()
69
- else:
70
- userlist = ul.split(",")
71
- if _type == "search":
72
- un = ""
73
- for user in userlist:
74
- un += "%20OR%20from%3A" + user
75
- return un[15:]
76
- return userlist
77
-
78
-
79
- def initialize(args):
80
- """ Set default values for config from args
81
- """
82
- c = config.Config()
83
- c.Username = args.username
84
- c.User_id = args.userid
85
- c.Search = args.search
86
- c.Geo = args.geo
87
- c.Location = args.location
88
- c.Near = args.near
89
- c.Lang = args.lang
90
- c.Output = args.output
91
- c.Elasticsearch = args.elasticsearch
92
- c.Year = args.year
93
- c.Since = args.since
94
- c.Until = args.until
95
- c.Email = args.email
96
- c.Phone = args.phone
97
- c.Verified = args.verified
98
- c.Store_csv = args.csv
99
- c.Tabs = args.tabs
100
- c.Store_json = args.json
101
- c.Show_hashtags = args.hashtags
102
- c.Show_cashtags = args.cashtags
103
- c.Limit = args.limit
104
- c.Count = args.count
105
- c.Stats = args.stats
106
- c.Database = args.database
107
- c.To = args.to
108
- c.All = args.all
109
- c.Essid = args.essid
110
- c.Format = args.format
111
- c.User_full = args.user_full
112
- # c.Profile_full = args.profile_full
113
- c.Pandas_type = args.pandas_type
114
- c.Index_tweets = args.index_tweets
115
- c.Index_follow = args.index_follow
116
- c.Index_users = args.index_users
117
- c.Debug = args.debug
118
- c.Resume = args.resume
119
- c.Images = args.images
120
- c.Videos = args.videos
121
- c.Media = args.media
122
- c.Replies = args.replies
123
- c.Pandas_clean = args.pandas_clean
124
- c.Proxy_host = args.proxy_host
125
- c.Proxy_port = args.proxy_port
126
- c.Proxy_type = args.proxy_type
127
- c.Tor_control_port = args.tor_control_port
128
- c.Tor_control_password = args.tor_control_password
129
- c.Retweets = args.retweets
130
- c.Custom_query = args.custom_query
131
- c.Popular_tweets = args.popular_tweets
132
- c.Skip_certs = args.skip_certs
133
- c.Hide_output = args.hide_output
134
- c.Native_retweets = args.native_retweets
135
- c.Min_likes = args.min_likes
136
- c.Min_retweets = args.min_retweets
137
- c.Min_replies = args.min_replies
138
- c.Links = args.links
139
- c.Source = args.source
140
- c.Members_list = args.members_list
141
- c.Filter_retweets = args.filter_retweets
142
- c.Translate = args.translate
143
- c.TranslateDest = args.translate_dest
144
- c.Backoff_exponent = args.backoff_exponent
145
- c.Min_wait_time = args.min_wait_time
146
- return c
147
-
148
-
149
- def options():
150
- """ Parse arguments
151
- """
152
- ap = argparse.ArgumentParser(prog="twint",
153
- usage="python3 %(prog)s [options]",
154
- description="TWINT - An Advanced Twitter Scraping Tool.")
155
- ap.add_argument("-u", "--username", help="User's Tweets you want to scrape.")
156
- ap.add_argument("-s", "--search", help="Search for Tweets containing this word or phrase.")
157
- ap.add_argument("-g", "--geo", help="Search for geocoded Tweets.")
158
- ap.add_argument("--near", help="Near a specified city.")
159
- ap.add_argument("--location", help="Show user's location (Experimental).", action="store_true")
160
- ap.add_argument("-l", "--lang", help="Search for Tweets in a specific language.")
161
- ap.add_argument("-o", "--output", help="Save output to a file.")
162
- ap.add_argument("-es", "--elasticsearch", help="Index to Elasticsearch.")
163
- ap.add_argument("--year", help="Filter Tweets before specified year.")
164
- ap.add_argument("--since", help="Filter Tweets sent since date (Example: \"2017-12-27 20:30:15\" or 2017-12-27).",
165
- metavar="DATE")
166
- ap.add_argument("--until", help="Filter Tweets sent until date (Example: \"2017-12-27 20:30:15\" or 2017-12-27).",
167
- metavar="DATE")
168
- ap.add_argument("--email", help="Filter Tweets that might have email addresses", action="store_true")
169
- ap.add_argument("--phone", help="Filter Tweets that might have phone numbers", action="store_true")
170
- ap.add_argument("--verified", help="Display Tweets only from verified users (Use with -s).",
171
- action="store_true")
172
- ap.add_argument("--csv", help="Write as .csv file.", action="store_true")
173
- ap.add_argument("--tabs", help="Separate CSV fields with tab characters, not commas.", action="store_true")
174
- ap.add_argument("--json", help="Write as .json file", action="store_true")
175
- ap.add_argument("--hashtags", help="Output hashtags in seperate column.", action="store_true")
176
- ap.add_argument("--cashtags", help="Output cashtags in seperate column.", action="store_true")
177
- ap.add_argument("--userid", help="Twitter user id.")
178
- ap.add_argument("--limit", help="Number of Tweets to pull (Increments of 20).")
179
- ap.add_argument("--count", help="Display number of Tweets scraped at the end of session.",
180
- action="store_true")
181
- ap.add_argument("--stats", help="Show number of replies, retweets, and likes.",
182
- action="store_true")
183
- ap.add_argument("-db", "--database", help="Store Tweets in a sqlite3 database.")
184
- ap.add_argument("--to", help="Search Tweets to a user.", metavar="USERNAME")
185
- ap.add_argument("--all", help="Search all Tweets associated with a user.", metavar="USERNAME")
186
- ap.add_argument("--followers", help="Scrape a person's followers.", action="store_true")
187
- ap.add_argument("--following", help="Scrape a person's follows", action="store_true")
188
- ap.add_argument("--favorites", help="Scrape Tweets a user has liked.", action="store_true")
189
- ap.add_argument("--proxy-type", help="Socks5, HTTP, etc.")
190
- ap.add_argument("--proxy-host", help="Proxy hostname or IP.")
191
- ap.add_argument("--proxy-port", help="The port of the proxy server.")
192
- ap.add_argument("--tor-control-port", help="If proxy-host is set to tor, this is the control port", default=9051)
193
- ap.add_argument("--tor-control-password",
194
- help="If proxy-host is set to tor, this is the password for the control port",
195
- default="my_password")
196
- ap.add_argument("--essid",
197
- help="Elasticsearch Session ID, use this to differentiate scraping sessions.",
198
- nargs="?", default="")
199
- ap.add_argument("--userlist", help="Userlist from list or file.")
200
- ap.add_argument("--retweets",
201
- help="Include user's Retweets (Warning: limited).",
202
- action="store_true")
203
- ap.add_argument("--format", help="Custom output format (See wiki for details).")
204
- ap.add_argument("--user-full",
205
- help="Collect all user information (Use with followers or following only).",
206
- action="store_true")
207
- # I am removing this this feature for the time being, because it is no longer required, default method will do this
208
- # ap.add_argument("--profile-full",
209
- # help="Slow, but effective method of collecting a user's Tweets and RT.",
210
- # action="store_true")
211
- ap.add_argument(
212
- "-tl",
213
- "--timeline",
214
- help="Collects every tweet from a User's Timeline. (Tweets, RTs & Replies)",
215
- action="store_true",
216
- )
217
- ap.add_argument("--translate",
218
- help="Get tweets translated by Google Translate.",
219
- action="store_true")
220
- ap.add_argument("--translate-dest", help="Translate tweet to language (ISO2).",
221
- default="en")
222
- ap.add_argument("--store-pandas", help="Save Tweets in a DataFrame (Pandas) file.")
223
- ap.add_argument("--pandas-type",
224
- help="Specify HDF5 or Pickle (HDF5 as default)", nargs="?", default="HDF5")
225
- ap.add_argument("-it", "--index-tweets",
226
- help="Custom Elasticsearch Index name for Tweets.", nargs="?", default="twinttweets")
227
- ap.add_argument("-if", "--index-follow",
228
- help="Custom Elasticsearch Index name for Follows.",
229
- nargs="?", default="twintgraph")
230
- ap.add_argument("-iu", "--index-users", help="Custom Elasticsearch Index name for Users.",
231
- nargs="?", default="twintuser")
232
- ap.add_argument("--debug",
233
- help="Store information in debug logs", action="store_true")
234
- ap.add_argument("--resume", help="Resume from Tweet ID.", metavar="TWEET_ID")
235
- ap.add_argument("--videos", help="Display only Tweets with videos.", action="store_true")
236
- ap.add_argument("--images", help="Display only Tweets with images.", action="store_true")
237
- ap.add_argument("--media",
238
- help="Display Tweets with only images or videos.", action="store_true")
239
- ap.add_argument("--replies", help="Display replies to a subject.", action="store_true")
240
- ap.add_argument("-pc", "--pandas-clean",
241
- help="Automatically clean Pandas dataframe at every scrape.")
242
- ap.add_argument("-cq", "--custom-query", help="Custom search query.")
243
- ap.add_argument("-pt", "--popular-tweets", help="Scrape popular tweets instead of recent ones.",
244
- action="store_true")
245
- ap.add_argument("-sc", "--skip-certs", help="Skip certs verification, useful for SSC.", action="store_false")
246
- ap.add_argument("-ho", "--hide-output", help="Hide output, no tweets will be displayed.", action="store_true")
247
- ap.add_argument("-nr", "--native-retweets", help="Filter the results for retweets only.", action="store_true")
248
- ap.add_argument("--min-likes", help="Filter the tweets by minimum number of likes.")
249
- ap.add_argument("--min-retweets", help="Filter the tweets by minimum number of retweets.")
250
- ap.add_argument("--min-replies", help="Filter the tweets by minimum number of replies.")
251
- ap.add_argument("--links", help="Include or exclude tweets containing one o more links. If not specified" +
252
- " you will get both tweets that might contain links or not.")
253
- ap.add_argument("--source", help="Filter the tweets for specific source client.")
254
- ap.add_argument("--members-list", help="Filter the tweets sent by users in a given list.")
255
- ap.add_argument("-fr", "--filter-retweets", help="Exclude retweets from the results.", action="store_true")
256
- ap.add_argument("--backoff-exponent", help="Specify a exponent for the polynomial backoff in case of errors.",
257
- type=float, default=3.0)
258
- ap.add_argument("--min-wait-time", type=float, default=15,
259
- help="specifiy a minimum wait time in case of scraping limit error. This value will be adjusted by twint if the value provided does not satisfy the limits constraints")
260
- args = ap.parse_args()
261
-
262
- return args
263
-
264
-
265
- def main():
266
- """ Main
267
- """
268
- args = options()
269
- check(args)
270
-
271
- if args.pandas_clean:
272
- storage.panda.clean()
273
-
274
- c = initialize(args)
275
-
276
- if args.userlist:
277
- c.Query = loadUserList(args.userlist, "search")
278
-
279
- if args.pandas_clean:
280
- storage.panda.clean()
281
-
282
- if args.favorites:
283
- if args.userlist:
284
- _userlist = loadUserList(args.userlist, "favorites")
285
- for _user in _userlist:
286
- args.username = _user
287
- c = initialize(args)
288
- run.Favorites(c)
289
- else:
290
- run.Favorites(c)
291
- elif args.following:
292
- if args.userlist:
293
- _userlist = loadUserList(args.userlist, "following")
294
- for _user in _userlist:
295
- args.username = _user
296
- c = initialize(args)
297
- run.Following(c)
298
- else:
299
- run.Following(c)
300
- elif args.followers:
301
- if args.userlist:
302
- _userlist = loadUserList(args.userlist, "followers")
303
- for _user in _userlist:
304
- args.username = _user
305
- c = initialize(args)
306
- run.Followers(c)
307
- else:
308
- run.Followers(c)
309
- elif args.retweets: # or args.profile_full:
310
- if args.userlist:
311
- _userlist = loadUserList(args.userlist, "profile")
312
- for _user in _userlist:
313
- args.username = _user
314
- c = initialize(args)
315
- run.Profile(c)
316
- else:
317
- run.Profile(c)
318
- elif args.user_full:
319
- if args.userlist:
320
- _userlist = loadUserList(args.userlist, "userlist")
321
- for _user in _userlist:
322
- args.username = _user
323
- c = initialize(args)
324
- run.Lookup(c)
325
- else:
326
- run.Lookup(c)
327
- elif args.timeline:
328
- run.Profile(c)
329
- else:
330
- run.Search(c)
331
-
332
-
333
- def run_as_command():
334
- if(sys.version_info.major < 3 or (sys.version_info.major == 3 and sys.version_info.minor < 6)):
335
- print("[-] TWINT requires Python version 3.6+.")
336
- sys.exit(0)
337
-
338
- main()
339
-
340
-
341
- if __name__ == '__main__':
342
- main()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
twitter-scraper/twint-master/twint/config.py DELETED
@@ -1,87 +0,0 @@
1
- from dataclasses import dataclass
2
- from typing import Optional
3
-
4
- @dataclass
5
- class Config:
6
- Username: Optional[str] = None
7
- User_id: Optional[str] = None
8
- Search: Optional[str] = None
9
- Lookup: bool = False
10
- Geo: str = ""
11
- Location: bool = False
12
- Near: str = None
13
- Lang: Optional[str] = None
14
- Output: Optional[str] = None
15
- Elasticsearch: object = None
16
- Year: Optional[int] = None
17
- Since: Optional[str] = None
18
- Until: Optional[str] = None
19
- Email: Optional[str] = None
20
- Phone: Optional[str] = None
21
- Verified: bool = False
22
- Store_csv: bool = False
23
- Store_json: bool = False
24
- Custom = {"tweet": None, "user": None, "username": None}
25
- Show_hashtags: bool = False
26
- Show_cashtags: bool = False
27
- Limit: Optional[int] = None
28
- Count: Optional[int] = None
29
- Stats: bool = False
30
- Database: object = None
31
- To: str = None
32
- All = None
33
- Debug: bool = False
34
- Format = None
35
- Essid: str = ""
36
- Profile: bool = False
37
- Followers: bool = False
38
- Following: bool = False
39
- Favorites: bool = False
40
- TwitterSearch: bool = False
41
- User_full: bool = False
42
- # Profile_full: bool = False
43
- Store_object: bool = False
44
- Store_object_tweets_list: list = None
45
- Store_object_users_list: list = None
46
- Store_object_follow_list: list = None
47
- Pandas_type: type = None
48
- Pandas: bool = False
49
- Index_tweets: str = "twinttweets"
50
- Index_follow: str = "twintgraph"
51
- Index_users: str = "twintuser"
52
- Retries_count: int = 10
53
- Resume: object = None
54
- Images: bool = False
55
- Videos: bool = False
56
- Media: bool = False
57
- Replies: bool = False
58
- Pandas_clean: bool = True
59
- Lowercase: bool = True
60
- Pandas_au: bool = True
61
- Proxy_host: str = ""
62
- Proxy_port: int = 0
63
- Proxy_type: object = None
64
- Tor_control_port: int = 9051
65
- Tor_control_password: str = None
66
- Retweets: bool = False
67
- Query: str = None
68
- Hide_output: bool = False
69
- Custom_query: str = ""
70
- Popular_tweets: bool = False
71
- Skip_certs: bool = False
72
- Native_retweets: bool = False
73
- Min_likes: int = 0
74
- Min_retweets: int = 0
75
- Min_replies: int = 0
76
- Links: Optional[str] = None
77
- Source: Optional[str] = None
78
- Members_list: Optional[str] = None
79
- Filter_retweets: bool = False
80
- Translate: bool = False
81
- TranslateSrc: str = "en"
82
- TranslateDest: str = "en"
83
- Backoff_exponent: float = 3.0
84
- Min_wait_time: int = 0
85
- Bearer_token: str = None
86
- Guest_token: str = None
87
- deleted: list = None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
twitter-scraper/twint-master/twint/datelock.py DELETED
@@ -1,44 +0,0 @@
1
- import datetime
2
-
3
- import logging as logme
4
-
5
- from .tweet import utc_to_local
6
-
7
-
8
- class Datelock:
9
- until = None
10
- since = None
11
- _since_def_user = None
12
-
13
-
14
- def convertToDateTime(string):
15
- dateTimeList = string.split()
16
- ListLength = len(dateTimeList)
17
- if ListLength == 2:
18
- return string
19
- if ListLength == 1:
20
- return string + " 00:00:00"
21
- else:
22
- return ""
23
-
24
-
25
- def Set(Until, Since):
26
- logme.debug(__name__+':Set')
27
- d = Datelock()
28
-
29
- if Until:
30
- d.until = datetime.datetime.strptime(convertToDateTime(Until), "%Y-%m-%d %H:%M:%S")
31
- d.until = utc_to_local(d.until)
32
- else:
33
- d.until = datetime.datetime.today()
34
-
35
- if Since:
36
- d.since = datetime.datetime.strptime(convertToDateTime(Since), "%Y-%m-%d %H:%M:%S")
37
- d.since = utc_to_local(d.since)
38
- d._since_def_user = True
39
- else:
40
- d.since = datetime.datetime.strptime("2006-03-21 00:00:00", "%Y-%m-%d %H:%M:%S")
41
- d.since = utc_to_local(d.since)
42
- d._since_def_user = False
43
-
44
- return d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
twitter-scraper/twint-master/twint/feed.py DELETED
@@ -1,145 +0,0 @@
1
- import time
2
- from datetime import datetime
3
-
4
- from bs4 import BeautifulSoup
5
- from re import findall
6
- from json import loads
7
-
8
- import logging as logme
9
-
10
- from .tweet import utc_to_local, Tweet_formats
11
-
12
-
13
- class NoMoreTweetsException(Exception):
14
- def __init__(self, msg):
15
- super().__init__(msg)
16
-
17
-
18
- def Follow(response):
19
- logme.debug(__name__ + ':Follow')
20
- soup = BeautifulSoup(response, "html.parser")
21
- follow = soup.find_all("td", "info fifty screenname")
22
- cursor = soup.find_all("div", "w-button-more")
23
- try:
24
- cursor = findall(r'cursor=(.*?)">', str(cursor))[0]
25
- except IndexError:
26
- logme.critical(__name__ + ':Follow:IndexError')
27
-
28
- return follow, cursor
29
-
30
-
31
- # TODO: this won't be used by --profile-full anymore. if it isn't used anywhere else, perhaps remove this in future
32
- def Mobile(response):
33
- logme.debug(__name__ + ':Mobile')
34
- soup = BeautifulSoup(response, "html.parser")
35
- tweets = soup.find_all("span", "metadata")
36
- max_id = soup.find_all("div", "w-button-more")
37
- try:
38
- max_id = findall(r'max_id=(.*?)">', str(max_id))[0]
39
- except Exception as e:
40
- logme.critical(__name__ + ':Mobile:' + str(e))
41
-
42
- return tweets, max_id
43
-
44
-
45
- def MobileFav(response):
46
- soup = BeautifulSoup(response, "html.parser")
47
- tweets = soup.find_all("table", "tweet")
48
- max_id = soup.find_all("div", "w-button-more")
49
- try:
50
- max_id = findall(r'max_id=(.*?)">', str(max_id))[0]
51
- except Exception as e:
52
- print(str(e) + " [x] feed.MobileFav")
53
-
54
- return tweets, max_id
55
-
56
-
57
- def _get_cursor(response):
58
- if isinstance(response, dict): # case 1
59
- try:
60
- next_cursor = response['timeline']['instructions'][0]['addEntries']['entries'][-1]['content'][
61
- 'operation']['cursor']['value']
62
- except KeyError:
63
- # this is needed because after the first request location of cursor is changed
64
- next_cursor = response['timeline']['instructions'][-1]['replaceEntry']['entry']['content']['operation'][
65
- 'cursor']['value']
66
- else: # case 2
67
- next_cursor = response[-1]['content']['value']
68
- return next_cursor
69
-
70
-
71
- def Json(response):
72
- logme.debug(__name__ + ':Json')
73
- json_response = loads(response)
74
- html = json_response["items_html"]
75
- soup = BeautifulSoup(html, "html.parser")
76
- feed = soup.find_all("div", "tweet")
77
- return feed, json_response["min_position"]
78
-
79
-
80
- def parse_tweets(config, response):
81
- logme.debug(__name__ + ':parse_tweets')
82
- response = loads(response)
83
- feed = []
84
- if 'globalObjects' in response:
85
- if len(response['globalObjects']['tweets']) == 0:
86
- msg = 'No more data!'
87
- raise NoMoreTweetsException(msg)
88
- for timeline_entry in response['timeline']['instructions'][0]['addEntries']['entries']:
89
- # this will handle the cases when the timeline entry is a tweet
90
- if (config.TwitterSearch or config.Profile) and (timeline_entry['entryId'].startswith('sq-I-t-') or
91
- timeline_entry['entryId'].startswith('tweet-')):
92
- if 'tweet' in timeline_entry['content']['item']['content']:
93
- _id = timeline_entry['content']['item']['content']['tweet']['id']
94
- # skip the ads
95
- if 'promotedMetadata' in timeline_entry['content']['item']['content']['tweet']:
96
- continue
97
- elif 'tombstone' in timeline_entry['content']['item']['content'] and 'tweet' in \
98
- timeline_entry['content']['item']['content']['tombstone']:
99
- _id = timeline_entry['content']['item']['content']['tombstone']['tweet']['id']
100
- else:
101
- _id = None
102
- if _id is None:
103
- raise ValueError('Unable to find ID of tweet in timeline.')
104
- try:
105
- temp_obj = response['globalObjects']['tweets'][_id]
106
- except KeyError:
107
- logme.info('encountered a deleted tweet with id {}'.format(_id))
108
-
109
- config.deleted.append(_id)
110
- continue
111
- temp_obj['user_data'] = response['globalObjects']['users'][temp_obj['user_id_str']]
112
- if 'retweeted_status_id_str' in temp_obj:
113
- rt_id = temp_obj['retweeted_status_id_str']
114
- _dt = response['globalObjects']['tweets'][rt_id]['created_at']
115
- _dt = datetime.strptime(_dt, '%a %b %d %H:%M:%S %z %Y')
116
- _dt = utc_to_local(_dt)
117
- _dt = str(_dt.strftime(Tweet_formats['datetime']))
118
- temp_obj['retweet_data'] = {
119
- 'user_rt_id': response['globalObjects']['tweets'][rt_id]['user_id_str'],
120
- 'user_rt': response['globalObjects']['tweets'][rt_id]['full_text'],
121
- 'retweet_id': rt_id,
122
- 'retweet_date': _dt,
123
- }
124
- feed.append(temp_obj)
125
- next_cursor = _get_cursor(response) # case 1
126
- else:
127
- response = response['data']['user']['result']['timeline']
128
- entries = response['timeline']['instructions']
129
- for e in entries:
130
- if e.get('entries'):
131
- entries = e['entries']
132
- break
133
- if len(entries) == 2:
134
- msg = 'No more data!'
135
- raise NoMoreTweetsException(msg)
136
- for timeline_entry in entries:
137
- if timeline_entry['content'].get('itemContent'):
138
- try:
139
- temp_obj = timeline_entry['content']['itemContent']['tweet_results']['result']['legacy']
140
- temp_obj['user_data'] = timeline_entry['content']['itemContent']['tweet_results']['result']['core']['user_results']['result']['legacy']
141
- feed.append(temp_obj)
142
- except KeyError: # doubtful
143
- next
144
- next_cursor = _get_cursor(entries) # case 2
145
- return feed, next_cursor
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
twitter-scraper/twint-master/twint/format.py DELETED
@@ -1,91 +0,0 @@
1
- import logging as logme
2
-
3
- def Tweet(config, t):
4
- if config.Format:
5
- logme.debug(__name__+':Tweet:Format')
6
- output = config.Format.replace("{id}", t.id_str)
7
- output = output.replace("{conversation_id}", t.conversation_id)
8
- output = output.replace("{date}", t.datestamp)
9
- output = output.replace("{time}", t.timestamp)
10
- output = output.replace("{user_id}", t.user_id_str)
11
- output = output.replace("{username}", t.username)
12
- output = output.replace("{name}", t.name)
13
- output = output.replace("{place}", t.place)
14
- output = output.replace("{timezone}", t.timezone)
15
- output = output.replace("{urls}", ",".join(t.urls))
16
- output = output.replace("{photos}", ",".join(t.photos))
17
- output = output.replace("{video}", str(t.video))
18
- output = output.replace("{thumbnail}", t.thumbnail)
19
- output = output.replace("{tweet}", t.tweet)
20
- output = output.replace("{language}", t.lang)
21
- output = output.replace("{hashtags}", ",".join(t.hashtags))
22
- output = output.replace("{cashtags}", ",".join(t.cashtags))
23
- output = output.replace("{replies}", t.replies_count)
24
- output = output.replace("{retweets}", t.retweets_count)
25
- output = output.replace("{likes}", t.likes_count)
26
- output = output.replace("{link}", t.link)
27
- output = output.replace("{is_retweet}", str(t.retweet))
28
- output = output.replace("{user_rt_id}", str(t.user_rt_id))
29
- output = output.replace("{quote_url}", t.quote_url)
30
- output = output.replace("{near}", t.near)
31
- output = output.replace("{geo}", t.geo)
32
- output = output.replace("{mentions}", ",".join(t.mentions))
33
- output = output.replace("{translate}", t.translate)
34
- output = output.replace("{trans_src}", t.trans_src)
35
- output = output.replace("{trans_dest}", t.trans_dest)
36
- else:
37
- logme.debug(__name__+':Tweet:notFormat')
38
- output = f"{t.id_str} {t.datestamp} {t.timestamp} {t.timezone} "
39
-
40
- # TODO: someone who is familiar with this code, needs to take a look at what this is <also see tweet.py>
41
- # if t.retweet:
42
- # output += "RT "
43
-
44
- output += f"<{t.username}> {t.tweet}"
45
-
46
- if config.Show_hashtags:
47
- hashtags = ",".join(t.hashtags)
48
- output += f" {hashtags}"
49
- if config.Show_cashtags:
50
- cashtags = ",".join(t.cashtags)
51
- output += f" {cashtags}"
52
- if config.Stats:
53
- output += f" | {t.replies_count} replies {t.retweets_count} retweets {t.likes_count} likes"
54
- if config.Translate:
55
- output += f" {t.translate} {t.trans_src} {t.trans_dest}"
56
- return output
57
-
58
- def User(_format, u):
59
- if _format:
60
- logme.debug(__name__+':User:Format')
61
- output = _format.replace("{id}", str(u.id))
62
- output = output.replace("{name}", u.name)
63
- output = output.replace("{username}", u.username)
64
- output = output.replace("{bio}", u.bio)
65
- output = output.replace("{location}", u.location)
66
- output = output.replace("{url}", u.url)
67
- output = output.replace("{join_date}", u.join_date)
68
- output = output.replace("{join_time}", u.join_time)
69
- output = output.replace("{tweets}", str(u.tweets))
70
- output = output.replace("{following}", str(u.following))
71
- output = output.replace("{followers}", str(u.followers))
72
- output = output.replace("{likes}", str(u.likes))
73
- output = output.replace("{media}", str(u.media_count))
74
- output = output.replace("{private}", str(u.is_private))
75
- output = output.replace("{verified}", str(u.is_verified))
76
- output = output.replace("{avatar}", u.avatar)
77
- if u.background_image:
78
- output = output.replace("{background_image}", u.background_image)
79
- else:
80
- output = output.replace("{background_image}", "")
81
- else:
82
- logme.debug(__name__+':User:notFormat')
83
- output = f"{u.id} | {u.name} | @{u.username} | Private: "
84
- output += f"{u.is_private} | Verified: {u.is_verified} |"
85
- output += f" Bio: {u.bio} | Location: {u.location} | Url: "
86
- output += f"{u.url} | Joined: {u.join_date} {u.join_time} "
87
- output += f"| Tweets: {u.tweets} | Following: {u.following}"
88
- output += f" | Followers: {u.followers} | Likes: {u.likes} "
89
- output += f"| Media: {u.media_count} | Avatar: {u.avatar}"
90
-
91
- return output
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
twitter-scraper/twint-master/twint/get.py DELETED
@@ -1,298 +0,0 @@
1
- from async_timeout import timeout
2
- from datetime import datetime
3
- from bs4 import BeautifulSoup
4
- import sys
5
- import socket
6
- import aiohttp
7
- from fake_useragent import UserAgent
8
- import asyncio
9
- import concurrent.futures
10
- import random
11
- from json import loads, dumps
12
- from aiohttp_socks import ProxyConnector, ProxyType
13
- from urllib.parse import quote
14
- import time
15
-
16
- from . import url
17
- from .output import Tweets, Users
18
- from .token import TokenExpiryException
19
-
20
- import logging as logme
21
-
22
- httpproxy = None
23
-
24
- user_agent_list = [
25
- # 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)'
26
- # ' Chrome/60.0.3112.113 Safari/537.36',
27
- # 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)'
28
- # ' Chrome/60.0.3112.90 Safari/537.36',
29
- # 'Mozilla/5.0 (Windows NT 5.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)'
30
- # ' Chrome/60.0.3112.90 Safari/537.36',
31
- # 'Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)'
32
- # ' Chrome/60.0.3112.90 Safari/537.36',
33
- # 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko)'
34
- # ' Chrome/44.0.2403.157 Safari/537.36',
35
- # 'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)'
36
- # ' Chrome/60.0.3112.113 Safari/537.36',
37
- # 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)'
38
- # ' Chrome/57.0.2987.133 Safari/537.36',
39
- # 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)'
40
- # ' Chrome/57.0.2987.133 Safari/537.36',
41
- # 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)'
42
- # ' Chrome/55.0.2883.87 Safari/537.36',
43
- # 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)'
44
- # ' Chrome/55.0.2883.87 Safari/537.36',
45
-
46
- 'Mozilla/4.0 (compatible; MSIE 9.0; Windows NT 6.1)',
47
- 'Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko',
48
- 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0)',
49
- 'Mozilla/5.0 (Windows NT 6.1; Trident/7.0; rv:11.0) like Gecko',
50
- 'Mozilla/5.0 (Windows NT 6.2; WOW64; Trident/7.0; rv:11.0) like Gecko',
51
- 'Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko',
52
- 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.0; Trident/5.0)',
53
- 'Mozilla/5.0 (Windows NT 6.3; WOW64; Trident/7.0; rv:11.0) like Gecko',
54
- 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)',
55
- 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; Trident/7.0; rv:11.0) like Gecko',
56
- 'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; WOW64; Trident/6.0)',
57
- 'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.1; Trident/6.0)',
58
- 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0; .NET CLR 2.0.50727; .NET CLR 3.0.4506.2152; .NET '
59
- 'CLR 3.5.30729)',
60
- ]
61
-
62
-
63
- # function to convert python `dict` to json and then encode it to be passed in the url as a parameter
64
- # some urls require this format
65
- def dict_to_url(dct):
66
- return quote(dumps(dct))
67
-
68
-
69
- def get_connector(config):
70
- logme.debug(__name__ + ':get_connector')
71
- _connector = None
72
- if config.Proxy_host:
73
- if config.Proxy_host.lower() == "tor":
74
- _connector = ProxyConnector(
75
- host='127.0.0.1',
76
- port=9050,
77
- rdns=True)
78
- elif config.Proxy_port and config.Proxy_type:
79
- if config.Proxy_type.lower() == "socks5":
80
- _type = ProxyType.SOCKS5
81
- elif config.Proxy_type.lower() == "socks4":
82
- _type = ProxyType.SOCKS4
83
- elif config.Proxy_type.lower() == "http":
84
- global httpproxy
85
- httpproxy = "http://" + config.Proxy_host + ":" + str(config.Proxy_port)
86
- return _connector
87
- else:
88
- logme.critical("get_connector:proxy-type-error")
89
- print("Error: Proxy types allowed are: http, socks5 and socks4. No https.")
90
- sys.exit(1)
91
- _connector = ProxyConnector(
92
- proxy_type=_type,
93
- host=config.Proxy_host,
94
- port=config.Proxy_port,
95
- rdns=True)
96
- else:
97
- logme.critical(__name__ + ':get_connector:proxy-port-type-error')
98
- print("Error: Please specify --proxy-host, --proxy-port, and --proxy-type")
99
- sys.exit(1)
100
- else:
101
- if config.Proxy_port or config.Proxy_type:
102
- logme.critical(__name__ + ':get_connector:proxy-host-arg-error')
103
- print("Error: Please specify --proxy-host, --proxy-port, and --proxy-type")
104
- sys.exit(1)
105
-
106
- return _connector
107
-
108
-
109
- async def RequestUrl(config, init):
110
- logme.debug(__name__ + ':RequestUrl')
111
- _connector = get_connector(config)
112
- _serialQuery = ""
113
- params = []
114
- _url = ""
115
- _headers = [("authorization", config.Bearer_token), ("x-guest-token", config.Guest_token)]
116
-
117
- # TODO : do this later
118
- if config.Profile:
119
- logme.debug(__name__ + ':RequestUrl:Profile')
120
- _url, params, _serialQuery = url.SearchProfile(config, init)
121
- elif config.TwitterSearch:
122
- logme.debug(__name__ + ':RequestUrl:TwitterSearch')
123
- _url, params, _serialQuery = await url.Search(config, init)
124
- else:
125
- if config.Following:
126
- logme.debug(__name__ + ':RequestUrl:Following')
127
- _url = await url.Following(config.Username, init)
128
- elif config.Followers:
129
- logme.debug(__name__ + ':RequestUrl:Followers')
130
- _url = await url.Followers(config.Username, init)
131
- else:
132
- logme.debug(__name__ + ':RequestUrl:Favorites')
133
- _url = await url.Favorites(config.Username, init)
134
- _serialQuery = _url
135
-
136
- response = await Request(_url, params=params, connector=_connector, headers=_headers)
137
-
138
- if config.Debug:
139
- print(_serialQuery, file=open("twint-request_urls.log", "a", encoding="utf-8"))
140
-
141
- return response
142
-
143
-
144
- def ForceNewTorIdentity(config):
145
- logme.debug(__name__ + ':ForceNewTorIdentity')
146
- try:
147
- tor_c = socket.create_connection(('127.0.0.1', config.Tor_control_port))
148
- tor_c.send('AUTHENTICATE "{}"\r\nSIGNAL NEWNYM\r\n'.format(config.Tor_control_password).encode())
149
- response = tor_c.recv(1024)
150
- if response != b'250 OK\r\n250 OK\r\n':
151
- sys.stderr.write('Unexpected response from Tor control port: {}\n'.format(response))
152
- logme.critical(__name__ + ':ForceNewTorIdentity:unexpectedResponse')
153
- except Exception as e:
154
- logme.debug(__name__ + ':ForceNewTorIdentity:errorConnectingTor')
155
- sys.stderr.write('Error connecting to Tor control port: {}\n'.format(repr(e)))
156
- sys.stderr.write('If you want to rotate Tor ports automatically - enable Tor control port\n')
157
-
158
-
159
- async def Request(_url, connector=None, params=None, headers=None):
160
- logme.debug(__name__ + ':Request:Connector')
161
- async with aiohttp.ClientSession(connector=connector, headers=headers) as session:
162
- return await Response(session, _url, params)
163
-
164
-
165
- async def Response(session, _url, params=None):
166
- logme.debug(__name__ + ':Response')
167
- retries = 5
168
- wait = 10 # No basis, maybe work with 0
169
- for attempt in range(retries + 1):
170
- try:
171
- with timeout(120):
172
- async with session.get(_url, ssl=True, params=params, proxy=httpproxy) as response:
173
- resp = await response.text()
174
- if response.status == 429: # 429 implies Too many requests i.e. Rate Limit Exceeded
175
- raise TokenExpiryException(loads(resp)['errors'][0]['message'])
176
- return resp
177
- except aiohttp.client_exceptions.ClientConnectorError as exc:
178
- if attempt < retries:
179
- retrying = ', retrying'
180
- level = logme.WARNING
181
- else:
182
- retrying = ''
183
- level = logme.ERROR
184
- logme.log(level, f'Error retrieving {_url}: {exc!r}{retrying}')
185
- if attempt < retries:
186
- time.sleep(wait)
187
- else:
188
- logme.fatal(f'{retries + 1} requests to {_url} failed, giving up.')
189
- raise TokenExpiryException(f'{exc!r}')
190
-
191
-
192
- async def RandomUserAgent(wa=None):
193
- logme.debug(__name__ + ':RandomUserAgent')
194
- try:
195
- if wa:
196
- return "Mozilla/5.0 (Windows NT 6.4; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2225.0 Safari/537.36"
197
- return UserAgent(verify_ssl=False, use_cache_server=False).random
198
- except:
199
- return random.choice(user_agent_list)
200
-
201
-
202
- async def Username(_id, bearer_token, guest_token):
203
- logme.debug(__name__ + ':Username')
204
- _dct = {'userId': _id, 'withHighlightedLabel': False}
205
- _url = "https://api.twitter.com/graphql/B9FuNQVmyx32rdbIPEZKag/UserByRestId?variables={}".format(dict_to_url(_dct))
206
- _headers = {
207
- 'authorization': bearer_token,
208
- 'x-guest-token': guest_token,
209
- }
210
- r = await Request(_url, headers=_headers)
211
- j_r = loads(r)
212
- username = j_r['data']['user']['legacy']['screen_name']
213
- return username
214
-
215
-
216
- async def Tweet(url, config, conn):
217
- logme.debug(__name__ + ':Tweet')
218
- try:
219
- response = await Request(url)
220
- soup = BeautifulSoup(response, "html.parser")
221
- tweets = soup.find_all("div", "tweet")
222
- await Tweets(tweets, config, conn, url)
223
- except Exception as e:
224
- logme.critical(__name__ + ':Tweet:' + str(e))
225
-
226
-
227
- async def User(username, config, conn, user_id=False):
228
- logme.debug(__name__ + ':User')
229
- _dct = {'screen_name': username, 'withHighlightedLabel': False}
230
- _url = 'https://api.twitter.com/graphql/jMaTS-_Ea8vh9rpKggJbCQ/UserByScreenName?variables={}'\
231
- .format(dict_to_url(_dct))
232
- _headers = {
233
- 'authorization': config.Bearer_token,
234
- 'x-guest-token': config.Guest_token,
235
- }
236
- try:
237
- response = await Request(_url, headers=_headers)
238
- j_r = loads(response)
239
- if user_id:
240
- try:
241
- _id = j_r['data']['user']['rest_id']
242
- return _id
243
- except KeyError as e:
244
- logme.critical(__name__ + ':User:' + str(e))
245
- return
246
- await Users(j_r, config, conn)
247
- except Exception as e:
248
- logme.critical(__name__ + ':User:' + str(e))
249
- raise
250
-
251
-
252
- def Limit(Limit, count):
253
- logme.debug(__name__ + ':Limit')
254
- if Limit is not None and count >= int(Limit):
255
- return True
256
-
257
-
258
- async def Multi(feed, config, conn):
259
- logme.debug(__name__ + ':Multi')
260
- count = 0
261
- try:
262
- with concurrent.futures.ThreadPoolExecutor(max_workers=20) as executor:
263
- loop = asyncio.get_event_loop()
264
- futures = []
265
- for tweet in feed:
266
- count += 1
267
- if config.Favorites or config.Profile_full:
268
- logme.debug(__name__ + ':Multi:Favorites-profileFull')
269
- link = tweet.find("a")["href"]
270
- url = f"https://twitter.com{link}&lang=en"
271
- elif config.User_full:
272
- logme.debug(__name__ + ':Multi:userFull')
273
- username = tweet.find("a")["name"]
274
- url = f"http://twitter.com/{username}?lang=en"
275
- else:
276
- logme.debug(__name__ + ':Multi:else-url')
277
- link = tweet.find("a", "tweet-timestamp js-permalink js-nav js-tooltip")["href"]
278
- url = f"https://twitter.com{link}?lang=en"
279
-
280
- if config.User_full:
281
- logme.debug(__name__ + ':Multi:user-full-Run')
282
- futures.append(loop.run_in_executor(executor, await User(url,
283
- config, conn)))
284
- else:
285
- logme.debug(__name__ + ':Multi:notUser-full-Run')
286
- futures.append(loop.run_in_executor(executor, await Tweet(url,
287
- config, conn)))
288
- logme.debug(__name__ + ':Multi:asyncioGather')
289
- await asyncio.gather(*futures)
290
- except Exception as e:
291
- # TODO: fix error not error
292
- # print(str(e) + " [x] get.Multi")
293
- # will return "'NoneType' object is not callable"
294
- # but still works
295
- # logme.critical(__name__+':Multi:' + str(e))
296
- pass
297
-
298
- return count
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
twitter-scraper/twint-master/twint/output.py DELETED
@@ -1,241 +0,0 @@
1
- from datetime import datetime
2
-
3
- from . import format, get
4
- from .tweet import Tweet
5
- from .user import User
6
- from .storage import db, elasticsearch, write, panda
7
-
8
- import logging as logme
9
-
10
- follows_list = []
11
- tweets_list = []
12
- users_list = []
13
-
14
- author_list = {''}
15
- author_list.pop()
16
-
17
- # used by Pandas
18
- _follows_object = {}
19
-
20
-
21
- def _formatDateTime(datetimestamp):
22
- try:
23
- return int(datetime.strptime(datetimestamp, "%Y-%m-%d %H:%M:%S").timestamp())
24
- except ValueError:
25
- return int(datetime.strptime(datetimestamp, "%Y-%m-%d").timestamp())
26
-
27
-
28
- def _clean_follow_list():
29
- logme.debug(__name__ + ':clean_follow_list')
30
- global _follows_object
31
- _follows_object = {}
32
-
33
-
34
- def clean_lists():
35
- logme.debug(__name__ + ':clean_lists')
36
- global follows_list
37
- global tweets_list
38
- global users_list
39
- follows_list = []
40
- tweets_list = []
41
- users_list = []
42
-
43
-
44
- def datecheck(datetimestamp, config):
45
- logme.debug(__name__ + ':datecheck')
46
- if config.Since:
47
- logme.debug(__name__ + ':datecheck:SinceTrue')
48
-
49
- d = _formatDateTime(datetimestamp)
50
- s = _formatDateTime(config.Since)
51
-
52
- if d < s:
53
- return False
54
- if config.Until:
55
- logme.debug(__name__ + ':datecheck:UntilTrue')
56
-
57
- d = _formatDateTime(datetimestamp)
58
- s = _formatDateTime(config.Until)
59
-
60
- if d > s:
61
- return False
62
- logme.debug(__name__ + ':datecheck:dateRangeFalse')
63
- return True
64
-
65
-
66
- # TODO In this method we need to delete the quoted tweets, because twitter also sends the quoted tweets in the
67
- # `tweets` list along with the other tweets
68
- def is_tweet(tw):
69
- try:
70
- tw["data-item-id"]
71
- logme.debug(__name__ + ':is_tweet:True')
72
- return True
73
- except:
74
- logme.critical(__name__ + ':is_tweet:False')
75
- return False
76
-
77
-
78
- def _output(obj, output, config, **extra):
79
- logme.debug(__name__ + ':_output')
80
- if config.Lowercase:
81
- if isinstance(obj, str):
82
- logme.debug(__name__ + ':_output:Lowercase:username')
83
- obj = obj.lower()
84
- elif obj.__class__.__name__ == "user":
85
- logme.debug(__name__ + ':_output:Lowercase:user')
86
- pass
87
- elif obj.__class__.__name__ == "tweet":
88
- logme.debug(__name__ + ':_output:Lowercase:tweet')
89
- obj.username = obj.username.lower()
90
- author_list.update({obj.username})
91
- for dct in obj.mentions:
92
- for key, val in dct.items():
93
- dct[key] = val.lower()
94
- for i in range(len(obj.hashtags)):
95
- obj.hashtags[i] = obj.hashtags[i].lower()
96
- for i in range(len(obj.cashtags)):
97
- obj.cashtags[i] = obj.cashtags[i].lower()
98
- else:
99
- logme.info('_output:Lowercase:hiddenTweetFound')
100
- print("[x] Hidden tweet found, account suspended due to violation of TOS")
101
- return
102
- if config.Output != None:
103
- if config.Store_csv:
104
- try:
105
- write.Csv(obj, config)
106
- logme.debug(__name__ + ':_output:CSV')
107
- except Exception as e:
108
- logme.critical(__name__ + ':_output:CSV:Error:' + str(e))
109
- print(str(e) + " [x] output._output")
110
- elif config.Store_json:
111
- write.Json(obj, config)
112
- logme.debug(__name__ + ':_output:JSON')
113
- else:
114
- write.Text(output, config.Output)
115
- logme.debug(__name__ + ':_output:Text')
116
-
117
- if config.Elasticsearch:
118
- logme.debug(__name__ + ':_output:Elasticsearch')
119
- print("", end=".", flush=True)
120
- else:
121
- if not config.Hide_output:
122
- try:
123
- print(output.replace('\n', ' '))
124
- except UnicodeEncodeError:
125
- logme.critical(__name__ + ':_output:UnicodeEncodeError')
126
- print("unicode error [x] output._output")
127
-
128
-
129
- async def checkData(tweet, config, conn):
130
- logme.debug(__name__ + ':checkData')
131
- tweet = Tweet(tweet, config)
132
- if not tweet.datestamp:
133
- logme.critical(__name__ + ':checkData:hiddenTweetFound')
134
- print("[x] Hidden tweet found, account suspended due to violation of TOS")
135
- return
136
- if datecheck(tweet.datestamp + " " + tweet.timestamp, config):
137
- output = format.Tweet(config, tweet)
138
- if config.Database:
139
- logme.debug(__name__ + ':checkData:Database')
140
- db.tweets(conn, tweet, config)
141
- if config.Pandas:
142
- logme.debug(__name__ + ':checkData:Pandas')
143
- panda.update(tweet, config)
144
- if config.Store_object:
145
- logme.debug(__name__ + ':checkData:Store_object')
146
- if hasattr(config.Store_object_tweets_list, 'append'):
147
- config.Store_object_tweets_list.append(tweet)
148
- else:
149
- tweets_list.append(tweet)
150
- if config.Elasticsearch:
151
- logme.debug(__name__ + ':checkData:Elasticsearch')
152
- elasticsearch.Tweet(tweet, config)
153
- _output(tweet, output, config)
154
- # else:
155
- # logme.critical(__name__+':checkData:copyrightedTweet')
156
-
157
-
158
- async def Tweets(tweets, config, conn):
159
- logme.debug(__name__ + ':Tweets')
160
- if config.Favorites or config.Location:
161
- logme.debug(__name__ + ':Tweets:fav+full+loc')
162
- for tw in tweets:
163
- await checkData(tw, config, conn)
164
- elif config.TwitterSearch or config.Profile:
165
- logme.debug(__name__ + ':Tweets:TwitterSearch')
166
- await checkData(tweets, config, conn)
167
- else:
168
- logme.debug(__name__ + ':Tweets:else')
169
- if int(tweets["data-user-id"]) == config.User_id or config.Retweets:
170
- await checkData(tweets, config, conn)
171
-
172
-
173
- async def Users(u, config, conn):
174
- logme.debug(__name__ + ':User')
175
- global users_list
176
-
177
- user = User(u)
178
- output = format.User(config.Format, user)
179
-
180
- if config.Database:
181
- logme.debug(__name__ + ':User:Database')
182
- db.user(conn, config, user)
183
-
184
- if config.Elasticsearch:
185
- logme.debug(__name__ + ':User:Elasticsearch')
186
- _save_date = user.join_date
187
- _save_time = user.join_time
188
- user.join_date = str(datetime.strptime(user.join_date, "%d %b %Y")).split()[0]
189
- user.join_time = str(datetime.strptime(user.join_time, "%I:%M %p")).split()[1]
190
- elasticsearch.UserProfile(user, config)
191
- user.join_date = _save_date
192
- user.join_time = _save_time
193
-
194
- if config.Store_object:
195
- logme.debug(__name__ + ':User:Store_object')
196
-
197
- if hasattr(config.Store_object_follow_list, 'append'):
198
- config.Store_object_follow_list.append(user)
199
- elif hasattr(config.Store_object_users_list, 'append'):
200
- config.Store_object_users_list.append(user)
201
- else:
202
- users_list.append(user) # twint.user.user
203
-
204
- if config.Pandas:
205
- logme.debug(__name__ + ':User:Pandas+user')
206
- panda.update(user, config)
207
-
208
- _output(user, output, config)
209
-
210
-
211
- async def Username(username, config, conn):
212
- logme.debug(__name__ + ':Username')
213
- global _follows_object
214
- global follows_list
215
- follow_var = config.Following * "following" + config.Followers * "followers"
216
-
217
- if config.Database:
218
- logme.debug(__name__ + ':Username:Database')
219
- db.follow(conn, config.Username, config.Followers, username)
220
-
221
- if config.Elasticsearch:
222
- logme.debug(__name__ + ':Username:Elasticsearch')
223
- elasticsearch.Follow(username, config)
224
-
225
- if config.Store_object:
226
- if hasattr(config.Store_object_follow_list, 'append'):
227
- config.Store_object_follow_list.append(username)
228
- else:
229
- follows_list.append(username) # twint.user.user
230
-
231
- if config.Pandas:
232
- logme.debug(__name__ + ':Username:object+pandas')
233
- try:
234
- _ = _follows_object[config.Username][follow_var]
235
- except KeyError:
236
- _follows_object.update({config.Username: {follow_var: []}})
237
- _follows_object[config.Username][follow_var].append(username)
238
- if config.Pandas_au:
239
- logme.debug(__name__ + ':Username:object+pandas+au')
240
- panda.update(_follows_object[config.Username], config)
241
- _output(username, username, config)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
twitter-scraper/twint-master/twint/run.py DELETED
@@ -1,412 +0,0 @@
1
- import sys, os, datetime
2
- from asyncio import get_event_loop, TimeoutError, ensure_future, new_event_loop, set_event_loop
3
-
4
- from . import datelock, feed, get, output, verbose, storage
5
- from .token import TokenExpiryException
6
- from . import token
7
- from .storage import db
8
- from .feed import NoMoreTweetsException
9
-
10
- import logging as logme
11
-
12
- import time
13
-
14
- bearer = 'Bearer AAAAAAAAAAAAAAAAAAAAANRILgAAAAAAnNwIzUejRCOuH5E6I8xnZz4puTs' \
15
- '%3D1Zv7ttfk8LF81IUq16cHjhLTvJu4FA33AGWWjCpTnA'
16
-
17
-
18
- class Twint:
19
- def __init__(self, config):
20
- logme.debug(__name__ + ':Twint:__init__')
21
- if config.Resume is not None and (config.TwitterSearch or config.Followers or config.Following):
22
- logme.debug(__name__ + ':Twint:__init__:Resume')
23
- self.init = self.get_resume(config.Resume)
24
- else:
25
- self.init = -1
26
-
27
- config.deleted = []
28
- self.feed: list = [-1]
29
- self.count = 0
30
- self.user_agent = ""
31
- self.config = config
32
- self.config.Bearer_token = bearer
33
- # TODO might have to make some adjustments for it to work with multi-treading
34
- # USAGE : to get a new guest token simply do `self.token.refresh()`
35
- self.token = token.Token(config)
36
- self.token.refresh()
37
- self.conn = db.Conn(config.Database)
38
- self.d = datelock.Set(self.config.Until, self.config.Since)
39
- verbose.Elastic(config.Elasticsearch)
40
-
41
- if self.config.Store_object:
42
- logme.debug(__name__ + ':Twint:__init__:clean_follow_list')
43
- output._clean_follow_list()
44
-
45
- if self.config.Pandas_clean:
46
- logme.debug(__name__ + ':Twint:__init__:pandas_clean')
47
- storage.panda.clean()
48
-
49
- def get_resume(self, resumeFile):
50
- if not os.path.exists(resumeFile):
51
- return '-1'
52
- with open(resumeFile, 'r') as rFile:
53
- _init = rFile.readlines()[-1].strip('\n')
54
- return _init
55
-
56
- async def Feed(self):
57
- logme.debug(__name__ + ':Twint:Feed')
58
- consecutive_errors_count = 0
59
- while True:
60
- # this will receive a JSON string, parse it into a `dict` and do the required stuff
61
- try:
62
- response = await get.RequestUrl(self.config, self.init)
63
- except TokenExpiryException as e:
64
- logme.debug(__name__ + 'Twint:Feed:' + str(e))
65
- self.token.refresh()
66
- response = await get.RequestUrl(self.config, self.init)
67
-
68
- if self.config.Debug:
69
- print(response, file=open("twint-last-request.log", "w", encoding="utf-8"))
70
-
71
- self.feed = []
72
- try:
73
- if self.config.Favorites:
74
- self.feed, self.init = feed.MobileFav(response)
75
- favorite_err_cnt = 0
76
- if len(self.feed) == 0 and len(self.init) == 0:
77
- while (len(self.feed) == 0 or len(self.init) == 0) and favorite_err_cnt < 5:
78
- self.user_agent = await get.RandomUserAgent(wa=False)
79
- response = await get.RequestUrl(self.config, self.init,
80
- headers=[("User-Agent", self.user_agent)])
81
- self.feed, self.init = feed.MobileFav(response)
82
- favorite_err_cnt += 1
83
- time.sleep(1)
84
- if favorite_err_cnt == 5:
85
- print("Favorite page could not be fetched")
86
- if not self.count % 40:
87
- time.sleep(5)
88
- elif self.config.Followers or self.config.Following:
89
- self.feed, self.init = feed.Follow(response)
90
- if not self.count % 40:
91
- time.sleep(5)
92
- elif self.config.Profile or self.config.TwitterSearch:
93
- try:
94
- self.feed, self.init = feed.parse_tweets(self.config, response)
95
- except NoMoreTweetsException as e:
96
- logme.debug(__name__ + ':Twint:Feed:' + str(e))
97
- print('[!] ' + str(e) + ' Scraping will stop now.')
98
- print('found {} deleted tweets in this search.'.format(len(self.config.deleted)))
99
- break
100
- break
101
- except TimeoutError as e:
102
- if self.config.Proxy_host.lower() == "tor":
103
- print("[?] Timed out, changing Tor identity...")
104
- if self.config.Tor_control_password is None:
105
- logme.critical(__name__ + ':Twint:Feed:tor-password')
106
- sys.stderr.write("Error: config.Tor_control_password must be set for proxy auto-rotation!\r\n")
107
- sys.stderr.write(
108
- "Info: What is it? See https://stem.torproject.org/faq.html#can-i-interact-with-tors"
109
- "-controller-interface-directly\r\n")
110
- break
111
- else:
112
- get.ForceNewTorIdentity(self.config)
113
- continue
114
- else:
115
- logme.critical(__name__ + ':Twint:Feed:' + str(e))
116
- print(str(e))
117
- break
118
- except Exception as e:
119
- if self.config.Profile or self.config.Favorites:
120
- print("[!] Twitter does not return more data, scrape stops here.")
121
- break
122
-
123
- logme.critical(__name__ + ':Twint:Feed:noData' + str(e))
124
- # Sometimes Twitter says there is no data. But it's a lie.
125
- # raise
126
- consecutive_errors_count += 1
127
- if consecutive_errors_count < self.config.Retries_count:
128
- # skip to the next iteration if wait time does not satisfy limit constraints
129
- delay = round(consecutive_errors_count ** self.config.Backoff_exponent, 1)
130
-
131
- # if the delay is less than users set min wait time then replace delay
132
- if self.config.Min_wait_time > delay:
133
- delay = self.config.Min_wait_time
134
-
135
- sys.stderr.write('sleeping for {} secs\n'.format(delay))
136
- time.sleep(delay)
137
- self.user_agent = await get.RandomUserAgent(wa=True)
138
- continue
139
- logme.critical(__name__ + ':Twint:Feed:Tweets_known_error:' + str(e))
140
- sys.stderr.write(str(e) + " [x] run.Feed")
141
- sys.stderr.write(
142
- "[!] if you get this error but you know for sure that more tweets exist, please open an issue and "
143
- "we will investigate it!")
144
- break
145
- if self.config.Resume:
146
- print(self.init, file=open(self.config.Resume, "a", encoding="utf-8"))
147
-
148
- async def follow(self):
149
- await self.Feed()
150
- if self.config.User_full:
151
- logme.debug(__name__ + ':Twint:follow:userFull')
152
- self.count += await get.Multi(self.feed, self.config, self.conn)
153
- else:
154
- logme.debug(__name__ + ':Twint:follow:notUserFull')
155
- for user in self.feed:
156
- self.count += 1
157
- username = user.find("a")["name"]
158
- await output.Username(username, self.config, self.conn)
159
-
160
- async def favorite(self):
161
- logme.debug(__name__ + ':Twint:favorite')
162
- await self.Feed()
163
- favorited_tweets_list = []
164
- for tweet in self.feed:
165
- tweet_dict = {}
166
- self.count += 1
167
- try:
168
- tweet_dict['data-item-id'] = tweet.find("div", {"class": "tweet-text"})['data-id']
169
- t_url = tweet.find("span", {"class": "metadata"}).find("a")["href"]
170
- tweet_dict['data-conversation-id'] = t_url.split('?')[0].split('/')[-1]
171
- tweet_dict['username'] = tweet.find("div", {"class": "username"}).text.replace('\n', '').replace(' ',
172
- '')
173
- tweet_dict['tweet'] = tweet.find("div", {"class": "tweet-text"}).find("div", {"class": "dir-ltr"}).text
174
- date_str = tweet.find("td", {"class": "timestamp"}).find("a").text
175
- # test_dates = ["1m", "2h", "Jun 21, 2019", "Mar 12", "28 Jun 19"]
176
- # date_str = test_dates[3]
177
- if len(date_str) <= 3 and (date_str[-1] == "m" or date_str[-1] == "h"): # 25m 1h
178
- dateu = str(datetime.date.today())
179
- tweet_dict['date'] = dateu
180
- elif ',' in date_str: # Aug 21, 2019
181
- sp = date_str.replace(',', '').split(' ')
182
- date_str_formatted = sp[1] + ' ' + sp[0] + ' ' + sp[2]
183
- dateu = datetime.datetime.strptime(date_str_formatted, "%d %b %Y").strftime("%Y-%m-%d")
184
- tweet_dict['date'] = dateu
185
- elif len(date_str.split(' ')) == 3: # 28 Jun 19
186
- sp = date_str.split(' ')
187
- if len(sp[2]) == 2:
188
- sp[2] = '20' + sp[2]
189
- date_str_formatted = sp[0] + ' ' + sp[1] + ' ' + sp[2]
190
- dateu = datetime.datetime.strptime(date_str_formatted, "%d %b %Y").strftime("%Y-%m-%d")
191
- tweet_dict['date'] = dateu
192
- else: # Aug 21
193
- sp = date_str.split(' ')
194
- date_str_formatted = sp[1] + ' ' + sp[0] + ' ' + str(datetime.date.today().year)
195
- dateu = datetime.datetime.strptime(date_str_formatted, "%d %b %Y").strftime("%Y-%m-%d")
196
- tweet_dict['date'] = dateu
197
-
198
- favorited_tweets_list.append(tweet_dict)
199
-
200
- except Exception as e:
201
- logme.critical(__name__ + ':Twint:favorite:favorite_field_lack')
202
- print("shit: ", date_str, " ", str(e))
203
-
204
- try:
205
- self.config.favorited_tweets_list += favorited_tweets_list
206
- except AttributeError:
207
- self.config.favorited_tweets_list = favorited_tweets_list
208
-
209
- async def profile(self):
210
- await self.Feed()
211
- logme.debug(__name__ + ':Twint:profile')
212
- for tweet in self.feed:
213
- self.count += 1
214
- await output.Tweets(tweet, self.config, self.conn)
215
-
216
- async def tweets(self):
217
- await self.Feed()
218
- # TODO : need to take care of this later
219
- if self.config.Location:
220
- logme.debug(__name__ + ':Twint:tweets:location')
221
- self.count += await get.Multi(self.feed, self.config, self.conn)
222
- else:
223
- logme.debug(__name__ + ':Twint:tweets:notLocation')
224
- for tweet in self.feed:
225
- self.count += 1
226
- await output.Tweets(tweet, self.config, self.conn)
227
-
228
- async def main(self, callback=None):
229
-
230
- task = ensure_future(self.run()) # Might be changed to create_task in 3.7+.
231
-
232
- if callback:
233
- task.add_done_callback(callback)
234
-
235
- await task
236
-
237
- async def run(self):
238
- if self.config.TwitterSearch:
239
- self.user_agent = await get.RandomUserAgent(wa=True)
240
- else:
241
- self.user_agent = await get.RandomUserAgent()
242
-
243
- if self.config.User_id is not None and self.config.Username is None:
244
- logme.debug(__name__ + ':Twint:main:user_id')
245
- self.config.Username = await get.Username(self.config.User_id, self.config.Bearer_token,
246
- self.config.Guest_token)
247
-
248
- if self.config.Username is not None and self.config.User_id is None:
249
- logme.debug(__name__ + ':Twint:main:username')
250
-
251
- self.config.User_id = await get.User(self.config.Username, self.config, self.conn, True)
252
- if self.config.User_id is None:
253
- raise ValueError("Cannot find twitter account with name = " + self.config.Username)
254
-
255
- # TODO : will need to modify it to work with the new endpoints
256
- if self.config.TwitterSearch and self.config.Since and self.config.Until:
257
- logme.debug(__name__ + ':Twint:main:search+since+until')
258
- while self.d.since < self.d.until:
259
- self.config.Since = datetime.datetime.strftime(self.d.since, "%Y-%m-%d %H:%M:%S")
260
- self.config.Until = datetime.datetime.strftime(self.d.until, "%Y-%m-%d %H:%M:%S")
261
- if len(self.feed) > 0:
262
- await self.tweets()
263
- else:
264
- logme.debug(__name__ + ':Twint:main:gettingNewTweets')
265
- break
266
-
267
- if get.Limit(self.config.Limit, self.count):
268
- break
269
- elif self.config.Lookup:
270
- await self.Lookup()
271
- else:
272
- logme.debug(__name__ + ':Twint:main:not-search+since+until')
273
- while True:
274
- if len(self.feed) > 0:
275
- if self.config.Followers or self.config.Following:
276
- logme.debug(__name__ + ':Twint:main:follow')
277
- await self.follow()
278
- elif self.config.Favorites:
279
- logme.debug(__name__ + ':Twint:main:favorites')
280
- await self.favorite()
281
- elif self.config.Profile:
282
- logme.debug(__name__ + ':Twint:main:profile')
283
- await self.profile()
284
- elif self.config.TwitterSearch:
285
- logme.debug(__name__ + ':Twint:main:twitter-search')
286
- await self.tweets()
287
- else:
288
- logme.debug(__name__ + ':Twint:main:no-more-tweets')
289
- break
290
-
291
- # logging.info("[<] " + str(datetime.now()) + ':: run+Twint+main+CallingGetLimit2')
292
- if get.Limit(self.config.Limit, self.count):
293
- logme.debug(__name__ + ':Twint:main:reachedLimit')
294
- break
295
-
296
- if self.config.Count:
297
- verbose.Count(self.count, self.config)
298
-
299
- async def Lookup(self):
300
- logme.debug(__name__ + ':Twint:Lookup')
301
-
302
- try:
303
- if self.config.User_id is not None and self.config.Username is None:
304
- logme.debug(__name__ + ':Twint:Lookup:user_id')
305
- self.config.Username = await get.Username(self.config.User_id, self.config.Bearer_token,
306
- self.config.Guest_token)
307
- await get.User(self.config.Username, self.config, db.Conn(self.config.Database))
308
-
309
- except Exception as e:
310
- logme.exception(__name__ + ':Twint:Lookup:Unexpected exception occurred.')
311
- raise
312
-
313
-
314
- def run(config, callback=None):
315
- logme.debug(__name__ + ':run')
316
- try:
317
- get_event_loop()
318
- except RuntimeError as e:
319
- if "no current event loop" in str(e):
320
- set_event_loop(new_event_loop())
321
- else:
322
- logme.exception(__name__ + ':run:Unexpected exception while handling an expected RuntimeError.')
323
- raise
324
- except Exception as e:
325
- logme.exception(
326
- __name__ + ':run:Unexpected exception occurred while attempting to get or create a new event loop.')
327
- raise
328
-
329
- get_event_loop().run_until_complete(Twint(config).main(callback))
330
-
331
-
332
- def Favorites(config):
333
- logme.debug(__name__ + ':Favorites')
334
- config.Favorites = True
335
- config.Following = False
336
- config.Followers = False
337
- config.Profile = False
338
- config.TwitterSearch = False
339
- run(config)
340
- if config.Pandas_au:
341
- storage.panda._autoget("tweet")
342
-
343
-
344
- def Followers(config):
345
- logme.debug(__name__ + ':Followers')
346
- config.Followers = True
347
- config.Following = False
348
- config.Profile = False
349
- config.Favorites = False
350
- config.TwitterSearch = False
351
- run(config)
352
- if config.Pandas_au:
353
- storage.panda._autoget("followers")
354
- if config.User_full:
355
- storage.panda._autoget("user")
356
- if config.Pandas_clean and not config.Store_object:
357
- # storage.panda.clean()
358
- output._clean_follow_list()
359
-
360
-
361
- def Following(config):
362
- logme.debug(__name__ + ':Following')
363
- config.Following = True
364
- config.Followers = False
365
- config.Profile = False
366
- config.Favorites = False
367
- config.TwitterSearch = False
368
- run(config)
369
- if config.Pandas_au:
370
- storage.panda._autoget("following")
371
- if config.User_full:
372
- storage.panda._autoget("user")
373
- if config.Pandas_clean and not config.Store_object:
374
- # storage.panda.clean()
375
- output._clean_follow_list()
376
-
377
-
378
- def Lookup(config):
379
- logme.debug(__name__ + ':Lookup')
380
- config.Profile = False
381
- config.Lookup = True
382
- config.Favorites = False
383
- config.FOllowing = False
384
- config.Followers = False
385
- config.TwitterSearch = False
386
- run(config)
387
- if config.Pandas_au:
388
- storage.panda._autoget("user")
389
-
390
-
391
- def Profile(config):
392
- logme.debug(__name__ + ':Profile')
393
- config.Profile = True
394
- config.Favorites = False
395
- config.Following = False
396
- config.Followers = False
397
- config.TwitterSearch = False
398
- run(config)
399
- if config.Pandas_au:
400
- storage.panda._autoget("tweet")
401
-
402
-
403
- def Search(config, callback=None):
404
- logme.debug(__name__ + ':Search')
405
- config.TwitterSearch = True
406
- config.Favorites = False
407
- config.Following = False
408
- config.Followers = False
409
- config.Profile = False
410
- run(config, callback)
411
- if config.Pandas_au:
412
- storage.panda._autoget("tweet")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
twitter-scraper/twint-master/twint/storage/__init__.py DELETED
File without changes
twitter-scraper/twint-master/twint/storage/db.py DELETED
@@ -1,297 +0,0 @@
1
- import sqlite3
2
- import sys
3
- import time
4
- import hashlib
5
-
6
- from datetime import datetime
7
-
8
- def Conn(database):
9
- if database:
10
- print("[+] Inserting into Database: " + str(database))
11
- conn = init(database)
12
- if isinstance(conn, str): # error
13
- print(conn)
14
- sys.exit(1)
15
- else:
16
- conn = ""
17
-
18
- return conn
19
-
20
- def init(db):
21
- try:
22
- conn = sqlite3.connect(db)
23
- cursor = conn.cursor()
24
-
25
- table_users = """
26
- CREATE TABLE IF NOT EXISTS
27
- users(
28
- id integer not null,
29
- id_str text not null,
30
- name text,
31
- username text not null,
32
- bio text,
33
- location text,
34
- url text,
35
- join_date text not null,
36
- join_time text not null,
37
- tweets integer,
38
- following integer,
39
- followers integer,
40
- likes integer,
41
- media integer,
42
- private integer not null,
43
- verified integer not null,
44
- profile_image_url text not null,
45
- background_image text,
46
- hex_dig text not null,
47
- time_update integer not null,
48
- CONSTRAINT users_pk PRIMARY KEY (id, hex_dig)
49
- );
50
- """
51
- cursor.execute(table_users)
52
-
53
- table_tweets = """
54
- CREATE TABLE IF NOT EXISTS
55
- tweets (
56
- id integer not null,
57
- id_str text not null,
58
- tweet text default '',
59
- language text default '',
60
- conversation_id text not null,
61
- created_at integer not null,
62
- date text not null,
63
- time text not null,
64
- timezone text not null,
65
- place text default '',
66
- replies_count integer,
67
- likes_count integer,
68
- retweets_count integer,
69
- user_id integer not null,
70
- user_id_str text not null,
71
- screen_name text not null,
72
- name text default '',
73
- link text,
74
- mentions text,
75
- hashtags text,
76
- cashtags text,
77
- urls text,
78
- photos text,
79
- thumbnail text,
80
- quote_url text,
81
- video integer,
82
- geo text,
83
- near text,
84
- source text,
85
- time_update integer not null,
86
- `translate` text default '',
87
- trans_src text default '',
88
- trans_dest text default '',
89
- PRIMARY KEY (id)
90
- );
91
- """
92
- cursor.execute(table_tweets)
93
-
94
- table_retweets = """
95
- CREATE TABLE IF NOT EXISTS
96
- retweets(
97
- user_id integer not null,
98
- username text not null,
99
- tweet_id integer not null,
100
- retweet_id integer not null,
101
- retweet_date integer,
102
- CONSTRAINT retweets_pk PRIMARY KEY(user_id, tweet_id),
103
- CONSTRAINT user_id_fk FOREIGN KEY(user_id) REFERENCES users(id),
104
- CONSTRAINT tweet_id_fk FOREIGN KEY(tweet_id) REFERENCES tweets(id)
105
- );
106
- """
107
- cursor.execute(table_retweets)
108
-
109
- table_reply_to = """
110
- CREATE TABLE IF NOT EXISTS
111
- replies(
112
- tweet_id integer not null,
113
- user_id integer not null,
114
- username text not null,
115
- CONSTRAINT replies_pk PRIMARY KEY (user_id, tweet_id),
116
- CONSTRAINT tweet_id_fk FOREIGN KEY (tweet_id) REFERENCES tweets(id)
117
- );
118
- """
119
- cursor.execute(table_reply_to)
120
-
121
- table_favorites = """
122
- CREATE TABLE IF NOT EXISTS
123
- favorites(
124
- user_id integer not null,
125
- tweet_id integer not null,
126
- CONSTRAINT favorites_pk PRIMARY KEY (user_id, tweet_id),
127
- CONSTRAINT user_id_fk FOREIGN KEY (user_id) REFERENCES users(id),
128
- CONSTRAINT tweet_id_fk FOREIGN KEY (tweet_id) REFERENCES tweets(id)
129
- );
130
- """
131
- cursor.execute(table_favorites)
132
-
133
- table_followers = """
134
- CREATE TABLE IF NOT EXISTS
135
- followers (
136
- id integer not null,
137
- follower_id integer not null,
138
- CONSTRAINT followers_pk PRIMARY KEY (id, follower_id),
139
- CONSTRAINT id_fk FOREIGN KEY(id) REFERENCES users(id),
140
- CONSTRAINT follower_id_fk FOREIGN KEY(follower_id) REFERENCES users(id)
141
- );
142
- """
143
- cursor.execute(table_followers)
144
-
145
- table_following = """
146
- CREATE TABLE IF NOT EXISTS
147
- following (
148
- id integer not null,
149
- following_id integer not null,
150
- CONSTRAINT following_pk PRIMARY KEY (id, following_id),
151
- CONSTRAINT id_fk FOREIGN KEY(id) REFERENCES users(id),
152
- CONSTRAINT following_id_fk FOREIGN KEY(following_id) REFERENCES users(id)
153
- );
154
- """
155
- cursor.execute(table_following)
156
-
157
- table_followers_names = """
158
- CREATE TABLE IF NOT EXISTS
159
- followers_names (
160
- user text not null,
161
- time_update integer not null,
162
- follower text not null,
163
- PRIMARY KEY (user, follower)
164
- );
165
- """
166
- cursor.execute(table_followers_names)
167
-
168
- table_following_names = """
169
- CREATE TABLE IF NOT EXISTS
170
- following_names (
171
- user text not null,
172
- time_update integer not null,
173
- follows text not null,
174
- PRIMARY KEY (user, follows)
175
- );
176
- """
177
- cursor.execute(table_following_names)
178
-
179
- return conn
180
- except Exception as e:
181
- return str(e)
182
-
183
- def fTable(Followers):
184
- if Followers:
185
- table = "followers_names"
186
- else:
187
- table = "following_names"
188
-
189
- return table
190
-
191
- def uTable(Followers):
192
- if Followers:
193
- table = "followers"
194
- else:
195
- table = "following"
196
-
197
- return table
198
-
199
- def follow(conn, Username, Followers, User):
200
- try:
201
- time_ms = round(time.time()*1000)
202
- cursor = conn.cursor()
203
- entry = (User, time_ms, Username,)
204
- table = fTable(Followers)
205
- query = f"INSERT INTO {table} VALUES(?,?,?)"
206
- cursor.execute(query, entry)
207
- conn.commit()
208
- except sqlite3.IntegrityError:
209
- pass
210
-
211
- def get_hash_id(conn, id):
212
- cursor = conn.cursor()
213
- cursor.execute('SELECT hex_dig FROM users WHERE id = ? LIMIT 1', (id,))
214
- resultset = cursor.fetchall()
215
- return resultset[0][0] if resultset else -1
216
-
217
- def user(conn, config, User):
218
- try:
219
- time_ms = round(time.time()*1000)
220
- cursor = conn.cursor()
221
- user = [int(User.id), User.id, User.name, User.username, User.bio, User.location, User.url,User.join_date, User.join_time, User.tweets, User.following, User.followers, User.likes, User.media_count, User.is_private, User.is_verified, User.avatar, User.background_image]
222
-
223
- hex_dig = hashlib.sha256(','.join(str(v) for v in user).encode()).hexdigest()
224
- entry = tuple(user) + (hex_dig,time_ms,)
225
- old_hash = get_hash_id(conn, User.id)
226
-
227
- if old_hash == -1 or old_hash != hex_dig:
228
- query = f"INSERT INTO users VALUES(?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)"
229
- cursor.execute(query, entry)
230
- else:
231
- pass
232
-
233
- if config.Followers or config.Following:
234
- table = uTable(config.Followers)
235
- query = f"INSERT INTO {table} VALUES(?,?)"
236
- cursor.execute(query, (config.User_id, int(User.id)))
237
-
238
- conn.commit()
239
- except sqlite3.IntegrityError:
240
- pass
241
-
242
- def tweets(conn, Tweet, config):
243
- try:
244
- time_ms = round(time.time()*1000)
245
- cursor = conn.cursor()
246
- entry = (Tweet.id,
247
- Tweet.id_str,
248
- Tweet.tweet,
249
- Tweet.lang,
250
- Tweet.conversation_id,
251
- Tweet.datetime,
252
- Tweet.datestamp,
253
- Tweet.timestamp,
254
- Tweet.timezone,
255
- Tweet.place,
256
- Tweet.replies_count,
257
- Tweet.likes_count,
258
- Tweet.retweets_count,
259
- Tweet.user_id,
260
- Tweet.user_id_str,
261
- Tweet.username,
262
- Tweet.name,
263
- Tweet.link,
264
- ",".join(Tweet.mentions),
265
- ",".join(Tweet.hashtags),
266
- ",".join(Tweet.cashtags),
267
- ",".join(Tweet.urls),
268
- ",".join(Tweet.photos),
269
- Tweet.thumbnail,
270
- Tweet.quote_url,
271
- Tweet.video,
272
- Tweet.geo,
273
- Tweet.near,
274
- Tweet.source,
275
- time_ms,
276
- Tweet.translate,
277
- Tweet.trans_src,
278
- Tweet.trans_dest)
279
- cursor.execute('INSERT INTO tweets VALUES(?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)', entry)
280
-
281
- if config.Favorites:
282
- query = 'INSERT INTO favorites VALUES(?,?)'
283
- cursor.execute(query, (config.User_id, Tweet.id))
284
-
285
- if Tweet.retweet:
286
- query = 'INSERT INTO retweets VALUES(?,?,?,?,?)'
287
- _d = datetime.timestamp(datetime.strptime(Tweet.retweet_date, "%Y-%m-%d %H:%M:%S"))
288
- cursor.execute(query, (int(Tweet.user_rt_id), Tweet.user_rt, Tweet.id, int(Tweet.retweet_id), _d))
289
-
290
- if Tweet.reply_to:
291
- for reply in Tweet.reply_to:
292
- query = 'INSERT INTO replies VALUES(?,?,?)'
293
- cursor.execute(query, (Tweet.id, int(reply['user_id']), reply['username']))
294
-
295
- conn.commit()
296
- except sqlite3.IntegrityError:
297
- pass
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
twitter-scraper/twint-master/twint/storage/elasticsearch.py DELETED
@@ -1,364 +0,0 @@
1
- ## TODO - Fix Weekday situation
2
- from elasticsearch import Elasticsearch, helpers
3
- from geopy.geocoders import Nominatim
4
- from datetime import datetime
5
- import contextlib
6
- import sys
7
-
8
- _index_tweet_status = False
9
- _index_follow_status = False
10
- _index_user_status = False
11
- _is_near_def = False
12
- _is_location_def = False
13
- _near = {}
14
- _location = {}
15
-
16
- geolocator = Nominatim(user_agent="twint-1.2")
17
-
18
- class RecycleObject(object):
19
- def write(self, junk): pass
20
- def flush(self): pass
21
-
22
- def getLocation(place, **options):
23
- location = geolocator.geocode(place,timeout=1000)
24
- if location:
25
- if options.get("near"):
26
- global _near
27
- _near = {"lat": location.latitude, "lon": location.longitude}
28
- return True
29
- elif options.get("location"):
30
- global _location
31
- _location = {"lat": location.latitude, "lon": location.longitude}
32
- return True
33
- return {"lat": location.latitude, "lon": location.longitude}
34
- else:
35
- return {}
36
-
37
- def handleIndexResponse(response):
38
- try:
39
- if response["status"] == 400:
40
- return True
41
- except KeyError:
42
- pass
43
- if response["acknowledged"]:
44
- print("[+] Index \"" + response["index"] + "\" created!")
45
- else:
46
- print("[x] error index creation :: storage.elasticsearch.handleIndexCreation")
47
- if response["shards_acknowledged"]:
48
- print("[+] Shards acknowledged, everything is ready to be used!")
49
- return True
50
- else:
51
- print("[x] error with shards :: storage.elasticsearch.HandleIndexCreation")
52
- return False
53
-
54
- def createIndex(config, instance, **scope):
55
- if scope.get("scope") == "tweet":
56
- tweets_body = {
57
- "mappings": {
58
- "properties": {
59
- "id": {"type": "long"},
60
- "conversation_id": {"type": "long"},
61
- "created_at": {"type": "text"},
62
- "date": {"type": "date", "format": "yyyy-MM-dd HH:mm:ss"},
63
- "timezone": {"type": "keyword"},
64
- "place": {"type": "keyword"},
65
- "location": {"type": "keyword"},
66
- "tweet": {"type": "text"},
67
- "lang": {"type": "keyword"},
68
- "hashtags": {"type": "keyword", "normalizer": "hashtag_normalizer"},
69
- "cashtags": {"type": "keyword", "normalizer": "hashtag_normalizer"},
70
- "user_id_str": {"type": "keyword"},
71
- "username": {"type": "keyword", "normalizer": "hashtag_normalizer"},
72
- "name": {"type": "text"},
73
- "profile_image_url": {"type": "text"},
74
- "day": {"type": "integer"},
75
- "hour": {"type": "integer"},
76
- "link": {"type": "text"},
77
- "retweet": {"type": "text"},
78
- "essid": {"type": "keyword"},
79
- "nlikes": {"type": "integer"},
80
- "nreplies": {"type": "integer"},
81
- "nretweets": {"type": "integer"},
82
- "quote_url": {"type": "text"},
83
- "video": {"type":"integer"},
84
- "thumbnail": {"type":"text"},
85
- "search": {"type": "text"},
86
- "near": {"type": "text"},
87
- "geo_near": {"type": "geo_point"},
88
- "geo_tweet": {"type": "geo_point"},
89
- "photos": {"type": "text"},
90
- "user_rt_id": {"type": "keyword"},
91
- "mentions": {"type": "keyword", "normalizer": "hashtag_normalizer"},
92
- "source": {"type": "keyword"},
93
- "user_rt": {"type": "keyword"},
94
- "retweet_id": {"type": "keyword"},
95
- "reply_to": {
96
- "type": "nested",
97
- "properties": {
98
- "user_id": {"type": "keyword"},
99
- "username": {"type": "keyword"}
100
- }
101
- },
102
- "retweet_date": {"type": "date", "format": "yyyy-MM-dd HH:mm:ss", "ignore_malformed": True},
103
- "urls": {"type": "keyword"},
104
- "translate": {"type": "text"},
105
- "trans_src": {"type": "keyword"},
106
- "trans_dest": {"type": "keyword"},
107
- }
108
- },
109
- "settings": {
110
- "number_of_shards": 1,
111
- "analysis": {
112
- "normalizer": {
113
- "hashtag_normalizer": {
114
- "type": "custom",
115
- "char_filter": [],
116
- "filter": ["lowercase", "asciifolding"]
117
- }
118
- }
119
- }
120
- }
121
- }
122
- with nostdout():
123
- resp = instance.indices.create(index=config.Index_tweets, body=tweets_body, ignore=400)
124
- return handleIndexResponse(resp)
125
- elif scope.get("scope") == "follow":
126
- follow_body = {
127
- "mappings": {
128
- "properties": {
129
- "user": {"type": "keyword"},
130
- "follow": {"type": "keyword"},
131
- "essid": {"type": "keyword"}
132
- }
133
- },
134
- "settings": {
135
- "number_of_shards": 1
136
- }
137
- }
138
- with nostdout():
139
- resp = instance.indices.create(index=config.Index_follow, body=follow_body, ignore=400)
140
- return handleIndexResponse(resp)
141
- elif scope.get("scope") == "user":
142
- user_body = {
143
- "mappings": {
144
- "properties": {
145
- "id": {"type": "keyword"},
146
- "name": {"type": "keyword"},
147
- "username": {"type": "keyword"},
148
- "bio": {"type": "text"},
149
- "location": {"type": "keyword"},
150
- "url": {"type": "text"},
151
- "join_datetime": {"type": "date", "format": "yyyy-MM-dd HH:mm:ss"},
152
- "tweets": {"type": "integer"},
153
- "following": {"type": "integer"},
154
- "followers": {"type": "integer"},
155
- "likes": {"type": "integer"},
156
- "media": {"type": "integer"},
157
- "private": {"type": "integer"},
158
- "verified": {"type": "integer"},
159
- "avatar": {"type": "text"},
160
- "background_image": {"type": "text"},
161
- "session": {"type": "keyword"},
162
- "geo_user": {"type": "geo_point"}
163
- }
164
- },
165
- "settings": {
166
- "number_of_shards": 1
167
- }
168
- }
169
- with nostdout():
170
- resp = instance.indices.create(index=config.Index_users, body=user_body, ignore=400)
171
- return handleIndexResponse(resp)
172
- else:
173
- print("[x] error index pre-creation :: storage.elasticsearch.createIndex")
174
- return False
175
-
176
- @contextlib.contextmanager
177
- def nostdout():
178
- savestdout = sys.stdout
179
- sys.stdout = RecycleObject()
180
- yield
181
- sys.stdout = savestdout
182
-
183
- def weekday(day):
184
- weekdays = {
185
- "Monday": 1,
186
- "Tuesday": 2,
187
- "Wednesday": 3,
188
- "Thursday": 4,
189
- "Friday": 5,
190
- "Saturday": 6,
191
- "Sunday": 7,
192
- }
193
-
194
- return weekdays[day]
195
-
196
- def Tweet(Tweet, config):
197
- global _index_tweet_status
198
- global _is_near_def
199
- date_obj = datetime.strptime(Tweet.datetime, "%Y-%m-%d %H:%M:%S %Z")
200
-
201
- actions = []
202
-
203
- try:
204
- retweet = Tweet.retweet
205
- except AttributeError:
206
- retweet = None
207
-
208
- dt = f"{Tweet.datestamp} {Tweet.timestamp}"
209
-
210
- j_data = {
211
- "_index": config.Index_tweets,
212
- "_id": str(Tweet.id) + "_raw_" + config.Essid,
213
- "_source": {
214
- "id": str(Tweet.id),
215
- "conversation_id": Tweet.conversation_id,
216
- "created_at": Tweet.datetime,
217
- "date": dt,
218
- "timezone": Tweet.timezone,
219
- "place": Tweet.place,
220
- "tweet": Tweet.tweet,
221
- "language": Tweet.lang,
222
- "hashtags": Tweet.hashtags,
223
- "cashtags": Tweet.cashtags,
224
- "user_id_str": Tweet.user_id_str,
225
- "username": Tweet.username,
226
- "name": Tweet.name,
227
- "day": date_obj.weekday(),
228
- "hour": date_obj.hour,
229
- "link": Tweet.link,
230
- "retweet": retweet,
231
- "essid": config.Essid,
232
- "nlikes": int(Tweet.likes_count),
233
- "nreplies": int(Tweet.replies_count),
234
- "nretweets": int(Tweet.retweets_count),
235
- "quote_url": Tweet.quote_url,
236
- "video": Tweet.video,
237
- "search": str(config.Search),
238
- "near": config.Near
239
- }
240
- }
241
- if retweet is not None:
242
- j_data["_source"].update({"user_rt_id": Tweet.user_rt_id})
243
- j_data["_source"].update({"user_rt": Tweet.user_rt})
244
- j_data["_source"].update({"retweet_id": Tweet.retweet_id})
245
- j_data["_source"].update({"retweet_date": Tweet.retweet_date})
246
- if Tweet.reply_to:
247
- j_data["_source"].update({"reply_to": Tweet.reply_to})
248
- if Tweet.photos:
249
- _photos = []
250
- for photo in Tweet.photos:
251
- _photos.append(photo)
252
- j_data["_source"].update({"photos": _photos})
253
- if Tweet.thumbnail:
254
- j_data["_source"].update({"thumbnail": Tweet.thumbnail})
255
- if Tweet.mentions:
256
- _mentions = []
257
- for mention in Tweet.mentions:
258
- _mentions.append(mention)
259
- j_data["_source"].update({"mentions": _mentions})
260
- if Tweet.urls:
261
- _urls = []
262
- for url in Tweet.urls:
263
- _urls.append(url)
264
- j_data["_source"].update({"urls": _urls})
265
- if config.Near or config.Geo:
266
- if not _is_near_def:
267
- __geo = ""
268
- __near = ""
269
- if config.Geo:
270
- __geo = config.Geo
271
- if config.Near:
272
- __near = config.Near
273
- _is_near_def = getLocation(__near + __geo, near=True)
274
- if _near:
275
- j_data["_source"].update({"geo_near": _near})
276
- if Tweet.place:
277
- _t_place = getLocation(Tweet.place)
278
- if _t_place:
279
- j_data["_source"].update({"geo_tweet": getLocation(Tweet.place)})
280
- if Tweet.source:
281
- j_data["_source"].update({"source": Tweet.Source})
282
- if config.Translate:
283
- j_data["_source"].update({"translate": Tweet.translate})
284
- j_data["_source"].update({"trans_src": Tweet.trans_src})
285
- j_data["_source"].update({"trans_dest": Tweet.trans_dest})
286
-
287
- actions.append(j_data)
288
-
289
- es = Elasticsearch(config.Elasticsearch, verify_certs=config.Skip_certs)
290
- if not _index_tweet_status:
291
- _index_tweet_status = createIndex(config, es, scope="tweet")
292
- with nostdout():
293
- helpers.bulk(es, actions, chunk_size=2000, request_timeout=200)
294
- actions = []
295
-
296
- def Follow(user, config):
297
- global _index_follow_status
298
- actions = []
299
-
300
- if config.Following:
301
- _user = config.Username
302
- _follow = user
303
- else:
304
- _user = user
305
- _follow = config.Username
306
- j_data = {
307
- "_index": config.Index_follow,
308
- "_id": _user + "_" + _follow + "_" + config.Essid,
309
- "_source": {
310
- "user": _user,
311
- "follow": _follow,
312
- "essid": config.Essid
313
- }
314
- }
315
- actions.append(j_data)
316
-
317
- es = Elasticsearch(config.Elasticsearch, verify_certs=config.Skip_certs)
318
- if not _index_follow_status:
319
- _index_follow_status = createIndex(config, es, scope="follow")
320
- with nostdout():
321
- helpers.bulk(es, actions, chunk_size=2000, request_timeout=200)
322
- actions = []
323
-
324
- def UserProfile(user, config):
325
- global _index_user_status
326
- global _is_location_def
327
- actions = []
328
-
329
- j_data = {
330
- "_index": config.Index_users,
331
- "_id": user.id + "_" + user.join_date + "_" + user.join_time + "_" + config.Essid,
332
- "_source": {
333
- "id": user.id,
334
- "name": user.name,
335
- "username": user.username,
336
- "bio": user.bio,
337
- "location": user.location,
338
- "url": user.url,
339
- "join_datetime": user.join_date + " " + user.join_time,
340
- "tweets": user.tweets,
341
- "following": user.following,
342
- "followers": user.followers,
343
- "likes": user.likes,
344
- "media": user.media_count,
345
- "private": user.is_private,
346
- "verified": user.is_verified,
347
- "avatar": user.avatar,
348
- "background_image": user.background_image,
349
- "session": config.Essid
350
- }
351
- }
352
- if config.Location:
353
- if not _is_location_def:
354
- _is_location_def = getLocation(user.location, location=True)
355
- if _location:
356
- j_data["_source"].update({"geo_user": _location})
357
- actions.append(j_data)
358
-
359
- es = Elasticsearch(config.Elasticsearch, verify_certs=config.Skip_certs)
360
- if not _index_user_status:
361
- _index_user_status = createIndex(config, es, scope="user")
362
- with nostdout():
363
- helpers.bulk(es, actions, chunk_size=2000, request_timeout=200)
364
- actions = []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
twitter-scraper/twint-master/twint/storage/panda.py DELETED
@@ -1,196 +0,0 @@
1
- import datetime, pandas as pd, warnings
2
- from time import strftime, localtime
3
- from twint.tweet import Tweet_formats
4
-
5
- Tweets_df = None
6
- Follow_df = None
7
- User_df = None
8
-
9
- _object_blocks = {
10
- "tweet": [],
11
- "user": [],
12
- "following": [],
13
- "followers": []
14
- }
15
-
16
- weekdays = {
17
- "Monday": 1,
18
- "Tuesday": 2,
19
- "Wednesday": 3,
20
- "Thursday": 4,
21
- "Friday": 5,
22
- "Saturday": 6,
23
- "Sunday": 7,
24
- }
25
-
26
- _type = ""
27
-
28
- def _concat(df, _type):
29
- if df is None:
30
- df = pd.DataFrame(_object_blocks[_type])
31
- else:
32
- _df = pd.DataFrame(_object_blocks[_type])
33
- df = pd.concat([df, _df], sort=True)
34
- return df
35
-
36
- def _autoget(_type):
37
- global Tweets_df
38
- global Follow_df
39
- global User_df
40
-
41
- if _type == "tweet":
42
- Tweets_df = _concat(Tweets_df, _type)
43
- elif _type == "followers" or _type == "following":
44
- Follow_df = _concat(Follow_df, _type)
45
- elif _type == "user":
46
- User_df = _concat(User_df, _type)
47
- else:
48
- error("[x] Wrong type of object passed")
49
-
50
-
51
- def update(object, config):
52
- global _type
53
-
54
- #try:
55
- # _type = ((object.__class__.__name__ == "tweet")*"tweet" +
56
- # (object.__class__.__name__ == "user")*"user")
57
- #except AttributeError:
58
- # _type = config.Following*"following" + config.Followers*"followers"
59
- if object.__class__.__name__ == "tweet":
60
- _type = "tweet"
61
- elif object.__class__.__name__ == "user":
62
- _type = "user"
63
- elif object.__class__.__name__ == "dict":
64
- _type = config.Following*"following" + config.Followers*"followers"
65
-
66
- if _type == "tweet":
67
- Tweet = object
68
- datetime_ms = datetime.datetime.strptime(Tweet.datetime, Tweet_formats['datetime']).timestamp() * 1000
69
- day = weekdays[strftime("%A", localtime(datetime_ms/1000))]
70
- dt = f"{object.datestamp} {object.timestamp}"
71
- _data = {
72
- "id": str(Tweet.id),
73
- "conversation_id": Tweet.conversation_id,
74
- "created_at": datetime_ms,
75
- "date": dt,
76
- "timezone": Tweet.timezone,
77
- "place": Tweet.place,
78
- "tweet": Tweet.tweet,
79
- "language": Tweet.lang,
80
- "hashtags": Tweet.hashtags,
81
- "cashtags": Tweet.cashtags,
82
- "user_id": Tweet.user_id,
83
- "user_id_str": Tweet.user_id_str,
84
- "username": Tweet.username,
85
- "name": Tweet.name,
86
- "day": day,
87
- "hour": strftime("%H", localtime(datetime_ms/1000)),
88
- "link": Tweet.link,
89
- "urls": Tweet.urls,
90
- "photos": Tweet.photos,
91
- "video": Tweet.video,
92
- "thumbnail": Tweet.thumbnail,
93
- "retweet": Tweet.retweet,
94
- "nlikes": int(Tweet.likes_count),
95
- "nreplies": int(Tweet.replies_count),
96
- "nretweets": int(Tweet.retweets_count),
97
- "quote_url": Tweet.quote_url,
98
- "search": str(config.Search),
99
- "near": Tweet.near,
100
- "geo": Tweet.geo,
101
- "source": Tweet.source,
102
- "user_rt_id": Tweet.user_rt_id,
103
- "user_rt": Tweet.user_rt,
104
- "retweet_id": Tweet.retweet_id,
105
- "reply_to": Tweet.reply_to,
106
- "retweet_date": Tweet.retweet_date,
107
- "translate": Tweet.translate,
108
- "trans_src": Tweet.trans_src,
109
- "trans_dest": Tweet.trans_dest
110
- }
111
- _object_blocks[_type].append(_data)
112
- elif _type == "user":
113
- user = object
114
- try:
115
- background_image = user.background_image
116
- except:
117
- background_image = ""
118
- _data = {
119
- "id": user.id,
120
- "name": user.name,
121
- "username": user.username,
122
- "bio": user.bio,
123
- "url": user.url,
124
- "join_datetime": user.join_date + " " + user.join_time,
125
- "join_date": user.join_date,
126
- "join_time": user.join_time,
127
- "tweets": user.tweets,
128
- "location": user.location,
129
- "following": user.following,
130
- "followers": user.followers,
131
- "likes": user.likes,
132
- "media": user.media_count,
133
- "private": user.is_private,
134
- "verified": user.is_verified,
135
- "avatar": user.avatar,
136
- "background_image": background_image,
137
- }
138
- _object_blocks[_type].append(_data)
139
- elif _type == "followers" or _type == "following":
140
- _data = {
141
- config.Following*"following" + config.Followers*"followers" :
142
- {config.Username: object[_type]}
143
- }
144
- _object_blocks[_type] = _data
145
- else:
146
- print("Wrong type of object passed!")
147
-
148
-
149
- def clean():
150
- global Tweets_df
151
- global Follow_df
152
- global User_df
153
- _object_blocks["tweet"].clear()
154
- _object_blocks["following"].clear()
155
- _object_blocks["followers"].clear()
156
- _object_blocks["user"].clear()
157
- Tweets_df = None
158
- Follow_df = None
159
- User_df = None
160
-
161
- def save(_filename, _dataframe, **options):
162
- if options.get("dataname"):
163
- _dataname = options.get("dataname")
164
- else:
165
- _dataname = "twint"
166
-
167
- if not options.get("type"):
168
- with warnings.catch_warnings():
169
- warnings.simplefilter("ignore")
170
- _store = pd.HDFStore(_filename + ".h5")
171
- _store[_dataname] = _dataframe
172
- _store.close()
173
- elif options.get("type") == "Pickle":
174
- with warnings.catch_warnings():
175
- warnings.simplefilter("ignore")
176
- _dataframe.to_pickle(_filename + ".pkl")
177
- else:
178
- print("""Please specify: filename, DataFrame, DataFrame name and type
179
- (HDF5, default, or Pickle)""")
180
-
181
- def read(_filename, **options):
182
- if not options.get("dataname"):
183
- _dataname = "twint"
184
- else:
185
- _dataname = options.get("dataname")
186
-
187
- if not options.get("type"):
188
- _store = pd.HDFStore(_filename + ".h5")
189
- _df = _store[_dataname]
190
- return _df
191
- elif options.get("type") == "Pickle":
192
- _df = pd.read_pickle(_filename + ".pkl")
193
- return _df
194
- else:
195
- print("""Please specify: DataFrame, DataFrame name (twint as default),
196
- filename and type (HDF5, default, or Pickle""")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
twitter-scraper/twint-master/twint/storage/write.py DELETED
@@ -1,77 +0,0 @@
1
- from . import write_meta as meta
2
- import csv
3
- import json
4
- import os
5
-
6
- def outputExt(objType, fType):
7
- if objType == "str":
8
- objType = "username"
9
- outExt = f"/{objType}s.{fType}"
10
-
11
- return outExt
12
-
13
- def addExt(base, objType, fType):
14
- if len(base.split('.')) == 1:
15
- createDirIfMissing(base)
16
- base += outputExt(objType, fType)
17
-
18
- return base
19
-
20
- def Text(entry, f):
21
- print(entry.replace('\n', ' '), file=open(f, "a", encoding="utf-8"))
22
-
23
- def Type(config):
24
- if config.User_full:
25
- _type = "user"
26
- elif config.Followers or config.Following:
27
- _type = "username"
28
- else:
29
- _type = "tweet"
30
-
31
- return _type
32
-
33
- def struct(obj, custom, _type):
34
- if custom:
35
- fieldnames = custom
36
- row = {}
37
- for f in fieldnames:
38
- row[f] = meta.Data(obj, _type)[f]
39
- else:
40
- fieldnames = meta.Fieldnames(_type)
41
- row = meta.Data(obj, _type)
42
-
43
- return fieldnames, row
44
-
45
- def createDirIfMissing(dirname):
46
- if not os.path.exists(dirname):
47
- os.makedirs(dirname)
48
-
49
- def Csv(obj, config):
50
- _obj_type = obj.__class__.__name__
51
- if _obj_type == "str":
52
- _obj_type = "username"
53
- fieldnames, row = struct(obj, config.Custom[_obj_type], _obj_type)
54
-
55
- base = addExt(config.Output, _obj_type, "csv")
56
- dialect = 'excel-tab' if 'Tabs' in config.__dict__ else 'excel'
57
-
58
- if not (os.path.exists(base)):
59
- with open(base, "w", newline='', encoding="utf-8") as csv_file:
60
- writer = csv.DictWriter(csv_file, fieldnames=fieldnames, dialect=dialect)
61
- writer.writeheader()
62
-
63
- with open(base, "a", newline='', encoding="utf-8") as csv_file:
64
- writer = csv.DictWriter(csv_file, fieldnames=fieldnames, dialect=dialect)
65
- writer.writerow(row)
66
-
67
- def Json(obj, config):
68
- _obj_type = obj.__class__.__name__
69
- if _obj_type == "str":
70
- _obj_type = "username"
71
- null, data = struct(obj, config.Custom[_obj_type], _obj_type)
72
-
73
- base = addExt(config.Output, _obj_type, "json")
74
-
75
- with open(base, "a", newline='', encoding="utf-8") as json_file:
76
- json.dump(data, json_file, ensure_ascii=False)
77
- json_file.write("\n")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
twitter-scraper/twint-master/twint/storage/write_meta.py DELETED
@@ -1,151 +0,0 @@
1
- def tweetData(t):
2
- data = {
3
- "id": int(t.id),
4
- "conversation_id": t.conversation_id,
5
- "created_at": t.datetime,
6
- "date": t.datestamp,
7
- "time": t.timestamp,
8
- "timezone": t.timezone,
9
- "user_id": t.user_id,
10
- "username": t.username,
11
- "name": t.name,
12
- "place": t.place,
13
- "tweet": t.tweet,
14
- "language": t.lang,
15
- "mentions": t.mentions,
16
- "urls": t.urls,
17
- "photos": t.photos,
18
- "replies_count": int(t.replies_count),
19
- "retweets_count": int(t.retweets_count),
20
- "likes_count": int(t.likes_count),
21
- "hashtags": t.hashtags,
22
- "cashtags": t.cashtags,
23
- "link": t.link,
24
- "retweet": t.retweet,
25
- "quote_url": t.quote_url,
26
- "video": t.video,
27
- "thumbnail": t.thumbnail,
28
- "near": t.near,
29
- "geo": t.geo,
30
- "source": t.source,
31
- "user_rt_id": t.user_rt_id,
32
- "user_rt": t.user_rt,
33
- "retweet_id": t.retweet_id,
34
- "reply_to": t.reply_to,
35
- "retweet_date": t.retweet_date,
36
- "translate": t.translate,
37
- "trans_src": t.trans_src,
38
- "trans_dest": t.trans_dest,
39
- }
40
- return data
41
-
42
- def tweetFieldnames():
43
- fieldnames = [
44
- "id",
45
- "conversation_id",
46
- "created_at",
47
- "date",
48
- "time",
49
- "timezone",
50
- "user_id",
51
- "username",
52
- "name",
53
- "place",
54
- "tweet",
55
- "language",
56
- "mentions",
57
- "urls",
58
- "photos",
59
- "replies_count",
60
- "retweets_count",
61
- "likes_count",
62
- "hashtags",
63
- "cashtags",
64
- "link",
65
- "retweet",
66
- "quote_url",
67
- "video",
68
- "thumbnail",
69
- "near",
70
- "geo",
71
- "source",
72
- "user_rt_id",
73
- "user_rt",
74
- "retweet_id",
75
- "reply_to",
76
- "retweet_date",
77
- "translate",
78
- "trans_src",
79
- "trans_dest"
80
- ]
81
- return fieldnames
82
-
83
- def userData(u):
84
- data = {
85
- "id": int(u.id),
86
- "name": u.name,
87
- "username": u.username,
88
- "bio": u.bio,
89
- "location": u.location,
90
- "url": u.url,
91
- "join_date": u.join_date,
92
- "join_time": u.join_time,
93
- "tweets": int(u.tweets),
94
- "following": int(u.following),
95
- "followers": int(u.followers),
96
- "likes": int(u.likes),
97
- "media": int(u.media_count),
98
- "private": u.is_private,
99
- "verified": u.is_verified,
100
- "profile_image_url": u.avatar,
101
- "background_image": u.background_image
102
- }
103
- return data
104
-
105
- def userFieldnames():
106
- fieldnames = [
107
- "id",
108
- "name",
109
- "username",
110
- "bio",
111
- "location",
112
- "url",
113
- "join_date",
114
- "join_time",
115
- "tweets",
116
- "following",
117
- "followers",
118
- "likes",
119
- "media",
120
- "private",
121
- "verified",
122
- "profile_image_url",
123
- "background_image"
124
- ]
125
- return fieldnames
126
-
127
- def usernameData(u):
128
- return {"username": u}
129
-
130
- def usernameFieldnames():
131
- return ["username"]
132
-
133
- def Data(obj, _type):
134
- if _type == "user":
135
- ret = userData(obj)
136
- elif _type == "username":
137
- ret = usernameData(obj)
138
- else:
139
- ret = tweetData(obj)
140
-
141
- return ret
142
-
143
- def Fieldnames(_type):
144
- if _type == "user":
145
- ret = userFieldnames()
146
- elif _type == "username":
147
- ret = usernameFieldnames()
148
- else:
149
- ret = tweetFieldnames()
150
-
151
- return ret
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
twitter-scraper/twint-master/twint/token.py DELETED
@@ -1,94 +0,0 @@
1
- import re
2
- import time
3
-
4
- import requests
5
- import logging as logme
6
-
7
-
8
- class TokenExpiryException(Exception):
9
- def __init__(self, msg):
10
- super().__init__(msg)
11
-
12
-
13
- class RefreshTokenException(Exception):
14
- def __init__(self, msg):
15
- super().__init__(msg)
16
-
17
-
18
- class Token:
19
- def __init__(self, config):
20
- self._session = requests.Session()
21
- self._session.headers.update({'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:78.0) Gecko/20100101 Firefox/78.0'})
22
- self.config = config
23
- self._retries = 5
24
- self._timeout = 10
25
- self.url = 'https://twitter.com'
26
-
27
- def _request(self):
28
- for attempt in range(self._retries + 1):
29
- # The request is newly prepared on each retry because of potential cookie updates.
30
- req = self._session.prepare_request(requests.Request('GET', self.url))
31
- logme.debug(f'Retrieving {req.url}')
32
- try:
33
- r = self._session.send(req, allow_redirects=True, timeout=self._timeout)
34
- except requests.exceptions.RequestException as exc:
35
- if attempt < self._retries:
36
- retrying = ', retrying'
37
- level = logme.WARNING
38
- else:
39
- retrying = ''
40
- level = logme.ERROR
41
- logme.log(level, f'Error retrieving {req.url}: {exc!r}{retrying}')
42
- else:
43
- success, msg = (True, None)
44
- msg = f': {msg}' if msg else ''
45
-
46
- if success:
47
- logme.debug(f'{req.url} retrieved successfully{msg}')
48
- return r
49
- if attempt < self._retries:
50
- # TODO : might wanna tweak this back-off timer
51
- sleep_time = 2.0 * 2 ** attempt
52
- logme.info(f'Waiting {sleep_time:.0f} seconds')
53
- time.sleep(sleep_time)
54
- else:
55
- msg = f'{self._retries + 1} requests to {self.url} failed, giving up.'
56
- logme.fatal(msg)
57
- self.config.Guest_token = None
58
- raise RefreshTokenException(msg)
59
-
60
- def refresh(self):
61
- logme.debug('Retrieving guest token')
62
- res = self._request()
63
- match = re.search(r'\("gt=(\d+);', res.text)
64
- if match:
65
- logme.debug('Found guest token in HTML')
66
- self.config.Guest_token = str(match.group(1))
67
- else:
68
- headers = {
69
- 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:78.0) Gecko/20100101 Firefox/78.0',
70
- 'authority': 'api.twitter.com',
71
- 'content-length': '0',
72
- 'authorization': self.config.Bearer_token,
73
- 'x-twitter-client-language': 'en',
74
- 'x-csrf-token': res.cookies.get("ct0"),
75
- 'x-twitter-active-user': 'yes',
76
- 'content-type': 'application/x-www-form-urlencoded',
77
- 'accept': '*/*',
78
- 'sec-gpc': '1',
79
- 'origin': 'https://twitter.com',
80
- 'sec-fetch-site': 'same-site',
81
- 'sec-fetch-mode': 'cors',
82
- 'sec-fetch-dest': 'empty',
83
- 'referer': 'https://twitter.com/',
84
- 'accept-language': 'en-US',
85
- }
86
- self._session.headers.update(headers)
87
- req = self._session.prepare_request(requests.Request('POST', 'https://api.twitter.com/1.1/guest/activate.json'))
88
- res = self._session.send(req, allow_redirects=True, timeout=self._timeout)
89
- if 'guest_token' in res.json():
90
- logme.debug('Found guest token in JSON')
91
- self.config.Guest_token = res.json()['guest_token']
92
- else:
93
- self.config.Guest_token = None
94
- raise RefreshTokenException('Could not find the Guest token in HTML')
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
twitter-scraper/twint-master/twint/tweet.py DELETED
@@ -1,166 +0,0 @@
1
- from time import strftime, localtime
2
- from datetime import datetime, timezone
3
-
4
- import logging as logme
5
- from googletransx import Translator
6
- # ref.
7
- # - https://github.com/x0rzkov/py-googletrans#basic-usage
8
- translator = Translator()
9
-
10
-
11
- class tweet:
12
- """Define Tweet class
13
- """
14
- type = "tweet"
15
-
16
- def __init__(self):
17
- pass
18
-
19
-
20
- def utc_to_local(utc_dt):
21
- return utc_dt.replace(tzinfo=timezone.utc).astimezone(tz=None)
22
-
23
-
24
- Tweet_formats = {
25
- 'datetime': '%Y-%m-%d %H:%M:%S %Z',
26
- 'datestamp': '%Y-%m-%d',
27
- 'timestamp': '%H:%M:%S'
28
- }
29
-
30
-
31
- def _get_mentions(tw):
32
- """Extract mentions from tweet
33
- """
34
- logme.debug(__name__ + ':get_mentions')
35
- try:
36
- mentions = [
37
- {
38
- 'screen_name': _mention['screen_name'],
39
- 'name': _mention['name'],
40
- 'id': _mention['id_str'],
41
- } for _mention in tw['entities']['user_mentions']
42
- if tw['display_text_range'][0] < _mention['indices'][0]
43
- ]
44
- except KeyError:
45
- mentions = []
46
- return mentions
47
-
48
-
49
- def _get_reply_to(tw):
50
- try:
51
- reply_to = [
52
- {
53
- 'screen_name': _mention['screen_name'],
54
- 'name': _mention['name'],
55
- 'id': _mention['id_str'],
56
- } for _mention in tw['entities']['user_mentions']
57
- if tw['display_text_range'][0] > _mention['indices'][1]
58
- ]
59
- except KeyError:
60
- reply_to = []
61
- return reply_to
62
-
63
-
64
- def getText(tw):
65
- """Replace some text
66
- """
67
- logme.debug(__name__ + ':getText')
68
- text = tw['full_text']
69
- text = text.replace("http", " http")
70
- text = text.replace("pic.twitter", " pic.twitter")
71
- text = text.replace("\n", " ")
72
-
73
- return text
74
-
75
-
76
- def Tweet(tw, config):
77
- """Create Tweet object
78
- """
79
- logme.debug(__name__ + ':Tweet')
80
- t = tweet()
81
- t.id = int(tw['id_str'])
82
- t.id_str = tw["id_str"]
83
- t.conversation_id = tw["conversation_id_str"]
84
-
85
- # parsing date to user-friendly format
86
- _dt = tw['created_at']
87
- _dt = datetime.strptime(_dt, '%a %b %d %H:%M:%S %z %Y')
88
- _dt = utc_to_local(_dt)
89
- t.datetime = str(_dt.strftime(Tweet_formats['datetime']))
90
- # date is of the format year,
91
- t.datestamp = _dt.strftime(Tweet_formats['datestamp'])
92
- t.timestamp = _dt.strftime(Tweet_formats['timestamp'])
93
- t.user_id = int(tw["user_id_str"])
94
- t.user_id_str = tw["user_id_str"]
95
- t.username = tw["user_data"]['screen_name']
96
- t.name = tw["user_data"]['name']
97
- t.place = tw['geo'] if 'geo' in tw and tw['geo'] else ""
98
- t.timezone = strftime("%z", localtime())
99
- t.mentions = _get_mentions(tw)
100
- t.reply_to = _get_reply_to(tw)
101
- try:
102
- t.urls = [_url['expanded_url'] for _url in tw['entities']['urls']]
103
- except KeyError:
104
- t.urls = []
105
- try:
106
- t.photos = [_img['media_url_https'] for _img in tw['entities']['media'] if _img['type'] == 'photo' and
107
- _img['expanded_url'].find('/photo/') != -1]
108
- except KeyError:
109
- t.photos = []
110
- try:
111
- t.video = 1 if len(tw['extended_entities']['media']) else 0
112
- except KeyError:
113
- t.video = 0
114
- try:
115
- t.thumbnail = tw['extended_entities']['media'][0]['media_url_https']
116
- except KeyError:
117
- t.thumbnail = ''
118
- t.tweet = getText(tw)
119
- t.lang = tw['lang']
120
- try:
121
- t.hashtags = [hashtag['text'] for hashtag in tw['entities']['hashtags']]
122
- except KeyError:
123
- t.hashtags = []
124
- try:
125
- t.cashtags = [cashtag['text'] for cashtag in tw['entities']['symbols']]
126
- except KeyError:
127
- t.cashtags = []
128
- t.replies_count = tw['reply_count']
129
- t.retweets_count = tw['retweet_count']
130
- t.likes_count = tw['favorite_count']
131
- t.link = f"https://twitter.com/{t.username}/status/{t.id}"
132
- try:
133
- if 'user_rt_id' in tw['retweet_data']:
134
- t.retweet = True
135
- t.retweet_id = tw['retweet_data']['retweet_id']
136
- t.retweet_date = tw['retweet_data']['retweet_date']
137
- t.user_rt = tw['retweet_data']['user_rt']
138
- t.user_rt_id = tw['retweet_data']['user_rt_id']
139
- except KeyError:
140
- t.retweet = False
141
- t.retweet_id = ''
142
- t.retweet_date = ''
143
- t.user_rt = ''
144
- t.user_rt_id = ''
145
- try:
146
- t.quote_url = tw['quoted_status_permalink']['expanded'] if tw['is_quote_status'] else ''
147
- except KeyError:
148
- # means that the quoted tweet have been deleted
149
- t.quote_url = 0
150
- t.near = config.Near if config.Near else ""
151
- t.geo = config.Geo if config.Geo else ""
152
- t.source = config.Source if config.Source else ""
153
- t.translate = ''
154
- t.trans_src = ''
155
- t.trans_dest = ''
156
- if config.Translate:
157
- try:
158
- ts = translator.translate(text=t.tweet, dest=config.TranslateDest)
159
- t.translate = ts.text
160
- t.trans_src = ts.src
161
- t.trans_dest = ts.dest
162
- # ref. https://github.com/SuniTheFish/ChainTranslator/blob/master/ChainTranslator/__main__.py#L31
163
- except ValueError as e:
164
- logme.debug(__name__ + ':Tweet:translator.translate:' + str(e))
165
- raise Exception("Invalid destination language: {} / Tweet: {}".format(config.TranslateDest, t.tweet))
166
- return t
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
twitter-scraper/twint-master/twint/url.py DELETED
@@ -1,195 +0,0 @@
1
- import datetime
2
- import json
3
- from sys import platform
4
- import logging as logme
5
- from urllib.parse import urlencode
6
- from urllib.parse import quote
7
-
8
- mobile = "https://mobile.twitter.com"
9
- base = "https://api.twitter.com/2/search/adaptive.json"
10
-
11
-
12
- def _sanitizeQuery(_url, params):
13
- _serialQuery = ""
14
- _serialQuery = urlencode(params, quote_via=quote)
15
- _serialQuery = _url + "?" + _serialQuery
16
- return _serialQuery
17
-
18
-
19
- def _formatDate(date):
20
- if "win" in platform:
21
- return f'\"{date.split()[0]}\"'
22
- try:
23
- return int(datetime.datetime.strptime(date, "%Y-%m-%d %H:%M:%S").timestamp())
24
- except ValueError:
25
- return int(datetime.datetime.strptime(date, "%Y-%m-%d").timestamp())
26
-
27
-
28
- async def Favorites(username, init):
29
- logme.debug(__name__ + ':Favorites')
30
- url = f"{mobile}/{username}/favorites?lang=en"
31
-
32
- if init != '-1':
33
- url += f"&max_id={init}"
34
-
35
- return url
36
-
37
-
38
- async def Followers(username, init):
39
- logme.debug(__name__ + ':Followers')
40
- url = f"{mobile}/{username}/followers?lang=en"
41
-
42
- if init != '-1':
43
- url += f"&cursor={init}"
44
-
45
- return url
46
-
47
-
48
- async def Following(username, init):
49
- logme.debug(__name__ + ':Following')
50
- url = f"{mobile}/{username}/following?lang=en"
51
-
52
- if init != '-1':
53
- url += f"&cursor={init}"
54
-
55
- return url
56
-
57
-
58
- async def MobileProfile(username, init):
59
- logme.debug(__name__ + ':MobileProfile')
60
- url = f"{mobile}/{username}?lang=en"
61
-
62
- if init != '-1':
63
- url += f"&max_id={init}"
64
-
65
- return url
66
-
67
-
68
- async def Search(config, init):
69
- logme.debug(__name__ + ':Search')
70
- url = base
71
- tweet_count = 100 if not config.Limit else config.Limit
72
- q = ""
73
- params = [
74
- # ('include_blocking', '1'),
75
- # ('include_blocked_by', '1'),
76
- # ('include_followed_by', '1'),
77
- # ('include_want_retweets', '1'),
78
- # ('include_mute_edge', '1'),
79
- # ('include_can_dm', '1'),
80
- ('include_can_media_tag', '1'),
81
- # ('skip_status', '1'),
82
- # ('include_cards', '1'),
83
- ('include_ext_alt_text', 'true'),
84
- ('include_quote_count', 'true'),
85
- ('include_reply_count', '1'),
86
- ('tweet_mode', 'extended'),
87
- ('include_entities', 'true'),
88
- ('include_user_entities', 'true'),
89
- ('include_ext_media_availability', 'true'),
90
- ('send_error_codes', 'true'),
91
- ('simple_quoted_tweet', 'true'),
92
- ('count', tweet_count),
93
- ('query_source', 'typed_query'),
94
- # ('pc', '1'),
95
- ('cursor', str(init)),
96
- ('spelling_corrections', '1'),
97
- ('ext', 'mediaStats%2ChighlightedLabel'),
98
- ('tweet_search_mode', 'live'), # this can be handled better, maybe take an argument and set it then
99
- ]
100
- if not config.Popular_tweets:
101
- params.append(('f', 'tweets'))
102
- if config.Lang:
103
- params.append(("l", config.Lang))
104
- params.append(("lang", "en"))
105
- if config.Query:
106
- q += f" from:{config.Query}"
107
- if config.Username:
108
- q += f" from:{config.Username}"
109
- if config.Geo:
110
- config.Geo = config.Geo.replace(" ", "")
111
- q += f" geocode:{config.Geo}"
112
- if config.Search:
113
-
114
- q += f" {config.Search}"
115
- if config.Year:
116
- q += f" until:{config.Year}-1-1"
117
- if config.Since:
118
- q += f" since:{_formatDate(config.Since)}"
119
- if config.Until:
120
- q += f" until:{_formatDate(config.Until)}"
121
- if config.Email:
122
- q += ' "mail" OR "email" OR'
123
- q += ' "gmail" OR "e-mail"'
124
- if config.Phone:
125
- q += ' "phone" OR "call me" OR "text me"'
126
- if config.Verified:
127
- q += " filter:verified"
128
- if config.To:
129
- q += f" to:{config.To}"
130
- if config.All:
131
- q += f" to:{config.All} OR from:{config.All} OR @{config.All}"
132
- if config.Near:
133
- q += f' near:"{config.Near}"'
134
- if config.Images:
135
- q += " filter:images"
136
- if config.Videos:
137
- q += " filter:videos"
138
- if config.Media:
139
- q += " filter:media"
140
- if config.Replies:
141
- q += " filter:replies"
142
- # although this filter can still be used, but I found it broken in my preliminary testing, needs more testing
143
- if config.Native_retweets:
144
- q += " filter:nativeretweets"
145
- if config.Min_likes:
146
- q += f" min_faves:{config.Min_likes}"
147
- if config.Min_retweets:
148
- q += f" min_retweets:{config.Min_retweets}"
149
- if config.Min_replies:
150
- q += f" min_replies:{config.Min_replies}"
151
- if config.Links == "include":
152
- q += " filter:links"
153
- elif config.Links == "exclude":
154
- q += " exclude:links"
155
- if config.Source:
156
- q += f" source:\"{config.Source}\""
157
- if config.Members_list:
158
- q += f" list:{config.Members_list}"
159
- if config.Filter_retweets:
160
- q += f" exclude:nativeretweets exclude:retweets"
161
- if config.Custom_query:
162
- q = config.Custom_query
163
-
164
- q = q.strip()
165
- params.append(("q", q))
166
- _serialQuery = _sanitizeQuery(url, params)
167
- return url, params, _serialQuery
168
-
169
-
170
- def SearchProfile(config, init=None):
171
- logme.debug(__name__ + ':SearchProfile')
172
- _url = 'https://twitter.com/i/api/graphql/CwLU7qTfeu0doqhSr6tW4A/UserTweetsAndReplies'
173
- tweet_count = 100
174
- variables = {
175
- "userId": config.User_id,
176
- "count": tweet_count,
177
- "includePromotedContent": True,
178
- "withCommunity": True,
179
- "withSuperFollowsUserFields": True,
180
- "withBirdwatchPivots": False,
181
- "withDownvotePerspective": False,
182
- "withReactionsMetadata": False,
183
- "withReactionsPerspective": False,
184
- "withSuperFollowsTweetFields": True,
185
- "withVoice": True,
186
- "withV2Timeline": False,
187
- "__fs_interactive_text": False,
188
- "__fs_dont_mention_me_view_api_enabled": False,
189
- }
190
- if type(init) == str:
191
- variables['cursor'] = init
192
- params = [('variables', json.dumps(variables, separators=(',',':')))]
193
-
194
- _serialQuery = _sanitizeQuery(_url, params)
195
- return _serialQuery, [], _serialQuery
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
twitter-scraper/twint-master/twint/user.py DELETED
@@ -1,52 +0,0 @@
1
- import datetime
2
- import logging as logme
3
-
4
-
5
- class user:
6
- type = "user"
7
-
8
- def __init__(self):
9
- pass
10
-
11
-
12
- User_formats = {
13
- 'join_date': '%Y-%m-%d',
14
- 'join_time': '%H:%M:%S %Z'
15
- }
16
-
17
-
18
- # ur object must be a json from the endpoint https://api.twitter.com/graphql
19
- def User(ur):
20
- logme.debug(__name__ + ':User')
21
- if 'data' not in ur and 'user' not in ur['data']:
22
- msg = 'malformed json! cannot be parsed to get user data'
23
- logme.fatal(msg)
24
- raise KeyError(msg)
25
- _usr = user()
26
- _usr.id = ur['data']['user']['rest_id']
27
- _usr.name = ur['data']['user']['legacy']['name']
28
- _usr.username = ur['data']['user']['legacy']['screen_name']
29
- _usr.bio = ur['data']['user']['legacy']['description']
30
- _usr.location = ur['data']['user']['legacy']['location']
31
- _usr.url = ur['data']['user']['legacy']['url']
32
- # parsing date to user-friendly format
33
- _dt = ur['data']['user']['legacy']['created_at']
34
- _dt = datetime.datetime.strptime(_dt, '%a %b %d %H:%M:%S %z %Y')
35
- # date is of the format year,
36
- _usr.join_date = _dt.strftime(User_formats['join_date'])
37
- _usr.join_time = _dt.strftime(User_formats['join_time'])
38
-
39
- # :type `int`
40
- _usr.tweets = int(ur['data']['user']['legacy']['statuses_count'])
41
- _usr.following = int(ur['data']['user']['legacy']['friends_count'])
42
- _usr.followers = int(ur['data']['user']['legacy']['followers_count'])
43
- _usr.likes = int(ur['data']['user']['legacy']['favourites_count'])
44
- _usr.media_count = int(ur['data']['user']['legacy']['media_count'])
45
-
46
- _usr.is_private = ur['data']['user']['legacy']['protected']
47
- _usr.is_verified = ur['data']['user']['legacy']['verified']
48
- _usr.avatar = ur['data']['user']['legacy']['profile_image_url_https']
49
- _usr.background_image = ur['data']['user']['legacy']['profile_banner_url']
50
- # TODO : future implementation
51
- # legacy_extended_profile is also available in some cases which can be used to get DOB of user
52
- return _usr
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
twitter-scraper/twint-master/twint/verbose.py DELETED
@@ -1,18 +0,0 @@
1
- def Count(count, config):
2
- msg = "[+] Finished: Successfully collected "
3
- if config.Followers:
4
- msg += f"all {count} users who follow @{config.Username}"
5
- elif config.Following:
6
- msg += f"all {count} users who @{config.Username} follows"
7
- elif config.Favorites:
8
- msg += f"{count} Tweets that @{config.Username} liked"
9
- else:
10
- msg += f"{count} Tweets_and_replies"
11
- if config.Username:
12
- msg += f" from @{config.Username}"
13
- msg += "."
14
- print(msg)
15
-
16
- def Elastic(elasticsearch):
17
- if elasticsearch:
18
- print("[+] Indexing to Elasticsearch @ " + str(elasticsearch))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
twitter-scraper/twint-master/twitter_scraper.ipynb DELETED
@@ -1,265 +0,0 @@
1
- {
2
- "cells": [
3
- {
4
- "cell_type": "markdown",
5
- "id": "a5361789",
6
- "metadata": {},
7
- "source": [
8
- "## Have to install these packages \n"
9
- ]
10
- },
11
- {
12
- "cell_type": "code",
13
- "execution_count": null,
14
- "id": "c9021300",
15
- "metadata": {
16
- "scrolled": true
17
- },
18
- "outputs": [],
19
- "source": [
20
- "%%capture \n",
21
- "!pip3 install Twint \n"
22
- ]
23
- },
24
- {
25
- "cell_type": "markdown",
26
- "id": "5c857dbf",
27
- "metadata": {},
28
- "source": [
29
- "## Nessessary Imports"
30
- ]
31
- },
32
- {
33
- "cell_type": "code",
34
- "execution_count": null,
35
- "id": "1413ab2b",
36
- "metadata": {},
37
- "outputs": [],
38
- "source": [
39
- "# import asyncio\n",
40
- "# import os\n",
41
- "# loop = asyncio.get_event_loop()\n",
42
- "# loop.is_running()\n",
43
- "# import twint\n",
44
- "# import nest_asyncio\n",
45
- "# nest_asyncio.apply()"
46
- ]
47
- },
48
- {
49
- "cell_type": "code",
50
- "execution_count": null,
51
- "id": "d38514f3",
52
- "metadata": {},
53
- "outputs": [],
54
- "source": [
55
- "import scrape\n"
56
- ]
57
- },
58
- {
59
- "cell_type": "code",
60
- "execution_count": null,
61
- "id": "a7912a91",
62
- "metadata": {},
63
- "outputs": [],
64
- "source": [
65
- "from_date=\"2022-6-10 10:30:22\"\n",
66
- "to_date= \"2022-6-30\"\n",
67
- "num_tweets = 20\n",
68
- "_data=scrape.scraper.get_tweets(\"jimmieakesson\",u_or_s=\"u\",from_date=221232,to_date=2313)\n"
69
- ]
70
- },
71
- {
72
- "cell_type": "code",
73
- "execution_count": null,
74
- "id": "48d50b46",
75
- "metadata": {},
76
- "outputs": [],
77
- "source": [
78
- "tweets= _data.keys()\n",
79
- "for i in tweets:\n",
80
- " _data[i][\"tweet\"]\n",
81
- " print(_data[i][\"tweet\"], \"\\n\", \"__________________________________________________________\")"
82
- ]
83
- },
84
- {
85
- "cell_type": "code",
86
- "execution_count": null,
87
- "id": "72cabcb5",
88
- "metadata": {},
89
- "outputs": [],
90
- "source": [
91
- "from_date=\"2022-6-10 10:30:22\"\n",
92
- "to_date= \"2022-6-30\"\n",
93
- "num_tweets = 20\n",
94
- "_data=scrape.scraper.string_search_user_tweets(\"jimmieakesson\",\"invandring\")\n"
95
- ]
96
- },
97
- {
98
- "cell_type": "code",
99
- "execution_count": null,
100
- "id": "549e4fb3",
101
- "metadata": {},
102
- "outputs": [],
103
- "source": [
104
- "tweets= _data[\"tweet\"]\n",
105
- "for i in tweets:\n",
106
- " print(i, \"\\n\", \"__________________________________________________________\")"
107
- ]
108
- },
109
- {
110
- "cell_type": "code",
111
- "execution_count": 3,
112
- "id": "733dd44a",
113
- "metadata": {},
114
- "outputs": [
115
- {
116
- "name": "stdout",
117
- "output_type": "stream",
118
- "text": [
119
- "Defaulting to user installation because normal site-packages is not writeable\n",
120
- "Requirement already satisfied: snscrape in /home/oxygen/.local/lib/python3.10/site-packages (0.3.4)\n",
121
- "Requirement already satisfied: beautifulsoup4 in /home/oxygen/.local/lib/python3.10/site-packages (from snscrape) (4.11.1)\n",
122
- "Requirement already satisfied: requests[socks] in /usr/lib/python3/dist-packages (from snscrape) (2.25.1)\n",
123
- "Requirement already satisfied: lxml in /usr/lib/python3/dist-packages (from snscrape) (4.8.0)\n",
124
- "Requirement already satisfied: soupsieve>1.2 in /home/oxygen/.local/lib/python3.10/site-packages (from beautifulsoup4->snscrape) (2.3.2.post1)\n",
125
- "Requirement already satisfied: PySocks!=1.5.7,>=1.5.6 in /home/oxygen/.local/lib/python3.10/site-packages (from requests[socks]->snscrape) (1.7.1)\n"
126
- ]
127
- }
128
- ],
129
- "source": [
130
- "#%pip install -q snscrape==0.3.4\n",
131
- "!pip3 install snscrape\n",
132
- "#!pip3 install git+https://github.com/JustAnotherArchivist/snscrape.git"
133
- ]
134
- },
135
- {
136
- "cell_type": "code",
137
- "execution_count": 14,
138
- "id": "0d16422c",
139
- "metadata": {},
140
- "outputs": [
141
- {
142
- "name": "stdout",
143
- "output_type": "stream",
144
- "text": [
145
- "Note: you may need to restart the kernel to use updated packages.\n"
146
- ]
147
- }
148
- ],
149
- "source": [
150
- "%pip install -q snscrape==0.3.4\n",
151
- "from datetime import date\n",
152
- "import os\n",
153
- "import pandas as pd\n",
154
- "\n",
155
- "\n",
156
- "def get_tweets(search_term, from_date, to_date=date.today(), num_tweets=100,u_or_s='s'):\n",
157
- " if u_or_s.lower() =='u':\n",
158
- " extracted_tweets = \"snscrape --format '{content!r}'\"+ f\" --max-results {num_tweets} --since {from_date} twitter-user '{search_term} until:{to_date}' > extracted-tweets.txt\" \n",
159
- " else:\n",
160
- " extracted_tweets = \"snscrape --format '{content!r}'\"+ f\" --max-results {num_tweets} --since {from_date} twitter-search '{search_term} until:{to_date}' > extracted-tweets.txt\"\n",
161
- " \n",
162
- " os.system(extracted_tweets)\n",
163
- " if os.stat(\"extracted-tweets.txt\").st_size == 0:\n",
164
- " print('No Tweets found')\n",
165
- " else:\n",
166
- " df = pd.read_csv('extracted-tweets.txt', names=['content'])\n",
167
- " data_list=[]\n",
168
- " for row in df['content'].iteritems():\n",
169
- " temp= str(row[0])+str(row[1])\n",
170
- " temp= temp.replace(\"\\'\",\"\")\n",
171
- " data_list.append(temp)\n",
172
- " return data_list\n",
173
- "\n"
174
- ]
175
- },
176
- {
177
- "cell_type": "code",
178
- "execution_count": 12,
179
- "id": "8e2adb35",
180
- "metadata": {},
181
- "outputs": [
182
- {
183
- "name": "stdout",
184
- "output_type": "stream",
185
- "text": [
186
- "No Tweets found\n"
187
- ]
188
- },
189
- {
190
- "name": "stderr",
191
- "output_type": "stream",
192
- "text": [
193
- "Traceback (most recent call last):\n",
194
- " File \"/home/oxygen/.local/bin/snscrape\", line 8, in <module>\n",
195
- " sys.exit(main())\n",
196
- " File \"/home/oxygen/.local/lib/python3.10/site-packages/snscrape/cli.py\", line 224, in main\n",
197
- " args = parse_args()\n",
198
- " File \"/home/oxygen/.local/lib/python3.10/site-packages/snscrape/cli.py\", line 159, in parse_args\n",
199
- " import snscrape.modules\n",
200
- " File \"/home/oxygen/.local/lib/python3.10/site-packages/snscrape/modules/__init__.py\", line 15, in <module>\n",
201
- " _import_modules()\n",
202
- " File \"/home/oxygen/.local/lib/python3.10/site-packages/snscrape/modules/__init__.py\", line 12, in _import_modules\n",
203
- " module = importlib.import_module(moduleName)\n",
204
- " File \"/usr/lib/python3.10/importlib/__init__.py\", line 126, in import_module\n",
205
- " return _bootstrap._gcd_import(name[level:], package, level)\n",
206
- " File \"/home/oxygen/.local/lib/python3.10/site-packages/snscrape/modules/instagram.py\", line 12, in <module>\n",
207
- " class InstagramPost(typing.NamedTuple, snscrape.base.Item):\n",
208
- " File \"/usr/lib/python3.10/typing.py\", line 2329, in _namedtuple_mro_entries\n",
209
- " raise TypeError(\"Multiple inheritance with NamedTuple is not supported\")\n",
210
- "TypeError: Multiple inheritance with NamedTuple is not supported\n"
211
- ]
212
- },
213
- {
214
- "ename": "UnboundLocalError",
215
- "evalue": "local variable 'df' referenced before assignment",
216
- "output_type": "error",
217
- "traceback": [
218
- "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
219
- "\u001b[0;31mUnboundLocalError\u001b[0m Traceback (most recent call last)",
220
- "\u001b[0;32m/tmp/ipykernel_26511/1892081786.py\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0md\u001b[0m\u001b[0;34m=\u001b[0m \u001b[0mget_tweets\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"jimmieakesson\"\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0mfrom_date\u001b[0m\u001b[0;34m=\u001b[0m \u001b[0;34m\"2022-06-01\"\u001b[0m \u001b[0;34m,\u001b[0m\u001b[0mnum_tweets\u001b[0m \u001b[0;34m=\u001b[0m\u001b[0;36m5\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mu_or_s\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m\"u\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
221
- "\u001b[0;32m/tmp/ipykernel_26511/275462205.py\u001b[0m in \u001b[0;36mget_tweets\u001b[0;34m(search_term, from_date, to_date, num_tweets, u_or_s)\u001b[0m\n\u001b[1;32m 17\u001b[0m \u001b[0mdf\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mpd\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mread_csv\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'extracted-tweets.txt'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mnames\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'content'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 18\u001b[0m \u001b[0mdata_list\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 19\u001b[0;31m \u001b[0;32mfor\u001b[0m \u001b[0mrow\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mdf\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'content'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0miteritems\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 20\u001b[0m \u001b[0mtemp\u001b[0m\u001b[0;34m=\u001b[0m \u001b[0mstr\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mrow\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m+\u001b[0m\u001b[0mstr\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mrow\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 21\u001b[0m \u001b[0mtemp\u001b[0m\u001b[0;34m=\u001b[0m \u001b[0mtemp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mreplace\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"\\'\"\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\"\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
222
- "\u001b[0;31mUnboundLocalError\u001b[0m: local variable 'df' referenced before assignment"
223
- ]
224
- }
225
- ],
226
- "source": [
227
- "d= get_tweets(\"jimmieakesson\",from_date= \"2022-06-01\" ,num_tweets =5, u_or_s=\"u\")"
228
- ]
229
- },
230
- {
231
- "cell_type": "code",
232
- "execution_count": null,
233
- "id": "a2c837f4",
234
- "metadata": {},
235
- "outputs": [],
236
- "source": []
237
- }
238
- ],
239
- "metadata": {
240
- "kernelspec": {
241
- "display_name": "Python 3.10.4 64-bit",
242
- "language": "python",
243
- "name": "python3"
244
- },
245
- "language_info": {
246
- "codemirror_mode": {
247
- "name": "ipython",
248
- "version": 3
249
- },
250
- "file_extension": ".py",
251
- "mimetype": "text/x-python",
252
- "name": "python",
253
- "nbconvert_exporter": "python",
254
- "pygments_lexer": "ipython3",
255
- "version": "3.10.4"
256
- },
257
- "vscode": {
258
- "interpreter": {
259
- "hash": "916dbcbb3f70747c44a77c7bcd40155683ae19c65e1c03b4aa3499c5328201f1"
260
- }
261
- }
262
- },
263
- "nbformat": 4,
264
- "nbformat_minor": 5
265
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
twitter-scraper/twitter_scraper.ipynb ADDED
@@ -0,0 +1,819 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "id": "a5361789",
6
+ "metadata": {},
7
+ "source": [
8
+ "## Have to install these packages \n"
9
+ ]
10
+ },
11
+ {
12
+ "cell_type": "code",
13
+ "execution_count": 2,
14
+ "id": "c9021300",
15
+ "metadata": {
16
+ "scrolled": true
17
+ },
18
+ "outputs": [],
19
+ "source": [
20
+ "%%capture \n",
21
+ "!pip3 install Twint \n",
22
+ "\n"
23
+ ]
24
+ },
25
+ {
26
+ "cell_type": "markdown",
27
+ "id": "5c857dbf",
28
+ "metadata": {},
29
+ "source": [
30
+ "## Nessessary Imports"
31
+ ]
32
+ },
33
+ {
34
+ "cell_type": "code",
35
+ "execution_count": 3,
36
+ "id": "1413ab2b",
37
+ "metadata": {},
38
+ "outputs": [],
39
+ "source": [
40
+ "import asyncio\n",
41
+ "import os\n",
42
+ "loop = asyncio.get_event_loop()\n",
43
+ "loop.is_running()\n",
44
+ "import twint\n",
45
+ "import nest_asyncio\n",
46
+ "nest_asyncio.apply()"
47
+ ]
48
+ },
49
+ {
50
+ "cell_type": "code",
51
+ "execution_count": 4,
52
+ "id": "d38514f3",
53
+ "metadata": {},
54
+ "outputs": [],
55
+ "source": [
56
+ "import scrape\n",
57
+ "sc= scrape.TwitterScraper(num_tweets=10)\n"
58
+ ]
59
+ },
60
+ {
61
+ "cell_type": "code",
62
+ "execution_count": 5,
63
+ "id": "d37e5cbf",
64
+ "metadata": {},
65
+ "outputs": [
66
+ {
67
+ "name": "stdout",
68
+ "output_type": "stream",
69
+ "text": [
70
+ "[+] Finished: Successfully collected 20 Tweets.\n"
71
+ ]
72
+ },
73
+ {
74
+ "data": {
75
+ "text/html": [
76
+ "<div>\n",
77
+ "<style scoped>\n",
78
+ " .dataframe tbody tr th:only-of-type {\n",
79
+ " vertical-align: middle;\n",
80
+ " }\n",
81
+ "\n",
82
+ " .dataframe tbody tr th {\n",
83
+ " vertical-align: top;\n",
84
+ " }\n",
85
+ "\n",
86
+ " .dataframe thead th {\n",
87
+ " text-align: right;\n",
88
+ " }\n",
89
+ "</style>\n",
90
+ "<table border=\"1\" class=\"dataframe\">\n",
91
+ " <thead>\n",
92
+ " <tr style=\"text-align: right;\">\n",
93
+ " <th></th>\n",
94
+ " <th>id</th>\n",
95
+ " <th>tweet</th>\n",
96
+ " <th>date</th>\n",
97
+ " <th>user_id</th>\n",
98
+ " <th>username</th>\n",
99
+ " <th>urls</th>\n",
100
+ " <th>nlikes</th>\n",
101
+ " <th>nreplies</th>\n",
102
+ " <th>nretweets</th>\n",
103
+ " </tr>\n",
104
+ " </thead>\n",
105
+ " <tbody>\n",
106
+ " <tr>\n",
107
+ " <th>0</th>\n",
108
+ " <td>1545194541006950400</td>\n",
109
+ " <td>kim sever benim gibi sevmeyecekler bıraktığın ...</td>\n",
110
+ " <td>2022-07-08 01:54:21</td>\n",
111
+ " <td>1396065566117466113</td>\n",
112
+ " <td>heja4r</td>\n",
113
+ " <td>[]</td>\n",
114
+ " <td>1</td>\n",
115
+ " <td>0</td>\n",
116
+ " <td>0</td>\n",
117
+ " </tr>\n",
118
+ " <tr>\n",
119
+ " <th>1</th>\n",
120
+ " <td>1545192735354806274</td>\n",
121
+ " <td>Kelimeler,albayım,bazı anlamalara gelmiyor..</td>\n",
122
+ " <td>2022-07-08 01:47:11</td>\n",
123
+ " <td>1481604485118140425</td>\n",
124
+ " <td>Theguapo6</td>\n",
125
+ " <td>[]</td>\n",
126
+ " <td>1</td>\n",
127
+ " <td>0</td>\n",
128
+ " <td>0</td>\n",
129
+ " </tr>\n",
130
+ " <tr>\n",
131
+ " <th>2</th>\n",
132
+ " <td>1545190168533008385</td>\n",
133
+ " <td>@shikan213 ptdr ? y’a aucune racisme à quel mo...</td>\n",
134
+ " <td>2022-07-08 01:36:59</td>\n",
135
+ " <td>1476042813741617155</td>\n",
136
+ " <td>srndz213__</td>\n",
137
+ " <td>[]</td>\n",
138
+ " <td>0</td>\n",
139
+ " <td>1</td>\n",
140
+ " <td>0</td>\n",
141
+ " </tr>\n",
142
+ " <tr>\n",
143
+ " <th>3</th>\n",
144
+ " <td>1545190106910171136</td>\n",
145
+ " <td>@guzzeida Men gud du har presterat så mkt bätt...</td>\n",
146
+ " <td>2022-07-08 01:36:44</td>\n",
147
+ " <td>34343541</td>\n",
148
+ " <td>lisaxamanda</td>\n",
149
+ " <td>[]</td>\n",
150
+ " <td>1</td>\n",
151
+ " <td>0</td>\n",
152
+ " <td>0</td>\n",
153
+ " </tr>\n",
154
+ " <tr>\n",
155
+ " <th>4</th>\n",
156
+ " <td>1545190096042860544</td>\n",
157
+ " <td>Heja, heja, heja Slovensko</td>\n",
158
+ " <td>2022-07-08 01:36:41</td>\n",
159
+ " <td>3158344237</td>\n",
160
+ " <td>ian_10_19</td>\n",
161
+ " <td>[]</td>\n",
162
+ " <td>0</td>\n",
163
+ " <td>0</td>\n",
164
+ " <td>0</td>\n",
165
+ " </tr>\n",
166
+ " </tbody>\n",
167
+ "</table>\n",
168
+ "</div>"
169
+ ],
170
+ "text/plain": [
171
+ " id tweet \\\n",
172
+ "0 1545194541006950400 kim sever benim gibi sevmeyecekler bıraktığın ... \n",
173
+ "1 1545192735354806274 Kelimeler,albayım,bazı anlamalara gelmiyor.. \n",
174
+ "2 1545190168533008385 @shikan213 ptdr ? y’a aucune racisme à quel mo... \n",
175
+ "3 1545190106910171136 @guzzeida Men gud du har presterat så mkt bätt... \n",
176
+ "4 1545190096042860544 Heja, heja, heja Slovensko \n",
177
+ "\n",
178
+ " date user_id username urls nlikes \\\n",
179
+ "0 2022-07-08 01:54:21 1396065566117466113 heja4r [] 1 \n",
180
+ "1 2022-07-08 01:47:11 1481604485118140425 Theguapo6 [] 1 \n",
181
+ "2 2022-07-08 01:36:59 1476042813741617155 srndz213__ [] 0 \n",
182
+ "3 2022-07-08 01:36:44 34343541 lisaxamanda [] 1 \n",
183
+ "4 2022-07-08 01:36:41 3158344237 ian_10_19 [] 0 \n",
184
+ "\n",
185
+ " nreplies nretweets \n",
186
+ "0 0 0 \n",
187
+ "1 0 0 \n",
188
+ "2 1 0 \n",
189
+ "3 0 0 \n",
190
+ "4 0 0 "
191
+ ]
192
+ },
193
+ "execution_count": 5,
194
+ "metadata": {},
195
+ "output_type": "execute_result"
196
+ }
197
+ ],
198
+ "source": [
199
+ "string_tr_info=sc.scrape_by_string(\"heja\")\n",
200
+ "string_tr_info.head()\n"
201
+ ]
202
+ },
203
+ {
204
+ "cell_type": "code",
205
+ "execution_count": 6,
206
+ "id": "902170ad",
207
+ "metadata": {},
208
+ "outputs": [
209
+ {
210
+ "data": {
211
+ "text/html": [
212
+ "<div>\n",
213
+ "<style scoped>\n",
214
+ " .dataframe tbody tr th:only-of-type {\n",
215
+ " vertical-align: middle;\n",
216
+ " }\n",
217
+ "\n",
218
+ " .dataframe tbody tr th {\n",
219
+ " vertical-align: top;\n",
220
+ " }\n",
221
+ "\n",
222
+ " .dataframe thead th {\n",
223
+ " text-align: right;\n",
224
+ " }\n",
225
+ "</style>\n",
226
+ "<table border=\"1\" class=\"dataframe\">\n",
227
+ " <thead>\n",
228
+ " <tr style=\"text-align: right;\">\n",
229
+ " <th></th>\n",
230
+ " <th>id</th>\n",
231
+ " <th>tweet</th>\n",
232
+ " <th>date</th>\n",
233
+ " <th>user_id</th>\n",
234
+ " <th>username</th>\n",
235
+ " <th>urls</th>\n",
236
+ " <th>nlikes</th>\n",
237
+ " <th>nreplies</th>\n",
238
+ " <th>nretweets</th>\n",
239
+ " </tr>\n",
240
+ " </thead>\n",
241
+ " <tbody>\n",
242
+ " <tr>\n",
243
+ " <th>0</th>\n",
244
+ " <td>1545194541006950400</td>\n",
245
+ " <td>kim sever benim gibi sevmeyecekler bıraktığın ...</td>\n",
246
+ " <td>2022-07-08 01:54:21</td>\n",
247
+ " <td>1396065566117466113</td>\n",
248
+ " <td>heja4r</td>\n",
249
+ " <td>[]</td>\n",
250
+ " <td>1</td>\n",
251
+ " <td>0</td>\n",
252
+ " <td>0</td>\n",
253
+ " </tr>\n",
254
+ " <tr>\n",
255
+ " <th>1</th>\n",
256
+ " <td>1545192735354806274</td>\n",
257
+ " <td>Kelimeler,albayım,bazı anlamalara gelmiyor..</td>\n",
258
+ " <td>2022-07-08 01:47:11</td>\n",
259
+ " <td>1481604485118140425</td>\n",
260
+ " <td>Theguapo6</td>\n",
261
+ " <td>[]</td>\n",
262
+ " <td>1</td>\n",
263
+ " <td>0</td>\n",
264
+ " <td>0</td>\n",
265
+ " </tr>\n",
266
+ " <tr>\n",
267
+ " <th>4</th>\n",
268
+ " <td>1545190096042860544</td>\n",
269
+ " <td>Heja, heja, heja Slovensko</td>\n",
270
+ " <td>2022-07-08 01:36:41</td>\n",
271
+ " <td>3158344237</td>\n",
272
+ " <td>ian_10_19</td>\n",
273
+ " <td>[]</td>\n",
274
+ " <td>0</td>\n",
275
+ " <td>0</td>\n",
276
+ " <td>0</td>\n",
277
+ " </tr>\n",
278
+ " <tr>\n",
279
+ " <th>6</th>\n",
280
+ " <td>1545189783747436545</td>\n",
281
+ " <td>Beni sorarsan dardayım..</td>\n",
282
+ " <td>2022-07-08 01:35:27</td>\n",
283
+ " <td>1481604485118140425</td>\n",
284
+ " <td>Theguapo6</td>\n",
285
+ " <td>[]</td>\n",
286
+ " <td>2</td>\n",
287
+ " <td>0</td>\n",
288
+ " <td>0</td>\n",
289
+ " </tr>\n",
290
+ " <tr>\n",
291
+ " <th>12</th>\n",
292
+ " <td>1545186234623991813</td>\n",
293
+ " <td>Heja strandhäll. Vilket jävla block mongo</td>\n",
294
+ " <td>2022-07-08 01:21:21</td>\n",
295
+ " <td>1160537136250195968</td>\n",
296
+ " <td>Siggydunn</td>\n",
297
+ " <td>[]</td>\n",
298
+ " <td>0</td>\n",
299
+ " <td>0</td>\n",
300
+ " <td>0</td>\n",
301
+ " </tr>\n",
302
+ " </tbody>\n",
303
+ "</table>\n",
304
+ "</div>"
305
+ ],
306
+ "text/plain": [
307
+ " id tweet \\\n",
308
+ "0 1545194541006950400 kim sever benim gibi sevmeyecekler bıraktığın ... \n",
309
+ "1 1545192735354806274 Kelimeler,albayım,bazı anlamalara gelmiyor.. \n",
310
+ "4 1545190096042860544 Heja, heja, heja Slovensko \n",
311
+ "6 1545189783747436545 Beni sorarsan dardayım.. \n",
312
+ "12 1545186234623991813 Heja strandhäll. Vilket jävla block mongo \n",
313
+ "\n",
314
+ " date user_id username urls nlikes \\\n",
315
+ "0 2022-07-08 01:54:21 1396065566117466113 heja4r [] 1 \n",
316
+ "1 2022-07-08 01:47:11 1481604485118140425 Theguapo6 [] 1 \n",
317
+ "4 2022-07-08 01:36:41 3158344237 ian_10_19 [] 0 \n",
318
+ "6 2022-07-08 01:35:27 1481604485118140425 Theguapo6 [] 2 \n",
319
+ "12 2022-07-08 01:21:21 1160537136250195968 Siggydunn [] 0 \n",
320
+ "\n",
321
+ " nreplies nretweets \n",
322
+ "0 0 0 \n",
323
+ "1 0 0 \n",
324
+ "4 0 0 \n",
325
+ "6 0 0 \n",
326
+ "12 0 0 "
327
+ ]
328
+ },
329
+ "execution_count": 6,
330
+ "metadata": {},
331
+ "output_type": "execute_result"
332
+ }
333
+ ],
334
+ "source": [
335
+ "string_t_info=sc.get_only_tweets(tr_info)\n",
336
+ "string_t_info.head()"
337
+ ]
338
+ },
339
+ {
340
+ "cell_type": "code",
341
+ "execution_count": 7,
342
+ "id": "a7912a91",
343
+ "metadata": {},
344
+ "outputs": [
345
+ {
346
+ "name": "stdout",
347
+ "output_type": "stream",
348
+ "text": [
349
+ "[+] Finished: Successfully collected 20 Tweets.\n"
350
+ ]
351
+ },
352
+ {
353
+ "data": {
354
+ "text/html": [
355
+ "<div>\n",
356
+ "<style scoped>\n",
357
+ " .dataframe tbody tr th:only-of-type {\n",
358
+ " vertical-align: middle;\n",
359
+ " }\n",
360
+ "\n",
361
+ " .dataframe tbody tr th {\n",
362
+ " vertical-align: top;\n",
363
+ " }\n",
364
+ "\n",
365
+ " .dataframe thead th {\n",
366
+ " text-align: right;\n",
367
+ " }\n",
368
+ "</style>\n",
369
+ "<table border=\"1\" class=\"dataframe\">\n",
370
+ " <thead>\n",
371
+ " <tr style=\"text-align: right;\">\n",
372
+ " <th></th>\n",
373
+ " <th>id</th>\n",
374
+ " <th>tweet</th>\n",
375
+ " <th>date</th>\n",
376
+ " <th>user_id</th>\n",
377
+ " <th>username</th>\n",
378
+ " <th>urls</th>\n",
379
+ " <th>nlikes</th>\n",
380
+ " <th>nreplies</th>\n",
381
+ " <th>nretweets</th>\n",
382
+ " </tr>\n",
383
+ " </thead>\n",
384
+ " <tbody>\n",
385
+ " <tr>\n",
386
+ " <th>0</th>\n",
387
+ " <td>1544748873767424001</td>\n",
388
+ " <td>Fruktansvärt att nås av beskedet att kvinnan s...</td>\n",
389
+ " <td>2022-07-06 20:23:26</td>\n",
390
+ " <td>95972673</td>\n",
391
+ " <td>jimmieakesson</td>\n",
392
+ " <td>[]</td>\n",
393
+ " <td>3397</td>\n",
394
+ " <td>167</td>\n",
395
+ " <td>140</td>\n",
396
+ " </tr>\n",
397
+ " <tr>\n",
398
+ " <th>1</th>\n",
399
+ " <td>1538948369611210764</td>\n",
400
+ " <td>@annieloof Nej, jag håller med. Tänk mer som M...</td>\n",
401
+ " <td>2022-06-20 20:14:18</td>\n",
402
+ " <td>95972673</td>\n",
403
+ " <td>jimmieakesson</td>\n",
404
+ " <td>[]</td>\n",
405
+ " <td>1513</td>\n",
406
+ " <td>89</td>\n",
407
+ " <td>115</td>\n",
408
+ " </tr>\n",
409
+ " <tr>\n",
410
+ " <th>2</th>\n",
411
+ " <td>1537770920621879297</td>\n",
412
+ " <td>Man kan ha synpunkter på en sådan lösning, men...</td>\n",
413
+ " <td>2022-06-17 14:15:32</td>\n",
414
+ " <td>95972673</td>\n",
415
+ " <td>jimmieakesson</td>\n",
416
+ " <td>[]</td>\n",
417
+ " <td>694</td>\n",
418
+ " <td>17</td>\n",
419
+ " <td>41</td>\n",
420
+ " </tr>\n",
421
+ " <tr>\n",
422
+ " <th>3</th>\n",
423
+ " <td>1537770809225273344</td>\n",
424
+ " <td>Är det ont om plats på anstalterna så får man ...</td>\n",
425
+ " <td>2022-06-17 14:15:05</td>\n",
426
+ " <td>95972673</td>\n",
427
+ " <td>jimmieakesson</td>\n",
428
+ " <td>[]</td>\n",
429
+ " <td>810</td>\n",
430
+ " <td>26</td>\n",
431
+ " <td>57</td>\n",
432
+ " </tr>\n",
433
+ " <tr>\n",
434
+ " <th>4</th>\n",
435
+ " <td>1537770713368735744</td>\n",
436
+ " <td>Döms man för brott, särskilt våldsbrott, ska m...</td>\n",
437
+ " <td>2022-06-17 14:14:43</td>\n",
438
+ " <td>95972673</td>\n",
439
+ " <td>jimmieakesson</td>\n",
440
+ " <td>[]</td>\n",
441
+ " <td>1020</td>\n",
442
+ " <td>26</td>\n",
443
+ " <td>86</td>\n",
444
+ " </tr>\n",
445
+ " </tbody>\n",
446
+ "</table>\n",
447
+ "</div>"
448
+ ],
449
+ "text/plain": [
450
+ " id tweet \\\n",
451
+ "0 1544748873767424001 Fruktansvärt att nås av beskedet att kvinnan s... \n",
452
+ "1 1538948369611210764 @annieloof Nej, jag håller med. Tänk mer som M... \n",
453
+ "2 1537770920621879297 Man kan ha synpunkter på en sådan lösning, men... \n",
454
+ "3 1537770809225273344 Är det ont om plats på anstalterna så får man ... \n",
455
+ "4 1537770713368735744 Döms man för brott, särskilt våldsbrott, ska m... \n",
456
+ "\n",
457
+ " date user_id username urls nlikes nreplies \\\n",
458
+ "0 2022-07-06 20:23:26 95972673 jimmieakesson [] 3397 167 \n",
459
+ "1 2022-06-20 20:14:18 95972673 jimmieakesson [] 1513 89 \n",
460
+ "2 2022-06-17 14:15:32 95972673 jimmieakesson [] 694 17 \n",
461
+ "3 2022-06-17 14:15:05 95972673 jimmieakesson [] 810 26 \n",
462
+ "4 2022-06-17 14:14:43 95972673 jimmieakesson [] 1020 26 \n",
463
+ "\n",
464
+ " nretweets \n",
465
+ "0 140 \n",
466
+ "1 115 \n",
467
+ "2 41 \n",
468
+ "3 57 \n",
469
+ "4 86 "
470
+ ]
471
+ },
472
+ "execution_count": 7,
473
+ "metadata": {},
474
+ "output_type": "execute_result"
475
+ }
476
+ ],
477
+ "source": [
478
+ "user__tr_info=sc.scrape_by_user(\"jimmieakesson\")\n",
479
+ "df.head()"
480
+ ]
481
+ },
482
+ {
483
+ "cell_type": "code",
484
+ "execution_count": null,
485
+ "id": "7db69757",
486
+ "metadata": {},
487
+ "outputs": [],
488
+ "source": [
489
+ "user__t_info=sc.get_only_tweets(tr_info)\n",
490
+ "user__t_info.head()"
491
+ ]
492
+ },
493
+ {
494
+ "cell_type": "code",
495
+ "execution_count": 8,
496
+ "id": "9d6b1bdf",
497
+ "metadata": {},
498
+ "outputs": [
499
+ {
500
+ "name": "stdout",
501
+ "output_type": "stream",
502
+ "text": [
503
+ "[+] Finished: Successfully collected 16 Tweets from @jimmieakesson.\n"
504
+ ]
505
+ },
506
+ {
507
+ "data": {
508
+ "text/html": [
509
+ "<div>\n",
510
+ "<style scoped>\n",
511
+ " .dataframe tbody tr th:only-of-type {\n",
512
+ " vertical-align: middle;\n",
513
+ " }\n",
514
+ "\n",
515
+ " .dataframe tbody tr th {\n",
516
+ " vertical-align: top;\n",
517
+ " }\n",
518
+ "\n",
519
+ " .dataframe thead th {\n",
520
+ " text-align: right;\n",
521
+ " }\n",
522
+ "</style>\n",
523
+ "<table border=\"1\" class=\"dataframe\">\n",
524
+ " <thead>\n",
525
+ " <tr style=\"text-align: right;\">\n",
526
+ " <th></th>\n",
527
+ " <th>id</th>\n",
528
+ " <th>tweet</th>\n",
529
+ " <th>date</th>\n",
530
+ " <th>user_id</th>\n",
531
+ " <th>username</th>\n",
532
+ " <th>urls</th>\n",
533
+ " <th>nlikes</th>\n",
534
+ " <th>nreplies</th>\n",
535
+ " <th>nretweets</th>\n",
536
+ " </tr>\n",
537
+ " </thead>\n",
538
+ " <tbody>\n",
539
+ " <tr>\n",
540
+ " <th>0</th>\n",
541
+ " <td>1363067834260201475</td>\n",
542
+ " <td>Utan massiv, asylrelaterad invandring från frä...</td>\n",
543
+ " <td>2021-02-20 11:07:50</td>\n",
544
+ " <td>95972673</td>\n",
545
+ " <td>jimmieakesson</td>\n",
546
+ " <td>[]</td>\n",
547
+ " <td>1277</td>\n",
548
+ " <td>22</td>\n",
549
+ " <td>105</td>\n",
550
+ " </tr>\n",
551
+ " <tr>\n",
552
+ " <th>1</th>\n",
553
+ " <td>1363067613660778496</td>\n",
554
+ " <td>Många vänsterliberaler tycks ha reagerat på de...</td>\n",
555
+ " <td>2021-02-20 11:06:58</td>\n",
556
+ " <td>95972673</td>\n",
557
+ " <td>jimmieakesson</td>\n",
558
+ " <td>[]</td>\n",
559
+ " <td>625</td>\n",
560
+ " <td>9</td>\n",
561
+ " <td>68</td>\n",
562
+ " </tr>\n",
563
+ " <tr>\n",
564
+ " <th>2</th>\n",
565
+ " <td>1363067558409158656</td>\n",
566
+ " <td>Jag förstår — uppriktigt — inte den närmast hy...</td>\n",
567
+ " <td>2021-02-20 11:06:45</td>\n",
568
+ " <td>95972673</td>\n",
569
+ " <td>jimmieakesson</td>\n",
570
+ " <td>[]</td>\n",
571
+ " <td>2458</td>\n",
572
+ " <td>199</td>\n",
573
+ " <td>336</td>\n",
574
+ " </tr>\n",
575
+ " <tr>\n",
576
+ " <th>3</th>\n",
577
+ " <td>1362748777552113670</td>\n",
578
+ " <td>Invandring av hundratusentals människor från f...</td>\n",
579
+ " <td>2021-02-19 14:00:01</td>\n",
580
+ " <td>95972673</td>\n",
581
+ " <td>jimmieakesson</td>\n",
582
+ " <td>[]</td>\n",
583
+ " <td>1334</td>\n",
584
+ " <td>55</td>\n",
585
+ " <td>101</td>\n",
586
+ " </tr>\n",
587
+ " <tr>\n",
588
+ " <th>4</th>\n",
589
+ " <td>1362409505557012490</td>\n",
590
+ " <td>Vårt land behöver ett totalstopp för all asyl-...</td>\n",
591
+ " <td>2021-02-18 15:31:53</td>\n",
592
+ " <td>95972673</td>\n",
593
+ " <td>jimmieakesson</td>\n",
594
+ " <td>[]</td>\n",
595
+ " <td>3044</td>\n",
596
+ " <td>268</td>\n",
597
+ " <td>404</td>\n",
598
+ " </tr>\n",
599
+ " </tbody>\n",
600
+ "</table>\n",
601
+ "</div>"
602
+ ],
603
+ "text/plain": [
604
+ " id tweet \\\n",
605
+ "0 1363067834260201475 Utan massiv, asylrelaterad invandring från frä... \n",
606
+ "1 1363067613660778496 Många vänsterliberaler tycks ha reagerat på de... \n",
607
+ "2 1363067558409158656 Jag förstår — uppriktigt — inte den närmast hy... \n",
608
+ "3 1362748777552113670 Invandring av hundratusentals människor från f... \n",
609
+ "4 1362409505557012490 Vårt land behöver ett totalstopp för all asyl-... \n",
610
+ "\n",
611
+ " date user_id username urls nlikes nreplies \\\n",
612
+ "0 2021-02-20 11:07:50 95972673 jimmieakesson [] 1277 22 \n",
613
+ "1 2021-02-20 11:06:58 95972673 jimmieakesson [] 625 9 \n",
614
+ "2 2021-02-20 11:06:45 95972673 jimmieakesson [] 2458 199 \n",
615
+ "3 2021-02-19 14:00:01 95972673 jimmieakesson [] 1334 55 \n",
616
+ "4 2021-02-18 15:31:53 95972673 jimmieakesson [] 3044 268 \n",
617
+ "\n",
618
+ " nretweets \n",
619
+ "0 105 \n",
620
+ "1 68 \n",
621
+ "2 336 \n",
622
+ "3 101 \n",
623
+ "4 404 "
624
+ ]
625
+ },
626
+ "execution_count": 8,
627
+ "metadata": {},
628
+ "output_type": "execute_result"
629
+ }
630
+ ],
631
+ "source": [
632
+ "user__string_tr_info=sc.scrape_by_user_and_string(\"jimmieakesson\",\"invandring\")\n",
633
+ "user__string_tr_info.head()\n"
634
+ ]
635
+ },
636
+ {
637
+ "cell_type": "code",
638
+ "execution_count": 9,
639
+ "id": "a1aede79",
640
+ "metadata": {},
641
+ "outputs": [
642
+ {
643
+ "data": {
644
+ "text/html": [
645
+ "<div>\n",
646
+ "<style scoped>\n",
647
+ " .dataframe tbody tr th:only-of-type {\n",
648
+ " vertical-align: middle;\n",
649
+ " }\n",
650
+ "\n",
651
+ " .dataframe tbody tr th {\n",
652
+ " vertical-align: top;\n",
653
+ " }\n",
654
+ "\n",
655
+ " .dataframe thead th {\n",
656
+ " text-align: right;\n",
657
+ " }\n",
658
+ "</style>\n",
659
+ "<table border=\"1\" class=\"dataframe\">\n",
660
+ " <thead>\n",
661
+ " <tr style=\"text-align: right;\">\n",
662
+ " <th></th>\n",
663
+ " <th>id</th>\n",
664
+ " <th>tweet</th>\n",
665
+ " <th>date</th>\n",
666
+ " <th>user_id</th>\n",
667
+ " <th>username</th>\n",
668
+ " <th>urls</th>\n",
669
+ " <th>nlikes</th>\n",
670
+ " <th>nreplies</th>\n",
671
+ " <th>nretweets</th>\n",
672
+ " </tr>\n",
673
+ " </thead>\n",
674
+ " <tbody>\n",
675
+ " <tr>\n",
676
+ " <th>0</th>\n",
677
+ " <td>1363067834260201475</td>\n",
678
+ " <td>Utan massiv, asylrelaterad invandring från frä...</td>\n",
679
+ " <td>2021-02-20 11:07:50</td>\n",
680
+ " <td>95972673</td>\n",
681
+ " <td>jimmieakesson</td>\n",
682
+ " <td>[]</td>\n",
683
+ " <td>1277</td>\n",
684
+ " <td>22</td>\n",
685
+ " <td>105</td>\n",
686
+ " </tr>\n",
687
+ " <tr>\n",
688
+ " <th>1</th>\n",
689
+ " <td>1363067613660778496</td>\n",
690
+ " <td>Många vänsterliberaler tycks ha reagerat på de...</td>\n",
691
+ " <td>2021-02-20 11:06:58</td>\n",
692
+ " <td>95972673</td>\n",
693
+ " <td>jimmieakesson</td>\n",
694
+ " <td>[]</td>\n",
695
+ " <td>625</td>\n",
696
+ " <td>9</td>\n",
697
+ " <td>68</td>\n",
698
+ " </tr>\n",
699
+ " <tr>\n",
700
+ " <th>2</th>\n",
701
+ " <td>1363067558409158656</td>\n",
702
+ " <td>Jag förstår — uppriktigt — inte den närmast hy...</td>\n",
703
+ " <td>2021-02-20 11:06:45</td>\n",
704
+ " <td>95972673</td>\n",
705
+ " <td>jimmieakesson</td>\n",
706
+ " <td>[]</td>\n",
707
+ " <td>2458</td>\n",
708
+ " <td>199</td>\n",
709
+ " <td>336</td>\n",
710
+ " </tr>\n",
711
+ " <tr>\n",
712
+ " <th>3</th>\n",
713
+ " <td>1362748777552113670</td>\n",
714
+ " <td>Invandring av hundratusentals människor från f...</td>\n",
715
+ " <td>2021-02-19 14:00:01</td>\n",
716
+ " <td>95972673</td>\n",
717
+ " <td>jimmieakesson</td>\n",
718
+ " <td>[]</td>\n",
719
+ " <td>1334</td>\n",
720
+ " <td>55</td>\n",
721
+ " <td>101</td>\n",
722
+ " </tr>\n",
723
+ " <tr>\n",
724
+ " <th>4</th>\n",
725
+ " <td>1362409505557012490</td>\n",
726
+ " <td>Vårt land behöver ett totalstopp för all asyl-...</td>\n",
727
+ " <td>2021-02-18 15:31:53</td>\n",
728
+ " <td>95972673</td>\n",
729
+ " <td>jimmieakesson</td>\n",
730
+ " <td>[]</td>\n",
731
+ " <td>3044</td>\n",
732
+ " <td>268</td>\n",
733
+ " <td>404</td>\n",
734
+ " </tr>\n",
735
+ " </tbody>\n",
736
+ "</table>\n",
737
+ "</div>"
738
+ ],
739
+ "text/plain": [
740
+ " id tweet \\\n",
741
+ "0 1363067834260201475 Utan massiv, asylrelaterad invandring från frä... \n",
742
+ "1 1363067613660778496 Många vänsterliberaler tycks ha reagerat på de... \n",
743
+ "2 1363067558409158656 Jag förstår — uppriktigt — inte den närmast hy... \n",
744
+ "3 1362748777552113670 Invandring av hundratusentals människor från f... \n",
745
+ "4 1362409505557012490 Vårt land behöver ett totalstopp för all asyl-... \n",
746
+ "\n",
747
+ " date user_id username urls nlikes nreplies \\\n",
748
+ "0 2021-02-20 11:07:50 95972673 jimmieakesson [] 1277 22 \n",
749
+ "1 2021-02-20 11:06:58 95972673 jimmieakesson [] 625 9 \n",
750
+ "2 2021-02-20 11:06:45 95972673 jimmieakesson [] 2458 199 \n",
751
+ "3 2021-02-19 14:00:01 95972673 jimmieakesson [] 1334 55 \n",
752
+ "4 2021-02-18 15:31:53 95972673 jimmieakesson [] 3044 268 \n",
753
+ "\n",
754
+ " nretweets \n",
755
+ "0 105 \n",
756
+ "1 68 \n",
757
+ "2 336 \n",
758
+ "3 101 \n",
759
+ "4 404 "
760
+ ]
761
+ },
762
+ "execution_count": 9,
763
+ "metadata": {},
764
+ "output_type": "execute_result"
765
+ }
766
+ ],
767
+ "source": [
768
+ "user__string_t_info = sc.get_only_tweets(user__string_tr_info)\n",
769
+ "user__string_t_info.head()"
770
+ ]
771
+ },
772
+ {
773
+ "cell_type": "code",
774
+ "execution_count": null,
775
+ "id": "48d50b46",
776
+ "metadata": {},
777
+ "outputs": [],
778
+ "source": [
779
+ "tweets= df[\"tweet\"]\n",
780
+ "for tweet in tweets:\n",
781
+ " print(tweet, \"\\n\", \"__________________________________________________________\")"
782
+ ]
783
+ },
784
+ {
785
+ "cell_type": "code",
786
+ "execution_count": null,
787
+ "id": "530c26e2",
788
+ "metadata": {},
789
+ "outputs": [],
790
+ "source": []
791
+ }
792
+ ],
793
+ "metadata": {
794
+ "kernelspec": {
795
+ "display_name": "Python 3.10.4 64-bit",
796
+ "language": "python",
797
+ "name": "python3"
798
+ },
799
+ "language_info": {
800
+ "codemirror_mode": {
801
+ "name": "ipython",
802
+ "version": 3
803
+ },
804
+ "file_extension": ".py",
805
+ "mimetype": "text/x-python",
806
+ "name": "python",
807
+ "nbconvert_exporter": "python",
808
+ "pygments_lexer": "ipython3",
809
+ "version": "3.10.4"
810
+ },
811
+ "vscode": {
812
+ "interpreter": {
813
+ "hash": "916dbcbb3f70747c44a77c7bcd40155683ae19c65e1c03b4aa3499c5328201f1"
814
+ }
815
+ }
816
+ },
817
+ "nbformat": 4,
818
+ "nbformat_minor": 5
819
+ }