diff --git a/.gitattributes b/.gitattributes index a6344aac8c09253b3b630fb776ae94478aa0275b..ca881c2dcf00158c6711f6a3d742ee4567f4910f 100644 --- a/.gitattributes +++ b/.gitattributes @@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text *.zip filter=lfs diff=lfs merge=lfs -text *.zst filter=lfs diff=lfs merge=lfs -text *tfevents* filter=lfs diff=lfs merge=lfs -text +twitter_scraper_without_API/firefox-geckodriver/geckodriver.exe filter=lfs diff=lfs merge=lfs -text diff --git a/Hands-on-WebScraping-master.zip b/Hands-on-WebScraping-master.zip new file mode 100644 index 0000000000000000000000000000000000000000..c01b24f9627368f8bf4c107eef4cd3aec334b655 --- /dev/null +++ b/Hands-on-WebScraping-master.zip @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8634d80c9a416f3346c02ce5b6f25f96fda953cc24212a45f461a510d145f04c +size 15838 diff --git a/Hands-on-WebScraping-master/Hands-on-WebScraping-master/.gitignore b/Hands-on-WebScraping-master/Hands-on-WebScraping-master/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..bc3ca9210a493323162741598b9f73ab8d101d4b --- /dev/null +++ b/Hands-on-WebScraping-master/Hands-on-WebScraping-master/.gitignore @@ -0,0 +1,241 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +pip-wheel-metadata/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +.python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + + + + +# Logs +logs +*.log +npm-debug.log* +yarn-debug.log* +yarn-error.log* +lerna-debug.log* + +# Diagnostic reports (https://nodejs.org/api/report.html) +report.[0-9]*.[0-9]*.[0-9]*.[0-9]*.json + +# Runtime data +pids +*.pid +*.seed +*.pid.lock + +# Directory for instrumented libs generated by jscoverage/JSCover +lib-cov + +# Coverage directory used by tools like istanbul +coverage +*.lcov + +# nyc test coverage +.nyc_output + +# Grunt intermediate storage (https://gruntjs.com/creating-plugins#storing-task-files) +.grunt + +# Bower dependency directory (https://bower.io/) +bower_components + +# node-waf configuration +.lock-wscript + +# Compiled binary addons (https://nodejs.org/api/addons.html) +build/Release + +# Dependency directories +node_modules/ +jspm_packages/ + +# TypeScript v1 declaration files +typings/ + +# TypeScript cache +*.tsbuildinfo + +# Optional npm cache directory +.npm + +# Optional eslint cache +.eslintcache + +# Microbundle cache +.rpt2_cache/ +.rts2_cache_cjs/ +.rts2_cache_es/ +.rts2_cache_umd/ + +# Optional REPL history +.node_repl_history + +# Output of 'npm pack' +*.tgz + +# Yarn Integrity file +.yarn-integrity + +# dotenv environment variables file +.env +.env.test + +# parcel-bundler cache (https://parceljs.org/) +.cache + +# Next.js build output +.next + +# Nuxt.js build / generate output +.nuxt +dist + +# Gatsby files +.cache/ +# Comment in the public line in if your project uses Gatsby and not Next.js +# https://nextjs.org/blog/next-9-1#public-directory-support +# public + +# vuepress build output +.vuepress/dist + +# Serverless directories +.serverless/ + +# FuseBox cache +.fusebox/ + +# DynamoDB Local files +.dynamodb/ + +# TernJS port file +.tern-port + +# Stores VSCode versions used for testing VSCode extensions +.vscode-test + diff --git a/Hands-on-WebScraping-master/Hands-on-WebScraping-master/LICENSE b/Hands-on-WebScraping-master/Hands-on-WebScraping-master/LICENSE new file mode 100644 index 0000000000000000000000000000000000000000..78668187abc7c5a2b942139d657aa76ec5117bb6 --- /dev/null +++ b/Hands-on-WebScraping-master/Hands-on-WebScraping-master/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2020 Amit Upreti + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/Hands-on-WebScraping-master/Hands-on-WebScraping-master/README.md b/Hands-on-WebScraping-master/Hands-on-WebScraping-master/README.md new file mode 100644 index 0000000000000000000000000000000000000000..91c5a8cae4501bc68157caebfaf2c81bbfaea8f9 --- /dev/null +++ b/Hands-on-WebScraping-master/Hands-on-WebScraping-master/README.md @@ -0,0 +1,2 @@ +# Hands-on-WebScraping (NO LONGER MAINTAINED) +This repo is a part of blog series on several web scraping projects where we will explore scraping techniques to crawl data from simple websites to websites using advanced protection. diff --git a/Hands-on-WebScraping-master/Hands-on-WebScraping-master/project1_twitter_hashtag_crawler/Readme.md b/Hands-on-WebScraping-master/Hands-on-WebScraping-master/project1_twitter_hashtag_crawler/Readme.md new file mode 100644 index 0000000000000000000000000000000000000000..da59ad7c4d6259e053d17bd74f0daf8858a6df5e --- /dev/null +++ b/Hands-on-WebScraping-master/Hands-on-WebScraping-master/project1_twitter_hashtag_crawler/Readme.md @@ -0,0 +1,118 @@ +#### Depricated. No longer maintained. + +# Twitter Hashtag crawler +> A fast and unofficial twitter crawler to collect tweets using hashtag search. + +> Notice: The crawler is meant to be used for collecting data purely for academic and research purpose only. I am not responsible for any legal issue that might arise for any unintended use of this crawler + +[![Python 3](https://img.shields.io/badge/python-3.6-blue.svg)](https://www.python.org/downloads/release/python-360/) +[![twitter crawler](https://img.shields.io/badge/twittercrawler-1.0-green)](https://github.com/amitupreti/Hands-on-WebScraping/tree/master/project1_twitter_hashtag_crawler) + +This is written using scrapy and python. The logic is straight forward. We are simply sending get requests to the mobile version of the twitter(mobile.twitter.com) to collect the list of tweets and sending get requests to the web version to parse tweet details. +![](header.png) + +## Installation + +OS X & Linux: + +1. Download the project + +```sh +git clone https://github.com/amitupreti/Hands-on-WebScraping + +cd Hands-on-WebScraping/project1_twitter_hashtag_crawler +``` +2. Install the dependencies + +```sh +pip install -r requirements.txt --user +``` + +3. Verify the crawler spider exists + +```sh +scrapy list +``` +if you see `twittercrawler` than you are all set. + + +Windows: +1. Install [python3](https://www.python.org/downloads/) if you haven't already +2. Download the project. https://github.com/amitupreti/Hands-on-WebScraping/archive/master.zip +3. Extract the project +4. Open cmd and navigate inside the project directory +```sh +cd Hands-on-WebScraping/project1_twitter_hashtag_crawler +``` +5. Follow step 2 and 3 from Mac/Linux installation + + + +## Usage example + +1. Put the hashtags in a csv file seperated by new line. For example, I have included `myhashtags.csv` as a sample. + +![Hashtags file](https://i.paste.pics/225079df0d3dc27d66430b1553b2ac39.png) + +2. Run the crawler with your hashtag file and the desired [output formats] (https://docs.scrapy.org/en/latest/topics/feed-exports.html)(JSON,JSON lines,CSV,XML) + +* For csv + ```sh + scrapy crawl twittercrawler -a filename=myhashtags.csv -o mydata.csv + + ``` + +* For JSON + ```sh + scrapy crawl twittercrawler -a filename=myhashtags.csv -o mydata.json + + ``` +![sample images](https://i.paste.pics/4a5826a6a090522e5326bb11838258df.png) +![sample images](https://i.paste.pics/68a64bab743150e00af4cd9eea9af8dc.png) + + +### Speeding up the crawls +If you feel like the crawler is a little slow then find the hashtag.py file in the project and edit the custom settings. +```py +custom_settings = { + 'USER_AGENT': 'Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; compatible; Googlebot/2.1; +http://www.google.com/bot.html) Safari/537.36', + 'CONCURRENT_REQUESTS': 2, 'DOWNLOAD_DELAY': 1, 'LOG_LEVEL': 'INFO'} +``` +> Here CONCURRENT_REQUESTS is the number of URLs that will be processed parallelly and DOWNLOAD_DELAY is a wait between each request. So, Increase CONCURRENT_REQUESTS and decrease DOWNLOAD_DELAY (minimum value for download delay is 0). + + +## Data Columns +* username +* full_name +* twitter_url +* tweet_text +* tweet_time +* number_of_likes +* no_of_retweets +* no_of_replies +* mentions +* no_of_mentions +* hashtags +* no_of_hashtags +* call_to_action +* image_url + +## Release History + +* 1.0.0 + * first release crawl by hashtags + +## Meta + +Amit Upreti – [@amitupreti](https://www.linkedin.com/in/amitupreti/) + +Distributed under the MIT license. See ``LICENSE`` for more information. + + +## Contributing + +1. Fork it () +2. Create your feature branch (`git checkout -b feature/fooBar`) +3. Commit your changes (`git commit -am 'Add some fooBar'`) +4. Push to the branch (`git push origin feature/fooBar`) +5. Create a new Pull Request diff --git a/Hands-on-WebScraping-master/Hands-on-WebScraping-master/project1_twitter_hashtag_crawler/TwitterHashTagCrawler/__init__.py b/Hands-on-WebScraping-master/Hands-on-WebScraping-master/project1_twitter_hashtag_crawler/TwitterHashTagCrawler/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/Hands-on-WebScraping-master/Hands-on-WebScraping-master/project1_twitter_hashtag_crawler/TwitterHashTagCrawler/items.py b/Hands-on-WebScraping-master/Hands-on-WebScraping-master/project1_twitter_hashtag_crawler/TwitterHashTagCrawler/items.py new file mode 100644 index 0000000000000000000000000000000000000000..d4ba6f2d8db57565a17246fd99647b2a77dba1c4 --- /dev/null +++ b/Hands-on-WebScraping-master/Hands-on-WebScraping-master/project1_twitter_hashtag_crawler/TwitterHashTagCrawler/items.py @@ -0,0 +1,14 @@ +# -*- coding: utf-8 -*- + +# Define here the models for your scraped items +# +# See documentation in: +# https://doc.scrapy.org/en/latest/topics/items.html + +import scrapy + + +class TwitterhashtagcrawlerItem(scrapy.Item): + # define the fields for your item here like: + # name = scrapy.Field() + pass diff --git a/Hands-on-WebScraping-master/Hands-on-WebScraping-master/project1_twitter_hashtag_crawler/TwitterHashTagCrawler/middlewares.py b/Hands-on-WebScraping-master/Hands-on-WebScraping-master/project1_twitter_hashtag_crawler/TwitterHashTagCrawler/middlewares.py new file mode 100644 index 0000000000000000000000000000000000000000..051c3df3f4f76df7a7664770ebaf0a1192d01bce --- /dev/null +++ b/Hands-on-WebScraping-master/Hands-on-WebScraping-master/project1_twitter_hashtag_crawler/TwitterHashTagCrawler/middlewares.py @@ -0,0 +1,103 @@ +# -*- coding: utf-8 -*- + +# Define here the models for your spider middleware +# +# See documentation in: +# https://doc.scrapy.org/en/latest/topics/spider-middleware.html + +from scrapy import signals + + +class TwitterhashtagcrawlerSpiderMiddleware(object): + # Not all methods need to be defined. If a method is not defined, + # scrapy acts as if the spider middleware does not modify the + # passed objects. + + @classmethod + def from_crawler(cls, crawler): + # This method is used by Scrapy to create your spiders. + s = cls() + crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) + return s + + def process_spider_input(self, response, spider): + # Called for each response that goes through the spider + # middleware and into the spider. + + # Should return None or raise an exception. + return None + + def process_spider_output(self, response, result, spider): + # Called with the results returned from the Spider, after + # it has processed the response. + + # Must return an iterable of Request, dict or Item objects. + for i in result: + yield i + + def process_spider_exception(self, response, exception, spider): + # Called when a spider or process_spider_input() method + # (from other spider middleware) raises an exception. + + # Should return either None or an iterable of Response, dict + # or Item objects. + pass + + def process_start_requests(self, start_requests, spider): + # Called with the start requests of the spider, and works + # similarly to the process_spider_output() method, except + # that it doesn’t have a response associated. + + # Must return only requests (not items). + for r in start_requests: + yield r + + def spider_opened(self, spider): + spider.logger.info('Spider opened: %s' % spider.name) + + +class TwitterhashtagcrawlerDownloaderMiddleware(object): + # Not all methods need to be defined. If a method is not defined, + # scrapy acts as if the downloader middleware does not modify the + # passed objects. + + @classmethod + def from_crawler(cls, crawler): + # This method is used by Scrapy to create your spiders. + s = cls() + crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) + return s + + def process_request(self, request, spider): + # Called for each request that goes through the downloader + # middleware. + + # Must either: + # - return None: continue processing this request + # - or return a Response object + # - or return a Request object + # - or raise IgnoreRequest: process_exception() methods of + # installed downloader middleware will be called + return None + + def process_response(self, request, response, spider): + # Called with the response returned from the downloader. + + # Must either; + # - return a Response object + # - return a Request object + # - or raise IgnoreRequest + return response + + def process_exception(self, request, exception, spider): + # Called when a download handler or a process_request() + # (from other downloader middleware) raises an exception. + + # Must either: + # - return None: continue processing this exception + # - return a Response object: stops process_exception() chain + # - return a Request object: stops process_exception() chain + pass + + def spider_opened(self, spider): + spider.logger.info('Spider opened: %s' % spider.name) diff --git a/Hands-on-WebScraping-master/Hands-on-WebScraping-master/project1_twitter_hashtag_crawler/TwitterHashTagCrawler/pipelines.py b/Hands-on-WebScraping-master/Hands-on-WebScraping-master/project1_twitter_hashtag_crawler/TwitterHashTagCrawler/pipelines.py new file mode 100644 index 0000000000000000000000000000000000000000..8cd03944ce5b46c23cbde31f6168d47008be0f4d --- /dev/null +++ b/Hands-on-WebScraping-master/Hands-on-WebScraping-master/project1_twitter_hashtag_crawler/TwitterHashTagCrawler/pipelines.py @@ -0,0 +1,11 @@ +# -*- coding: utf-8 -*- + +# Define your item pipelines here +# +# Don't forget to add your pipeline to the ITEM_PIPELINES setting +# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html + + +class TwitterhashtagcrawlerPipeline(object): + def process_item(self, item, spider): + return item diff --git a/Hands-on-WebScraping-master/Hands-on-WebScraping-master/project1_twitter_hashtag_crawler/TwitterHashTagCrawler/settings.py b/Hands-on-WebScraping-master/Hands-on-WebScraping-master/project1_twitter_hashtag_crawler/TwitterHashTagCrawler/settings.py new file mode 100644 index 0000000000000000000000000000000000000000..3d24912ef5930e443344ce9817e3a0ffeaada039 --- /dev/null +++ b/Hands-on-WebScraping-master/Hands-on-WebScraping-master/project1_twitter_hashtag_crawler/TwitterHashTagCrawler/settings.py @@ -0,0 +1,90 @@ +# -*- coding: utf-8 -*- + +# Scrapy settings for TwitterHashTagCrawler project +# +# For simplicity, this file contains only settings considered important or +# commonly used. You can find more settings consulting the documentation: +# +# https://doc.scrapy.org/en/latest/topics/settings.html +# https://doc.scrapy.org/en/latest/topics/downloader-middleware.html +# https://doc.scrapy.org/en/latest/topics/spider-middleware.html + +BOT_NAME = 'TwitterHashTagCrawler' + +SPIDER_MODULES = ['TwitterHashTagCrawler.spiders'] +NEWSPIDER_MODULE = 'TwitterHashTagCrawler.spiders' + + +# Crawl responsibly by identifying yourself (and your website) on the user-agent +#USER_AGENT = 'TwitterHashTagCrawler (+http://www.yourdomain.com)' + +# Obey robots.txt rules +ROBOTSTXT_OBEY = True + +# Configure maximum concurrent requests performed by Scrapy (default: 16) +#CONCURRENT_REQUESTS = 32 + +# Configure a delay for requests for the same website (default: 0) +# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay +# See also autothrottle settings and docs +#DOWNLOAD_DELAY = 3 +# The download delay setting will honor only one of: +#CONCURRENT_REQUESTS_PER_DOMAIN = 16 +#CONCURRENT_REQUESTS_PER_IP = 16 + +# Disable cookies (enabled by default) +#COOKIES_ENABLED = False + +# Disable Telnet Console (enabled by default) +#TELNETCONSOLE_ENABLED = False + +# Override the default request headers: +#DEFAULT_REQUEST_HEADERS = { +# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', +# 'Accept-Language': 'en', +#} + +# Enable or disable spider middlewares +# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html +#SPIDER_MIDDLEWARES = { +# 'TwitterHashTagCrawler.middlewares.TwitterhashtagcrawlerSpiderMiddleware': 543, +#} + +# Enable or disable downloader middlewares +# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html +#DOWNLOADER_MIDDLEWARES = { +# 'TwitterHashTagCrawler.middlewares.TwitterhashtagcrawlerDownloaderMiddleware': 543, +#} + +# Enable or disable extensions +# See https://doc.scrapy.org/en/latest/topics/extensions.html +#EXTENSIONS = { +# 'scrapy.extensions.telnet.TelnetConsole': None, +#} + +# Configure item pipelines +# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html +#ITEM_PIPELINES = { +# 'TwitterHashTagCrawler.pipelines.TwitterhashtagcrawlerPipeline': 300, +#} + +# Enable and configure the AutoThrottle extension (disabled by default) +# See https://doc.scrapy.org/en/latest/topics/autothrottle.html +#AUTOTHROTTLE_ENABLED = True +# The initial download delay +#AUTOTHROTTLE_START_DELAY = 5 +# The maximum download delay to be set in case of high latencies +#AUTOTHROTTLE_MAX_DELAY = 60 +# The average number of requests Scrapy should be sending in parallel to +# each remote server +#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 +# Enable showing throttling stats for every response received: +#AUTOTHROTTLE_DEBUG = False + +# Enable and configure HTTP caching (disabled by default) +# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings +#HTTPCACHE_ENABLED = True +#HTTPCACHE_EXPIRATION_SECS = 0 +#HTTPCACHE_DIR = 'httpcache' +#HTTPCACHE_IGNORE_HTTP_CODES = [] +#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' diff --git a/Hands-on-WebScraping-master/Hands-on-WebScraping-master/project1_twitter_hashtag_crawler/TwitterHashTagCrawler/spiders/__init__.py b/Hands-on-WebScraping-master/Hands-on-WebScraping-master/project1_twitter_hashtag_crawler/TwitterHashTagCrawler/spiders/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..ebd689ac51d69c5e1dbbe80083c2b20a39f8bb79 --- /dev/null +++ b/Hands-on-WebScraping-master/Hands-on-WebScraping-master/project1_twitter_hashtag_crawler/TwitterHashTagCrawler/spiders/__init__.py @@ -0,0 +1,4 @@ +# This package will contain the spiders of your Scrapy project +# +# Please refer to the documentation for information on how to create and manage +# your spiders. diff --git a/Hands-on-WebScraping-master/Hands-on-WebScraping-master/project1_twitter_hashtag_crawler/TwitterHashTagCrawler/spiders/hashtag.py b/Hands-on-WebScraping-master/Hands-on-WebScraping-master/project1_twitter_hashtag_crawler/TwitterHashTagCrawler/spiders/hashtag.py new file mode 100644 index 0000000000000000000000000000000000000000..aef23a0268d3a3ea1df8d465433b813838b9c009 --- /dev/null +++ b/Hands-on-WebScraping-master/Hands-on-WebScraping-master/project1_twitter_hashtag_crawler/TwitterHashTagCrawler/spiders/hashtag.py @@ -0,0 +1,112 @@ +# -*- coding: utf-8 -*- +import scrapy +import ipdb +import re +from dateutil import parser +import sys +from scrapy.crawler import CrawlerProcess +from utils import get_links, get_hashtags, get_mentions +import logging + +class HashtagSpider(scrapy.Spider): + name = 'twittercrawler' + allowed_domains = ["twitter.com"] + + # custom settings for user agent and proxy. Default will get chrome as user agent and use a proxypool of 50 . + # Override here + custom_settings = { + 'USER_AGENT': 'Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; compatible; Googlebot/2.1; +http://www.google.com/bot.html) Safari/537.36', + 'CONCURRENT_REQUESTS': 5, 'DOWNLOAD_DELAY': 0, 'LOG_LEVEL': 'INFO'} + + def __init__(self, filename=''): + if not filename: + sys.exit('Please provide the input filename also. Example \n\n$ python3 hashtags.py myinput.csv') + self.filename = filename + + # the crawler will execute start_requests function at first. + def start_requests(self): + with open(self.filename, 'r') as f: + hashtags = f.read().splitlines() + if len(hashtags) == 0: + sys.exit('Emplty File detected.Please provide hashtags separated by newlines') + else: + logging.info(f'{len(hashtags)} hashtags found') + for hashtag in hashtags: + if hashtag: + search_url = "https://mobile.twitter.com/hashtag/" + hashtag.lower() + + yield scrapy.Request(search_url, callback=self.find_tweets, dont_filter=True) + + def find_tweets(self, response): + tweets = response.xpath('//table[@class="tweet "]/@href').getall() + logging.info(f'{len(tweets)} tweets found') + for tweet_id in tweets: + tweet_id = re.findall("\d+", tweet_id)[-1] + tweet_url = 'https://twitter.com/anyuser/status/' + \ + str(tweet_id) + yield scrapy.Request(tweet_url, callback=self.parse_tweet) + + # finding and visiting next page + next_page = response.xpath( + '//*[@class="w-button-more"]/a/@href').get(default='') + logging.info('Next page found:') + if next_page != '': + next_page = 'https://mobile.twitter.com' + next_page + yield scrapy.Request(next_page, callback=self.find_tweets) + + def parse_tweet(self, response): + logging.info('Processing --> ' + response.url) + username = response.xpath( + '//*[@class="permalink-inner permalink-tweet-container"]//*[@class="username u-dir u-textTruncate"]/b/text()').get( + default='') + full_name = response.xpath( + '//*[@class="permalink-inner permalink-tweet-container"]//*[@class="FullNameGroup"]/strong/text()').get( + default='') + + try: + tweet_text = response.xpath('//title/text()').get(default='').split(':')[1].strip() + + except: + tweet_text = ' '.join(response.xpath( + '//*[contains(@class,"permalink-inner permalink-tweet-container")]//*[@class="js-tweet-text-container"]/p//text()').getall()).strip() + image_list = response.xpath( + '//*[contains(@class,"permalink-inner permalink-tweet-container")]//*[@class="AdaptiveMediaOuterContainer"]//img/@src').getall() + date_time = response.xpath( + '//*[contains(@class,"permalink-inner permalink-tweet-container")]//*[@class="js-tweet-details-fixer tweet-details-fixer"]/div[@class="client-and-actions"]/span[@class="metadata"]/span/text()').get( + default='') + + date_time = parser.parse(date_time.replace('-', '')).strftime('%Y-%m-%d %H:%M:%S') + retweets = response.xpath( + '//*[contains(@class,"permalink-inner permalink-tweet-container")]//*[@class="js-tweet-details-fixer tweet-details-fixer"]/div[@class="js-tweet-stats-container tweet-stats-container"]//*[@class="js-stat-count js-stat-retweets stat-count"]/a/strong/text()').get( + default='') + + likes = response.xpath( + '//*[contains(@class,"permalink-inner permalink-tweet-container")]//*[@class="js-tweet-details-fixer tweet-details-fixer"]/div[@class="js-tweet-stats-container tweet-stats-container"]//*[@class="js-stat-count js-stat-favorites stat-count"]/a/strong/text()').get( + default='') + replies = response.xpath( + '//*[contains(@class,"permalink-inner permalink-tweet-container")]//*[contains(@id,"profile-tweet-action-reply-count")]/parent::span/@data-tweet-stat-count').get( + default='') + + mentions = get_mentions(tweet_text) + hashtags = get_hashtags(tweet_text) + cta = get_links(tweet_text) + + result = { + 'username': username.lower(), + 'full_name': full_name, + 'twitter_url': response.url, + 'tweet_text': tweet_text, + 'tweet_time': str(date_time), + 'number_of_likes': str(likes), + 'no_of_retweets': str(retweets), + 'no_of_replies': str(replies), + 'mentions': ' | '.join(mentions), + 'no_of_mentions': str(len(mentions)), + 'hashtags': ' | '.join(hashtags), + 'no_of_hashtags': str(len(hashtags)), + 'call_to_action': ' | '.join(cta), + 'image_url': ' | '.join(image_list), + + } + yield result + diff --git a/Hands-on-WebScraping-master/Hands-on-WebScraping-master/project1_twitter_hashtag_crawler/myhashtags.csv b/Hands-on-WebScraping-master/Hands-on-WebScraping-master/project1_twitter_hashtag_crawler/myhashtags.csv new file mode 100644 index 0000000000000000000000000000000000000000..be8ebcc52ead7b0f82e736ee32b3c7639af0616e --- /dev/null +++ b/Hands-on-WebScraping-master/Hands-on-WebScraping-master/project1_twitter_hashtag_crawler/myhashtags.csv @@ -0,0 +1,2 @@ +cats +dogs diff --git a/Hands-on-WebScraping-master/Hands-on-WebScraping-master/project1_twitter_hashtag_crawler/requirements.txt b/Hands-on-WebScraping-master/Hands-on-WebScraping-master/project1_twitter_hashtag_crawler/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..fa3771a478dc2d769dc6e0c1769848cc8ccfbb05 --- /dev/null +++ b/Hands-on-WebScraping-master/Hands-on-WebScraping-master/project1_twitter_hashtag_crawler/requirements.txt @@ -0,0 +1,2 @@ +scrapy +dateutil \ No newline at end of file diff --git a/Hands-on-WebScraping-master/Hands-on-WebScraping-master/project1_twitter_hashtag_crawler/sampledata.csv b/Hands-on-WebScraping-master/Hands-on-WebScraping-master/project1_twitter_hashtag_crawler/sampledata.csv new file mode 100644 index 0000000000000000000000000000000000000000..bc292c4d6fd3bf756709e953211a65a474e08257 --- /dev/null +++ b/Hands-on-WebScraping-master/Hands-on-WebScraping-master/project1_twitter_hashtag_crawler/sampledata.csv @@ -0,0 +1,12 @@ +username,full_name,twitter_url,tweet_text,tweet_time,number_of_likes,no_of_retweets,no_of_replies,mentions,no_of_mentions,hashtags,no_of_hashtags,call_to_action,image_url +cctvasiapacific,CCTV Asia Pacific,https://twitter.com/CCTVAsiaPacific/status/1212269072328491008,"Turning off the stereotype of political faces, Hou Yanqi, the # ChineseAmbssdor to # Nepal , amazes Nepalese and gains popularity on twitter by posting her ad-like photos and wishes: ""True beauty always touches the deep heart"", said Hou. +२०२० नेपाल भ्रमाण वर्ष सफलताको शुभकामना pic.twitter.com/z0N8ru2vNd",2019-12-31 23:07:00,804,171,35,,0,,0,,https://pbs.twimg.com/media/ENLYSqlU4AAgiFh.jpg | https://pbs.twimg.com/media/ENLYSqoVAAASSS-.jpg | https://pbs.twimg.com/media/ENLYSqmU0AAZEyK.jpg +,,https://twitter.com/BishowParajuli/status/1213037950549626882,"Zimbabwe is beautiful! Glad to hear your mountain climbing adventure ; If you wish to climb further higher, another beautiful place is # Nepal ! You will you can also enjoy some terrific historical spots: pic.twitter.com/ofsCppyp0O",2020-01-03 02:02:00,27,3,1,,0,,0,,https://pbs.twimg.com/media/ENWTkzmUEAEKS1k.jpg | https://pbs.twimg.com/media/ENWTkznU4AAtVxK.jpg | https://pbs.twimg.com/media/ENWTkzoUwAEgMpX.jpg | https://pbs.twimg.com/media/ENWTkzlU4AEYxor.jpg +kopinoora,kpila,https://twitter.com/kopinoora/status/1213481511967690752,# VisitNepal2020 official inauguration at London Nepal Embassy. # pic.twitter.com/e4N9XulBH7,2020-01-04 07:25:00,3,,0,,0,,0,,https://pbs.twimg.com/media/ENcnABiXsAE7_sw.jpg | https://pbs.twimg.com/media/ENcnABsXUAAnuBL.jpg +mahbub_nazif,Nazif Mahbub,https://twitter.com/mahbub_nazif/status/1213328288271089664,"The joy of being Innocent. Durbar square, kathmandu, nepal pic.twitter.com/sbsfxTzeHN",2020-01-03 21:16:00,4,,0,,0,,0,,https://pbs.twimg.com/media/ENabn-uWwAcbUfb.jpg +prabhumteverest,Prastuti_प्रश्तुती,https://twitter.com/PrabhuMteverest/status/1213178026457878528,"Visit nepal2020. where heaven meets and you won't feel regret choosing Nepal as your destination + +We are eager to welcome you with our beautiful destinations and warm hospitality pic.twitter.com/l7GQfk2ha6",2020-01-03 11:19:00,5,,0,,0,,0,,https://pbs.twimg.com/media/ENYS_CLUwAAVypp.jpg +kashishds,Kashish Das Shrestha,https://twitter.com/kashishds/status/1213120581412876295,"Marpha bazaar, Mustang, Nepal. Today. + +Requested my friend & Marpha resident Dipesh Hirachan for this clip. This is just outside his Apple orchard there. pic.twitter.com/oOFy88ylIt",2020-01-03 07:30:00,123,20,4,,0,,0,, diff --git a/Hands-on-WebScraping-master/Hands-on-WebScraping-master/project1_twitter_hashtag_crawler/scrapy.cfg b/Hands-on-WebScraping-master/Hands-on-WebScraping-master/project1_twitter_hashtag_crawler/scrapy.cfg new file mode 100644 index 0000000000000000000000000000000000000000..eb263c302efcdac4fdcf2b536f11b9f76a4e142a --- /dev/null +++ b/Hands-on-WebScraping-master/Hands-on-WebScraping-master/project1_twitter_hashtag_crawler/scrapy.cfg @@ -0,0 +1,11 @@ +# Automatically created by: scrapy startproject +# +# For more information about the [deploy] section see: +# https://scrapyd.readthedocs.io/en/latest/deploy.html + +[settings] +default = TwitterHashTagCrawler.settings + +[deploy] +#url = http://localhost:6800/ +project = TwitterHashTagCrawler diff --git a/Hands-on-WebScraping-master/Hands-on-WebScraping-master/project1_twitter_hashtag_crawler/utils.py b/Hands-on-WebScraping-master/Hands-on-WebScraping-master/project1_twitter_hashtag_crawler/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..459f69ae27424fea8411d3b3ca7e0af6c69bb21e --- /dev/null +++ b/Hands-on-WebScraping-master/Hands-on-WebScraping-master/project1_twitter_hashtag_crawler/utils.py @@ -0,0 +1,67 @@ +import re + + +def find_emails(text): + """ + It will parse the given string and return a list of emails if found + + Example: + >>find_emails('hello\n find me here\nemail@gmail.com') + ['email@gmail.com'] + + :param text: string + :return: list + """ + return re.findall(r"([a-zA-Z0-9+._-]+@[a-zA-Z0-9._-]+\.[a-zA-Z0-9_-]+)", text) + + +def get_mentions(text): + """ + It will return mentions from the text i.e @someone + + :param text: string + :return: list + + example + >>> get_mentions('Hi @hero, How are you? I hope @hero2 is fine. BTW say hi to @heroine for me') + ['hero','hero2','heroine'] + """ + result = re.findall("(^|[^@\w])@(\w{1,15})", text) + if len(result) != 0: + result = [i[1] for i in result] + return result + + +def get_hashtags(text): + """ + It will return hashtags from the text i.e #something + + :param text: string + :return: list + + example + >>> get_hashtags('my first code #programmer #python #awesome #grepsr') + ['programmer','python','awesome','grepsr'] + """ + + result = re.findall("(^|[^@\w])#(\w{1,15})", text) + if len(result) != 0: + result = [i[1] for i in result] + return result + + +def get_links(text): + """ + It will return website links from the text + + :param text: string + :return: list + + example + >>> message = 'http://twitter.com Project URL: https://app.grepsr.com/app/project/message/70454' + >>> get_links(message) + ['http://twitter.com', 'https://app.grepsr.com/app/project/message/70454'] + + """ + result = re.findall(r"(?Phttps?://[^\s]+)", text) + return result diff --git a/Hands-on-WebScraping/.gitignore b/Hands-on-WebScraping/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..057bffa493b20b624ef2b4f2531bd75c565730bf --- /dev/null +++ b/Hands-on-WebScraping/.gitignore @@ -0,0 +1,241 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +pip-wheel-metadata/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +.python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + + + + +# Logs +logs +*.log +npm-debug.log* +yarn-debug.log* +yarn-error.log* +lerna-debug.log* + +# Diagnostic reports (https://nodejs.org/api/report.html) +report.[0-9]*.[0-9]*.[0-9]*.[0-9]*.json + +# Runtime data +pids +*.pid +*.seed +*.pid.lock + +# Directory for instrumented libs generated by jscoverage/JSCover +lib-cov + +# Coverage directory used by tools like istanbul +coverage +*.lcov + +# nyc test coverage +.nyc_output + +# Grunt intermediate storage (https://gruntjs.com/creating-plugins#storing-task-files) +.grunt + +# Bower dependency directory (https://bower.io/) +bower_components + +# node-waf configuration +.lock-wscript + +# Compiled binary addons (https://nodejs.org/api/addons.html) +build/Release + +# Dependency directories +node_modules/ +jspm_packages/ + +# TypeScript v1 declaration files +typings/ + +# TypeScript cache +*.tsbuildinfo + +# Optional npm cache directory +.npm + +# Optional eslint cache +.eslintcache + +# Microbundle cache +.rpt2_cache/ +.rts2_cache_cjs/ +.rts2_cache_es/ +.rts2_cache_umd/ + +# Optional REPL history +.node_repl_history + +# Output of 'npm pack' +*.tgz + +# Yarn Integrity file +.yarn-integrity + +# dotenv environment variables file +.env +.env.test + +# parcel-bundler cache (https://parceljs.org/) +.cache + +# Next.js build output +.next + +# Nuxt.js build / generate output +.nuxt +dist + +# Gatsby files +.cache/ +# Comment in the public line in if your project uses Gatsby and not Next.js +# https://nextjs.org/blog/next-9-1#public-directory-support +# public + +# vuepress build output +.vuepress/dist + +# Serverless directories +.serverless/ + +# FuseBox cache +.fusebox/ + +# DynamoDB Local files +.dynamodb/ + +# TernJS port file +.tern-port + +# Stores VSCode versions used for testing VSCode extensions +.vscode-test + diff --git a/Hands-on-WebScraping/LICENSE b/Hands-on-WebScraping/LICENSE new file mode 100644 index 0000000000000000000000000000000000000000..18a81255c3dea970314fc8a1cdcd69510f268e01 --- /dev/null +++ b/Hands-on-WebScraping/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2020 Amit Upreti + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/Hands-on-WebScraping/README.md b/Hands-on-WebScraping/README.md new file mode 100644 index 0000000000000000000000000000000000000000..4e5b4d55f16884af0d5db113d23e55c2afe88a35 --- /dev/null +++ b/Hands-on-WebScraping/README.md @@ -0,0 +1,2 @@ +# Hands-on-WebScraping (NO LONGER MAINTAINED) +This repo is a part of blog series on several web scraping projects where we will explore scraping techniques to crawl data from simple websites to websites using advanced protection. diff --git a/Hands-on-WebScraping/project1_twitter_hashtag_crawler/Readme.md b/Hands-on-WebScraping/project1_twitter_hashtag_crawler/Readme.md new file mode 100644 index 0000000000000000000000000000000000000000..b85daefcb4bc6c78560b0fb076e62dd5e59cc570 --- /dev/null +++ b/Hands-on-WebScraping/project1_twitter_hashtag_crawler/Readme.md @@ -0,0 +1,118 @@ +#### Depricated. No longer maintained. + +# Twitter Hashtag crawler +> A fast and unofficial twitter crawler to collect tweets using hashtag search. + +> Notice: The crawler is meant to be used for collecting data purely for academic and research purpose only. I am not responsible for any legal issue that might arise for any unintended use of this crawler + +[![Python 3](https://img.shields.io/badge/python-3.6-blue.svg)](https://www.python.org/downloads/release/python-360/) +[![twitter crawler](https://img.shields.io/badge/twittercrawler-1.0-green)](https://github.com/amitupreti/Hands-on-WebScraping/tree/master/project1_twitter_hashtag_crawler) + +This is written using scrapy and python. The logic is straight forward. We are simply sending get requests to the mobile version of the twitter(mobile.twitter.com) to collect the list of tweets and sending get requests to the web version to parse tweet details. +![](header.png) + +## Installation + +OS X & Linux: + +1. Download the project + +```sh +git clone https://github.com/amitupreti/Hands-on-WebScraping + +cd Hands-on-WebScraping/project1_twitter_hashtag_crawler +``` +2. Install the dependencies + +```sh +pip install -r requirements.txt --user +``` + +3. Verify the crawler spider exists + +```sh +scrapy list +``` +if you see `twittercrawler` than you are all set. + + +Windows: +1. Install [python3](https://www.python.org/downloads/) if you haven't already +2. Download the project. https://github.com/amitupreti/Hands-on-WebScraping/archive/master.zip +3. Extract the project +4. Open cmd and navigate inside the project directory +```sh +cd Hands-on-WebScraping/project1_twitter_hashtag_crawler +``` +5. Follow step 2 and 3 from Mac/Linux installation + + + +## Usage example + +1. Put the hashtags in a csv file seperated by new line. For example, I have included `myhashtags.csv` as a sample. + +![Hashtags file](https://i.paste.pics/225079df0d3dc27d66430b1553b2ac39.png) + +2. Run the crawler with your hashtag file and the desired [output formats] (https://docs.scrapy.org/en/latest/topics/feed-exports.html)(JSON,JSON lines,CSV,XML) + +* For csv + ```sh + scrapy crawl twittercrawler -a filename=myhashtags.csv -o mydata.csv + + ``` + +* For JSON + ```sh + scrapy crawl twittercrawler -a filename=myhashtags.csv -o mydata.json + + ``` +![sample images](https://i.paste.pics/4a5826a6a090522e5326bb11838258df.png) +![sample images](https://i.paste.pics/68a64bab743150e00af4cd9eea9af8dc.png) + + +### Speeding up the crawls +If you feel like the crawler is a little slow then find the hashtag.py file in the project and edit the custom settings. +```py +custom_settings = { + 'USER_AGENT': 'Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; compatible; Googlebot/2.1; +http://www.google.com/bot.html) Safari/537.36', + 'CONCURRENT_REQUESTS': 2, 'DOWNLOAD_DELAY': 1, 'LOG_LEVEL': 'INFO'} +``` +> Here CONCURRENT_REQUESTS is the number of URLs that will be processed parallelly and DOWNLOAD_DELAY is a wait between each request. So, Increase CONCURRENT_REQUESTS and decrease DOWNLOAD_DELAY (minimum value for download delay is 0). + + +## Data Columns +* username +* full_name +* twitter_url +* tweet_text +* tweet_time +* number_of_likes +* no_of_retweets +* no_of_replies +* mentions +* no_of_mentions +* hashtags +* no_of_hashtags +* call_to_action +* image_url + +## Release History + +* 1.0.0 + * first release crawl by hashtags + +## Meta + +Amit Upreti – [@amitupreti](https://www.linkedin.com/in/amitupreti/) + +Distributed under the MIT license. See ``LICENSE`` for more information. + + +## Contributing + +1. Fork it () +2. Create your feature branch (`git checkout -b feature/fooBar`) +3. Commit your changes (`git commit -am 'Add some fooBar'`) +4. Push to the branch (`git push origin feature/fooBar`) +5. Create a new Pull Request diff --git a/Hands-on-WebScraping/project1_twitter_hashtag_crawler/TwitterHashTagCrawler/__init__.py b/Hands-on-WebScraping/project1_twitter_hashtag_crawler/TwitterHashTagCrawler/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/Hands-on-WebScraping/project1_twitter_hashtag_crawler/TwitterHashTagCrawler/__pycache__/__init__.cpython-310.pyc b/Hands-on-WebScraping/project1_twitter_hashtag_crawler/TwitterHashTagCrawler/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..42d452f115d2d40c150b551c6b060fa43407a7c1 Binary files /dev/null and b/Hands-on-WebScraping/project1_twitter_hashtag_crawler/TwitterHashTagCrawler/__pycache__/__init__.cpython-310.pyc differ diff --git a/Hands-on-WebScraping/project1_twitter_hashtag_crawler/TwitterHashTagCrawler/__pycache__/settings.cpython-310.pyc b/Hands-on-WebScraping/project1_twitter_hashtag_crawler/TwitterHashTagCrawler/__pycache__/settings.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e5c6c664bbcea0c948ea7c16f0eb7663af945387 Binary files /dev/null and b/Hands-on-WebScraping/project1_twitter_hashtag_crawler/TwitterHashTagCrawler/__pycache__/settings.cpython-310.pyc differ diff --git a/Hands-on-WebScraping/project1_twitter_hashtag_crawler/TwitterHashTagCrawler/items.py b/Hands-on-WebScraping/project1_twitter_hashtag_crawler/TwitterHashTagCrawler/items.py new file mode 100644 index 0000000000000000000000000000000000000000..6cc454f4f95a3733891feb68f04375ce188bc5d3 --- /dev/null +++ b/Hands-on-WebScraping/project1_twitter_hashtag_crawler/TwitterHashTagCrawler/items.py @@ -0,0 +1,14 @@ +# -*- coding: utf-8 -*- + +# Define here the models for your scraped items +# +# See documentation in: +# https://doc.scrapy.org/en/latest/topics/items.html + +import scrapy + + +class TwitterhashtagcrawlerItem(scrapy.Item): + # define the fields for your item here like: + # name = scrapy.Field() + pass diff --git a/Hands-on-WebScraping/project1_twitter_hashtag_crawler/TwitterHashTagCrawler/middlewares.py b/Hands-on-WebScraping/project1_twitter_hashtag_crawler/TwitterHashTagCrawler/middlewares.py new file mode 100644 index 0000000000000000000000000000000000000000..386a074e6503fafba636d783edcf0d6d4e11bc66 --- /dev/null +++ b/Hands-on-WebScraping/project1_twitter_hashtag_crawler/TwitterHashTagCrawler/middlewares.py @@ -0,0 +1,103 @@ +# -*- coding: utf-8 -*- + +# Define here the models for your spider middleware +# +# See documentation in: +# https://doc.scrapy.org/en/latest/topics/spider-middleware.html + +from scrapy import signals + + +class TwitterhashtagcrawlerSpiderMiddleware(object): + # Not all methods need to be defined. If a method is not defined, + # scrapy acts as if the spider middleware does not modify the + # passed objects. + + @classmethod + def from_crawler(cls, crawler): + # This method is used by Scrapy to create your spiders. + s = cls() + crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) + return s + + def process_spider_input(self, response, spider): + # Called for each response that goes through the spider + # middleware and into the spider. + + # Should return None or raise an exception. + return None + + def process_spider_output(self, response, result, spider): + # Called with the results returned from the Spider, after + # it has processed the response. + + # Must return an iterable of Request, dict or Item objects. + for i in result: + yield i + + def process_spider_exception(self, response, exception, spider): + # Called when a spider or process_spider_input() method + # (from other spider middleware) raises an exception. + + # Should return either None or an iterable of Response, dict + # or Item objects. + pass + + def process_start_requests(self, start_requests, spider): + # Called with the start requests of the spider, and works + # similarly to the process_spider_output() method, except + # that it doesn’t have a response associated. + + # Must return only requests (not items). + for r in start_requests: + yield r + + def spider_opened(self, spider): + spider.logger.info('Spider opened: %s' % spider.name) + + +class TwitterhashtagcrawlerDownloaderMiddleware(object): + # Not all methods need to be defined. If a method is not defined, + # scrapy acts as if the downloader middleware does not modify the + # passed objects. + + @classmethod + def from_crawler(cls, crawler): + # This method is used by Scrapy to create your spiders. + s = cls() + crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) + return s + + def process_request(self, request, spider): + # Called for each request that goes through the downloader + # middleware. + + # Must either: + # - return None: continue processing this request + # - or return a Response object + # - or return a Request object + # - or raise IgnoreRequest: process_exception() methods of + # installed downloader middleware will be called + return None + + def process_response(self, request, response, spider): + # Called with the response returned from the downloader. + + # Must either; + # - return a Response object + # - return a Request object + # - or raise IgnoreRequest + return response + + def process_exception(self, request, exception, spider): + # Called when a download handler or a process_request() + # (from other downloader middleware) raises an exception. + + # Must either: + # - return None: continue processing this exception + # - return a Response object: stops process_exception() chain + # - return a Request object: stops process_exception() chain + pass + + def spider_opened(self, spider): + spider.logger.info('Spider opened: %s' % spider.name) diff --git a/Hands-on-WebScraping/project1_twitter_hashtag_crawler/TwitterHashTagCrawler/pipelines.py b/Hands-on-WebScraping/project1_twitter_hashtag_crawler/TwitterHashTagCrawler/pipelines.py new file mode 100644 index 0000000000000000000000000000000000000000..6e4eeefcc9b41f1c789dc26cb8d200fcff843aab --- /dev/null +++ b/Hands-on-WebScraping/project1_twitter_hashtag_crawler/TwitterHashTagCrawler/pipelines.py @@ -0,0 +1,11 @@ +# -*- coding: utf-8 -*- + +# Define your item pipelines here +# +# Don't forget to add your pipeline to the ITEM_PIPELINES setting +# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html + + +class TwitterhashtagcrawlerPipeline(object): + def process_item(self, item, spider): + return item diff --git a/Hands-on-WebScraping/project1_twitter_hashtag_crawler/TwitterHashTagCrawler/settings.py b/Hands-on-WebScraping/project1_twitter_hashtag_crawler/TwitterHashTagCrawler/settings.py new file mode 100644 index 0000000000000000000000000000000000000000..2887d1deb6f1151600c25e78b72229403eb6ab04 --- /dev/null +++ b/Hands-on-WebScraping/project1_twitter_hashtag_crawler/TwitterHashTagCrawler/settings.py @@ -0,0 +1,90 @@ +# -*- coding: utf-8 -*- + +# Scrapy settings for TwitterHashTagCrawler project +# +# For simplicity, this file contains only settings considered important or +# commonly used. You can find more settings consulting the documentation: +# +# https://doc.scrapy.org/en/latest/topics/settings.html +# https://doc.scrapy.org/en/latest/topics/downloader-middleware.html +# https://doc.scrapy.org/en/latest/topics/spider-middleware.html + +BOT_NAME = 'TwitterHashTagCrawler' + +SPIDER_MODULES = ['TwitterHashTagCrawler.spiders'] +NEWSPIDER_MODULE = 'TwitterHashTagCrawler.spiders' + + +# Crawl responsibly by identifying yourself (and your website) on the user-agent +#USER_AGENT = 'TwitterHashTagCrawler (+http://www.yourdomain.com)' + +# Obey robots.txt rules +ROBOTSTXT_OBEY = False + +# Configure maximum concurrent requests performed by Scrapy (default: 16) +#CONCURRENT_REQUESTS = 32 + +# Configure a delay for requests for the same website (default: 0) +# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay +# See also autothrottle settings and docs +#DOWNLOAD_DELAY = 3 +# The download delay setting will honor only one of: +#CONCURRENT_REQUESTS_PER_DOMAIN = 16 +#CONCURRENT_REQUESTS_PER_IP = 16 + +# Disable cookies (enabled by default) +#COOKIES_ENABLED = False + +# Disable Telnet Console (enabled by default) +#TELNETCONSOLE_ENABLED = False + +# Override the default request headers: +#DEFAULT_REQUEST_HEADERS = { +# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', +# 'Accept-Language': 'en', +#} + +# Enable or disable spider middlewares +# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html +#SPIDER_MIDDLEWARES = { +# 'TwitterHashTagCrawler.middlewares.TwitterhashtagcrawlerSpiderMiddleware': 543, +#} + +# Enable or disable downloader middlewares +# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html +#DOWNLOADER_MIDDLEWARES = { +# 'TwitterHashTagCrawler.middlewares.TwitterhashtagcrawlerDownloaderMiddleware': 543, +#} + +# Enable or disable extensions +# See https://doc.scrapy.org/en/latest/topics/extensions.html +#EXTENSIONS = { +# 'scrapy.extensions.telnet.TelnetConsole': None, +#} + +# Configure item pipelines +# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html +#ITEM_PIPELINES = { +# 'TwitterHashTagCrawler.pipelines.TwitterhashtagcrawlerPipeline': 300, +#} + +# Enable and configure the AutoThrottle extension (disabled by default) +# See https://doc.scrapy.org/en/latest/topics/autothrottle.html +#AUTOTHROTTLE_ENABLED = True +# The initial download delay +#AUTOTHROTTLE_START_DELAY = 5 +# The maximum download delay to be set in case of high latencies +#AUTOTHROTTLE_MAX_DELAY = 60 +# The average number of requests Scrapy should be sending in parallel to +# each remote server +#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 +# Enable showing throttling stats for every response received: +#AUTOTHROTTLE_DEBUG = False + +# Enable and configure HTTP caching (disabled by default) +# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings +#HTTPCACHE_ENABLED = True +#HTTPCACHE_EXPIRATION_SECS = 0 +#HTTPCACHE_DIR = 'httpcache' +#HTTPCACHE_IGNORE_HTTP_CODES = [] +#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' diff --git a/Hands-on-WebScraping/project1_twitter_hashtag_crawler/TwitterHashTagCrawler/spiders/__init__.py b/Hands-on-WebScraping/project1_twitter_hashtag_crawler/TwitterHashTagCrawler/spiders/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..5ca581dc70652bc451062dd6efa6d8b4d3848a75 --- /dev/null +++ b/Hands-on-WebScraping/project1_twitter_hashtag_crawler/TwitterHashTagCrawler/spiders/__init__.py @@ -0,0 +1,4 @@ +# This package will contain the spiders of your Scrapy project +# +# Please refer to the documentation for information on how to create and manage +# your spiders. diff --git a/Hands-on-WebScraping/project1_twitter_hashtag_crawler/TwitterHashTagCrawler/spiders/__pycache__/__init__.cpython-310.pyc b/Hands-on-WebScraping/project1_twitter_hashtag_crawler/TwitterHashTagCrawler/spiders/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..586b71ce0494be2d91f7027359557c1c0e7a0d80 Binary files /dev/null and b/Hands-on-WebScraping/project1_twitter_hashtag_crawler/TwitterHashTagCrawler/spiders/__pycache__/__init__.cpython-310.pyc differ diff --git a/Hands-on-WebScraping/project1_twitter_hashtag_crawler/TwitterHashTagCrawler/spiders/__pycache__/hashtag.cpython-310.pyc b/Hands-on-WebScraping/project1_twitter_hashtag_crawler/TwitterHashTagCrawler/spiders/__pycache__/hashtag.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..89f1e6b44fb96504852455cd4e97f14b16c15165 Binary files /dev/null and b/Hands-on-WebScraping/project1_twitter_hashtag_crawler/TwitterHashTagCrawler/spiders/__pycache__/hashtag.cpython-310.pyc differ diff --git a/Hands-on-WebScraping/project1_twitter_hashtag_crawler/TwitterHashTagCrawler/spiders/__pycache__/hashtag2.cpython-310.pyc b/Hands-on-WebScraping/project1_twitter_hashtag_crawler/TwitterHashTagCrawler/spiders/__pycache__/hashtag2.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a4dee7d73c2d09aedf707f66340bf91545f2fb2f Binary files /dev/null and b/Hands-on-WebScraping/project1_twitter_hashtag_crawler/TwitterHashTagCrawler/spiders/__pycache__/hashtag2.cpython-310.pyc differ diff --git a/Hands-on-WebScraping/project1_twitter_hashtag_crawler/TwitterHashTagCrawler/spiders/__pycache__/hashtag3.cpython-310.pyc b/Hands-on-WebScraping/project1_twitter_hashtag_crawler/TwitterHashTagCrawler/spiders/__pycache__/hashtag3.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e5e7b88fbefd983cbd73aad7e8d175bbea99c387 Binary files /dev/null and b/Hands-on-WebScraping/project1_twitter_hashtag_crawler/TwitterHashTagCrawler/spiders/__pycache__/hashtag3.cpython-310.pyc differ diff --git a/Hands-on-WebScraping/project1_twitter_hashtag_crawler/TwitterHashTagCrawler/spiders/hashtag.py b/Hands-on-WebScraping/project1_twitter_hashtag_crawler/TwitterHashTagCrawler/spiders/hashtag.py new file mode 100644 index 0000000000000000000000000000000000000000..870d7eb86427ef0bf74e840d5c4fe3987b4f3c39 --- /dev/null +++ b/Hands-on-WebScraping/project1_twitter_hashtag_crawler/TwitterHashTagCrawler/spiders/hashtag.py @@ -0,0 +1,118 @@ +# -*- coding: utf-8 -*- +import scrapy +import ipdb +import re +from dateutil import parser +import sys +from scrapy.crawler import CrawlerProcess +from utils import get_links, get_hashtags, get_mentions +import logging + +class HashtagSpider(scrapy.Spider): + name = 'hashtag' + allowed_domains = ["twitter.com"] + + # custom settings for user agent and proxy. Default will get chrome as user agent and use a proxypool of 50 . + # Override here + custom_settings = { + 'USER_AGENT': 'my-cool-project (http://example.com)', + 'CONCURRENT_REQUESTS': 5, 'DOWNLOAD_DELAY': 0, 'LOG_LEVEL': 'INFO'} + + def __init__(self, filename=''): + if not filename: + sys.exit('Please provide the input filename also. Example \n\n$ python3 hashtags.py myinput.csv') + self.filename = filename + + # the crawler will execute start_requests function at first. + def start_requests(self): + #with open(self.filename, 'r') as f: + #hashtags = ['danaher'] + hashtags= ['danaher'] + if len(hashtags) == 0: + sys.exit('Emplty File detected.Please provide hashtags separated by newlines') + else: + logging.info(f'{len(hashtags)} hashtags found') + print('hashtag is..',hashtags) + for hashtag in hashtags: + if hashtag: + search_url = "https://twitter.com/hashtag/" + hashtag.lower() + print('search_url is...', search_url) + + yield scrapy.Request(search_url, callback=self.find_tweets, dont_filter=True) + + def find_tweets(self, response): + print("I am in find_tweets") + tweets = response.xpath('//*[@data-testid="tweetText"]/span[1]/text()').getall() + print("Tweets is...", tweets) + print(f'{len(tweets)} tweets found') + for tweet in tweets: + # tweet_id = re.findall("\d+", tweet_id)[-1] + # tweet_url = 'https://twitter.com/anyuser/status/' + \ + # str(tweet_id) + print(tweet) + yield scrapy.Request(tweet_url, callback=self.parse_tweet) + + # finding and visiting next page + next_page = response.xpath( + '//*[@class="w-button-more"]/a/@href').get(default='') + logging.info('Next page found:') + if next_page != '': + next_page = 'https://mobile.twitter.com' + next_page + yield scrapy.Request(next_page, callback=self.find_tweets) + + def parse_tweet(self, response): + logging.info('Processing --> ' + response.url) + username = response.xpath( + '//*[@class="permalink-inner permalink-tweet-container"]//*[@class="username u-dir u-textTruncate"]/b/text()').get( + default='') + full_name = response.xpath( + '//*[@class="permalink-inner permalink-tweet-container"]//*[@class="FullNameGroup"]/strong/text()').get( + default='') + + try: + tweet_text = response.xpath('//title/text()').get(default='').split(':')[1].strip() + + except: + tweet_text = ' '.join(response.xpath( + '//*[contains(@class,"permalink-inner permalink-tweet-container")]//*[@class="js-tweet-text-container"]/p//text()').getall()).strip() + image_list = response.xpath( + '//*[contains(@class,"permalink-inner permalink-tweet-container")]//*[@class="AdaptiveMediaOuterContainer"]//img/@src').getall() + date_time = response.xpath( + '//*[contains(@class,"permalink-inner permalink-tweet-container")]//*[@class="js-tweet-details-fixer tweet-details-fixer"]/div[@class="client-and-actions"]/span[@class="metadata"]/span/text()').get( + default='') + + date_time = parser.parse(date_time.replace('-', '')).strftime('%Y-%m-%d %H:%M:%S') + retweets = response.xpath( + '//*[contains(@class,"permalink-inner permalink-tweet-container")]//*[@class="js-tweet-details-fixer tweet-details-fixer"]/div[@class="js-tweet-stats-container tweet-stats-container"]//*[@class="js-stat-count js-stat-retweets stat-count"]/a/strong/text()').get( + default='') + + likes = response.xpath( + '//*[contains(@class,"permalink-inner permalink-tweet-container")]//*[@class="js-tweet-details-fixer tweet-details-fixer"]/div[@class="js-tweet-stats-container tweet-stats-container"]//*[@class="js-stat-count js-stat-favorites stat-count"]/a/strong/text()').get( + default='') + replies = response.xpath( + '//*[contains(@class,"permalink-inner permalink-tweet-container")]//*[contains(@id,"profile-tweet-action-reply-count")]/parent::span/@data-tweet-stat-count').get( + default='') + + mentions = get_mentions(tweet_text) + hashtags = get_hashtags(tweet_text) + cta = get_links(tweet_text) + + result = { + 'username': username.lower(), + 'full_name': full_name, + 'twitter_url': response.url, + 'tweet_text': tweet_text, + 'tweet_time': str(date_time), + 'number_of_likes': str(likes), + 'no_of_retweets': str(retweets), + 'no_of_replies': str(replies), + 'mentions': ' | '.join(mentions), + 'no_of_mentions': str(len(mentions)), + 'hashtags': ' | '.join(hashtags), + 'no_of_hashtags': str(len(hashtags)), + 'call_to_action': ' | '.join(cta), + 'image_url': ' | '.join(image_list), + + } + yield result + diff --git a/Hands-on-WebScraping/project1_twitter_hashtag_crawler/TwitterHashTagCrawler/spiders/hashtag2.py b/Hands-on-WebScraping/project1_twitter_hashtag_crawler/TwitterHashTagCrawler/spiders/hashtag2.py new file mode 100644 index 0000000000000000000000000000000000000000..5c8201ecace4cd3a05f5b2fd30feb406b9d88476 --- /dev/null +++ b/Hands-on-WebScraping/project1_twitter_hashtag_crawler/TwitterHashTagCrawler/spiders/hashtag2.py @@ -0,0 +1,121 @@ +# -*- coding: utf-8 -*- +import scrapy +import ipdb +import re +from dateutil import parser +import sys +from scrapy.crawler import CrawlerProcess +from utils import get_links, get_hashtags, get_mentions +from scrapy.http.request import Request +import logging + +class HashtagSpider(scrapy.Spider): + name = 'hashtag2' + allowed_domains = ["twitter.com"] + + # custom settings for user agent and proxy. Default will get chrome as user agent and use a proxypool of 50 . + # Override here + custom_settings = { + 'USER_AGENT': 'my-cool-project (http://example.com)', + 'CONCURRENT_REQUESTS': 5, 'DOWNLOAD_DELAY': 0, 'LOG_LEVEL': 'INFO'} + + def __init__(self, filename=''): + if not filename: + sys.exit('Please provide the input filename also. Example \n\n$ python3 hashtags.py myinput.csv') + self.filename = filename + + # the crawler will execute start_requests function at first. + def start_requests(self): + #with open(self.filename, 'r') as f: + #hashtags = ['danaher'] + hashtags= ['danaher'] + if len(hashtags) == 0: + sys.exit('Emplty File detected.Please provide hashtags separated by newlines') + else: + logging.info(f'{len(hashtags)} hashtags found') + print('hashtag is..',hashtags) + for hashtag in hashtags: + if hashtag: + search_url = "https://twitter.com/hashtag/" + hashtag.lower() + print('search_url is...', search_url) + try: + yield Request(search_url, callback=self.find_tweets, dont_filter=True) + except Exception as e: + print(e) + + def find_tweets(self, response): + print("I am in find_tweets") + tweets = response.xpath('//*[@data-testid="tweetText"]/span[1]/text()').getall() + print("Tweets is...", tweets) + print(f'{len(tweets)} tweets found') + for tweet in tweets: + # tweet_id = re.findall("\d+", tweet_id)[-1] + # tweet_url = 'https://twitter.com/anyuser/status/' + \ + # str(tweet_id) + print(tweet) + #yield scrapy.Request(tweet_url, callback=self.parse_tweet) + + # finding and visiting next page + next_page = response.xpath( + '//*[@class="w-button-more"]/a/@href').get(default='') + logging.info('Next page found:') + if next_page != '': + next_page = 'https://mobile.twitter.com' + next_page + yield scrapy.Request(next_page, callback=self.find_tweets) + + def parse_tweet(self, response): + logging.info('Processing --> ' + response.url) + username = response.xpath( + '//*[@class="permalink-inner permalink-tweet-container"]//*[@class="username u-dir u-textTruncate"]/b/text()').get( + default='') + full_name = response.xpath( + '//*[@class="permalink-inner permalink-tweet-container"]//*[@class="FullNameGroup"]/strong/text()').get( + default='') + + try: + tweet_text = response.xpath('//title/text()').get(default='').split(':')[1].strip() + + except: + tweet_text = ' '.join(response.xpath( + '//*[contains(@class,"permalink-inner permalink-tweet-container")]//*[@class="js-tweet-text-container"]/p//text()').getall()).strip() + image_list = response.xpath( + '//*[contains(@class,"permalink-inner permalink-tweet-container")]//*[@class="AdaptiveMediaOuterContainer"]//img/@src').getall() + date_time = response.xpath( + '//*[contains(@class,"permalink-inner permalink-tweet-container")]//*[@class="js-tweet-details-fixer tweet-details-fixer"]/div[@class="client-and-actions"]/span[@class="metadata"]/span/text()').get( + default='') + + date_time = parser.parse(date_time.replace('-', '')).strftime('%Y-%m-%d %H:%M:%S') + retweets = response.xpath( + '//*[contains(@class,"permalink-inner permalink-tweet-container")]//*[@class="js-tweet-details-fixer tweet-details-fixer"]/div[@class="js-tweet-stats-container tweet-stats-container"]//*[@class="js-stat-count js-stat-retweets stat-count"]/a/strong/text()').get( + default='') + + likes = response.xpath( + '//*[contains(@class,"permalink-inner permalink-tweet-container")]//*[@class="js-tweet-details-fixer tweet-details-fixer"]/div[@class="js-tweet-stats-container tweet-stats-container"]//*[@class="js-stat-count js-stat-favorites stat-count"]/a/strong/text()').get( + default='') + replies = response.xpath( + '//*[contains(@class,"permalink-inner permalink-tweet-container")]//*[contains(@id,"profile-tweet-action-reply-count")]/parent::span/@data-tweet-stat-count').get( + default='') + + mentions = get_mentions(tweet_text) + hashtags = get_hashtags(tweet_text) + cta = get_links(tweet_text) + + result = { + 'username': username.lower(), + 'full_name': full_name, + 'twitter_url': response.url, + 'tweet_text': tweet_text, + 'tweet_time': str(date_time), + 'number_of_likes': str(likes), + 'no_of_retweets': str(retweets), + 'no_of_replies': str(replies), + 'mentions': ' | '.join(mentions), + 'no_of_mentions': str(len(mentions)), + 'hashtags': ' | '.join(hashtags), + 'no_of_hashtags': str(len(hashtags)), + 'call_to_action': ' | '.join(cta), + 'image_url': ' | '.join(image_list), + + } + yield result + diff --git a/Hands-on-WebScraping/project1_twitter_hashtag_crawler/TwitterHashTagCrawler/spiders/hashtag3.py b/Hands-on-WebScraping/project1_twitter_hashtag_crawler/TwitterHashTagCrawler/spiders/hashtag3.py new file mode 100644 index 0000000000000000000000000000000000000000..0fc1b08f3e19f8b172072ef574af33e9a847a640 --- /dev/null +++ b/Hands-on-WebScraping/project1_twitter_hashtag_crawler/TwitterHashTagCrawler/spiders/hashtag3.py @@ -0,0 +1,142 @@ + +from operator import concat +import scrapy +import time +import pandas as pd +from scrapy.http.request import Request +from scrapy import signals +#from sentence_transformers import SentenceTransformer, util +#import numpy as np +#import yake +##import nltk.data +#import nltk +#nltk.download('punkt') +#from nltk.tokenize import sent_tokenize +#from keybert import KeyBERT +#from statistics import mean +#from urllib.parse import urlparse +#import random + + +from transformers import AutoTokenizer, AutoModel + +import torch + +from sklearn.metrics.pairwise import cosine_similarity + + +#text =[] +text2 ='' +res =[] +len_res = 0 +len_res2 = 0 + +list1 = [] +df = pd.DataFrame() +df_sim = pd.DataFrame() +allowed_domains = [] +list_start_urls = [] +list_companies = [] +index = 0 +len_df = 0 +mean_embedding = [] +list_df_sim = [] + +class BioSpider(scrapy.Spider): + name = "hashtag3" + + custom_settings = {'CONCURRENT_REQUESTS': '1', 'CONCURRENT_REQUESTS_PER_DOMAIN':'1', 'ROBOTSTXT_OBEY' : False \ + , "REQUEST_FINGERPRINTER_IMPLEMENTATION": "2.7", 'USER_AGENT':'my-cool-project (http://example.com)'} + + + global allowed_domains + allowed_domains = ["twitter.com"] + global list_start_urls + global list_companies + global res + global index + list_start_urls2 = [] + global len_df + global df + + #data=pd.read_excel("C:\\Users\\RSPRASAD\OneDrive - Danaher\\Bec_LS\\2023\\D_and_B_Project\\Segmentation\\Customer focus list 2023 NGW.xlsx", sheet_name='Sheet1') + + #df= data[['Company', 'Website']] + #df.drop_duplicates(inplace = True) + #df['Content'] = '' + + i = 0 + len_df = 1 + # for i in range(0, len(df)): + # #df.loc[i, 'company_name']= df.loc[i, 'Company'] + # #df.loc[i, 'company_website']= df.loc[i, 'Website'] + # list_start_urls.append(df.loc[i, 'Website']) + # list_companies.append(df.loc[i, 'Company']) + # domain = urlparse(df.loc[i, 'Website']).netloc + # allowed_domains.append(domain) + # print(allowed_domains) + # upper_len_websites = 5 + start_index = 0 + # if(len_df >upper_len_websites): + # list_start_urls= list_start_urls[start_index:upper_len_websites] + # df = df.iloc[start_index:upper_len_websites,:] + + + list_start_urls = ['https://twitter.com/hashtag/danaher/'] + # df = df.iloc[start_index:upper_len_websites,:] + # df = df.reset_index() + # df = df.drop('index', axis = 1) + # len_df = len(df) + + # print("Dataframe for crawling website is ..") + # print(df) + + + + print(list_start_urls) + + + @classmethod + def from_crawler(cls, crawler, *args, **kwargs): + spider = super(BioSpider, cls).from_crawler(crawler, *args, **kwargs) + #crawler.signals.connect(spider.spider_opened, signals.spider_opened) + crawler.signals.connect(spider.spider_closed, signals.spider_closed) + return spider + + def start_requests(self): + + global list_start_urls + global index + global res + + + index =0 + index2 = len(list_start_urls) + print(" i am in start_requests") + + + + try: + + yield Request(list_start_urls[0].strip(), callback = self.parse)#, meta={'priority': index2}) + + except Exception as e: + print("There is exception and exception is..",e) + + + + + def parse(self, response): + print("I am in parse..") + print("I am in find_tweets") + tweets = response.xpath('//*[@data-testid="tweetText"]/span[1]/text()').getall() + print("Tweets is...", tweets) + print(f'{len(tweets)} tweets found') + for tweet in tweets: + print(tweet) + count += 1 + if (count >5): + break + + def spider_closed(self, spider): + print("I am in spider closed...") diff --git a/Hands-on-WebScraping/project1_twitter_hashtag_crawler/__pycache__/utils.cpython-310.pyc b/Hands-on-WebScraping/project1_twitter_hashtag_crawler/__pycache__/utils.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..990ef76dff2c9326bec1ac44e594544b5dbe825c Binary files /dev/null and b/Hands-on-WebScraping/project1_twitter_hashtag_crawler/__pycache__/utils.cpython-310.pyc differ diff --git a/Hands-on-WebScraping/project1_twitter_hashtag_crawler/mydata.csv b/Hands-on-WebScraping/project1_twitter_hashtag_crawler/mydata.csv new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/Hands-on-WebScraping/project1_twitter_hashtag_crawler/myhashtag.csv b/Hands-on-WebScraping/project1_twitter_hashtag_crawler/myhashtag.csv new file mode 100644 index 0000000000000000000000000000000000000000..f2dcf43c8d4ba9718fd0b2e3addab939eb28753f Binary files /dev/null and b/Hands-on-WebScraping/project1_twitter_hashtag_crawler/myhashtag.csv differ diff --git a/Hands-on-WebScraping/project1_twitter_hashtag_crawler/myhashtags.csv b/Hands-on-WebScraping/project1_twitter_hashtag_crawler/myhashtags.csv new file mode 100644 index 0000000000000000000000000000000000000000..ebe48d54d8e7fe33f40b3a030b3acab9bb55b447 --- /dev/null +++ b/Hands-on-WebScraping/project1_twitter_hashtag_crawler/myhashtags.csv @@ -0,0 +1 @@ +danaher diff --git a/Hands-on-WebScraping/project1_twitter_hashtag_crawler/requirements.txt b/Hands-on-WebScraping/project1_twitter_hashtag_crawler/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..e62b6e9d5d327d41776f77e0014046fd5fece411 --- /dev/null +++ b/Hands-on-WebScraping/project1_twitter_hashtag_crawler/requirements.txt @@ -0,0 +1,2 @@ +scrapy +dateutil \ No newline at end of file diff --git a/Hands-on-WebScraping/project1_twitter_hashtag_crawler/sampledata.csv b/Hands-on-WebScraping/project1_twitter_hashtag_crawler/sampledata.csv new file mode 100644 index 0000000000000000000000000000000000000000..bc292c4d6fd3bf756709e953211a65a474e08257 --- /dev/null +++ b/Hands-on-WebScraping/project1_twitter_hashtag_crawler/sampledata.csv @@ -0,0 +1,12 @@ +username,full_name,twitter_url,tweet_text,tweet_time,number_of_likes,no_of_retweets,no_of_replies,mentions,no_of_mentions,hashtags,no_of_hashtags,call_to_action,image_url +cctvasiapacific,CCTV Asia Pacific,https://twitter.com/CCTVAsiaPacific/status/1212269072328491008,"Turning off the stereotype of political faces, Hou Yanqi, the # ChineseAmbssdor to # Nepal , amazes Nepalese and gains popularity on twitter by posting her ad-like photos and wishes: ""True beauty always touches the deep heart"", said Hou. +२०२० नेपाल भ्रमाण वर्ष सफलताको शुभकामना pic.twitter.com/z0N8ru2vNd",2019-12-31 23:07:00,804,171,35,,0,,0,,https://pbs.twimg.com/media/ENLYSqlU4AAgiFh.jpg | https://pbs.twimg.com/media/ENLYSqoVAAASSS-.jpg | https://pbs.twimg.com/media/ENLYSqmU0AAZEyK.jpg +,,https://twitter.com/BishowParajuli/status/1213037950549626882,"Zimbabwe is beautiful! Glad to hear your mountain climbing adventure ; If you wish to climb further higher, another beautiful place is # Nepal ! You will you can also enjoy some terrific historical spots: pic.twitter.com/ofsCppyp0O",2020-01-03 02:02:00,27,3,1,,0,,0,,https://pbs.twimg.com/media/ENWTkzmUEAEKS1k.jpg | https://pbs.twimg.com/media/ENWTkznU4AAtVxK.jpg | https://pbs.twimg.com/media/ENWTkzoUwAEgMpX.jpg | https://pbs.twimg.com/media/ENWTkzlU4AEYxor.jpg +kopinoora,kpila,https://twitter.com/kopinoora/status/1213481511967690752,# VisitNepal2020 official inauguration at London Nepal Embassy. # pic.twitter.com/e4N9XulBH7,2020-01-04 07:25:00,3,,0,,0,,0,,https://pbs.twimg.com/media/ENcnABiXsAE7_sw.jpg | https://pbs.twimg.com/media/ENcnABsXUAAnuBL.jpg +mahbub_nazif,Nazif Mahbub,https://twitter.com/mahbub_nazif/status/1213328288271089664,"The joy of being Innocent. Durbar square, kathmandu, nepal pic.twitter.com/sbsfxTzeHN",2020-01-03 21:16:00,4,,0,,0,,0,,https://pbs.twimg.com/media/ENabn-uWwAcbUfb.jpg +prabhumteverest,Prastuti_प्रश्तुती,https://twitter.com/PrabhuMteverest/status/1213178026457878528,"Visit nepal2020. where heaven meets and you won't feel regret choosing Nepal as your destination + +We are eager to welcome you with our beautiful destinations and warm hospitality pic.twitter.com/l7GQfk2ha6",2020-01-03 11:19:00,5,,0,,0,,0,,https://pbs.twimg.com/media/ENYS_CLUwAAVypp.jpg +kashishds,Kashish Das Shrestha,https://twitter.com/kashishds/status/1213120581412876295,"Marpha bazaar, Mustang, Nepal. Today. + +Requested my friend & Marpha resident Dipesh Hirachan for this clip. This is just outside his Apple orchard there. pic.twitter.com/oOFy88ylIt",2020-01-03 07:30:00,123,20,4,,0,,0,, diff --git a/Hands-on-WebScraping/project1_twitter_hashtag_crawler/scrapy.cfg b/Hands-on-WebScraping/project1_twitter_hashtag_crawler/scrapy.cfg new file mode 100644 index 0000000000000000000000000000000000000000..4d8f8c589ae72d8cabd0d0290b58e698dfae0af7 --- /dev/null +++ b/Hands-on-WebScraping/project1_twitter_hashtag_crawler/scrapy.cfg @@ -0,0 +1,11 @@ +# Automatically created by: scrapy startproject +# +# For more information about the [deploy] section see: +# https://scrapyd.readthedocs.io/en/latest/deploy.html + +[settings] +default = TwitterHashTagCrawler.settings + +[deploy] +#url = http://localhost:6800/ +project = TwitterHashTagCrawler diff --git a/Hands-on-WebScraping/project1_twitter_hashtag_crawler/utils.py b/Hands-on-WebScraping/project1_twitter_hashtag_crawler/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..c54d6ef1b730c78d9e4861b946741a6a5c7aa929 --- /dev/null +++ b/Hands-on-WebScraping/project1_twitter_hashtag_crawler/utils.py @@ -0,0 +1,67 @@ +import re + + +def find_emails(text): + """ + It will parse the given string and return a list of emails if found + + Example: + >>find_emails('hello\n find me here\nemail@gmail.com') + ['email@gmail.com'] + + :param text: string + :return: list + """ + return re.findall(r"([a-zA-Z0-9+._-]+@[a-zA-Z0-9._-]+\.[a-zA-Z0-9_-]+)", text) + + +def get_mentions(text): + """ + It will return mentions from the text i.e @someone + + :param text: string + :return: list + + example + >>> get_mentions('Hi @hero, How are you? I hope @hero2 is fine. BTW say hi to @heroine for me') + ['hero','hero2','heroine'] + """ + result = re.findall("(^|[^@\w])@(\w{1,15})", text) + if len(result) != 0: + result = [i[1] for i in result] + return result + + +def get_hashtags(text): + """ + It will return hashtags from the text i.e #something + + :param text: string + :return: list + + example + >>> get_hashtags('my first code #programmer #python #awesome #grepsr') + ['programmer','python','awesome','grepsr'] + """ + + result = re.findall("(^|[^@\w])#(\w{1,15})", text) + if len(result) != 0: + result = [i[1] for i in result] + return result + + +def get_links(text): + """ + It will return website links from the text + + :param text: string + :return: list + + example + >>> message = 'http://twitter.com Project URL: https://app.grepsr.com/app/project/message/70454' + >>> get_links(message) + ['http://twitter.com', 'https://app.grepsr.com/app/project/message/70454'] + + """ + result = re.findall(r"(?Phttps?://[^\s]+)", text) + return result diff --git a/README.md b/README.md index 623630051c7d6c86c84d803714e7b1776996d5ed..f1f4125180402638ec4fcdc7bee2487bdde7f031 100644 --- a/README.md +++ b/README.md @@ -1,12 +1,6 @@ --- -title: Twitter Sentiment -emoji: 📉 -colorFrom: pink -colorTo: red +title: twitter_sentiment +app_file: test.py sdk: gradio sdk_version: 4.31.1 -app_file: app.py -pinned: false --- - -Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference diff --git a/__pycache__/test.cpython-39.pyc b/__pycache__/test.cpython-39.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4fb27dff9b4fac104bc420111a9f70d68c1f7c68 Binary files /dev/null and b/__pycache__/test.cpython-39.pyc differ diff --git a/__pycache__/twitter_crawl.cpython-310.pyc b/__pycache__/twitter_crawl.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..517eab5bb5ac5447753fa87223293e4af1c1dc71 Binary files /dev/null and b/__pycache__/twitter_crawl.cpython-310.pyc differ diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..bf59e0ab60564d62c1545465531f14a0e28cd28e --- /dev/null +++ b/requirements.txt @@ -0,0 +1,4 @@ +streamlit +transformers==4.40.2 +tensorflow==2.16.1 +tweetnlp \ No newline at end of file diff --git a/scrapper.ipynb b/scrapper.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..d8e59de7ff00f55be5ce47ff0f82d865a45bed6f --- /dev/null +++ b/scrapper.ipynb @@ -0,0 +1,168 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import re\n", + "import csv\n", + "from getpass import getpass\n", + "from time import sleep\n", + "from selenium.webdriver.common.keys import Keys\n", + "from selenium.common.exceptions import NoSuchElementException\n", + "from msedge.selenium_tools import Edge, EdgeOptions " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def get_tweet_data(card):\n", + " \"\"\"Extract data from tweet card\"\"\"\n", + " username = card.find_element_by_xpath('.//span').text\n", + " try:\n", + " handle = card.find_element_by_xpath('.//span[contains(text(), \"@\")]').text\n", + " except NoSuchElementException:\n", + " return\n", + " \n", + " try:\n", + " postdate = card.find_element_by_xpath('.//time').get_attribute('datetime')\n", + " except NoSuchElementException:\n", + " return\n", + " \n", + " comment = card.find_element_by_xpath('.//div[2]/div[2]/div[1]').text\n", + " responding = card.find_element_by_xpath('.//div[2]/div[2]/div[2]').text\n", + " text = comment + responding\n", + " reply_cnt = card.find_element_by_xpath('.//div[@data-testid=\"reply\"]').text\n", + " retweet_cnt = card.find_element_by_xpath('.//div[@data-testid=\"retweet\"]').text\n", + " like_cnt = card.find_element_by_xpath('.//div[@data-testid=\"like\"]').text\n", + "\n", + " \n", + " tweet = (username, handle, postdate, text, reply_cnt, retweet_cnt, like_cnt)\n", + " return tweet " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "search_term = input('search term: ')\n", + "\n", + "# create instance of web driver\n", + "options = EdgeOptions()\n", + "options.use_chromium = True\n", + "driver = Edge(options=options)\n", + "\n", + "# navigate to login screen\n", + "driver.get('https://twitter.com/search')\n", + "driver.maximize_window()\n", + "sleep(5)\n", + "\n", + "# find search input and search for term\n", + "search_input = driver.find_element_by_xpath('//input[@aria-label=\"Search query\"]')\n", + "search_input.send_keys(search_term)\n", + "search_input.send_keys(Keys.RETURN)\n", + "sleep(1)\n", + "\n", + "# navigate to historical 'latest' tab\n", + "driver.find_element_by_link_text('Latest').click()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# get all tweets on the page\n", + "data = []\n", + "tweet_ids = set()\n", + "last_position = driver.execute_script(\"return window.pageYOffset;\")\n", + "scrolling = True\n", + "\n", + "while scrolling:\n", + " page_cards = driver.find_elements_by_xpath('//article[@data-testid=\"tweet\"]')\n", + " for card in page_cards[-15:]:\n", + " tweet = get_tweet_data(card)\n", + " if tweet:\n", + " tweet_id = ''.join(tweet)\n", + " if tweet_id not in tweet_ids:\n", + " tweet_ids.add(tweet_id)\n", + " data.append(tweet)\n", + " \n", + " scroll_attempt = 0\n", + " while True:\n", + " # check scroll position\n", + " driver.execute_script('window.scrollTo(0, document.body.scrollHeight);')\n", + " sleep(2)\n", + " curr_position = driver.execute_script(\"return window.pageYOffset;\")\n", + " if last_position == curr_position:\n", + " scroll_attempt += 1\n", + " \n", + " # end of scroll region\n", + " if scroll_attempt >= 3:\n", + " scrolling = False\n", + " break\n", + " else:\n", + " sleep(2) # attempt another scroll\n", + " else:\n", + " last_position = curr_position\n", + " break\n", + "\n", + "# close the web driver\n", + "driver.close()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Saving the tweet data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "with open('turkcell_tweets.csv', 'w', newline='', encoding='utf-8') as f:\n", + " header = ['UserName', 'Handle', 'Timestamp', 'Text', 'Comments', 'Likes', 'Retweets']\n", + " writer = csv.writer(f)\n", + " writer.writerow(header)\n", + " writer.writerows(data)" + ] + } + ], + "metadata": { + "interpreter": { + "hash": "306b4709344c791e982a258cf5494139869959872aa39c2c4102a54cca0d2138" + }, + "kernelspec": { + "display_name": "Python 3.7.0 64-bit", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.0" + }, + "orig_nbformat": 4 + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/scrapy-twitter/.gitignore b/scrapy-twitter/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..2d493b6ecedcce33d9ebf5b85331c7ab1caeec57 --- /dev/null +++ b/scrapy-twitter/.gitignore @@ -0,0 +1,187 @@ + +# Created by https://www.gitignore.io/api/vim,linux,python,windows,sublimetext + +_data + +### Linux ### +*~ + +# temporary files which can be created if a process still has a handle open of a deleted file +.fuse_hidden* + +# KDE directory preferences +.directory + +# Linux trash folder which might appear on any partition or disk +.Trash-* + +# .nfs files are created when an open file is removed but is still being accessed +.nfs* + +### Python ### +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +env/ +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +*.egg-info/ +.installed.cfg +*.egg + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*,cover +.hypothesis/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# pyenv +.python-version + +# celery beat schedule file +celerybeat-schedule + +# SageMath parsed files +*.sage.py + +# dotenv +.env + +# virtualenv +.venv +venv/ +ENV/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +### SublimeText ### +# cache files for sublime text +*.tmlanguage.cache +*.tmPreferences.cache +*.stTheme.cache + +# workspace files are user-specific +*.sublime-workspace + +# project files should be checked into the repository, unless a significant +# proportion of contributors will probably not be using SublimeText +# *.sublime-project + +# sftp configuration file +sftp-config.json + +# Package control specific files +Package Control.last-run +Package Control.ca-list +Package Control.ca-bundle +Package Control.system-ca-bundle +Package Control.cache/ +Package Control.ca-certs/ +Package Control.merged-ca-bundle +Package Control.user-ca-bundle +oscrypto-ca-bundle.crt +bh_unicode_properties.cache + +# Sublime-github package stores a github token in this file +# https://packagecontrol.io/packages/sublime-github +GitHub.sublime-settings + +### Vim ### +# swap +[._]*.s[a-v][a-z] +[._]*.sw[a-p] +[._]s[a-v][a-z] +[._]sw[a-p] +# session +Session.vim +# temporary +.netrwhist +# auto-generated tag files +tags + +### Windows ### +# Windows thumbnail cache files +Thumbs.db +ehthumbs.db +ehthumbs_vista.db + +# Folder config file +Desktop.ini + +# Recycle Bin used on file shares +$RECYCLE.BIN/ + +# Windows Installer files +*.cab +*.msi +*.msm +*.msp + +# Windows shortcuts +*.lnk + +# End of https://www.gitignore.io/api/vim,linux,python,windows,sublimetext diff --git a/scrapy-twitter/LICENSE.md b/scrapy-twitter/LICENSE.md new file mode 100644 index 0000000000000000000000000000000000000000..e11b346d627a08e0c9566f767127a64bfed9678d --- /dev/null +++ b/scrapy-twitter/LICENSE.md @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) [2017] [Kevin Lloyd H. Bernal] + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/scrapy-twitter/README.md b/scrapy-twitter/README.md new file mode 100644 index 0000000000000000000000000000000000000000..3fea8d704271f44c14a6432c652864344215fc74 --- /dev/null +++ b/scrapy-twitter/README.md @@ -0,0 +1,20 @@ +This is a web scraper for fetching tweets from a list of user accounts, +without using twitter's API to avoid its rate limiting. + +## USAGE + +`scrapy crawl twitter -a urls_file=url.txt -a urls_link=https://pastebin.com/raw/XXX123 -a combine_urls=True` + +**Parameters**|**Description** +:-----:|:-----: +urls_file|local path to file +urls_link|link to an online resource +combine_urls|*Optional*. Links from both *urls_file* and *urls_link* are combined. *Default: False* + +Both `urls_file` and `urls_link` must only contain links which are newline separated. + +## MOTIVATION + +I use this personally to keep track of twitter users who consistently tweet stock trading +speculations for the **Philippine Stock Exchange** (*PSE*). Spiders in this project are +deployed on my personal Scrapinghub platform. diff --git a/scrapy-twitter/requirements.txt b/scrapy-twitter/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..6359ca5585b19f4e51fa4dc5915ec3676e4ffcae --- /dev/null +++ b/scrapy-twitter/requirements.txt @@ -0,0 +1,3 @@ +Scrapy==1.3.3 +requests==2.13.0 +shub==2.5.1 diff --git a/scrapy-twitter/scrapy.cfg b/scrapy-twitter/scrapy.cfg new file mode 100644 index 0000000000000000000000000000000000000000..9cb6b03ad1f098cb45640db18e1330639c9ba18a --- /dev/null +++ b/scrapy-twitter/scrapy.cfg @@ -0,0 +1,11 @@ +# Automatically created by: scrapy startproject +# +# For more information about the [deploy] section see: +# https://scrapyd.readthedocs.org/en/latest/deploy.html + +[settings] +default = twitter.settings + +[deploy] +#url = http://localhost:6800/ +project = twitter diff --git a/scrapy-twitter/twitter/__init__.py b/scrapy-twitter/twitter/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/scrapy-twitter/twitter/__pycache__/__init__.cpython-310.pyc b/scrapy-twitter/twitter/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6aa475a07dd204554471d683d855f34d9a016870 Binary files /dev/null and b/scrapy-twitter/twitter/__pycache__/__init__.cpython-310.pyc differ diff --git a/scrapy-twitter/twitter/__pycache__/pipelines.cpython-310.pyc b/scrapy-twitter/twitter/__pycache__/pipelines.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c48c141e39c1158a175ff676294eba35e3a60c85 Binary files /dev/null and b/scrapy-twitter/twitter/__pycache__/pipelines.cpython-310.pyc differ diff --git a/scrapy-twitter/twitter/__pycache__/settings.cpython-310.pyc b/scrapy-twitter/twitter/__pycache__/settings.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..264d603c79740df3aabf75983ad602ca43c56146 Binary files /dev/null and b/scrapy-twitter/twitter/__pycache__/settings.cpython-310.pyc differ diff --git a/scrapy-twitter/twitter/items.py b/scrapy-twitter/twitter/items.py new file mode 100644 index 0000000000000000000000000000000000000000..0e211dd1d91bfee6dd9dbfa8e0c229bbc0cec78c --- /dev/null +++ b/scrapy-twitter/twitter/items.py @@ -0,0 +1,14 @@ +# -*- coding: utf-8 -*- + +# Define here the models for your scraped items +# +# See documentation in: +# http://doc.scrapy.org/en/latest/topics/items.html + +import scrapy + + +class TwitterItem(scrapy.Item): + # define the fields for your item here like: + # name = scrapy.Field() + pass diff --git a/scrapy-twitter/twitter/middlewares.py b/scrapy-twitter/twitter/middlewares.py new file mode 100644 index 0000000000000000000000000000000000000000..780890c82c4c3531cd0dd5a4ab0b0bdd3ad87b94 --- /dev/null +++ b/scrapy-twitter/twitter/middlewares.py @@ -0,0 +1,56 @@ +# -*- coding: utf-8 -*- + +# Define here the models for your spider middleware +# +# See documentation in: +# http://doc.scrapy.org/en/latest/topics/spider-middleware.html + +from scrapy import signals + + +class TwitterSpiderMiddleware(object): + # Not all methods need to be defined. If a method is not defined, + # scrapy acts as if the spider middleware does not modify the + # passed objects. + + @classmethod + def from_crawler(cls, crawler): + # This method is used by Scrapy to create your spiders. + s = cls() + crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) + return s + + def process_spider_input(response, spider): + # Called for each response that goes through the spider + # middleware and into the spider. + + # Should return None or raise an exception. + return None + + def process_spider_output(response, result, spider): + # Called with the results returned from the Spider, after + # it has processed the response. + + # Must return an iterable of Request, dict or Item objects. + for i in result: + yield i + + def process_spider_exception(response, exception, spider): + # Called when a spider or process_spider_input() method + # (from other spider middleware) raises an exception. + + # Should return either None or an iterable of Response, dict + # or Item objects. + pass + + def process_start_requests(start_requests, spider): + # Called with the start requests of the spider, and works + # similarly to the process_spider_output() method, except + # that it doesn’t have a response associated. + + # Must return only requests (not items). + for r in start_requests: + yield r + + def spider_opened(self, spider): + spider.logger.info('Spider opened: %s' % spider.name) diff --git a/scrapy-twitter/twitter/pipelines.py b/scrapy-twitter/twitter/pipelines.py new file mode 100644 index 0000000000000000000000000000000000000000..9d3869f2a48d69874d5a7c6d7237b04b0218cd50 --- /dev/null +++ b/scrapy-twitter/twitter/pipelines.py @@ -0,0 +1,121 @@ +# -*- coding: utf-8 -*- + +# Define your item pipelines here +# +# Don't forget to add your pipeline to the ITEM_PIPELINES setting +# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html + +import re +import os +import json + +from scrapy.exceptions import DropItem +from scrapy.conf import settings + +class FilterTweetsPipeline(object): + """This drops items that are either of the following: + * retweets + * tweet having content less than the specified in the settings + """ + + def __init__(self): + self.min_tweet_length = settings['MIN_TWEET_LENGTH'] + + + def process_item(self, item, spider): + + tweet = item['tweet'] + + if not self._is_retweet(tweet): + raise DropItem("item is a retweet.") + + if not self._has_enough_content(tweet, self.min_tweet_length): + raise DropItem("item has less than {} characters.".format(self.min_tweet_length)) + + return item + + + def _is_retweet(self, tweet): + """This determines if a certain tweet is a retweet since it contains the 'data-retweet-id' attribute. + + :returns: True if tweet is a retweet; False otherwise + """ + + if tweet.css('::attr(data-retweet-id)').extract_first() is None: + return True + return False + + + def _has_enough_content(self, tweet, length): + """This returns False if a tweet contains characters which are less than the specified length.""" + + content = tweet.css('p.tweet-text::text').extract_first() + + if content is None or len(content) < length: + return False + return True + + +class DataShapePipeline(object): + """This extracts the necessary text-data from the Selectors returned by Spiders.""" + + def process_item(self, item, spider): + + data = { + 'tweet_id': item['tweet'].css('::attr(data-tweet-id)').extract_first(), + 'user': item['user'], + 'time_epoch':item['tweet'].css('span._timestamp::attr(data-time)').extract_first(), + 'tweet': ''.join(item['tweet'].css('p.tweet-text ::text').extract()) + } + + return data + + +class CleanTweetsPipeline(object): + """This removes unnecessary texts in tweet texts.""" + + # any substrings matched in these regexes are removed from the tweet itself + REGEX = [ + # This matches urls starting in 'http' + re.compile(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'), + + # This are for the links included in tweets with images uploaded + re.compile(r'pic\.twitter\.com\/[a-zA-Z1-9]+') + ] + + def process_item(self, item, spider): + + for rgx in self.REGEX: + match = rgx.search(item['tweet']) + + if match is not None: + item['tweet'] = rgx.sub('', item['tweet']) + spider.log("'{0}' has been removed from the tweet.".format(match.group())) + + return item + + +class FileSavePipeline(object): + """This pipeline saves the data parsed into a file""" + + def __init__(self): + self.save_path = settings['SAVE_PATH']['tweets'] + self.output_file = os.path.join(self.save_path, 'output.json') + + + def open_spider(self, spider): + self.file = open(self.output_file, 'w') + + + def close_spider(self, spider): + self.file.close() + + + def process_item(self, item, spider): + self._write_to_file(item) + return item + + + def _write_to_file(self, item): + line = json.dumps(dict(item)) + "\n" + self.file.write(line) diff --git a/scrapy-twitter/twitter/settings.py b/scrapy-twitter/twitter/settings.py new file mode 100644 index 0000000000000000000000000000000000000000..0b3c732decd04bf33bcee6338737dce429f1d9e5 --- /dev/null +++ b/scrapy-twitter/twitter/settings.py @@ -0,0 +1,101 @@ +# -*- coding: utf-8 -*- + +# Scrapy settings for twitter project +# +# For simplicity, this file contains only settings considered important or +# commonly used. You can find more settings consulting the documentation: +# +# http://doc.scrapy.org/en/latest/topics/settings.html +# http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html +# http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html + +BOT_NAME = 'twitter' + +SPIDER_MODULES = ['twitter.spiders'] +NEWSPIDER_MODULE = 'twitter.spiders' + + +# Crawl responsibly by identifying yourself (and your website) on the user-agent +#USER_AGENT = 'twitter (+http://www.yourdomain.com)' + +# Obey robots.txt rules +ROBOTSTXT_OBEY = True + +# Configure maximum concurrent requests performed by Scrapy (default: 16) +#CONCURRENT_REQUESTS = 32 + +# Configure a delay for requests for the same website (default: 0) +# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay +# See also autothrottle settings and docs +DOWNLOAD_DELAY = 3 +# The download delay setting will honor only one of: +#CONCURRENT_REQUESTS_PER_DOMAIN = 16 +#CONCURRENT_REQUESTS_PER_IP = 16 + +# Disable cookies (enabled by default) +#COOKIES_ENABLED = False + +# Disable Telnet Console (enabled by default) +#TELNETCONSOLE_ENABLED = False + +# Override the default request headers: +#DEFAULT_REQUEST_HEADERS = { +# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', +# 'Accept-Language': 'en', +#} + +# Enable or disable spider middlewares +# See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html +#SPIDER_MIDDLEWARES = { +# 'twitter.middlewares.TwitterSpiderMiddleware': 543, +#} + +# Enable or disable downloader middlewares +# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html +#DOWNLOADER_MIDDLEWARES = { +# 'twitter.middlewares.MyCustomDownloaderMiddleware': 543, +#} + +# Enable or disable extensions +# See http://scrapy.readthedocs.org/en/latest/topics/extensions.html +#EXTENSIONS = { +# 'scrapy.extensions.telnet.TelnetConsole': None, +#} + +# Configure item pipelines +# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html +ITEM_PIPELINES = { + 'twitter.pipelines.FilterTweetsPipeline': 200, + 'twitter.pipelines.DataShapePipeline': 300, + 'twitter.pipelines.CleanTweetsPipeline': 400, + 'twitter.pipelines.FileSavePipeline': 500 +} + +SAVE_PATH = { + 'tweets' : './_data/tweets/' +} + +MIN_TWEET_LENGTH = 40 + +CLOSESPIDER_ITEMCOUNT = 50000 + +# Enable and configure the AutoThrottle extension (disabled by default) +# See http://doc.scrapy.org/en/latest/topics/autothrottle.html +#AUTOTHROTTLE_ENABLED = True +# The initial download delay +#AUTOTHROTTLE_START_DELAY = 5 +# The maximum download delay to be set in case of high latencies +#AUTOTHROTTLE_MAX_DELAY = 60 +# The average number of requests Scrapy should be sending in parallel to +# each remote server +#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 +# Enable showing throttling stats for every response received: +#AUTOTHROTTLE_DEBUG = False + +# Enable and configure HTTP caching (disabled by default) +# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings +#HTTPCACHE_ENABLED = True +#HTTPCACHE_EXPIRATION_SECS = 0 +#HTTPCACHE_DIR = 'httpcache' +#HTTPCACHE_IGNORE_HTTP_CODES = [] +#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' diff --git a/scrapy-twitter/twitter/spiders/__init__.py b/scrapy-twitter/twitter/spiders/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..5ca581dc70652bc451062dd6efa6d8b4d3848a75 --- /dev/null +++ b/scrapy-twitter/twitter/spiders/__init__.py @@ -0,0 +1,4 @@ +# This package will contain the spiders of your Scrapy project +# +# Please refer to the documentation for information on how to create and manage +# your spiders. diff --git a/scrapy-twitter/twitter/spiders/__pycache__/__init__.cpython-310.pyc b/scrapy-twitter/twitter/spiders/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e4b4dd98d189f3cd4f0bcc8ef72761a76857f580 Binary files /dev/null and b/scrapy-twitter/twitter/spiders/__pycache__/__init__.cpython-310.pyc differ diff --git a/scrapy-twitter/twitter/spiders/__pycache__/twitter.cpython-310.pyc b/scrapy-twitter/twitter/spiders/__pycache__/twitter.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e6575f7e0b8ff07c27b7e62d05efd30dbe4c5e29 Binary files /dev/null and b/scrapy-twitter/twitter/spiders/__pycache__/twitter.cpython-310.pyc differ diff --git a/scrapy-twitter/twitter/spiders/twitter.py b/scrapy-twitter/twitter/spiders/twitter.py new file mode 100644 index 0000000000000000000000000000000000000000..e72f0cf76623a6c5130c91d4032cf7e92917cbb7 --- /dev/null +++ b/scrapy-twitter/twitter/spiders/twitter.py @@ -0,0 +1,143 @@ +import re +import json +import scrapy +import requests + +from scrapy.selector import Selector + +class TwiterUserSpider(scrapy.Spider): + name = "twitter" + allowed_domains = ["twitter.com"] + + def __init__(self, urls_file=None, urls_link=None, combine_urls=False): + self.page_position_rgx = re.compile(r'data-min-position="([^"]+?)"') + self.scroll_content = 'https://twitter.com/i/profiles/show/{user}' \ + '/timeline/tweets?include_available_features=1' \ + '&include_entities=1&max_position={page_position}' \ + '&reset_error_state=false' + + self.urls = self._populate_urls(urls_file, urls_link, combine_urls) + + + def start_requests(self): + for url in self.urls: + yield scrapy.Request(url=url, callback=self.parse) + + + def parse(self, response): + user = self._get_user(response) + + yield from self._yield_tweets(user, response) + yield from self._load_scroll_content(user, response) + + + def _load_scroll_content(self, user, response, page_position=None): + """Twiter has an infinite scroll style of loading more tweets on a single page. + + This triggers that dynamic content loading. + """ + + url_for_scroll_load = self._get_scroll_url(user, response, page_position) + + request = scrapy.Request(url=url_for_scroll_load, callback=self.parse_scroll_content) + request.meta['user'] = user + yield request + + + def parse_scroll_content(self, response): + + user = response.meta['user'] + + raw_scroll_data = json.loads(response.body) + + html_page = Selector(text=raw_scroll_data['items_html']) + page_position = raw_scroll_data['min_position'] + + yield from self._yield_tweets(user, html_page) + + if raw_scroll_data['has_more_items'] is False: + return + + yield from self._load_scroll_content(user, response, page_position) + + + def _yield_tweets(self, user, response): + for tweet in response.css('div.tweet'): + yield { + 'tweet': tweet, + 'user': user + } + + + def _get_scroll_url(self, user, response, page_position=None): + """This returns the URL that contains the next set of tweet data.""" + + if page_position is None: + page_position = self._get_page_position(response) + + return self.scroll_content.format(user=user, page_position=page_position) + + + def _get_page_position(self, response): + """This parses the crawled response content to find and return the number contained in data-min-positon.""" + + rgx_match = self.page_position_rgx.search(response.body.decode('utf-8')) + captured_groups = rgx_match.groups() + + if len(captured_groups) > 0: + return captured_groups[0] + + + def _get_user(self, response): + """On the base page of the requested URL, this returns the username that belongs to the URL.""" + + user = response.css('h1.ProfileHeaderCard-name a::attr(href)').extract_first() + return user[1:] # remove preceeding slash + + + def _populate_urls(self, urls_file, urls_link, combine_urls): + """This return the list urls to crawl based on the arguments provided in the command line. + + Moreover, urls from both resources can be opted to be combined or not via the `combine` boolean flag. + Combining them would need both url resource arguments to be present. + + Consequently, if `combine_urls` is set to False, it takes the following precedence based on availability: + 1. FILE + 2. LINK + """ + + from_file = self._read_url_file(urls_file) + from_link = self._read_url_link(urls_link) + + if combine_urls: + if not (from_file and from_link): + raise AttributeError("URL resources from file and link must both be present to combine.") + return list(set(from_file + from_link)) + + # not combining URLS would have employ the precedence + return from_file or from_link + + + def _read_url_file(self, urls_file): + if urls_file is None: + return None + + urls = [] + + with open(urls_file, 'r') as f: + for line in f: + urls.append(line.strip()) + + return urls + + + def _read_url_link(self, urls_link): + if urls_link is None: + return None + + req = requests.get(urls_link) + + if req.status_code != 200: + return None + + return [url.strip() for url in req.text.split('\n')] diff --git a/test.py b/test.py new file mode 100644 index 0000000000000000000000000000000000000000..228eff77f34cefb481855ed9007ec29b403fc73e --- /dev/null +++ b/test.py @@ -0,0 +1,121 @@ +import gradio as gr +from selenium.webdriver.common.keys import Keys +from selenium.common.exceptions import NoSuchElementException +from selenium.webdriver import Chrome, ChromeOptions +import time +import pandas as pd +from selenium.webdriver.support import expected_conditions as EC +from selenium.webdriver.common.by import By +from selenium.webdriver.support.ui import WebDriverWait +import tweetnlp +from time import sleep + + + + + + + + + +global name +global password +global topic + +def login_and_work(username_text, password_text, topic_text): + options = ChromeOptions() + options.use_chromium = True + driver = Chrome() + driver.get('https://twitter.com/i/flow/login') + time.sleep(5) + username = driver.find_element("xpath",'//input[@name="text"]') + username.send_keys(username_text) + driver.find_element("xpath",'(//*[@role="button"])[3]').click() # in prevoius block + + try: + time.sleep(10) + y = driver.find_element("xpath",'//h1//span') + if(y.text =='Enter your phone number or username'): + username2 = driver.find_element("xpath",'//input[@name="text"]') + username2.send_keys(username_text) + driver.find_element("xpath",'(//*[@role="button"])[2]').click() + except: + + x = 10 + + password = driver.find_element("xpath",'//input[@name="password"]') + password.send_keys(password_text) + driver.find_element("xpath",'(//*[@role = "button"])[4]').click() + + + try: + driver.maximize_window() + time.sleep(10) + driver.maximize_window() + search = driver.find_element("xpath",'//input[@placeholder="Search"]') + search.send_keys(Keys.CONTROL + "a") + search.send_keys(Keys.DELETE) + search.send_keys(topic_text) + search.send_keys(Keys.RETURN) + + df = pd.DataFrame() + df['Name']='' + df['Tweet'] = '' + #df['Sentiment'] = '' + tweet_count = 6 + i = 1 + element = driver.find_element("xpath","//body") + time.sleep(20) + while True and i <= tweet_count: + + try: + tweet_name = driver.find_element("xpath",f'(//*[@data-testid="User-Name"])[{i}]') + df.loc[i, 'Name'] = tweet_name.text + + tweet_div = driver.find_element("xpath",f'(//*[@data-testid="tweetText"])[{i}]') + if(tweet_div): + #print(driver.find_element("xpath",'(//*[@data-testid="tweetText"])[1]').getText()) + df.loc[i, 'Tweet'] = tweet_div.text + i += 1 + element.send_keys(Keys.PAGE_DOWN) + time.sleep(10) + except: + time.sleep(10) + tweet_name = driver.find_element("xpath",f'(//*[@data-testid="User-Name"])[{i}]') + df.loc[i, 'Name'] = tweet_name.text + + tweet_div = driver.find_element("xpath",f'(//*[@data-testid="tweetText"])[{i}]') + if(tweet_div): + #print(driver.find_element("xpath",'(//*[@data-testid="tweetText"])[1]').getText()) + df.loc[i, 'Tweet'] = tweet_div.text + i += 1 + element.send_keys(Keys.PAGE_DOWN) + time.sleep(10) + + + #model = tweetnlp.load_model('sentiment', multilingual=True) # Or `model = tweetnlp.Sentiment()` + + # for i in range(1,len(df)+1): + # print("text is...", df.loc[i,'Tweet']) + + + # #y = sentiment_pipeline(df.loc[i, 'Tweet']) + # #print('label is..', y[0]['label']) + # print('model sentiment is..', model.sentiment(df.loc[i, 'Tweet'])) + # df.loc[i, 'Sentiment'] = str(model.sentiment(df.loc[i, 'Tweet'])) + + return df + + except Exception as e: + x = 10 + +with gr.Blocks() as demo: + + name = gr.Textbox(label="Username") + password = gr.Textbox(label="Password", type = 'password') + topic= gr.Textbox(label="Topic") + + btn = gr.Button(value="Submit") + btn.click(login_and_work, inputs=[name, password, topic], outputs= gr.Dataframe(headers=['Name', 'Tweet'])) + +demo.launch() \ No newline at end of file diff --git a/tw_s2.py b/tw_s2.py new file mode 100644 index 0000000000000000000000000000000000000000..2eaac6b47d4fc9aae8937e281f69d982a0de7ff6 --- /dev/null +++ b/tw_s2.py @@ -0,0 +1,162 @@ +import streamlit as st +import random +import toml +import streamlit as st +import pandas as pd +from datetime import date +import re +import csv +from getpass import getpass +from time import sleep +from selenium import webdriver +from selenium.webdriver.chrome.options import Options +from selenium.webdriver.chrome.service import Service +from webdriver_manager.chrome import ChromeDriverManager +from webdriver_manager.core.os_manager import ChromeType + +from selenium.webdriver.common.keys import Keys +from selenium.common.exceptions import NoSuchElementException +from selenium.webdriver import Chrome, ChromeOptions +import time +import pandas as pd +from selenium.webdriver.support import expected_conditions as EC +from selenium.webdriver.common.by import By +from selenium.webdriver.support.ui import WebDriverWait +import tweetnlp + + + +df = pd.DataFrame() +options = '' + +@st.experimental_singleton +def get_driver(): + global options + return webdriver.Chrome( + service=Service( + ChromeDriverManager(chrome_type=ChromeType.CHROMIUM).install() + ), + options=options + ) + + + + + +def login_twitter(username_text, password_text, topic_text): + # global options + # options = Options() + # options.add_argument("--disable-gpu") + # options.add_argument("--headless") + # driver = get_driver() + # options.use_chromium = True + options = ChromeOptions() + options.use_chromium = True + driver = webdriver.Chrome() + + driver.get('https://twitter.com/i/flow/login') + time.sleep(5) + username = driver.find_element("xpath",'//input[@name="text"]') + username.send_keys(username_text) + driver.find_element("xpath",'(//*[@role="button"])[3]').click() # in prevoius block + + try: + time.sleep(10) + y = driver.find_element("xpath",'//h1//span') + if(y.text =='Enter your phone number or username'): + username2 = driver.find_element("xpath",'//input[@name="text"]') + username2.send_keys(username_text) + driver.find_element("xpath",'(//*[@role="button"])[2]').click() + except: + st.write(" I am in exception and didnt get 'Enter your phone number or username'") + x = 10 + + password = driver.find_element("xpath",'//input[@name="password"]') + password.send_keys(password_text) + driver.find_element("xpath",'(//*[@role = "button"])[4]').click() + + + try: + driver.maximize_window() + time.sleep(10) + driver.maximize_window() + search = driver.find_element("xpath",'//input[@placeholder="Search"]') + search.send_keys(Keys.CONTROL + "a") + search.send_keys(Keys.DELETE) + search.send_keys(topic_text) + search.send_keys(Keys.RETURN) + + df = pd.DataFrame() + df['Name']='' + df['Tweet'] = '' + df['Sentiment'] = '' + tweet_count = 6 + i = 1 + element = driver.find_element("xpath","//body") + time.sleep(20) + while True and i <= tweet_count: + + try: + tweet_name = driver.find_element("xpath",f'(//*[@data-testid="User-Name"])[{i}]') + df.loc[i, 'Name'] = tweet_name.text + + tweet_div = driver.find_element("xpath",f'(//*[@data-testid="tweetText"])[{i}]') + if(tweet_div): + #print(driver.find_element("xpath",'(//*[@data-testid="tweetText"])[1]').getText()) + df.loc[i, 'Tweet'] = tweet_div.text + i += 1 + element.send_keys(Keys.PAGE_DOWN) + time.sleep(10) + except: + time.sleep(10) + tweet_name = driver.find_element("xpath",f'(//*[@data-testid="User-Name"])[{i}]') + df.loc[i, 'Name'] = tweet_name.text + + tweet_div = driver.find_element("xpath",f'(//*[@data-testid="tweetText"])[{i}]') + if(tweet_div): + #print(driver.find_element("xpath",'(//*[@data-testid="tweetText"])[1]').getText()) + df.loc[i, 'Tweet'] = tweet_div.text + i += 1 + element.send_keys(Keys.PAGE_DOWN) + time.sleep(10) + + + model = tweetnlp.load_model('sentiment', multilingual=True) # Or `model = tweetnlp.Sentiment()` + + for i in range(1,len(df)+1): + print("text is...", df.loc[i,'Tweet']) + + + #y = sentiment_pipeline(df.loc[i, 'Tweet']) + #print('label is..', y[0]['label']) + print('model sentiment is..', model.sentiment(df.loc[i, 'Tweet'])) + df.loc[i, 'Sentiment'] = str(model.sentiment(df.loc[i, 'Tweet'])) + st.dataframe(df) + + + except Exception as e: + st.write(e) + # driver.maximize_window() + # time.sleep(10) + # driver.maximize_window() + # search = driver.find_element("xpath",'//input[@placeholder="Search"]') + # search.send_keys(Keys.CONTROL + "a") + # search.send_keys(Keys.DELETE) + # search.send_keys(topic_text) + + + + +with st.sidebar: + username = st.text_input("Username") + password_text = st.text_input("Password", type = "password") + topic = st.text_input("topic") + connect = st.button("Login Twitter",\ + on_click = login_twitter, + args = [username, password_text, topic] + ) + if('is_ready' not in st.session_state): + st.session_state['is_ready'] = False + + if(st.session_state['is_ready'] == True): + st.write('Connected!') \ No newline at end of file diff --git a/twitter_crawl.py b/twitter_crawl.py new file mode 100644 index 0000000000000000000000000000000000000000..556b1267730c5a570a7b4d071e2e9d1db7ba0539 --- /dev/null +++ b/twitter_crawl.py @@ -0,0 +1,43 @@ +import scrapy + +class TwitterSpider(scrapy.Spider): + name = "twitter_spider" + start_urls = [ + "https://twitter.com/search?q=Scrapy" + ] + + def start_requests(self): + for url in self.start_urls: + yield scrapy.Request(url, callback=self.parse) + + def parse(self, response): + # Extract the tweets from the page + tweets = response.css('.tweet-text::text').getall() + + # Print the tweets + for tweet in tweets: + print(tweet) + + # Find the URL of the next page of search results + next_page = response.css('.next-page::attr(href)').get() + + # Check if there is a next page + if next_page: + # Send a request to the next page + yield scrapy.Request(response.urljoin(next_page), callback=self.parse_page) + + def parse_page(self, response): + # Extract the tweets from the page + tweets = response.css('.tweet-text::text').getall() + + # Print the tweets + for tweet in tweets: + print(tweet) + + # Find the URL of the next page of search results + next_page = response.css('.next-page::attr(href)').get() + + # Check if there is a next page + if next_page: + # Send a request to the next page + yield scrapy.Request(response.urljoin(next_page), callback=self.parse_page) \ No newline at end of file diff --git a/twitter_scraper_without_API/.github/workflows/python-app.yml b/twitter_scraper_without_API/.github/workflows/python-app.yml new file mode 100644 index 0000000000000000000000000000000000000000..b85b537df6a0deb8975b9cbce3b3f99c4a7b8e7b --- /dev/null +++ b/twitter_scraper_without_API/.github/workflows/python-app.yml @@ -0,0 +1,61 @@ +# This workflow will install Python dependencies, run tests and lint with a single version of Python +# For more information see: https://help.github.com/actions/language-and-framework-guides/using-python-with-github-actions + +name: Python application + +on: + push: + branches: [ main ] + pull_request: + branches: [ main ] + +permissions: + contents: read + +jobs: + build: + + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v3 + - name: Set up Python 3.10 + uses: actions/setup-python@v3 + with: + python-version: "3.10" + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install flake8 pytest + if [ -f requirements.txt ]; then pip install -r requirements.txt; fi + - name: Lint with flake8 + run: | + # stop the build if there are Python syntax errors or undefined names + flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics + # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide + flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics + #- name: Test with pytest + # run: | + # pytest + + deploy: + + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v3 + - name: Set up Python + uses: actions/setup-python@v3 + with: + python-version: '3.x' + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install build + - name: Build package + run: python -m build + - name: Publish package + uses: pypa/gh-action-pypi-publish@27b31702a0e7fc50959f5ad993c78deac1bdfc29 + with: + user: __token__ + password: ${{ secrets.PYPI_API_TOKEN }} \ No newline at end of file diff --git a/twitter_scraper_without_API/.gitignore b/twitter_scraper_without_API/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..64669d82f760fc2c830f0f7885b7f460dd4c4733 --- /dev/null +++ b/twitter_scraper_without_API/.gitignore @@ -0,0 +1,3 @@ +firefox-geckodriver/* +geckodriver.log +src/twitter_scraper_without_api/__pycache__/* diff --git a/twitter_scraper_without_API/MANIFEST.in b/twitter_scraper_without_API/MANIFEST.in new file mode 100644 index 0000000000000000000000000000000000000000..54fa53db6387bcf37828eb871bac3d09c7d00066 --- /dev/null +++ b/twitter_scraper_without_API/MANIFEST.in @@ -0,0 +1,3 @@ +include README.MD +include LICENS +include requirements.txt \ No newline at end of file diff --git a/twitter_scraper_without_API/README.MD b/twitter_scraper_without_API/README.MD new file mode 100644 index 0000000000000000000000000000000000000000..2417070d459c5d12f91ac1685295a8d05f3b2039 --- /dev/null +++ b/twitter_scraper_without_API/README.MD @@ -0,0 +1,61 @@ +

Twitter scraper selenium

+

Python's package to scrape Twitter's front-end easily with selenium.

+ + +[![PyPI license](https://img.shields.io/pypi/l/ansicolortags.svg)](https://opensource.org/licenses/MIT) [![Python >=3.6.9](https://img.shields.io/badge/python-3.6+-blue.svg)](https://www.python.org/downloads/release/python-360/) +[![Maintenance](https://img.shields.io/badge/Maintained-Yes-green.svg)](https://github.com/shaikhsajid1111/facebook_page_scraper/graphs/commit-activity) + +# Twitter_scraper_without_API + +This code was developed to extract information from twitter without using API as there are a limitation and costs for using official twitter API. You can extract based on your keyword and time frame (in minutes). You can extract unlimitted number of tweets. + + +## Pre-requests + + - Python 3.6+ + - Browsers(Firefox) + +## Instalation + +you can install from source code using + + git clone https://github.com/HamedMinaeizaeim/twitter_scraper_without_API.git + and then run + + + Python setup.py install + or you can run + + + pip install -r requirements.txt +alternatively, you can install using **PyPl** : + + + + pip install twitter_scraper_without_API + + + + + +## How to use + +To use this library, you just need to import the TwitterScraper scraper class and then specify your keyword search. By default, it will return all tweets within a minute. You can change it to extract tweets in the last n minutes. Here is a code to do that: + + from src.twitter_scraper_without_api import TwitterScraper + twitter = TwitterScraper('bitcoin') + twitter.last_n_mins = 3 + twitter.fetch_data() + +## Export option + +You can export data as json, panda (Dataframe) and csv + + df = twitter.store_data('dataFrame') + csv = twitter.store_data('csv') + json = twitter.store_data('json') + + +## Privacy + +There is no issue with privacy in this library and search is based on publicly avaialble information diff --git a/twitter_scraper_without_API/firefox-geckodriver/geckodriver-v0.34.0.zip b/twitter_scraper_without_API/firefox-geckodriver/geckodriver-v0.34.0.zip new file mode 100644 index 0000000000000000000000000000000000000000..779828df014decb22935b6d7a5b8acd278f5e190 --- /dev/null +++ b/twitter_scraper_without_API/firefox-geckodriver/geckodriver-v0.34.0.zip @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7d748367f7ed501a114fbbd5152a206061923db7276930e4e9739d29b857220f +size 1837855 diff --git a/twitter_scraper_without_API/firefox-geckodriver/geckodriver.exe b/twitter_scraper_without_API/firefox-geckodriver/geckodriver.exe new file mode 100644 index 0000000000000000000000000000000000000000..640bcb7840af3270f10f72c3cb1184b26579e431 --- /dev/null +++ b/twitter_scraper_without_API/firefox-geckodriver/geckodriver.exe @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:989c9e22c08924ecb0ce8901889dcb4dc8db33b0b4c8c88ffea38fe89f04c6aa +size 4418464 diff --git a/twitter_scraper_without_API/geckodriver.log b/twitter_scraper_without_API/geckodriver.log new file mode 100644 index 0000000000000000000000000000000000000000..01f965850ff523d51adc0d3dd99c8921e283af41 --- /dev/null +++ b/twitter_scraper_without_API/geckodriver.log @@ -0,0 +1,2 @@ +1715008570261 geckodriver INFO Listening on 127.0.0.1:60016 +1715008621681 geckodriver INFO Listening on 127.0.0.1:60045 diff --git a/twitter_scraper_without_API/pyproject.toml b/twitter_scraper_without_API/pyproject.toml new file mode 100644 index 0000000000000000000000000000000000000000..29df43b135c8baa8ec1b1d3197d45de5fd4055e8 --- /dev/null +++ b/twitter_scraper_without_API/pyproject.toml @@ -0,0 +1,3 @@ +[build-system] +requires = ["setuptools>=42"] +build-backend = "setuptools.build_meta" \ No newline at end of file diff --git a/twitter_scraper_without_API/requirements.txt b/twitter_scraper_without_API/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..32fc842cf5739316b0f3f592f1eef731196a0939 --- /dev/null +++ b/twitter_scraper_without_API/requirements.txt @@ -0,0 +1,53 @@ +async-generator==1.10 +attrs==21.4.0 +beautifulsoup4==4.11.1 +build==0.7.0 +certifi==2022.5.18.1 +cffi==1.15.0 +charset-normalizer==2.0.12 +colorama==0.4.4 +cryptography==37.0.2 +distlib==0.3.4 +distro==1.7.0 +filelock==3.7.0 +h11==0.13.0 +idna==3.3 +importlib-metadata==4.11.3 +mozdownload==1.26.0 +mozfile==2.1.0 +mozinfo==1.2.2 +numpy==1.21.6 +outcome==1.1.0 +packaging==21.3 +pandas==1.3.5 +pep517==0.12.0 +platformdirs==2.5.2 +platinfo==0.15.0 +progressbar2==4.0.0 +py-firefox-driver-manager==0.0.4 +pycparser==2.21 +pyOpenSSL==22.0.0 +pyparsing==3.0.9 +PySocks==1.7.1 +python-dateutil==2.8.2 +python-utils==3.3.0 +pytz==2022.1 +pywin32-ctypes==0.2.0 +redo==2.0.3 +requests==2.27.1 +selenium==4.1.5 +six==1.16.0 +sniffio==1.2.0 +sortedcontainers==2.4.0 +soupsieve==2.3.2.post1 +style==1.1.0 +tomli==2.0.1 +treeherder-client==5.0.0 +trio==0.20.0 +trio-websocket==0.9.2 +typing_extensions==4.2.0 +update==0.0.1 +urllib3==1.26.9 +virtualenv==20.14.1 +wsproto==1.1.0 +zipp==3.8.0 diff --git a/twitter_scraper_without_API/setup.py b/twitter_scraper_without_API/setup.py new file mode 100644 index 0000000000000000000000000000000000000000..524f09f108556d66f958c904c3866d29fbf3aae8 --- /dev/null +++ b/twitter_scraper_without_API/setup.py @@ -0,0 +1,40 @@ +from setuptools import setup +import setuptools +import os +import sys + +def read_file(filename): + with open(os.path.join(os.path.dirname(__file__), filename)) as file: + return file.read() + +thelibFolder = os.path.dirname(os.path.realpath(__file__)) +requirementPath = thelibFolder + '/requirements.txt' +install_requires = [] # Here we'll get: ["gunicorn", "docutils>=0.3", "lxml==0.5a7"] +if os.path.isfile(requirementPath): + with open(requirementPath,encoding='utf-8') as f: + install_requires = f.read().splitlines() + +print(install_requires) +setup( + name='twitter_scraper_without_api', + version='0.0.6', + license='', + author='Hamed', + author_email='hamed.minaei@gmail.com', + description='twitter_scraper without API', + long_description=read_file('README.MD'), + long_description_content_type="text/markdown", + url="https://github.com/HamedMinaeizaeim/twitter_scraper", + project_urls={ + "Bug Tracker": "https://github.com/HamedMinaeizaeim/twitter_scraper/issues", + }, + classifiers=[ + "Programming Language :: Python :: 3", + "License :: OSI Approved :: MIT License", + "Operating System :: OS Independent", + ], + install_requires=install_requires, + packages=['twitter_scraper_without_api'], + package_dir={'': 'src'}, + python_requires=">=3.6", +) diff --git a/twitter_scraper_without_API/src/twitter_scraper_without_api/__init__.py b/twitter_scraper_without_API/src/twitter_scraper_without_api/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..fbc9543626198f408383103465da2bc219b709a8 --- /dev/null +++ b/twitter_scraper_without_API/src/twitter_scraper_without_api/__init__.py @@ -0,0 +1,6 @@ + +from .scraping_utilities import * +from .driver_utils import * +from .driver_initialisation import * +from .element_finder import * +from .twitter_scraper import * \ No newline at end of file diff --git a/twitter_scraper_without_API/src/twitter_scraper_without_api/__pycache__/__init__.cpython-310.pyc b/twitter_scraper_without_API/src/twitter_scraper_without_api/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..28e94eb4b7dac34f77f2e7ab859de853cc07efc2 Binary files /dev/null and b/twitter_scraper_without_API/src/twitter_scraper_without_api/__pycache__/__init__.cpython-310.pyc differ diff --git a/twitter_scraper_without_API/src/twitter_scraper_without_api/__pycache__/driver_initialisation.cpython-310.pyc b/twitter_scraper_without_API/src/twitter_scraper_without_api/__pycache__/driver_initialisation.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..72d7fd3d4d239a536e8581494c1cc99acdcbc90c Binary files /dev/null and b/twitter_scraper_without_API/src/twitter_scraper_without_api/__pycache__/driver_initialisation.cpython-310.pyc differ diff --git a/twitter_scraper_without_API/src/twitter_scraper_without_api/__pycache__/driver_utils.cpython-310.pyc b/twitter_scraper_without_API/src/twitter_scraper_without_api/__pycache__/driver_utils.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6798b52ffe9d696fe834435328eef91d2145e2b5 Binary files /dev/null and b/twitter_scraper_without_API/src/twitter_scraper_without_api/__pycache__/driver_utils.cpython-310.pyc differ diff --git a/twitter_scraper_without_API/src/twitter_scraper_without_api/__pycache__/element_finder.cpython-310.pyc b/twitter_scraper_without_API/src/twitter_scraper_without_api/__pycache__/element_finder.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..24873b2999f6087e99763c6721a50f96b7365bc6 Binary files /dev/null and b/twitter_scraper_without_API/src/twitter_scraper_without_api/__pycache__/element_finder.cpython-310.pyc differ diff --git a/twitter_scraper_without_API/src/twitter_scraper_without_api/__pycache__/scraping_utilities.cpython-310.pyc b/twitter_scraper_without_API/src/twitter_scraper_without_api/__pycache__/scraping_utilities.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a3750694639dccafc5ca866b5510e91cd2d8feb9 Binary files /dev/null and b/twitter_scraper_without_API/src/twitter_scraper_without_api/__pycache__/scraping_utilities.cpython-310.pyc differ diff --git a/twitter_scraper_without_API/src/twitter_scraper_without_api/__pycache__/twitter_scraper.cpython-310.pyc b/twitter_scraper_without_API/src/twitter_scraper_without_api/__pycache__/twitter_scraper.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..3918a733b81e2e07fce98858d6becfb8e20d15f2 Binary files /dev/null and b/twitter_scraper_without_API/src/twitter_scraper_without_api/__pycache__/twitter_scraper.cpython-310.pyc differ diff --git a/twitter_scraper_without_API/src/twitter_scraper_without_api/driver_initialisation.py b/twitter_scraper_without_API/src/twitter_scraper_without_api/driver_initialisation.py new file mode 100644 index 0000000000000000000000000000000000000000..42972897bae3780907f461041e109e5838895a04 --- /dev/null +++ b/twitter_scraper_without_API/src/twitter_scraper_without_api/driver_initialisation.py @@ -0,0 +1,71 @@ +from selenium import webdriver +# to add capabilities for chrome and firefox, import their Options with different aliases +from selenium.webdriver.chrome.options import Options as ChromeOptions +from selenium.webdriver.firefox.options import Options as FirefoxOptions +# import webdriver for downloading respective driver for the browser + +from py_firefox_driver_manager import GeckoFireFoxdriverManager + +class DriverInitilizer: + def __init__(self, proxy=None): + self.proxy = proxy + + def set_properties(self, browser_option): + + browser_option.add_argument( + '--headless') # runs browser in headless mode + browser_option.add_argument('--no-sandbox') + browser_option.add_argument("--disable-dev-shm-usage") + browser_option.add_argument('--ignore-certificate-errors') + browser_option.add_argument('--disable-gpu') + browser_option.add_argument('--log-level=3') + browser_option.add_argument('--disable-notifications') + browser_option.add_argument('--disable-popup-blocking') + return browser_option + + def setup_profile(self): + """ + This code is setup the profile + :param fileLocation: location of file to be save + :return profile: + """ + profile = webdriver.FirefoxProfile() + #profile.set_preference("browser.download.dir", self.file_location); + profile.set_preference("browser.download.folderList", 2); + profile.set_preference("browser.helperApps.neverAsk.saveToDisk", + "application/csv,application/excel,application/vnd.msexcel,application/vnd.ms-excel,text/anytext,text/comma-separated-values,text/csv,application/vnd.ms-excel,application/vnd.openxmlformats-officedocument.spreadsheetml.sheet,application/octet-stream"); + profile.set_preference("browser.download.manager.showWhenStarting", False); + profile.set_preference("browser.helperApps.neverAsk.openFile", + "application/csv,application/excel,application/vnd.msexcel,application/vnd.ms-excel,text/anytext,text/comma-separated-values,text/csv,application/vnd.ms-excel,application/vnd.openxmlformats-officedocument.spreadsheetml.sheet,application/octet-stream"); + profile.set_preference("browser.helperApps.alwaysAsk.force", False); + profile.set_preference("browser.download.manager.useWindow", False); + profile.set_preference("browser.download.manager.focusWhenStarting", False); + profile.set_preference("browser.download.manager.alertOnEXEOpen", False); + profile.set_preference("browser.download.manager.showAlertOnComplete", False); + profile.set_preference("browser.download.manager.closeWhenDone", True); + profile.set_preference("pdfjs.disabled", True) + profile.set_preference('permissions.default.stylesheet', 2) + profile.set_preference('permissions.default.image', 2) + profile.set_preference('dom.ipc.plugins.enabled.libflashplayer.so', 'false') + profile.set_preference("http.response.timeout", 500) + profile.set_preference("dom.max_script_run_time", 500) + return profile + + def set_driver_for_browser(self): + """expects browser name and returns a driver instance""" + browser_option = FirefoxOptions() + if self.proxy is not None: + options = { + 'https': 'https://{}'.format(self.proxy.replace(" ", "")), + 'http': 'http://{}'.format(self.proxy.replace(" ", "")), + 'no_proxy': 'localhost, 127.0.0.1' + } + + return webdriver.Firefox(executable_path=GeckoFireFoxdriverManager().install_geckodriver(), + options=self.set_properties(browser_option), seleniumwire_options=options) + + + return webdriver.Firefox(executable_path=GeckoFireFoxdriverManager().install_geckodriver(), + options=self.set_properties(browser_option)) + + diff --git a/twitter_scraper_without_API/src/twitter_scraper_without_api/driver_utils.py b/twitter_scraper_without_API/src/twitter_scraper_without_api/driver_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..539405ebf9deff4396b313ce36cbe5ffb539f434 --- /dev/null +++ b/twitter_scraper_without_API/src/twitter_scraper_without_api/driver_utils.py @@ -0,0 +1,51 @@ +#!/usr/bin/env python3 +try: + from selenium.webdriver.support.ui import WebDriverWait + from selenium.webdriver.support import expected_conditions as EC + from selenium.webdriver.common.by import By + from selenium.common.exceptions import WebDriverException + import time + from selenium.webdriver.common.by import By + from selenium.webdriver.support.ui import WebDriverWait + from selenium.webdriver.support import expected_conditions as EC + from selenium.webdriver.common.keys import Keys + from inspect import currentframe + from random import randint +except Exception as ex: + frameinfo = currentframe() + print("Error on line no. {} : {}".format(frameinfo.f_lineno,ex)) + +frameinfo = currentframe() + +class Utilities: + """this class contains all the method related to driver behaviour, + like scrolling, waiting for element to appear, it contains all static + method, which accepts driver instance as a argument""" + + @staticmethod + def __wait_until_tweets_appear(driver): + try: + WebDriverWait(driver, 10).until(EC.presence_of_element_located( + (By.CSS_SELECTOR, '[data-testid="tweet"]'))) + except WebDriverException: + print("Tweets did not appear!") + + @staticmethod + def __scroll_down(driver): + try: + body = driver.find_element_by_css_selector('body') + for _ in range(3): + body.send_keys(Keys.PAGE_DOWN) + except Exception as ex: + print("Error on line no. {} : {}".format(frameinfo.f_lineno,ex)) + + @staticmethod + def __wait_until_completion(driver): + """waits until the page have completed loading""" + try: + state = "" + while state != "complete": + time.sleep(randint(3, 5)) + state = driver.execute_script("return document.readyState") + except Exception as ex: + print(ex) diff --git a/twitter_scraper_without_API/src/twitter_scraper_without_api/element_finder.py b/twitter_scraper_without_API/src/twitter_scraper_without_api/element_finder.py new file mode 100644 index 0000000000000000000000000000000000000000..b9d88157a43d14a66b1d3de447a7b715a8ac50ee --- /dev/null +++ b/twitter_scraper_without_API/src/twitter_scraper_without_api/element_finder.py @@ -0,0 +1,165 @@ +#!/usr/bin/env python3 + +from selenium.common.exceptions import NoSuchElementException +from .scraping_utilities import Scraping_utilities +from inspect import currentframe +from dateutil.parser import parse + + + +class Finder: + """ + this class should contain all the static method to find that accept + webdriver instance and perform operation to find elements and return the + found element. + method should follow convention like so: + + @staticmethod + def __method_name(parameters): + """ + + @staticmethod + def __fetch_all_tweets(driver): + try: + return driver.find_elements_by_css_selector('[data-testid="tweet"]') + except Exception as ex: + print("Error at method fetch_all_tweets on line no : {}".format(ex)) + + @staticmethod + def __find_replies(tweet): + try: + replies_element = tweet.find_element_by_css_selector('[data-testid="reply"]') + replies = replies_element.get_attribute("aria-label") + return Scraping_utilities._Scraping_utilities__extract_digits(replies) + except Exception as ex: + print("Error at method find_replies on line no : {}".format( ex)) + return "" + + @staticmethod + def __find_shares(tweet): + try: + shares_element = tweet.find_element_by_css_selector('[data-testid="retweet"]') + shares = shares_element.get_attribute("aria-label") + return Scraping_utilities._Scraping_utilities__extract_digits(shares) + except Exception as ex: + print("Error at method find_shares on line no: {}".format( ex)) + return "" + + @staticmethod + def __find_status(tweet): + try: + anchor = tweet.find_element_by_css_selector("a.r-bcqeeo.r-3s2u2q.r-qvutc0") + return (anchor.get_attribute("href").split("/"), anchor.get_attribute("href")) + except Exception as ex: + print("Error at method find_status on line no: {}".format( ex)) + return [] + + @staticmethod + def __find_all_anchor_tags(tweet): + try: + return tweet.find_elements_by_tag_name('a') + except Exception as ex: + print("Error at method find_all_anchor_tags on line no : {}".format( + ex)) + + @staticmethod + def __find_timestamp(tweet): + try: + timestamp = tweet.find_element_by_tag_name( + "time").get_attribute("datetime") + #posted_time = parse(timestamp).isoformat() + return timestamp + except Exception as ex: + print("Error at method find_timestamp on line no.: {}".format( + ex)) + + + @staticmethod + def __find_content(tweet): + try: + #content_element = tweet.find_element_by_css_selector('.//*[@dir="auto"]')[4] + content_element = tweet.find_element_by_css_selector('div[lang]') + return content_element.text + except NoSuchElementException: + return "" + except Exception as ex: + print("Error at method find_content on line no: {}".format( + ex)) + + @staticmethod + def __find_like(tweet): + try: + like_element = tweet.find_element_by_css_selector('[data-testid="like"]') + likes = like_element.get_attribute("aria-label") + return Scraping_utilities._Scraping_utilities__extract_digits(likes) + except Exception as ex: + print("Error at method find_like on line no: {}".format( + ex)) + @staticmethod + def __find_images(tweet): + try: + image_element = tweet.find_elements_by_css_selector( + 'div[data-testid="tweetPhoto"]') + images = [] + for image_div in image_element: + href = image_div.find_element_by_tag_name("img").get_attribute("src") + images.append(href) + return images + except Exception as ex: + print("Error at method __find_images on line no : {}".format( + ex)) + + @staticmethod + def __find_videos(tweet): + try: + image_element = tweet.find_elements_by_css_selector( + 'div[data-testid="videoPlayer"]') + videos = [] + for video_div in image_element: + href = video_div.find_element_by_tag_name("video").get_attribute("src") + videos.append(href) + return videos + except Exception as ex: + print("Error at method find_videos on line no: {}".format( + ex)) + + @staticmethod + def __is_retweet(tweet): + try: + tweet.find_element_by_css_selector('div.r-92ng3h.r-qvutc0') + return True + except NoSuchElementException: + return False + except Exception as ex: + print("Error at method is_retweet on line no: {}".format( + ex)) + return False + + @staticmethod + def __find_name_from_post(tweet,is_retweet=False): + try: + name = "NA" + anchors = Finder.__find_all_anchor_tags(tweet) + if len(anchors) > 2: + if is_retweet: + name = anchors[2].text.strip() + else: + name = anchors[1].text.split("\n")[0] + return name + except Exception as ex: + print("Error at method __find_name_from_post on line no: {}".format( + ex)) + + @staticmethod + def __find_external_link(tweet): + try: + card = tweet.find_element_by_css_selector('[data-testid="card.wrapper"]') + href = card.find_element_by_tag_name('a') + return href.get_attribute("href") + + except NoSuchElementException: + return "" + except Exception as ex: + print("Error at method __find_external_link on line no: {}".format( + ex)) + diff --git a/twitter_scraper_without_API/src/twitter_scraper_without_api/scraping_utilities.py b/twitter_scraper_without_API/src/twitter_scraper_without_api/scraping_utilities.py new file mode 100644 index 0000000000000000000000000000000000000000..6f1aca6811d575b040b17a7b6f3ec817787a7afd --- /dev/null +++ b/twitter_scraper_without_API/src/twitter_scraper_without_api/scraping_utilities.py @@ -0,0 +1,22 @@ +#!/usr/bin/env python3 + +from inspect import currentframe +import re + + + +class Scraping_utilities: + + @staticmethod + def __parse_name(string): + try: + return string.split("(")[0].strip() + except Exception as ex: + print("Error on line no: {}".format( ex)) + + @staticmethod + def __extract_digits(string): + try: + return int(re.search(r'\d+', string).group(0)) + except Exception as ex: + print("Error on line no.: {}".format( ex)) diff --git a/twitter_scraper_without_API/src/twitter_scraper_without_api/twitter_scraper.py b/twitter_scraper_without_API/src/twitter_scraper_without_api/twitter_scraper.py new file mode 100644 index 0000000000000000000000000000000000000000..b3cc18f397cee87cbd722c664087560178f8c15d --- /dev/null +++ b/twitter_scraper_without_API/src/twitter_scraper_without_api/twitter_scraper.py @@ -0,0 +1,194 @@ +from selenium import webdriver +from selenium.webdriver.support.ui import Select +import time +import pytz +from selenium.webdriver.common.by import By +from selenium.webdriver.support.ui import WebDriverWait +from selenium.webdriver.support import expected_conditions as EC +from selenium.common.exceptions import TimeoutException +from selenium.webdriver.firefox.options import Options +from selenium.webdriver.common.action_chains import ActionChains +import time +from datetime import datetime +import datetime +from selenium.webdriver.firefox.firefox_binary import FirefoxBinary +from datetime import datetime, timedelta +from selenium.webdriver.common.keys import Keys +from bs4 import BeautifulSoup +import pandas as pd +import numpy as np +from urllib.parse import quote +from .element_finder import Finder +from .driver_initialisation import DriverInitilizer +from .driver_utils import Utilities +import re, json, os, csv +import dateutil + +class TwitterScraper: + + def __init__(self, keyword): + self.keyword = keyword + self.since = self.set_since() + self.until = self.set_untill() + self.url = "https://twitter.com/search?q={}%20until%3A{}%20since%3A{}&src=typed_query&f=live".format( + quote(keyword), self.until, self.since) + self.driver = self.setup_driver() + self.retry = 10 + self.data = {} + self._last_n_mins = 1 + + def __repr__(self): + return "TwitterScraper('bitcoin', 60 )" + + def __str__(self): + return "" + + @property + def last_n_mins(self): + return self._last_n_mins + + @last_n_mins.setter + def last_n_mins(self, value): + if str(value).isnumeric(): + self._last_n_mins = value + else: + print("you must enter numeric value in mints - 1 mins defult value was replaced") + self._last_n_mins = 1 + + @staticmethod + def str_to_datetime(str_datetime): + datetime_old_zone = dateutil.parser.isoparse(str_datetime) + #datetime_old_zone = datetime.strptime(str_datetime, "%Y-%m-%dT%H:%M:%S.%z") + nz_datetime_time = datetime_old_zone.replace(tzinfo=pytz.utc).astimezone(pytz.timezone("Pacific/Auckland")) + return nz_datetime_time + + @staticmethod + def convert_json_to_dataframe(json_data): + df=[] + for key in json_data: + df=[pd.json_normalize(json_data[key]) for key in json_data] + return pd.concat(df) + + def set_since(self): + yesterday = datetime.now()-timedelta(days=1) + return yesterday.strftime('%Y-%m-%d') + + def set_untill(self): + tomorrow = datetime.now()+timedelta(days=1) + return tomorrow.strftime('%Y-%m-%d') + + def __check_tweets_presence(self, tweet_list): + if len(tweet_list) <= 0: + self.retry -= 1 + + def __check_retry(self): + return self.retry <= 0 + + def setup_driver(self): + # driver = webdriver.Firefox( + # executable_path=r"C:\Users\Hamed\PycharmProjects\Twitter_Scraper\geckodriver.exe", + # firefox_profile=self.setup_profile()) + firefox = DriverInitilizer() + driver = firefox.set_driver_for_browser() + driver.get(self.url) + driver.set_page_load_timeout(6000) + return driver + + + def obtain_info_from_tweet(self, tweet): + name = Finder._Finder__find_name_from_post(tweet) + status, tweet_url = Finder._Finder__find_status(tweet) + replies = Finder._Finder__find_replies(tweet) + retweets = Finder._Finder__find_shares(tweet) + username = tweet_url.split("/")[3] + status = status[-1] + is_retweet = Finder._Finder__is_retweet(tweet) + posted_time = Finder._Finder__find_timestamp(tweet) + posted_time = TwitterScraper.str_to_datetime(posted_time) + content = Finder._Finder__find_content(tweet) + likes = Finder._Finder__find_like(tweet) + images = Finder._Finder__find_images(tweet) + videos = Finder._Finder__find_videos(tweet) + hashtags = re.findall(r"#(\w+)", content) + mentions = re.findall(r"@(\w+)", content) + profile_picture = "https://twitter.com/{}/photo".format(username) + link = Finder._Finder__find_external_link(tweet) + return link, profile_picture, mentions, hashtags,\ + videos, images, likes, content, posted_time,\ + is_retweet, status, username, retweets, replies,\ + tweet_url, name + + + def update_tweet_data(self, link, profile_picture, mentions, hashtags, + videos, images, likes, content, posted_time, + is_retweet, status, username, retweets, replies, + tweet_url, name): + self.data[status] = { + "tweet_id": status, + "username": username, + "name": name, + "profile_picture": profile_picture, + "replies": replies, + "retweets": retweets, + "likes": likes, + "is_retweet": is_retweet, + "posted_time": posted_time, + "content": content, + "hashtags": hashtags, + "mentions": mentions, + "images": images, + "videos": videos, + "tweet_url": tweet_url, + "link": link + } + + def fetch_data(self): + #try: + all_ready_fetched_posts = [] + time.sleep(4) + present_tweets = Finder._Finder__fetch_all_tweets(self.driver) + self.__check_tweets_presence(present_tweets) + all_ready_fetched_posts.extend(present_tweets) + latest_time_now = datetime.now() + latest_time_now = latest_time_now.replace(tzinfo=None).astimezone(pytz.timezone("Pacific/Auckland")) + ref_date_time = latest_time_now-timedelta(minutes=self._last_n_mins) + + while (latest_time_now-ref_date_time).total_seconds()>0: + + for tweet in present_tweets: + + link, profile_picture, mentions, hashtags, \ + videos, images, likes, content, posted_time, \ + is_retweet, status, username, retweets, replies, \ + tweet_url, name = self.obtain_info_from_tweet(tweet) + self.update_tweet_data(link, profile_picture, mentions, hashtags, + videos, images, likes, content, posted_time, + is_retweet, status, username, retweets, replies, + tweet_url, name) + + if (posted_time-latest_time_now).total_seconds()<0: + latest_time_now = posted_time + + Utilities._Utilities__scroll_down(self.driver) + Utilities._Utilities__wait_until_completion(self.driver) + Utilities._Utilities__wait_until_tweets_appear(self.driver) + present_tweets = Finder._Finder__fetch_all_tweets(self.driver) + present_tweets = [post for post in present_tweets if post not in all_ready_fetched_posts] + self.__check_tweets_presence(present_tweets) + all_ready_fetched_posts.extend(present_tweets) + if self.__check_retry() is True: + break + self.driver.quit() + + def store_data(self, format='Json'): + if format.lower()=='json': + return self.data + elif format.lower()=='dataframe': + return TwitterScraper.convert_json_to_dataframe(self.data) + elif format.lower()=='csv': + df = TwitterScraper.convert_json_to_dataframe(self.data) + return df.to_csv() + else: + print("it dose not sopport that format") + + diff --git a/twitter_scraper_without_API/test.py b/twitter_scraper_without_API/test.py new file mode 100644 index 0000000000000000000000000000000000000000..0a8a1d3b373d9a4edf7b8c5e7d4f7289b30fd794 --- /dev/null +++ b/twitter_scraper_without_API/test.py @@ -0,0 +1,4 @@ +from src.twitter_scraper_without_api import TwitterScraper +twitter = TwitterScraper('bitcoin') +twitter.last_n_mins = 3 +twitter.fetch_data() \ No newline at end of file diff --git a/twitter_sentiment.py b/twitter_sentiment.py new file mode 100644 index 0000000000000000000000000000000000000000..e751cb4e3e77d7c5dffa6712a2df61d50122daf6 --- /dev/null +++ b/twitter_sentiment.py @@ -0,0 +1,135 @@ +import streamlit as st +import random +import toml +import streamlit as st +import pandas as pd +from datetime import date +import re +import csv +from getpass import getpass +from time import sleep +from selenium.webdriver.common.keys import Keys +from selenium.common.exceptions import NoSuchElementException +from selenium.webdriver import Chrome, ChromeOptions +import time +import pandas as pd +from selenium.webdriver.support import expected_conditions as EC +from selenium.webdriver.common.by import By +from selenium.webdriver.support.ui import WebDriverWait +import tweetnlp + + + +df = pd.DataFrame() + +def login_twitter(username_text, password_text, topic_text): + options = ChromeOptions() + options.use_chromium = True + driver = Chrome() + driver.get('https://twitter.com/i/flow/login') + time.sleep(5) + username = driver.find_element("xpath",'//input[@name="text"]') + username.send_keys(username_text) + driver.find_element("xpath",'(//*[@role="button"])[3]').click() # in prevoius block + + try: + time.sleep(10) + y = driver.find_element("xpath",'//h1//span') + if(y.text =='Enter your phone number or username'): + username2 = driver.find_element("xpath",'//input[@name="text"]') + username2.send_keys(username_text) + driver.find_element("xpath",'(//*[@role="button"])[2]').click() + except: + st.write(" I am in exception and didnt get 'Enter your phone number or username'") + x = 10 + + password = driver.find_element("xpath",'//input[@name="password"]') + password.send_keys(password_text) + driver.find_element("xpath",'(//*[@role = "button"])[4]').click() + + + try: + driver.maximize_window() + time.sleep(10) + driver.maximize_window() + search = driver.find_element("xpath",'//input[@placeholder="Search"]') + search.send_keys(Keys.CONTROL + "a") + search.send_keys(Keys.DELETE) + search.send_keys(topic_text) + search.send_keys(Keys.RETURN) + + df = pd.DataFrame() + df['Name']='' + df['Tweet'] = '' + df['Sentiment'] = '' + tweet_count = 6 + i = 1 + element = driver.find_element("xpath","//body") + time.sleep(20) + while True and i <= tweet_count: + + try: + tweet_name = driver.find_element("xpath",f'(//*[@data-testid="User-Name"])[{i}]') + df.loc[i, 'Name'] = tweet_name.text + + tweet_div = driver.find_element("xpath",f'(//*[@data-testid="tweetText"])[{i}]') + if(tweet_div): + #print(driver.find_element("xpath",'(//*[@data-testid="tweetText"])[1]').getText()) + df.loc[i, 'Tweet'] = tweet_div.text + i += 1 + element.send_keys(Keys.PAGE_DOWN) + time.sleep(10) + except: + time.sleep(10) + tweet_name = driver.find_element("xpath",f'(//*[@data-testid="User-Name"])[{i}]') + df.loc[i, 'Name'] = tweet_name.text + + tweet_div = driver.find_element("xpath",f'(//*[@data-testid="tweetText"])[{i}]') + if(tweet_div): + #print(driver.find_element("xpath",'(//*[@data-testid="tweetText"])[1]').getText()) + df.loc[i, 'Tweet'] = tweet_div.text + i += 1 + element.send_keys(Keys.PAGE_DOWN) + time.sleep(10) + + + model = tweetnlp.load_model('sentiment', multilingual=True) # Or `model = tweetnlp.Sentiment()` + + for i in range(1,len(df)+1): + print("text is...", df.loc[i,'Tweet']) + + + #y = sentiment_pipeline(df.loc[i, 'Tweet']) + #print('label is..', y[0]['label']) + print('model sentiment is..', model.sentiment(df.loc[i, 'Tweet'])) + df.loc[i, 'Sentiment'] = str(model.sentiment(df.loc[i, 'Tweet'])) + st.dataframe(df) + + + except Exception as e: + st.write(e) + # driver.maximize_window() + # time.sleep(10) + # driver.maximize_window() + # search = driver.find_element("xpath",'//input[@placeholder="Search"]') + # search.send_keys(Keys.CONTROL + "a") + # search.send_keys(Keys.DELETE) + # search.send_keys(topic_text) + + + + +with st.sidebar: + username = st.text_input("Username") + password_text = st.text_input("Password", type = "password") + topic = st.text_input("topic") + connect = st.button("Login Twitter",\ + on_click = login_twitter, + args = [username, password_text, topic] + ) + if('is_ready' not in st.session_state): + st.session_state['is_ready'] = False + + if(st.session_state['is_ready'] == True): + st.write('Connected!') + \ No newline at end of file