RAGTheDocs-mila-qc

Sleeping

App Files Files Community

RAGTheDocs-mila-qc / rtd_scraper /tutorial /settings.py

jerpint

First commit

ac493ec about 1 year ago

raw

history blame contribute delete

3.61 kB

	# Scrapy settings for tutorial project
	#
	# For simplicity, this file contains only settings considered important or
	# commonly used. You can find more settings consulting the documentation:
	#
	# https://docs.scrapy.org/en/latest/topics/settings.html
	# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
	# https://docs.scrapy.org/en/latest/topics/spider-middleware.html

	from scrapy.utils.log import configure_logging

	# Disable default Scrapy log settings.
	configure_logging(install_root_handler=False)
	BOT_NAME = "tutorial"

	SPIDER_MODULES = ["rtd_scraper.tutorial.spiders"]
	NEWSPIDER_MODULE = "rtd_scraper.tutorial.spiders"

	# SPIDER_MODULES = ["tutorial.spiders"]
	# NEWSPIDER_MODULE = "tutorial.spiders"

	LOG_ENABLED = False
	LOG_LEVEL = "INFO"

	# Crawl responsibly by identifying yourself (and your website) on the user-agent
	# USER_AGENT = "tutorial (+http://www.yourdomain.com)"

	# Obey robots.txt rules
	ROBOTSTXT_OBEY = True

	# Configure maximum concurrent requests performed by Scrapy (default: 16)
	# CONCURRENT_REQUESTS = 32

	# Configure a delay for requests for the same website (default: 0)
	# See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
	# See also autothrottle settings and docs
	# DOWNLOAD_DELAY = 3
	# The download delay setting will honor only one of:
	# CONCURRENT_REQUESTS_PER_DOMAIN = 16
	# CONCURRENT_REQUESTS_PER_IP = 16

	# Disable cookies (enabled by default)
	# COOKIES_ENABLED = False

	# Disable Telnet Console (enabled by default)
	# TELNETCONSOLE_ENABLED = False

	# Override the default request headers:
	# DEFAULT_REQUEST_HEADERS = {
	# "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,/;q=0.8",
	# "Accept-Language": "en",
	# }

	# Enable or disable spider middlewares
	# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
	# SPIDER_MIDDLEWARES = {
	# "tutorial.middlewares.TutorialSpiderMiddleware": 543,
	# }

	# Enable or disable downloader middlewares
	# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
	# DOWNLOADER_MIDDLEWARES = {
	# "tutorial.middlewares.TutorialDownloaderMiddleware": 543,
	# }

	# Enable or disable extensions
	# See https://docs.scrapy.org/en/latest/topics/extensions.html
	# EXTENSIONS = {
	# "scrapy.extensions.telnet.TelnetConsole": None,
	# }

	# Configure item pipelines
	# See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
	# ITEM_PIPELINES = {
	# "tutorial.pipelines.TutorialPipeline": 300,
	# }

	# Enable and configure the AutoThrottle extension (disabled by default)
	# See https://docs.scrapy.org/en/latest/topics/autothrottle.html
	# AUTOTHROTTLE_ENABLED = True
	# The initial download delay
	# AUTOTHROTTLE_START_DELAY = 5
	# The maximum download delay to be set in case of high latencies
	# AUTOTHROTTLE_MAX_DELAY = 60
	# The average number of requests Scrapy should be sending in parallel to
	# each remote server
	# AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
	# Enable showing throttling stats for every response received:
	# AUTOTHROTTLE_DEBUG = False

	# Enable and configure HTTP caching (disabled by default)
	# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
	# HTTPCACHE_ENABLED = True
	# HTTPCACHE_EXPIRATION_SECS = 0
	# HTTPCACHE_DIR = "httpcache"
	# HTTPCACHE_IGNORE_HTTP_CODES = []
	# HTTPCACHE_STORAGE = "scrapy.extensions.httpcache.FilesystemCacheStorage"

	# Set settings whose default value is deprecated to a future-proof value
	REQUEST_FINGERPRINTER_IMPLEMENTATION = "2.7"
	TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor"
	FEED_EXPORT_ENCODING = "utf-8"