jerpint commited on
Commit
ac493ec
0 Parent(s):

First commit

Browse files
.gitignore ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ outputs/
2
+ __pycache__/
app.py ADDED
@@ -0,0 +1,161 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ import os
3
+ from typing import Optional, Tuple
4
+
5
+ import gradio as gr
6
+ import pandas as pd
7
+ from buster.completers import Completion
8
+ from buster.utils import extract_zip
9
+
10
+ import cfg
11
+ from cfg import setup_buster
12
+
13
+ # Create a handler to control where log messages go (e.g., console, file)
14
+ handler = (
15
+ logging.StreamHandler()
16
+ ) # Console output, you can change it to a file handler if needed
17
+
18
+ # Set the handler's level to INFO
19
+ handler.setLevel(logging.INFO)
20
+ logging.basicConfig(level=logging.INFO)
21
+
22
+ # Check if an openai key is set as an env. variable
23
+ if os.getenv("OPENAI_API_KEY") is None:
24
+ print(
25
+ "Warning: No openai key detected. You can set it with 'export OPENAI_API_KEY=sk-...'."
26
+ )
27
+
28
+ # Typehint for chatbot history
29
+ ChatHistory = list[list[Optional[str], Optional[str]]]
30
+
31
+ buster = setup_buster(cfg.buster_cfg)
32
+
33
+
34
+ def add_user_question(
35
+ user_question: str, chat_history: Optional[ChatHistory] = None
36
+ ) -> ChatHistory:
37
+ """Adds a user's question to the chat history.
38
+
39
+ If no history is provided, the first element of the history will be the user conversation.
40
+ """
41
+ if chat_history is None:
42
+ chat_history = []
43
+ chat_history.append([user_question, None])
44
+ return chat_history
45
+
46
+
47
+ def format_sources(matched_documents: pd.DataFrame) -> str:
48
+ if len(matched_documents) == 0:
49
+ return ""
50
+
51
+ matched_documents.similarity_to_answer = (
52
+ matched_documents.similarity_to_answer * 100
53
+ )
54
+
55
+ # drop duplicate pages (by title), keep highest ranking ones
56
+ matched_documents = matched_documents.sort_values(
57
+ "similarity_to_answer", ascending=False
58
+ ).drop_duplicates("title", keep="first")
59
+
60
+ documents_answer_template: str = "📝 Here are the sources I used to answer your question:\n\n{documents}\n\n{footnote}"
61
+ document_template: str = "[🔗 {document.title}]({document.url}), relevance: {document.similarity_to_answer:2.1f} %"
62
+
63
+ documents = "\n".join(
64
+ [
65
+ document_template.format(document=document)
66
+ for _, document in matched_documents.iterrows()
67
+ ]
68
+ )
69
+ footnote: str = "I'm a bot 🤖 and not always perfect."
70
+
71
+ return documents_answer_template.format(documents=documents, footnote=footnote)
72
+
73
+
74
+ def add_sources(history, completion):
75
+ if completion.answer_relevant:
76
+ formatted_sources = format_sources(completion.matched_documents)
77
+ history.append([None, formatted_sources])
78
+
79
+ return history
80
+
81
+
82
+ def chat(chat_history: ChatHistory) -> Tuple[ChatHistory, Completion]:
83
+ """Answer a user's question using retrieval augmented generation."""
84
+
85
+ # We assume that the question is the user's last interaction
86
+ user_input = chat_history[-1][0]
87
+
88
+ # Do retrieval + augmented generation with buster
89
+ completion = buster.process_input(user_input)
90
+
91
+ # Stream tokens one at a time to the user
92
+ chat_history[-1][1] = ""
93
+ for token in completion.answer_generator:
94
+ chat_history[-1][1] += token
95
+
96
+ yield chat_history, completion
97
+
98
+
99
+ demo = gr.Blocks()
100
+ with demo:
101
+ with gr.Row():
102
+ gr.Markdown("<h3><center>RAGTheDocs</center></h3>")
103
+
104
+ chatbot = gr.Chatbot()
105
+
106
+ with gr.Row():
107
+ question = gr.Textbox(
108
+ label="What's your question?",
109
+ placeholder="Type your question here...",
110
+ lines=1,
111
+ )
112
+ submit = gr.Button(value="Send", variant="secondary")
113
+
114
+ examples = gr.Examples(
115
+ examples=[
116
+ "How can I install the library?",
117
+ "How do I deal with noisy data?",
118
+ "How do I deal with noisy data in 2 words?",
119
+ ],
120
+ inputs=question,
121
+ )
122
+
123
+ gr.Markdown(
124
+ "This application uses GPT to search the docs for relevant info and answer questions."
125
+ )
126
+
127
+ response = gr.State()
128
+
129
+ # fmt: off
130
+ submit.click(
131
+ add_user_question,
132
+ inputs=[question],
133
+ outputs=[chatbot]
134
+ ).then(
135
+ chat,
136
+ inputs=[chatbot],
137
+ outputs=[chatbot, response]
138
+ ).then(
139
+ add_sources,
140
+ inputs=[chatbot, response],
141
+ outputs=[chatbot]
142
+ )
143
+
144
+ question.submit(
145
+ add_user_question,
146
+ inputs=[question],
147
+ outputs=[chatbot],
148
+ ).then(
149
+ chat,
150
+ inputs=[chatbot],
151
+ outputs=[chatbot, response]
152
+ ).then(
153
+ add_sources,
154
+ inputs=[chatbot, response],
155
+ outputs=[chatbot]
156
+ )
157
+ # fmt: on
158
+
159
+
160
+ demo.queue(concurrency_count=16)
161
+ demo.launch(share=False)
cfg.py ADDED
@@ -0,0 +1,130 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ import sys
3
+
4
+ from buster.busterbot import Buster, BusterConfig
5
+ from buster.completers import ChatGPTCompleter, DocumentAnswerer
6
+ from buster.formatters.documents import DocumentsFormatterJSON
7
+ from buster.formatters.prompts import PromptFormatter
8
+ from buster.retriever import DeepLakeRetriever, Retriever
9
+ from buster.tokenizers import GPTTokenizer
10
+ from buster.validators import QuestionAnswerValidator, Validator
11
+
12
+ from rtd_scraper.scrape_rtd import scrape_rtd
13
+
14
+ # Set the root logger's level to INFO
15
+ logging.basicConfig(level=logging.INFO)
16
+
17
+
18
+ homepage_url = "https://buster.readthedocs.io/"
19
+
20
+
21
+ scrape_rtd(homepage_url=homepage_url, save_directory="outputs/")
22
+
23
+ # Disable logging for third-party libraries at DEBUG level
24
+ for name in logging.root.manager.loggerDict:
25
+ logger = logging.getLogger(name)
26
+ logger.setLevel(logging.INFO)
27
+
28
+
29
+ buster_cfg = BusterConfig(
30
+ validator_cfg={
31
+ "unknown_response_templates": [
32
+ "I'm sorry, but I am an AI language model trained to assist with questions related to AI. I cannot answer that question as it is not relevant to the library or its usage. Is there anything else I can assist you with?",
33
+ ],
34
+ "unknown_threshold": 0.85,
35
+ "embedding_model": "text-embedding-ada-002",
36
+ "use_reranking": True,
37
+ "invalid_question_response": "This question does not seem relevant to my current knowledge.",
38
+ "check_question_prompt": """You are an chatbot answering questions on artificial intelligence.
39
+
40
+ Your job is to determine wether or not a question is valid, and should be answered.
41
+ More general questions are not considered valid, even if you might know the response.
42
+ A user will submit a question. Respond 'true' if it is valid, respond 'false' if it is invalid.
43
+
44
+ For example:
45
+
46
+ Q: What is backpropagation?
47
+ true
48
+
49
+ Q: What is the meaning of life?
50
+ false
51
+
52
+ A user will submit a question. Respond 'true' if it is valid, respond 'false' if it is invalid.""",
53
+ "completion_kwargs": {
54
+ "model": "gpt-3.5-turbo",
55
+ "stream": False,
56
+ "temperature": 0,
57
+ },
58
+ },
59
+ retriever_cfg={
60
+ "path": "outputs/deeplake_store",
61
+ "top_k": 3,
62
+ "thresh": 0.7,
63
+ "max_tokens": 2000,
64
+ "embedding_model": "text-embedding-ada-002",
65
+ },
66
+ documents_answerer_cfg={
67
+ "no_documents_message": "No documents are available for this question.",
68
+ },
69
+ completion_cfg={
70
+ "completion_kwargs": {
71
+ "model": "gpt-3.5-turbo",
72
+ "stream": True,
73
+ "temperature": 0,
74
+ },
75
+ },
76
+ tokenizer_cfg={
77
+ "model_name": "gpt-3.5-turbo",
78
+ },
79
+ documents_formatter_cfg={
80
+ "max_tokens": 3500,
81
+ "columns": ["content", "title", "source"],
82
+ },
83
+ prompt_formatter_cfg={
84
+ "max_tokens": 3500,
85
+ "text_before_docs": (
86
+ "You are a chatbot assistant answering technical questions about artificial intelligence (AI)."
87
+ "You can only respond to a question if the content necessary to answer the question is contained in the following provided documentation. "
88
+ "If the answer is in the documentation, summarize it in a helpful way to the user. "
89
+ "If it isn't, simply reply that you cannot answer the question. "
90
+ "Do not refer to the documentation directly, but use the instructions provided within it to answer questions. "
91
+ "Here is the documentation:\n"
92
+ ),
93
+ "text_after_docs": (
94
+ "REMEMBER:\n"
95
+ "You are a chatbot assistant answering technical questions about artificial intelligence (AI)."
96
+ "Here are the rules you must follow:\n"
97
+ "1) You must only respond with information contained in the documentation above. Say you do not know if the information is not provided.\n"
98
+ "2) Make sure to format your answers in Markdown format, including code block and snippets.\n"
99
+ "3) Do not reference any links, urls or hyperlinks in your answers.\n"
100
+ "4) If you do not know the answer to a question, or if it is completely irrelevant to the library usage, simply reply with:\n"
101
+ "5) Do not refer to the documentation directly, but use the instructions provided within it to answer questions. "
102
+ "'I'm sorry, but I am an AI language model trained to assist with questions related to AI. I cannot answer that question as it is not relevant to the library or its usage. Is there anything else I can assist you with?'"
103
+ "For example:\n"
104
+ "What is the meaning of life for an qa bot?\n"
105
+ "I'm sorry, but I am an AI language model trained to assist with questions related to AI. I cannot answer that question as it is not relevant to the library or its usage. Is there anything else I can assist you with?"
106
+ "Now answer the following question:\n"
107
+ ),
108
+ },
109
+ )
110
+
111
+
112
+ def setup_buster(buster_cfg: BusterConfig):
113
+ """initialize buster with a buster_cfg class"""
114
+ retriever: Retriever = DeepLakeRetriever(**buster_cfg.retriever_cfg)
115
+ tokenizer = GPTTokenizer(**buster_cfg.tokenizer_cfg)
116
+ document_answerer: DocumentAnswerer = DocumentAnswerer(
117
+ completer=ChatGPTCompleter(**buster_cfg.completion_cfg),
118
+ documents_formatter=DocumentsFormatterJSON(
119
+ tokenizer=tokenizer, **buster_cfg.documents_formatter_cfg
120
+ ),
121
+ prompt_formatter=PromptFormatter(
122
+ tokenizer=tokenizer, **buster_cfg.prompt_formatter_cfg
123
+ ),
124
+ **buster_cfg.documents_answerer_cfg,
125
+ )
126
+ validator: Validator = QuestionAnswerValidator(**buster_cfg.validator_cfg)
127
+ buster: Buster = Buster(
128
+ retriever=retriever, document_answerer=document_answerer, validator=validator
129
+ )
130
+ return buster
requirements.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ buster-doctalk
2
+ scrapy
rtd_scraper/__init__.py ADDED
File without changes
rtd_scraper/scrape_rtd.py ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ import os
3
+
4
+ from buster.docparser import get_all_documents
5
+ from buster.documents_manager import DeepLakeDocumentsManager
6
+ from buster.parser import SphinxParser
7
+ from scrapy.crawler import CrawlerProcess
8
+ from scrapy.exceptions import CloseSpider
9
+ from scrapy.utils.project import get_project_settings
10
+
11
+ from rtd_scraper.tutorial.spiders.docs_spider import DocsSpider
12
+
13
+ # When using scrapy it seems to set logging for all apps at DEBUG, so simply shut it off here...
14
+ for name in logging.root.manager.loggerDict:
15
+ logger = logging.getLogger(name)
16
+ logger.setLevel(logging.INFO)
17
+
18
+
19
+ def run_spider(homepage_url, save_directory):
20
+ # settings_file_path = 'rtd_scraper.tutorial.settings' # The path seen from top-level, ie. from cfg.py
21
+ # os.environ.setdefault('SCRAPY_SETTINGS_MODULE', settings_file_path)
22
+
23
+ process = CrawlerProcess(settings=get_project_settings())
24
+ process.crawl(DocsSpider, homepage_url=homepage_url, save_dir=save_directory)
25
+
26
+ # Start the crawling process
27
+ process.start()
28
+
29
+ # To stop the crawling process gracefully
30
+ process.stop()
31
+
32
+
33
+ def scrape_rtd(homepage_url, save_directory):
34
+ # Crawl the website using scrapy
35
+ run_spider(homepage_url, save_directory=save_directory)
36
+
37
+ # Convert the .html pages into chunks using Buster's SphinxParser
38
+ root_dir = os.path.join(save_directory, homepage_url.split("https://")[1])
39
+
40
+ # root_dir is the folder containing the scraped content e.g. crawled_outputs/buster.readthedocs.io/
41
+ df = get_all_documents(
42
+ root_dir=root_dir,
43
+ base_url=homepage_url,
44
+ parser_cls=SphinxParser,
45
+ min_section_length=100,
46
+ max_section_length=1000,
47
+ )
48
+
49
+ # Add the source column
50
+ df["source"] = "readthedocs"
51
+
52
+ # # Initialize the DeepLake vector store
53
+ # dm = DeepLakeDocumentsManager(
54
+ # vector_store_path=os.path.join(save_directory, "deeplake_store"),
55
+ # overwrite=True,
56
+ # required_columns=["url", "content", "source", "title"],
57
+ # )
58
+ #
59
+ # # Add all embeddings to the vector store
60
+ # dm.batch_add(
61
+ # df=df,
62
+ # batch_size=3000,
63
+ # min_time_interval=60,
64
+ # num_workers=32,
65
+ # )
66
+ #
67
+
68
+
69
+ if __name__ == "__main__":
70
+ homepage_url = "https://buster.readthedocs.io/"
71
+ scrape_rtd(homepage_url=homepage_url, save_directory="outputs/")
rtd_scraper/scrapy.cfg ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Automatically created by: scrapy startproject
2
+ #
3
+ # For more information about the [deploy] section see:
4
+ # https://scrapyd.readthedocs.io/en/latest/deploy.html
5
+
6
+ [settings]
7
+ default = tutorial.settings
8
+
9
+ [deploy]
10
+ #url = http://localhost:6800/
11
+ project = tutorial
rtd_scraper/tutorial/__init__.py ADDED
File without changes
rtd_scraper/tutorial/middlewares.py ADDED
@@ -0,0 +1,102 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Define here the models for your spider middleware
2
+ #
3
+ # See documentation in:
4
+ # https://docs.scrapy.org/en/latest/topics/spider-middleware.html
5
+
6
+ # useful for handling different item types with a single interface
7
+ from itemadapter import ItemAdapter, is_item
8
+ from scrapy import signals
9
+
10
+
11
+ class TutorialSpiderMiddleware:
12
+ # Not all methods need to be defined. If a method is not defined,
13
+ # scrapy acts as if the spider middleware does not modify the
14
+ # passed objects.
15
+
16
+ @classmethod
17
+ def from_crawler(cls, crawler):
18
+ # This method is used by Scrapy to create your spiders.
19
+ s = cls()
20
+ crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
21
+ return s
22
+
23
+ def process_spider_input(self, response, spider):
24
+ # Called for each response that goes through the spider
25
+ # middleware and into the spider.
26
+
27
+ # Should return None or raise an exception.
28
+ return None
29
+
30
+ def process_spider_output(self, response, result, spider):
31
+ # Called with the results returned from the Spider, after
32
+ # it has processed the response.
33
+
34
+ # Must return an iterable of Request, or item objects.
35
+ for i in result:
36
+ yield i
37
+
38
+ def process_spider_exception(self, response, exception, spider):
39
+ # Called when a spider or process_spider_input() method
40
+ # (from other spider middleware) raises an exception.
41
+
42
+ # Should return either None or an iterable of Request or item objects.
43
+ pass
44
+
45
+ def process_start_requests(self, start_requests, spider):
46
+ # Called with the start requests of the spider, and works
47
+ # similarly to the process_spider_output() method, except
48
+ # that it doesn’t have a response associated.
49
+
50
+ # Must return only requests (not items).
51
+ for r in start_requests:
52
+ yield r
53
+
54
+ def spider_opened(self, spider):
55
+ spider.logger.info("Spider opened: %s" % spider.name)
56
+
57
+
58
+ class TutorialDownloaderMiddleware:
59
+ # Not all methods need to be defined. If a method is not defined,
60
+ # scrapy acts as if the downloader middleware does not modify the
61
+ # passed objects.
62
+
63
+ @classmethod
64
+ def from_crawler(cls, crawler):
65
+ # This method is used by Scrapy to create your spiders.
66
+ s = cls()
67
+ crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
68
+ return s
69
+
70
+ def process_request(self, request, spider):
71
+ # Called for each request that goes through the downloader
72
+ # middleware.
73
+
74
+ # Must either:
75
+ # - return None: continue processing this request
76
+ # - or return a Response object
77
+ # - or return a Request object
78
+ # - or raise IgnoreRequest: process_exception() methods of
79
+ # installed downloader middleware will be called
80
+ return None
81
+
82
+ def process_response(self, request, response, spider):
83
+ # Called with the response returned from the downloader.
84
+
85
+ # Must either;
86
+ # - return a Response object
87
+ # - return a Request object
88
+ # - or raise IgnoreRequest
89
+ return response
90
+
91
+ def process_exception(self, request, exception, spider):
92
+ # Called when a download handler or a process_request()
93
+ # (from other downloader middleware) raises an exception.
94
+
95
+ # Must either:
96
+ # - return None: continue processing this exception
97
+ # - return a Response object: stops process_exception() chain
98
+ # - return a Request object: stops process_exception() chain
99
+ pass
100
+
101
+ def spider_opened(self, spider):
102
+ spider.logger.info("Spider opened: %s" % spider.name)
rtd_scraper/tutorial/settings.py ADDED
@@ -0,0 +1,102 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Scrapy settings for tutorial project
2
+ #
3
+ # For simplicity, this file contains only settings considered important or
4
+ # commonly used. You can find more settings consulting the documentation:
5
+ #
6
+ # https://docs.scrapy.org/en/latest/topics/settings.html
7
+ # https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
8
+ # https://docs.scrapy.org/en/latest/topics/spider-middleware.html
9
+
10
+ from scrapy.utils.log import configure_logging
11
+
12
+ # Disable default Scrapy log settings.
13
+ configure_logging(install_root_handler=False)
14
+ BOT_NAME = "tutorial"
15
+
16
+ SPIDER_MODULES = ["rtd_scraper.tutorial.spiders"]
17
+ NEWSPIDER_MODULE = "rtd_scraper.tutorial.spiders"
18
+
19
+ # SPIDER_MODULES = ["tutorial.spiders"]
20
+ # NEWSPIDER_MODULE = "tutorial.spiders"
21
+
22
+ LOG_ENABLED = False
23
+ LOG_LEVEL = "INFO"
24
+
25
+ # Crawl responsibly by identifying yourself (and your website) on the user-agent
26
+ # USER_AGENT = "tutorial (+http://www.yourdomain.com)"
27
+
28
+ # Obey robots.txt rules
29
+ ROBOTSTXT_OBEY = True
30
+
31
+ # Configure maximum concurrent requests performed by Scrapy (default: 16)
32
+ # CONCURRENT_REQUESTS = 32
33
+
34
+ # Configure a delay for requests for the same website (default: 0)
35
+ # See https://docs.scrapy.org/en/latest/topics/settings.html#download-delay
36
+ # See also autothrottle settings and docs
37
+ # DOWNLOAD_DELAY = 3
38
+ # The download delay setting will honor only one of:
39
+ # CONCURRENT_REQUESTS_PER_DOMAIN = 16
40
+ # CONCURRENT_REQUESTS_PER_IP = 16
41
+
42
+ # Disable cookies (enabled by default)
43
+ # COOKIES_ENABLED = False
44
+
45
+ # Disable Telnet Console (enabled by default)
46
+ # TELNETCONSOLE_ENABLED = False
47
+
48
+ # Override the default request headers:
49
+ # DEFAULT_REQUEST_HEADERS = {
50
+ # "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
51
+ # "Accept-Language": "en",
52
+ # }
53
+
54
+ # Enable or disable spider middlewares
55
+ # See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
56
+ # SPIDER_MIDDLEWARES = {
57
+ # "tutorial.middlewares.TutorialSpiderMiddleware": 543,
58
+ # }
59
+
60
+ # Enable or disable downloader middlewares
61
+ # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
62
+ # DOWNLOADER_MIDDLEWARES = {
63
+ # "tutorial.middlewares.TutorialDownloaderMiddleware": 543,
64
+ # }
65
+
66
+ # Enable or disable extensions
67
+ # See https://docs.scrapy.org/en/latest/topics/extensions.html
68
+ # EXTENSIONS = {
69
+ # "scrapy.extensions.telnet.TelnetConsole": None,
70
+ # }
71
+
72
+ # Configure item pipelines
73
+ # See https://docs.scrapy.org/en/latest/topics/item-pipeline.html
74
+ # ITEM_PIPELINES = {
75
+ # "tutorial.pipelines.TutorialPipeline": 300,
76
+ # }
77
+
78
+ # Enable and configure the AutoThrottle extension (disabled by default)
79
+ # See https://docs.scrapy.org/en/latest/topics/autothrottle.html
80
+ # AUTOTHROTTLE_ENABLED = True
81
+ # The initial download delay
82
+ # AUTOTHROTTLE_START_DELAY = 5
83
+ # The maximum download delay to be set in case of high latencies
84
+ # AUTOTHROTTLE_MAX_DELAY = 60
85
+ # The average number of requests Scrapy should be sending in parallel to
86
+ # each remote server
87
+ # AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
88
+ # Enable showing throttling stats for every response received:
89
+ # AUTOTHROTTLE_DEBUG = False
90
+
91
+ # Enable and configure HTTP caching (disabled by default)
92
+ # See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
93
+ # HTTPCACHE_ENABLED = True
94
+ # HTTPCACHE_EXPIRATION_SECS = 0
95
+ # HTTPCACHE_DIR = "httpcache"
96
+ # HTTPCACHE_IGNORE_HTTP_CODES = []
97
+ # HTTPCACHE_STORAGE = "scrapy.extensions.httpcache.FilesystemCacheStorage"
98
+
99
+ # Set settings whose default value is deprecated to a future-proof value
100
+ REQUEST_FINGERPRINTER_IMPLEMENTATION = "2.7"
101
+ TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor"
102
+ FEED_EXPORT_ENCODING = "utf-8"
rtd_scraper/tutorial/spiders/__init__.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ # This package will contain the spiders of your Scrapy project
2
+ #
3
+ # Please refer to the documentation for information on how to create and manage
4
+ # your spiders.
rtd_scraper/tutorial/spiders/docs_spider.py ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ from pathlib import Path
3
+ from urllib.parse import urlparse
4
+
5
+ import scrapy
6
+
7
+ logging.basicConfig(format="%(levelname)s: %(message)s", level=logging.ERROR)
8
+
9
+
10
+ class DocsSpider(scrapy.Spider):
11
+ name = "docs"
12
+
13
+ def __init__(self, homepage_url: str, save_dir="crawled_pages", *args, **kwargs):
14
+ super(DocsSpider, self).__init__(*args, **kwargs)
15
+
16
+ if not homepage_url.startswith("https://"):
17
+ homepage_url = "https://" + homepage_url
18
+
19
+ project: str = homepage_url.split(".")[0].split("https://")[1]
20
+ self.allowed_domains = [f"{project}.readthedocs.io"]
21
+ self.start_urls = [homepage_url]
22
+ self.base_dir = Path(save_dir)
23
+
24
+ def parse(self, response):
25
+ parsed_uri = urlparse(response.url)
26
+ # Create a Path from the parsed URL. If it ends with '/', we add 'index.html' as the filename.
27
+ if parsed_uri.path.endswith("/"):
28
+ filepath = (
29
+ self.base_dir
30
+ / parsed_uri.netloc
31
+ / parsed_uri.path.strip("/")
32
+ / "index.html"
33
+ )
34
+ else:
35
+ filepath = self.base_dir / parsed_uri.netloc / parsed_uri.path.strip("/")
36
+ filepath.parent.mkdir(parents=True, exist_ok=True)
37
+
38
+ print(f"{filepath=}")
39
+ with open(filepath, "wb") as f:
40
+ f.write(response.body)
41
+
42
+ # Follow links to other documentation pages
43
+ for href in response.css("a::attr(href)").getall():
44
+ yield response.follow(href, self.parse)