jerpint commited on
Commit
75f72d8
1 Parent(s): ac493ec

support target_versions

Browse files
app.py CHANGED
@@ -1,5 +1,4 @@
1
  import logging
2
- import os
3
  from typing import Optional, Tuple
4
 
5
  import gradio as gr
@@ -19,12 +18,6 @@ handler = (
19
  handler.setLevel(logging.INFO)
20
  logging.basicConfig(level=logging.INFO)
21
 
22
- # Check if an openai key is set as an env. variable
23
- if os.getenv("OPENAI_API_KEY") is None:
24
- print(
25
- "Warning: No openai key detected. You can set it with 'export OPENAI_API_KEY=sk-...'."
26
- )
27
-
28
  # Typehint for chatbot history
29
  ChatHistory = list[list[Optional[str], Optional[str]]]
30
 
@@ -114,21 +107,21 @@ with demo:
114
  examples = gr.Examples(
115
  examples=[
116
  "How can I install the library?",
117
- "How do I deal with noisy data?",
118
- "How do I deal with noisy data in 2 words?",
119
  ],
120
  inputs=question,
121
  )
122
 
123
  gr.Markdown(
124
- "This application uses GPT to search the docs for relevant info and answer questions."
125
  )
126
 
127
  response = gr.State()
128
 
129
  # fmt: off
130
- submit.click(
131
- add_user_question,
 
132
  inputs=[question],
133
  outputs=[chatbot]
134
  ).then(
@@ -141,21 +134,6 @@ with demo:
141
  outputs=[chatbot]
142
  )
143
 
144
- question.submit(
145
- add_user_question,
146
- inputs=[question],
147
- outputs=[chatbot],
148
- ).then(
149
- chat,
150
- inputs=[chatbot],
151
- outputs=[chatbot, response]
152
- ).then(
153
- add_sources,
154
- inputs=[chatbot, response],
155
- outputs=[chatbot]
156
- )
157
- # fmt: on
158
-
159
 
160
  demo.queue(concurrency_count=16)
161
  demo.launch(share=False)
 
1
  import logging
 
2
  from typing import Optional, Tuple
3
 
4
  import gradio as gr
 
18
  handler.setLevel(logging.INFO)
19
  logging.basicConfig(level=logging.INFO)
20
 
 
 
 
 
 
 
21
  # Typehint for chatbot history
22
  ChatHistory = list[list[Optional[str], Optional[str]]]
23
 
 
107
  examples = gr.Examples(
108
  examples=[
109
  "How can I install the library?",
110
+ "What dependencies are required?",
 
111
  ],
112
  inputs=question,
113
  )
114
 
115
  gr.Markdown(
116
+ "This app uses [Buster 🤖](github.com/jerpint/buster) and ChatGPT to search the docs for relevant info and answer questions."
117
  )
118
 
119
  response = gr.State()
120
 
121
  # fmt: off
122
+ gr.on(
123
+ triggers=[submit.click, question.submit],
124
+ fn=add_user_question,
125
  inputs=[question],
126
  outputs=[chatbot]
127
  ).then(
 
134
  outputs=[chatbot]
135
  )
136
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
137
 
138
  demo.queue(concurrency_count=16)
139
  demo.launch(share=False)
cfg.py CHANGED
@@ -1,5 +1,5 @@
 
1
  import logging
2
- import sys
3
 
4
  from buster.busterbot import Buster, BusterConfig
5
  from buster.completers import ChatGPTCompleter, DocumentAnswerer
@@ -14,11 +14,17 @@ from rtd_scraper.scrape_rtd import scrape_rtd
14
  # Set the root logger's level to INFO
15
  logging.basicConfig(level=logging.INFO)
16
 
 
 
 
 
 
17
 
18
- homepage_url = "https://buster.readthedocs.io/"
19
-
20
 
21
- scrape_rtd(homepage_url=homepage_url, save_directory="outputs/")
 
22
 
23
  # Disable logging for third-party libraries at DEBUG level
24
  for name in logging.root.manager.loggerDict:
 
1
+ import os
2
  import logging
 
3
 
4
  from buster.busterbot import Buster, BusterConfig
5
  from buster.completers import ChatGPTCompleter, DocumentAnswerer
 
14
  # Set the root logger's level to INFO
15
  logging.basicConfig(level=logging.INFO)
16
 
17
+ # Check if an openai key is set as an env. variable
18
+ if os.getenv("OPENAI_API_KEY") is None:
19
+ print(
20
+ "Warning: No openai key detected. You can set it with 'export OPENAI_API_KEY=sk-...'."
21
+ )
22
 
23
+ homepage_url = os.getenv("RTD_URL", "https://orion.readthedocs.io/")
24
+ target_version = os.getenv("RTD_VERSION", "en/stable")
25
 
26
+ # scrape and embed content from readthedocs website
27
+ scrape_rtd(homepage_url=homepage_url, save_directory="outputs/", target_version=target_version)
28
 
29
  # Disable logging for third-party libraries at DEBUG level
30
  for name in logging.root.manager.loggerDict:
rtd_scraper/scrape_rtd.py CHANGED
@@ -5,10 +5,10 @@ from buster.docparser import get_all_documents
5
  from buster.documents_manager import DeepLakeDocumentsManager
6
  from buster.parser import SphinxParser
7
  from scrapy.crawler import CrawlerProcess
8
- from scrapy.exceptions import CloseSpider
9
  from scrapy.utils.project import get_project_settings
10
 
11
  from rtd_scraper.tutorial.spiders.docs_spider import DocsSpider
 
12
 
13
  # When using scrapy it seems to set logging for all apps at DEBUG, so simply shut it off here...
14
  for name in logging.root.manager.loggerDict:
@@ -16,12 +16,9 @@ for name in logging.root.manager.loggerDict:
16
  logger.setLevel(logging.INFO)
17
 
18
 
19
- def run_spider(homepage_url, save_directory):
20
- # settings_file_path = 'rtd_scraper.tutorial.settings' # The path seen from top-level, ie. from cfg.py
21
- # os.environ.setdefault('SCRAPY_SETTINGS_MODULE', settings_file_path)
22
-
23
  process = CrawlerProcess(settings=get_project_settings())
24
- process.crawl(DocsSpider, homepage_url=homepage_url, save_dir=save_directory)
25
 
26
  # Start the crawling process
27
  process.start()
@@ -30,11 +27,11 @@ def run_spider(homepage_url, save_directory):
30
  process.stop()
31
 
32
 
33
- def scrape_rtd(homepage_url, save_directory):
34
  # Crawl the website using scrapy
35
- run_spider(homepage_url, save_directory=save_directory)
36
 
37
- # Convert the .html pages into chunks using Buster's SphinxParser
38
  root_dir = os.path.join(save_directory, homepage_url.split("https://")[1])
39
 
40
  # root_dir is the folder containing the scraped content e.g. crawled_outputs/buster.readthedocs.io/
@@ -49,23 +46,23 @@ def scrape_rtd(homepage_url, save_directory):
49
  # Add the source column
50
  df["source"] = "readthedocs"
51
 
52
- # # Initialize the DeepLake vector store
53
- # dm = DeepLakeDocumentsManager(
54
- # vector_store_path=os.path.join(save_directory, "deeplake_store"),
55
- # overwrite=True,
56
- # required_columns=["url", "content", "source", "title"],
57
- # )
58
- #
59
- # # Add all embeddings to the vector store
60
- # dm.batch_add(
61
- # df=df,
62
- # batch_size=3000,
63
- # min_time_interval=60,
64
- # num_workers=32,
65
- # )
66
- #
67
 
68
 
69
  if __name__ == "__main__":
70
- homepage_url = "https://buster.readthedocs.io/"
71
- scrape_rtd(homepage_url=homepage_url, save_directory="outputs/")
 
5
  from buster.documents_manager import DeepLakeDocumentsManager
6
  from buster.parser import SphinxParser
7
  from scrapy.crawler import CrawlerProcess
 
8
  from scrapy.utils.project import get_project_settings
9
 
10
  from rtd_scraper.tutorial.spiders.docs_spider import DocsSpider
11
+ # from tutorial.spiders.docs_spider import DocsSpider
12
 
13
  # When using scrapy it seems to set logging for all apps at DEBUG, so simply shut it off here...
14
  for name in logging.root.manager.loggerDict:
 
16
  logger.setLevel(logging.INFO)
17
 
18
 
19
+ def run_spider(homepage_url, save_directory, target_version=None):
 
 
 
20
  process = CrawlerProcess(settings=get_project_settings())
21
+ process.crawl(DocsSpider, homepage_url=homepage_url, save_dir=save_directory, target_version=target_version)
22
 
23
  # Start the crawling process
24
  process.start()
 
27
  process.stop()
28
 
29
 
30
+ def scrape_rtd(homepage_url, save_directory, target_version=None):
31
  # Crawl the website using scrapy
32
+ run_spider(homepage_url, save_directory=save_directory, target_version=target_version)
33
 
34
+ # # Convert the .html pages into chunks using Buster's SphinxParser
35
  root_dir = os.path.join(save_directory, homepage_url.split("https://")[1])
36
 
37
  # root_dir is the folder containing the scraped content e.g. crawled_outputs/buster.readthedocs.io/
 
46
  # Add the source column
47
  df["source"] = "readthedocs"
48
 
49
+ # Initialize the DeepLake vector store
50
+ dm = DeepLakeDocumentsManager(
51
+ vector_store_path=os.path.join(save_directory, "deeplake_store"),
52
+ overwrite=True,
53
+ required_columns=["url", "content", "source", "title"],
54
+ )
55
+
56
+ # Add all embeddings to the vector store
57
+ dm.batch_add(
58
+ df=df,
59
+ batch_size=3000,
60
+ min_time_interval=60,
61
+ num_workers=32,
62
+ )
63
+
64
 
65
 
66
  if __name__ == "__main__":
67
+ homepage_url = "https://orion.readthedocs.io/"
68
+ scrape_rtd(homepage_url=homepage_url, target_version="v0.2.7", save_directory="outputs/")
rtd_scraper/tutorial/spiders/docs_spider.py CHANGED
@@ -6,20 +6,39 @@ import scrapy
6
 
7
  logging.basicConfig(format="%(levelname)s: %(message)s", level=logging.ERROR)
8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
 
10
  class DocsSpider(scrapy.Spider):
11
  name = "docs"
12
 
13
- def __init__(self, homepage_url: str, save_dir="crawled_pages", *args, **kwargs):
14
  super(DocsSpider, self).__init__(*args, **kwargs)
15
 
16
  if not homepage_url.startswith("https://"):
17
  homepage_url = "https://" + homepage_url
18
 
19
- project: str = homepage_url.split(".")[0].split("https://")[1]
20
- self.allowed_domains = [f"{project}.readthedocs.io"]
21
  self.start_urls = [homepage_url]
22
  self.base_dir = Path(save_dir)
 
23
 
24
  def parse(self, response):
25
  parsed_uri = urlparse(response.url)
@@ -39,6 +58,13 @@ class DocsSpider(scrapy.Spider):
39
  with open(filepath, "wb") as f:
40
  f.write(response.body)
41
 
42
- # Follow links to other documentation pages
43
  for href in response.css("a::attr(href)").getall():
44
- yield response.follow(href, self.parse)
 
 
 
 
 
 
 
 
6
 
7
  logging.basicConfig(format="%(levelname)s: %(message)s", level=logging.ERROR)
8
 
9
+ from urllib.parse import urlparse
10
+
11
+ def extract_domain(url):
12
+ """
13
+ Extract the domain (including subdomains) from a given URL.
14
+
15
+ Args:
16
+ - url (str): The URL from which the domain needs to be extracted.
17
+
18
+ Returns:
19
+ - str: The domain (with subdomains) extracted from the URL.
20
+ For example, 'www.example.com' for the URL 'https://www.example.com/path/to/something'.
21
+
22
+ """
23
+ parsed_uri = urlparse(url)
24
+ # The netloc attribute will contain the domain name
25
+ domain = parsed_uri.netloc
26
+ return domain
27
+
28
 
29
  class DocsSpider(scrapy.Spider):
30
  name = "docs"
31
 
32
+ def __init__(self, homepage_url: str, save_dir="crawled_pages", target_version=None, *args, **kwargs):
33
  super(DocsSpider, self).__init__(*args, **kwargs)
34
 
35
  if not homepage_url.startswith("https://"):
36
  homepage_url = "https://" + homepage_url
37
 
38
+ self.allowed_domains = [extract_domain(homepage_url)]
 
39
  self.start_urls = [homepage_url]
40
  self.base_dir = Path(save_dir)
41
+ self.target_version = target_version
42
 
43
  def parse(self, response):
44
  parsed_uri = urlparse(response.url)
 
58
  with open(filepath, "wb") as f:
59
  f.write(response.body)
60
 
61
+ # Follow links to other documentation pages only if they contain the target version in the full URL
62
  for href in response.css("a::attr(href)").getall():
63
+ if self.target_version:
64
+ # A version was specified, check to see if it's the correct version from url
65
+ full_url = response.urljoin(href) # Expand href to a full URL
66
+ if self.target_version in full_url:
67
+ yield response.follow(href, self.parse)
68
+ else:
69
+ # no version specified, follow all links
70
+ yield response.follow(href, self.parse)