jerpint commited on
Commit
db19951
1 Parent(s): 75f72d8
cfg.py CHANGED
@@ -24,7 +24,9 @@ homepage_url = os.getenv("RTD_URL", "https://orion.readthedocs.io/")
24
  target_version = os.getenv("RTD_VERSION", "en/stable")
25
 
26
  # scrape and embed content from readthedocs website
27
- scrape_rtd(homepage_url=homepage_url, save_directory="outputs/", target_version=target_version)
 
 
28
 
29
  # Disable logging for third-party libraries at DEBUG level
30
  for name in logging.root.manager.loggerDict:
 
24
  target_version = os.getenv("RTD_VERSION", "en/stable")
25
 
26
  # scrape and embed content from readthedocs website
27
+ scrape_rtd(
28
+ homepage_url=homepage_url, save_directory="outputs/", target_version=target_version
29
+ )
30
 
31
  # Disable logging for third-party libraries at DEBUG level
32
  for name in logging.root.manager.loggerDict:
rtd_scraper/scrape_rtd.py CHANGED
@@ -8,6 +8,7 @@ from scrapy.crawler import CrawlerProcess
8
  from scrapy.utils.project import get_project_settings
9
 
10
  from rtd_scraper.tutorial.spiders.docs_spider import DocsSpider
 
11
  # from tutorial.spiders.docs_spider import DocsSpider
12
 
13
  # When using scrapy it seems to set logging for all apps at DEBUG, so simply shut it off here...
@@ -18,7 +19,12 @@ for name in logging.root.manager.loggerDict:
18
 
19
  def run_spider(homepage_url, save_directory, target_version=None):
20
  process = CrawlerProcess(settings=get_project_settings())
21
- process.crawl(DocsSpider, homepage_url=homepage_url, save_dir=save_directory, target_version=target_version)
 
 
 
 
 
22
 
23
  # Start the crawling process
24
  process.start()
@@ -29,7 +35,9 @@ def run_spider(homepage_url, save_directory, target_version=None):
29
 
30
  def scrape_rtd(homepage_url, save_directory, target_version=None):
31
  # Crawl the website using scrapy
32
- run_spider(homepage_url, save_directory=save_directory, target_version=target_version)
 
 
33
 
34
  # # Convert the .html pages into chunks using Buster's SphinxParser
35
  root_dir = os.path.join(save_directory, homepage_url.split("https://")[1])
@@ -62,7 +70,8 @@ def scrape_rtd(homepage_url, save_directory, target_version=None):
62
  )
63
 
64
 
65
-
66
  if __name__ == "__main__":
67
  homepage_url = "https://orion.readthedocs.io/"
68
- scrape_rtd(homepage_url=homepage_url, target_version="v0.2.7", save_directory="outputs/")
 
 
 
8
  from scrapy.utils.project import get_project_settings
9
 
10
  from rtd_scraper.tutorial.spiders.docs_spider import DocsSpider
11
+
12
  # from tutorial.spiders.docs_spider import DocsSpider
13
 
14
  # When using scrapy it seems to set logging for all apps at DEBUG, so simply shut it off here...
 
19
 
20
  def run_spider(homepage_url, save_directory, target_version=None):
21
  process = CrawlerProcess(settings=get_project_settings())
22
+ process.crawl(
23
+ DocsSpider,
24
+ homepage_url=homepage_url,
25
+ save_dir=save_directory,
26
+ target_version=target_version,
27
+ )
28
 
29
  # Start the crawling process
30
  process.start()
 
35
 
36
  def scrape_rtd(homepage_url, save_directory, target_version=None):
37
  # Crawl the website using scrapy
38
+ run_spider(
39
+ homepage_url, save_directory=save_directory, target_version=target_version
40
+ )
41
 
42
  # # Convert the .html pages into chunks using Buster's SphinxParser
43
  root_dir = os.path.join(save_directory, homepage_url.split("https://")[1])
 
70
  )
71
 
72
 
 
73
  if __name__ == "__main__":
74
  homepage_url = "https://orion.readthedocs.io/"
75
+ scrape_rtd(
76
+ homepage_url=homepage_url, target_version="v0.2.7", save_directory="outputs/"
77
+ )
rtd_scraper/tutorial/spiders/docs_spider.py CHANGED
@@ -8,6 +8,7 @@ logging.basicConfig(format="%(levelname)s: %(message)s", level=logging.ERROR)
8
 
9
  from urllib.parse import urlparse
10
 
 
11
  def extract_domain(url):
12
  """
13
  Extract the domain (including subdomains) from a given URL.
@@ -29,7 +30,14 @@ def extract_domain(url):
29
  class DocsSpider(scrapy.Spider):
30
  name = "docs"
31
 
32
- def __init__(self, homepage_url: str, save_dir="crawled_pages", target_version=None, *args, **kwargs):
 
 
 
 
 
 
 
33
  super(DocsSpider, self).__init__(*args, **kwargs)
34
 
35
  if not homepage_url.startswith("https://"):
@@ -67,4 +75,4 @@ class DocsSpider(scrapy.Spider):
67
  yield response.follow(href, self.parse)
68
  else:
69
  # no version specified, follow all links
70
- yield response.follow(href, self.parse)
 
8
 
9
  from urllib.parse import urlparse
10
 
11
+
12
  def extract_domain(url):
13
  """
14
  Extract the domain (including subdomains) from a given URL.
 
30
  class DocsSpider(scrapy.Spider):
31
  name = "docs"
32
 
33
+ def __init__(
34
+ self,
35
+ homepage_url: str,
36
+ save_dir="crawled_pages",
37
+ target_version=None,
38
+ *args,
39
+ **kwargs,
40
+ ):
41
  super(DocsSpider, self).__init__(*args, **kwargs)
42
 
43
  if not homepage_url.startswith("https://"):
 
75
  yield response.follow(href, self.parse)
76
  else:
77
  # no version specified, follow all links
78
+ yield response.follow(href, self.parse)