jerpint commited on
Commit
df044c6
1 Parent(s): f8c09da

sanitize web urls

Browse files
app.py CHANGED
@@ -4,8 +4,8 @@ from typing import Optional, Tuple
4
  import gradio as gr
5
  import pandas as pd
6
  from buster.completers import Completion
7
- from buster.utils import extract_zip
8
 
 
9
  from rtd_scraper.scrape_rtd import scrape_rtd
10
  import cfg
11
  from cfg import setup_buster
@@ -18,8 +18,8 @@ if os.getenv("OPENAI_API_KEY") is None:
18
  )
19
 
20
 
21
- homepage_url = os.getenv("READTHEDOCS_URL") # e.g. "https://orion.readthedocs.io/"
22
- target_version = os.getenv("READTHEDOCS_VERSION") # e.g. "en/stable"
23
 
24
  # scrape and embed content from readthedocs website
25
  # comment out if already embedded locally to avoid extra costs
@@ -117,10 +117,8 @@ with demo:
117
  answer questions.
118
  View the code on the [project homepage](https://github.com/jerpint/RAGTheDocs)
119
  """
120
-
121
  )
122
 
123
-
124
  chatbot = gr.Chatbot()
125
 
126
  with gr.Row():
@@ -135,7 +133,7 @@ with demo:
135
  examples=[
136
  "How can I install the library?",
137
  "What dependencies are required?",
138
- "Give a brief overview of the library."
139
  ],
140
  inputs=question,
141
  )
 
4
  import gradio as gr
5
  import pandas as pd
6
  from buster.completers import Completion
 
7
 
8
+ # from embed_docs import embed_rtd_website
9
  from rtd_scraper.scrape_rtd import scrape_rtd
10
  import cfg
11
  from cfg import setup_buster
 
18
  )
19
 
20
 
21
+ homepage_url = os.getenv("READTHEDOCS_URL") # e.g. "https://orion.readthedocs.io/"
22
+ target_version = os.getenv("READTHEDOCS_VERSION") # e.g. "en/stable"
23
 
24
  # scrape and embed content from readthedocs website
25
  # comment out if already embedded locally to avoid extra costs
 
117
  answer questions.
118
  View the code on the [project homepage](https://github.com/jerpint/RAGTheDocs)
119
  """
 
120
  )
121
 
 
122
  chatbot = gr.Chatbot()
123
 
124
  with gr.Row():
 
133
  examples=[
134
  "How can I install the library?",
135
  "What dependencies are required?",
136
+ "Give a brief overview of the library.",
137
  ],
138
  inputs=question,
139
  )
rtd_scraper/scrape_rtd.py CHANGED
@@ -7,7 +7,7 @@ from buster.parser import SphinxParser
7
  from scrapy.crawler import CrawlerProcess
8
  from scrapy.utils.project import get_project_settings
9
 
10
- from rtd_scraper.tutorial.spiders.docs_spider import DocsSpider
11
 
12
  # from tutorial.spiders.docs_spider import DocsSpider
13
 
@@ -34,6 +34,10 @@ def run_spider(homepage_url, save_directory, target_version=None):
34
 
35
 
36
  def scrape_rtd(homepage_url, save_directory, target_version=None):
 
 
 
 
37
  # Crawl the website using scrapy
38
  run_spider(
39
  homepage_url, save_directory=save_directory, target_version=target_version
 
7
  from scrapy.crawler import CrawlerProcess
8
  from scrapy.utils.project import get_project_settings
9
 
10
+ from rtd_scraper.tutorial.spiders.docs_spider import DocsSpider, sanitize_url
11
 
12
  # from tutorial.spiders.docs_spider import DocsSpider
13
 
 
34
 
35
 
36
  def scrape_rtd(homepage_url, save_directory, target_version=None):
37
+
38
+ # adds https:// and trailing backslash
39
+ homepage_url = sanitize_url(homepage_url)
40
+
41
  # Crawl the website using scrapy
42
  run_spider(
43
  homepage_url, save_directory=save_directory, target_version=target_version
rtd_scraper/tutorial/spiders/docs_spider.py CHANGED
@@ -27,21 +27,30 @@ def extract_domain(url):
27
  return domain
28
 
29
 
 
 
 
 
 
 
 
 
 
 
30
  class DocsSpider(scrapy.Spider):
31
  name = "docs"
32
 
33
  def __init__(
34
  self,
35
  homepage_url: str,
36
- save_dir="crawled_pages",
37
  target_version=None,
38
  *args,
39
  **kwargs,
40
  ):
41
  super(DocsSpider, self).__init__(*args, **kwargs)
42
 
43
- if not homepage_url.startswith("https://"):
44
- homepage_url = "https://" + homepage_url
45
 
46
  self.allowed_domains = [extract_domain(homepage_url)]
47
  self.start_urls = [homepage_url]
 
27
  return domain
28
 
29
 
30
+ def sanitize_url(url: str) -> str:
31
+ """Adds https:// and trailing backslash."""
32
+ if not url.startswith("https://"):
33
+ url = "https://" + url
34
+
35
+ if not url.endswith("/"):
36
+ url = url + "/"
37
+ return url
38
+
39
+
40
  class DocsSpider(scrapy.Spider):
41
  name = "docs"
42
 
43
  def __init__(
44
  self,
45
  homepage_url: str,
46
+ save_dir="outputs/",
47
  target_version=None,
48
  *args,
49
  **kwargs,
50
  ):
51
  super(DocsSpider, self).__init__(*args, **kwargs)
52
 
53
+ homepage_url = sanitize_url(homepage_url)
 
54
 
55
  self.allowed_domains = [extract_domain(homepage_url)]
56
  self.start_urls = [homepage_url]