jerpint commited on
Commit
2ae8bfe
1 Parent(s): ef80e27

remove unused code, add exceptions if variables not set

Browse files
Files changed (3) hide show
  1. app.py +34 -14
  2. cfg.py +28 -30
  3. rtd_scraper/scrape_rtd.py +0 -53
app.py CHANGED
@@ -6,34 +6,54 @@ import pandas as pd
6
  from buster.completers import Completion
7
 
8
  # from embed_docs import embed_rtd_website
9
- from rtd_scraper.scrape_rtd import scrape_rtd
 
10
  import cfg
11
  from cfg import setup_buster
12
 
 
 
 
 
 
 
 
 
13
 
14
- # Check if an openai key is set as an env. variable
15
- if os.getenv("OPENAI_API_KEY") is None:
16
  print(
17
- "Warning: No openai key detected. You can set it with 'export OPENAI_API_KEY=sk-...'."
18
  )
19
 
 
 
 
 
20
 
21
- homepage_url = os.getenv("READTHEDOCS_URL") # e.g. "https://orion.readthedocs.io/"
22
- target_version = os.getenv("READTHEDOCS_VERSION") # e.g. "en/stable"
 
 
 
 
 
23
 
24
- # scrape and embed content from readthedocs website
25
- # comment out if already embedded locally to avoid extra costs
26
- scrape_rtd(
27
- homepage_url=homepage_url, save_directory="outputs/", target_version=target_version
28
- )
29
 
 
 
30
 
31
- # Typehint for chatbot history
32
- ChatHistory = list[list[Optional[str], Optional[str]]]
 
 
 
 
33
 
 
34
  buster = setup_buster(cfg.buster_cfg)
35
 
36
 
 
37
  def add_user_question(
38
  user_question: str, chat_history: Optional[ChatHistory] = None
39
  ) -> ChatHistory:
@@ -157,5 +177,5 @@ with demo:
157
  )
158
 
159
 
160
- demo.queue(concurrency_count=16)
161
  demo.launch(share=False)
 
6
  from buster.completers import Completion
7
 
8
  # from embed_docs import embed_rtd_website
9
+ # from rtd_scraper.scrape_rtd import scrape_rtd
10
+ from embed_docs import embed_documents
11
  import cfg
12
  from cfg import setup_buster
13
 
14
+ # Typehint for chatbot history
15
+ ChatHistory = list[list[Optional[str], Optional[str]]]
16
+
17
+
18
+ # Because this is a one-click deploy app, we will be relying on env. variables being set
19
+ openai_api_key = os.getenv("OPENAI_API_KEY") # Mandatory for app to work
20
+ readthedocs_url = os.getenv("READTHEDOCS_URL") # Mandatory for app to work as intended
21
+ readthedocs_version = os.getenv("READTHEDOCS_VERSION")
22
 
23
+ if openai_api_key is None:
 
24
  print(
25
+ "Warning: No OPENAI_API_KEY detected. Set it with 'export OPENAI_API_KEY=sk-...'."
26
  )
27
 
28
+ if readthedocs_url is None:
29
+ raise ValueError(
30
+ "No READTHEDOCS_URL detected. Set it with e.g. 'export READTHEDOCS_URL=https://orion.readthedocs.io/'"
31
+ )
32
 
33
+ if readthedocs_version is None:
34
+ print(
35
+ """
36
+ Warning: No READTHEDOCS_VERSION detected. If multiple versions of the docs exist, they will all be scraped.
37
+ Set it with e.g. 'export READTHEDOCS_VERSION=en/stable'
38
+ """
39
+ )
40
 
 
 
 
 
 
41
 
42
+ # Override to put it anywhere
43
+ save_directory = "outputs/"
44
 
45
+ # scrape and embed content from readthedocs website
46
+ embed_documents(
47
+ homepage_url=readthedocs_url,
48
+ save_directory=save_directory,
49
+ target_version=readthedocs_version,
50
+ )
51
 
52
+ # Setup RAG agent
53
  buster = setup_buster(cfg.buster_cfg)
54
 
55
 
56
+ # Setup Gradio app
57
  def add_user_question(
58
  user_question: str, chat_history: Optional[ChatHistory] = None
59
  ) -> ChatHistory:
 
177
  )
178
 
179
 
180
+ demo.queue(concurrency_count=8)
181
  demo.launch(share=False)
cfg.py CHANGED
@@ -6,37 +6,7 @@ from buster.retriever import DeepLakeRetriever, Retriever
6
  from buster.tokenizers import GPTTokenizer
7
  from buster.validators import QuestionAnswerValidator, Validator
8
 
9
- from rtd_scraper.scrape_rtd import scrape_rtd
10
-
11
  buster_cfg = BusterConfig(
12
- validator_cfg={
13
- "unknown_response_templates": [
14
- "I'm sorry, but I am an AI language model trained to assist with questions related to AI. I cannot answer that question as it is not relevant to the library or its usage. Is there anything else I can assist you with?",
15
- ],
16
- "unknown_threshold": 0.85,
17
- "embedding_model": "text-embedding-ada-002",
18
- "use_reranking": True,
19
- "invalid_question_response": "This question does not seem relevant to my current knowledge.",
20
- "check_question_prompt": """You are an chatbot answering questions on python libraries.
21
-
22
- Your job is to determine wether or not a question is valid, and should be answered.
23
- A user will submit a question. Respond 'true' if it is valid, respond 'false' if it is invalid.
24
-
25
- For example:
26
-
27
- Q: How can I install the library?
28
- true
29
-
30
- Q: What is the meaning of life?
31
- false
32
-
33
- A user will submit a question. Respond 'true' if it is valid, respond 'false' if it is invalid.""",
34
- "completion_kwargs": {
35
- "model": "gpt-3.5-turbo",
36
- "stream": False,
37
- "temperature": 0,
38
- },
39
- },
40
  retriever_cfg={
41
  "path": "outputs/deeplake_store",
42
  "top_k": 3,
@@ -87,6 +57,34 @@ A user will submit a question. Respond 'true' if it is valid, respond 'false' if
87
  "Now answer the following question:\n"
88
  ),
89
  },
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
90
  )
91
 
92
 
 
6
  from buster.tokenizers import GPTTokenizer
7
  from buster.validators import QuestionAnswerValidator, Validator
8
 
 
 
9
  buster_cfg = BusterConfig(
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
  retriever_cfg={
11
  "path": "outputs/deeplake_store",
12
  "top_k": 3,
 
57
  "Now answer the following question:\n"
58
  ),
59
  },
60
+ validator_cfg={
61
+ "unknown_response_templates": [
62
+ "I'm sorry, but I am an AI language model trained to assist with questions related to AI. I cannot answer that question as it is not relevant to the library or its usage. Is there anything else I can assist you with?",
63
+ ],
64
+ "unknown_threshold": 0.85,
65
+ "embedding_model": "text-embedding-ada-002",
66
+ "use_reranking": True,
67
+ "invalid_question_response": "This question does not seem relevant to my current knowledge.",
68
+ "check_question_prompt": """You are an chatbot answering questions on python libraries.
69
+
70
+ Your job is to determine wether or not a question is valid, and should be answered.
71
+ A user will submit a question. Respond 'true' if it is valid, respond 'false' if it is invalid.
72
+
73
+ For example:
74
+
75
+ Q: How can I install the library?
76
+ true
77
+
78
+ Q: What is the meaning of life?
79
+ false
80
+
81
+ A user will submit a question. Respond 'true' if it is valid, respond 'false' if it is invalid.""",
82
+ "completion_kwargs": {
83
+ "model": "gpt-3.5-turbo",
84
+ "stream": False,
85
+ "temperature": 0,
86
+ },
87
+ },
88
  )
89
 
90
 
rtd_scraper/scrape_rtd.py CHANGED
@@ -1,16 +1,11 @@
1
  import logging
2
  import os
3
 
4
- from buster.docparser import get_all_documents
5
- from buster.documents_manager import DeepLakeDocumentsManager
6
- from buster.parser import SphinxParser
7
  from scrapy.crawler import CrawlerProcess
8
  from scrapy.utils.project import get_project_settings
9
 
10
  from rtd_scraper.tutorial.spiders.docs_spider import DocsSpider, sanitize_url
11
 
12
- # from tutorial.spiders.docs_spider import DocsSpider
13
-
14
  # When using scrapy it seems to set logging for all apps at DEBUG, so simply shut it off here...
15
  for name in logging.root.manager.loggerDict:
16
  logger = logging.getLogger(name)
@@ -31,51 +26,3 @@ def run_spider(homepage_url, save_directory, target_version=None):
31
 
32
  # To stop the crawling process gracefully
33
  process.stop()
34
-
35
-
36
- def scrape_rtd(homepage_url, save_directory, target_version=None):
37
-
38
- # adds https:// and trailing backslash
39
- homepage_url = sanitize_url(homepage_url)
40
-
41
- # Crawl the website using scrapy
42
- run_spider(
43
- homepage_url, save_directory=save_directory, target_version=target_version
44
- )
45
-
46
- # # Convert the .html pages into chunks using Buster's SphinxParser
47
- root_dir = os.path.join(save_directory, homepage_url.split("https://")[1])
48
-
49
- # root_dir is the folder containing the scraped content e.g. crawled_outputs/buster.readthedocs.io/
50
- df = get_all_documents(
51
- root_dir=root_dir,
52
- base_url=homepage_url,
53
- parser_cls=SphinxParser,
54
- min_section_length=100,
55
- max_section_length=1000,
56
- )
57
-
58
- # Add the source column
59
- df["source"] = "readthedocs"
60
-
61
- # Initialize the DeepLake vector store
62
- dm = DeepLakeDocumentsManager(
63
- vector_store_path=os.path.join(save_directory, "deeplake_store"),
64
- overwrite=True,
65
- required_columns=["url", "content", "source", "title"],
66
- )
67
-
68
- # Add all embeddings to the vector store
69
- dm.batch_add(
70
- df=df,
71
- batch_size=3000,
72
- min_time_interval=60,
73
- num_workers=32,
74
- )
75
-
76
-
77
- if __name__ == "__main__":
78
- homepage_url = "https://orion.readthedocs.io/"
79
- scrape_rtd(
80
- homepage_url=homepage_url, target_version="v0.2.7", save_directory="outputs/"
81
- )
 
1
  import logging
2
  import os
3
 
 
 
 
4
  from scrapy.crawler import CrawlerProcess
5
  from scrapy.utils.project import get_project_settings
6
 
7
  from rtd_scraper.tutorial.spiders.docs_spider import DocsSpider, sanitize_url
8
 
 
 
9
  # When using scrapy it seems to set logging for all apps at DEBUG, so simply shut it off here...
10
  for name in logging.root.manager.loggerDict:
11
  logger = logging.getLogger(name)
 
26
 
27
  # To stop the crawling process gracefully
28
  process.stop()