jerpint commited on
Commit
ef80e27
1 Parent(s): df044c6
Files changed (1) hide show
  1. embed_docs.py +62 -0
embed_docs.py ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ import os
3
+
4
+ from buster.docparser import get_all_documents
5
+ from buster.documents_manager import DeepLakeDocumentsManager
6
+ from buster.parser import SphinxParser
7
+
8
+ from rtd_scraper.scrape_rtd import sanitize_url, run_spider
9
+
10
+ # When using scrapy it seems to set logging for all apps at DEBUG, so simply shut it off here...
11
+ for name in logging.root.manager.loggerDict:
12
+ logger = logging.getLogger(name)
13
+ logger.setLevel(logging.INFO)
14
+
15
+
16
+ def embed_documents(homepage_url, save_directory, target_version=None):
17
+ # adds https:// and trailing slash
18
+ homepage_url = sanitize_url(homepage_url)
19
+
20
+ # Crawl the website using scrapy
21
+ run_spider(
22
+ homepage_url, save_directory=save_directory, target_version=target_version
23
+ )
24
+
25
+ # # Convert the .html pages into chunks using Buster's SphinxParser
26
+ # root_dir is the folder containing the scraped content e.g. crawled_outputs/buster.readthedocs.io/
27
+ root_dir = os.path.join(save_directory, homepage_url.split("https://")[1])
28
+ df = get_all_documents(
29
+ root_dir=root_dir,
30
+ base_url=homepage_url,
31
+ parser_cls=SphinxParser,
32
+ min_section_length=100,
33
+ max_section_length=1000,
34
+ )
35
+ df["source"] = "readthedocs" # Add the source column
36
+
37
+ # Initialize the DeepLake vector store
38
+ vector_store_path = os.path.join(save_directory, "deeplake_store")
39
+ dm = DeepLakeDocumentsManager(
40
+ vector_store_path=vector_store_path,
41
+ overwrite=True,
42
+ required_columns=["url", "content", "source", "title"],
43
+ )
44
+
45
+ # Add all embeddings to the vector store
46
+ dm.batch_add(
47
+ df=df,
48
+ batch_size=3000,
49
+ min_time_interval=60,
50
+ num_workers=32,
51
+ )
52
+
53
+
54
+ if __name__ == "__main__":
55
+ homepage_url = "https://orion.readthedocs.io/"
56
+ target_version = "v0.2.7"
57
+ save_directory = "outputs/"
58
+ embed_documents(
59
+ homepage_url=homepage_url,
60
+ target_version=target_version,
61
+ save_directory=save_directory,
62
+ )