import logging import os from buster.documents_manager import DeepLakeDocumentsManager from buster.parsers import SphinxParser, get_all_documents from rtd_scraper.scrape_rtd import sanitize_url, run_spider # When using scrapy it seems to set logging for all apps at DEBUG, so simply shut it off here... for name in logging.root.manager.loggerDict: logger = logging.getLogger(name) logger.setLevel(logging.INFO) def embed_documents(homepage_url, save_directory, target_version=None): # adds https:// and trailing slash homepage_url = sanitize_url(homepage_url) # Crawl the website using scrapy run_spider( homepage_url, save_directory=save_directory, target_version=target_version ) # # Convert the .html pages into chunks using Buster's SphinxParser # root_dir is the folder containing the scraped content e.g. crawled_outputs/buster.readthedocs.io/ root_dir = os.path.join(save_directory, homepage_url.split("https://")[1]) df = get_all_documents( root_dir=root_dir, base_url=homepage_url, parser_cls=SphinxParser, min_section_length=100, max_section_length=1000, ) df["source"] = "readthedocs" # Add the source column # Initialize the DeepLake vector store vector_store_path = os.path.join(save_directory, "deeplake_store") dm = DeepLakeDocumentsManager( vector_store_path=vector_store_path, overwrite=True, required_columns=["url", "content", "source", "title"], ) # Add all embeddings to the vector store dm.batch_add( df=df, batch_size=3000, min_time_interval=60, num_workers=32, ) if __name__ == "__main__": homepage_url = "https://orion.readthedocs.io/" target_version = "v0.2.7" save_directory = "outputs/" embed_documents( homepage_url=homepage_url, target_version=target_version, save_directory=save_directory, )