from langchain.document_loaders import ReadTheDocsLoader from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain.embeddings import OpenAIEmbeddings from langchain.vectorstores import Qdrant # from qdrant_client import QdrantClient from config import DB_CONFIG CHUNK_SIZE = 500 def _remove_prefix_path(p: str): prefix = "data/rtdocs/nvdajp-book.readthedocs.io/" return p.removeprefix(prefix) def get_documents(path: str): loader = ReadTheDocsLoader(path, encoding="utf-8") docs = loader.load() base_url = "https://nvdajp-book.readthedocs.io/" category = "ja-book" for doc in docs: org_metadata = doc.metadata source = _remove_prefix_path(org_metadata["source"]) add_meta = {"category": category, "source": source, "url": f"{base_url}{source}"} doc.metadata = org_metadata | add_meta yield doc def get_text_chunk(docs): text_splitter = RecursiveCharacterTextSplitter(chunk_size=CHUNK_SIZE, chunk_overlap=0) texts = text_splitter.split_documents(docs) return texts def store(texts): embeddings = OpenAIEmbeddings() db_url, db_api_key, db_collection_name = DB_CONFIG # client = QdrantClient(url=db_url, api_key=db_api_key, prefer_grpc=True) _ = Qdrant.from_documents( texts, embeddings, url=db_url, api_key=db_api_key, collection_name=db_collection_name ) def main(path: str): docs = get_documents(path) texts = get_text_chunk(docs) store(texts) if __name__ == "__main__": """ $ python store.py "data/rtdocs/nvdajp-book.readthedocs.io/ja/latest" """ import sys args = sys.argv if len(args) != 2: print("No args, you need two args for html_path") docs = get_documents("data/rtdocs/nvdajp-book.readthedocs.io/ja/latest") print(type(docs)) breakpoint() else: path = args[1] # dir_name = args[2] main(path)