from langchain.document_loaders import ReadTheDocsLoader from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain.embeddings import OpenAIEmbeddings from langchain.vectorstores import Qdrant # from qdrant_client import QdrantClient from config import get_db_config CHUNK_SIZE = 500 def get_documents(path: str): loader = ReadTheDocsLoader(path, encoding="utf-8") docs = loader.load() return docs def get_text_chunk(docs): text_splitter = RecursiveCharacterTextSplitter(chunk_size=CHUNK_SIZE, chunk_overlap=0) texts = text_splitter.split_documents(docs) return texts def store(texts): embeddings = OpenAIEmbeddings() db_url, db_api_key, db_collection_name = get_db_config() # client = QdrantClient(url=db_url, api_key=db_api_key, prefer_grpc=True) _ = Qdrant.from_documents( texts, embeddings, url=db_url, api_key=db_api_key, collection_name=db_collection_name ) def main(path: str): docs = get_documents(path) texts = get_text_chunk(docs) store(texts) if __name__ == "__main__": """ $ python store.py "data/rtdocs/nvdajp-book.readthedocs.io/ja/latest" """ import sys args = sys.argv if len(args) != 2: print("No args, you need two args for html_path") else: path = args[1] # dir_name = args[2] main(path)