File size: 1,975 Bytes
6ab28e5
 
 
 
 
9bc4a6c
6ab28e5
 
 
 
 
9bc4a6c
 
 
 
 
6ab28e5
 
 
9bc4a6c
8d3ec3e
9bc4a6c
 
 
8d3ec3e
9bc4a6c
 
6ab28e5
 
 
 
 
 
 
 
 
 
9bc4a6c
6ab28e5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9bc4a6c
 
 
6ab28e5
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
from langchain.document_loaders import ReadTheDocsLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Qdrant
# from qdrant_client import QdrantClient
from config import DB_CONFIG


CHUNK_SIZE = 500


def _remove_prefix_path(p: str):
    prefix = "data/rtdocs/nvdajp-book.readthedocs.io/"
    return p.removeprefix(prefix)


def get_documents(path: str):
    loader = ReadTheDocsLoader(path, encoding="utf-8")
    docs = loader.load()
    base_url = "https://nvdajp-book.readthedocs.io/"
    category = "ja-book"
    for doc in docs:
        org_metadata = doc.metadata
        source = _remove_prefix_path(org_metadata["source"])
        add_meta = {"category": category, "source": source, "url": f"{base_url}{source}"}
        doc.metadata = org_metadata | add_meta
        yield doc


def get_text_chunk(docs):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=CHUNK_SIZE, chunk_overlap=0)
    texts = text_splitter.split_documents(docs)
    return texts


def store(texts):
    embeddings = OpenAIEmbeddings()
    db_url, db_api_key, db_collection_name = DB_CONFIG
    # client = QdrantClient(url=db_url, api_key=db_api_key, prefer_grpc=True)
    _ = Qdrant.from_documents(
        texts,
        embeddings,
        url=db_url,
        api_key=db_api_key,
        collection_name=db_collection_name
    )


def main(path: str):
    docs = get_documents(path)
    texts = get_text_chunk(docs)
    store(texts)


if __name__ == "__main__":
    """
    $ python store.py "data/rtdocs/nvdajp-book.readthedocs.io/ja/latest"
    """
    import sys
    args = sys.argv
    if len(args) != 2:
        print("No args, you need two args for html_path")
        docs = get_documents("data/rtdocs/nvdajp-book.readthedocs.io/ja/latest")
        print(type(docs))
        breakpoint()
    else:
        path = args[1]
        # dir_name = args[2]
        main(path)