File size: 1,746 Bytes
d3a1fe2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
from langchain_community.document_loaders import DirectoryLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_chroma import Chroma
import os
import shutil


def create_db(
    chunk_size,
    chunk_overlap,
    INPUT_PATH="./data/books/",
    INPUT_GLOB=["*.txt", "*.md"],
    MODEL_NAME="Alibaba-NLP/gte-multilingual-base",
    CHROMA_PATH="./chromadb/",
):
    # setup embeddings
    embeddings = HuggingFaceEmbeddings(
        model_name=MODEL_NAME,
        model_kwargs={"device": "cuda", "trust_remote_code": True},
        encode_kwargs={"normalize_embeddings": True},
    )

    # load documents
    raw_documents = DirectoryLoader(INPUT_PATH, glob=INPUT_GLOB).load()
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        length_function=len,
        add_start_index=True,
    )
    documents = text_splitter.split_documents(raw_documents)
    print(f"Split {len(raw_documents)} documents into {len(documents)} chunks.")

    # Clear out the database first.
    if os.path.exists(CHROMA_PATH):
        shutil.rmtree(CHROMA_PATH)

    # Create a new DB from the documents.
    db = Chroma.from_documents(
        documents,
        embeddings,
        persist_directory=CHROMA_PATH,
        collection_metadata={"hnsw:space": "cosine"},
    )
    print(f"Saved {len(documents)} chunks to {CHROMA_PATH}.")

    return db


if __name__ == "__main__":
    create_db(
        1000,
        500,
        INPUT_PATH="./data/books/dracula_segmented/",
        INPUT_GLOB=["*.txt"],
        MODEL_NAME="Alibaba-NLP/gte-multilingual-base",
        CHROMA_PATH="./chromadb/",
    )