File size: 1,157 Bytes
444dc2c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
from langchain.vectorstores import Chroma
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import DirectoryLoader, UnstructuredPDFLoader
import tiktoken


loader = DirectoryLoader(
    "./apple_amazon_intel", glob="**/*.pdf", loader_cls=UnstructuredPDFLoader
)
documents = loader.load()

# loader = DirectoryLoader("./data/", glob="**/*.pdf", loader_cls=PyPDFLoader)
# documents = loader.load()
# print(documents)


def tiktoken_len(text):
    tokenizer = tiktoken.encoding_for_model("gpt-4")
    tokens = tokenizer.encode(text, disallowed_special=())
    return len(tokens)


text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=4000,
    chunk_overlap=400,
    length_function=tiktoken_len,
    separators=["\n\n", "\n", " ", ""],
)
texts = text_splitter.split_documents(documents)

persist_direcory = "db_index"

# embeddings = OpenAIEmbeddings()
embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

db = Chroma.from_documents(
    texts, embedding=embeddings, persist_directory=persist_direcory
)
db.persist()
print("done")