from langchain.vectorstores import Chroma from langchain.embeddings import HuggingFaceEmbeddings from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain.document_loaders import DirectoryLoader, UnstructuredPDFLoader import tiktoken loader = DirectoryLoader( "./apple_amazon_intel", glob="**/*.pdf", loader_cls=UnstructuredPDFLoader ) documents = loader.load() # loader = DirectoryLoader("./data/", glob="**/*.pdf", loader_cls=PyPDFLoader) # documents = loader.load() # print(documents) def tiktoken_len(text): tokenizer = tiktoken.encoding_for_model("gpt-4") tokens = tokenizer.encode(text, disallowed_special=()) return len(tokens) text_splitter = RecursiveCharacterTextSplitter( chunk_size=4000, chunk_overlap=400, length_function=tiktoken_len, separators=["\n\n", "\n", " ", ""], ) texts = text_splitter.split_documents(documents) persist_direcory = "db_index" # embeddings = OpenAIEmbeddings() embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2") db = Chroma.from_documents( texts, embedding=embeddings, persist_directory=persist_direcory ) db.persist() print("done")