from langchain.vectorstores import Chroma | |
from langchain.embeddings import HuggingFaceEmbeddings | |
from langchain.text_splitter import RecursiveCharacterTextSplitter | |
from langchain.document_loaders import DirectoryLoader, UnstructuredPDFLoader | |
import tiktoken | |
loader = DirectoryLoader( | |
"./apple_amazon_intel", glob="**/*.pdf", loader_cls=UnstructuredPDFLoader | |
) | |
documents = loader.load() | |
# loader = DirectoryLoader("./data/", glob="**/*.pdf", loader_cls=PyPDFLoader) | |
# documents = loader.load() | |
# print(documents) | |
def tiktoken_len(text): | |
tokenizer = tiktoken.encoding_for_model("gpt-4") | |
tokens = tokenizer.encode(text, disallowed_special=()) | |
return len(tokens) | |
text_splitter = RecursiveCharacterTextSplitter( | |
chunk_size=4000, | |
chunk_overlap=400, | |
length_function=tiktoken_len, | |
separators=["\n\n", "\n", " ", ""], | |
) | |
texts = text_splitter.split_documents(documents) | |
persist_direcory = "db_index" | |
# embeddings = OpenAIEmbeddings() | |
embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2") | |
db = Chroma.from_documents( | |
texts, embedding=embeddings, persist_directory=persist_direcory | |
) | |
db.persist() | |
print("done") | |