from langchain_community.document_loaders import DirectoryLoader from langchain_text_splitters import RecursiveCharacterTextSplitter from datasets import load_dataset from langchain_community.document_loaders import TextLoader from langchain_community.embeddings.sentence_transformer import ( SentenceTransformerEmbeddings, ) from langchain_community.vectorstores import Chroma from unstructured.cleaners.core import clean_extra_whitespace html_loader = DirectoryLoader('./www.goingnowhere.org', glob="**/*.html", show_progress=True) pdf_loader = DirectoryLoader('./www.goingnowhere.org', glob="**/*.pdf", show_progress=True) html_docs = html_loader.load() pdf_docs = pdf_loader.load() text_splitter = RecursiveCharacterTextSplitter( chunk_size=500, chunk_overlap=30, length_function=len, is_separator_regex=False, ) texts = [] texts.extend(text_splitter.split_documents(html_docs)) texts.extend(text_splitter.split_documents(pdf_docs)) # create the open-source embedding function embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2") # load it into Chroma db = Chroma.from_documents(texts, embedding_function, persist_directory="./chroma_db") print("There are", db._collection.count(), "in the collection")