File size: 1,262 Bytes
3b327ab |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 |
from langchain_community.document_loaders import DirectoryLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from datasets import load_dataset
from langchain_community.document_loaders import TextLoader
from langchain_community.embeddings.sentence_transformer import (
SentenceTransformerEmbeddings,
)
from langchain_community.vectorstores import Chroma
from unstructured.cleaners.core import clean_extra_whitespace
html_loader = DirectoryLoader('./www.goingnowhere.org', glob="**/*.html", show_progress=True)
pdf_loader = DirectoryLoader('./www.goingnowhere.org', glob="**/*.pdf", show_progress=True)
html_docs = html_loader.load()
pdf_docs = pdf_loader.load()
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=500,
chunk_overlap=30,
length_function=len,
is_separator_regex=False,
)
texts = []
texts.extend(text_splitter.split_documents(html_docs))
texts.extend(text_splitter.split_documents(pdf_docs))
# create the open-source embedding function
embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
# load it into Chroma
db = Chroma.from_documents(texts, embedding_function, persist_directory="./chroma_db")
print("There are", db._collection.count(), "in the collection") |