from langchain_community.document_loaders import DirectoryLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from datasets import load_dataset
from langchain_community.document_loaders import TextLoader
from langchain_community.embeddings.sentence_transformer import (
    SentenceTransformerEmbeddings,
)
from langchain_community.vectorstores import Chroma
from unstructured.cleaners.core import clean_extra_whitespace

html_loader = DirectoryLoader('./www.goingnowhere.org', glob="**/*.html", show_progress=True)
pdf_loader = DirectoryLoader('./www.goingnowhere.org', glob="**/*.pdf", show_progress=True)

html_docs = html_loader.load()
pdf_docs = pdf_loader.load()

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,
    chunk_overlap=30,
    length_function=len,
    is_separator_regex=False,
)

texts = []
texts.extend(text_splitter.split_documents(html_docs))
texts.extend(text_splitter.split_documents(pdf_docs))

# create the open-source embedding function
embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")

# load it into Chroma
db = Chroma.from_documents(texts, embedding_function, persist_directory="./chroma_db")

print("There are", db._collection.count(), "in the collection")