goingnowhere / create_dataset.py
robkaandorp's picture
Add create_dataset.py
3b327ab
raw history blame
No virus
1.26 kB
from langchain_community.document_loaders import DirectoryLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from datasets import load_dataset
from langchain_community.document_loaders import TextLoader
from langchain_community.embeddings.sentence_transformer import (
SentenceTransformerEmbeddings,
)
from langchain_community.vectorstores import Chroma
from unstructured.cleaners.core import clean_extra_whitespace
html_loader = DirectoryLoader('./www.goingnowhere.org', glob="**/*.html", show_progress=True)
pdf_loader = DirectoryLoader('./www.goingnowhere.org', glob="**/*.pdf", show_progress=True)
html_docs = html_loader.load()
pdf_docs = pdf_loader.load()
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=500,
chunk_overlap=30,
length_function=len,
is_separator_regex=False,
)
texts = []
texts.extend(text_splitter.split_documents(html_docs))
texts.extend(text_splitter.split_documents(pdf_docs))
# create the open-source embedding function
embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")
# load it into Chroma
db = Chroma.from_documents(texts, embedding_function, persist_directory="./chroma_db")
print("There are", db._collection.count(), "in the collection")