robkaandorp
/

goingnowhere

Model card Files Files and versions Community

goingnowhere / create_dataset.py

robkaandorp's picture

Add create_dataset.py

3b327ab 2 months ago

raw history blame

No virus

1.26 kB

	from langchain_community.document_loaders import DirectoryLoader
	from langchain_text_splitters import RecursiveCharacterTextSplitter
	from datasets import load_dataset
	from langchain_community.document_loaders import TextLoader
	from langchain_community.embeddings.sentence_transformer import (
	SentenceTransformerEmbeddings,
	)
	from langchain_community.vectorstores import Chroma
	from unstructured.cleaners.core import clean_extra_whitespace

	html_loader = DirectoryLoader('./www.goingnowhere.org', glob="*/.html", show_progress=True)
	pdf_loader = DirectoryLoader('./www.goingnowhere.org', glob="*/.pdf", show_progress=True)

	html_docs = html_loader.load()
	pdf_docs = pdf_loader.load()

	text_splitter = RecursiveCharacterTextSplitter(
	chunk_size=500,
	chunk_overlap=30,
	length_function=len,
	is_separator_regex=False,
	)

	texts = []
	texts.extend(text_splitter.split_documents(html_docs))
	texts.extend(text_splitter.split_documents(pdf_docs))

	# create the open-source embedding function
	embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2")

	# load it into Chroma
	db = Chroma.from_documents(texts, embedding_function, persist_directory="./chroma_db")

	print("There are", db._collection.count(), "in the collection")