|
from langchain_community.document_loaders import DirectoryLoader |
|
from langchain_text_splitters import RecursiveCharacterTextSplitter |
|
from datasets import load_dataset |
|
from langchain_community.document_loaders import TextLoader |
|
from langchain_community.embeddings.sentence_transformer import ( |
|
SentenceTransformerEmbeddings, |
|
) |
|
from langchain_community.vectorstores import Chroma |
|
from unstructured.cleaners.core import clean_extra_whitespace |
|
|
|
html_loader = DirectoryLoader('./www.goingnowhere.org', glob="**/*.html", show_progress=True) |
|
pdf_loader = DirectoryLoader('./www.goingnowhere.org', glob="**/*.pdf", show_progress=True) |
|
|
|
html_docs = html_loader.load() |
|
pdf_docs = pdf_loader.load() |
|
|
|
text_splitter = RecursiveCharacterTextSplitter( |
|
chunk_size=500, |
|
chunk_overlap=30, |
|
length_function=len, |
|
is_separator_regex=False, |
|
) |
|
|
|
texts = [] |
|
texts.extend(text_splitter.split_documents(html_docs)) |
|
texts.extend(text_splitter.split_documents(pdf_docs)) |
|
|
|
|
|
embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2") |
|
|
|
|
|
db = Chroma.from_documents(texts, embedding_function, persist_directory="./chroma_db") |
|
|
|
print("There are", db._collection.count(), "in the collection") |