Spaces:
Sleeping
Sleeping
import chromadb | |
from chromadb.config import Settings | |
from langchain.vectorstores import Chroma | |
from langchain.vectorstores.utils import filter_complex_metadata | |
import time | |
from langchain.embeddings import HuggingFaceEmbeddings | |
from langchain.text_splitter import RecursiveCharacterTextSplitter | |
text_splitter = RecursiveCharacterTextSplitter( | |
# Set a really small chunk size, just to show. | |
chunk_size = 1000, | |
chunk_overlap = 100, | |
length_function = len, | |
) | |
# Stage one: read all the docs, split them into chunks. | |
st = time.time() | |
print('Loading documents ...') | |
docs = loader.load() | |
chunks = text_splitter.create_documents([doc.page_content for doc in docs], metadatas=[doc.metadata for doc in docs]) | |
et = time.time() - st | |
print(f'Time taken: {et} seconds.') | |
#Stage two: embed the docs. | |
# use all-mpnet-base-v2 sentence transformer to convert pieces of text in vectors to store them in the vector store | |
model_name = "sentence-transformers/all-mpnet-base-v2" | |
model_kwargs = {"device": "cuda"} | |
embeddings = HuggingFaceEmbeddings( | |
model_name=model_name, | |
model_kwargs=model_kwargs | |
) | |
print(f'Loading chunks into vector store ...') | |
st = time.time() | |
db = Chroma.from_documents(filter_complex_metadata(chunks), embeddings, persist_directory="/content/chroma_db") | |
et = time.time() - st | |
print(f'Time taken: {et} seconds.') |