docs-qachat-demo / localembedding.py
Asaad Almutareb
new repo Initial
45f1f60
raw
history blame
1.34 kB
import chromadb
from chromadb.config import Settings
from langchain.vectorstores import Chroma
from langchain.vectorstores.utils import filter_complex_metadata
import time
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(
# Set a really small chunk size, just to show.
chunk_size = 1000,
chunk_overlap = 100,
length_function = len,
)
# Stage one: read all the docs, split them into chunks.
st = time.time()
print('Loading documents ...')
docs = loader.load()
chunks = text_splitter.create_documents([doc.page_content for doc in docs], metadatas=[doc.metadata for doc in docs])
et = time.time() - st
print(f'Time taken: {et} seconds.')
#Stage two: embed the docs.
# use all-mpnet-base-v2 sentence transformer to convert pieces of text in vectors to store them in the vector store
model_name = "sentence-transformers/all-mpnet-base-v2"
model_kwargs = {"device": "cuda"}
embeddings = HuggingFaceEmbeddings(
model_name=model_name,
model_kwargs=model_kwargs
)
print(f'Loading chunks into vector store ...')
st = time.time()
db = Chroma.from_documents(filter_complex_metadata(chunks), embeddings, persist_directory="/content/chroma_db")
et = time.time() - st
print(f'Time taken: {et} seconds.')