api / experimets /embed_pipeline.py
Chandima Prabhath
Refactor code structure for improved readability and maintainability
10b392a
from dotenv import load_dotenv
load_dotenv()
import os
from glob import glob
from langchain_community.document_loaders import TextLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_ollama import OllamaEmbeddings
from langchain_chroma import Chroma
from langchain.schema import Document
# β€”β€”β€” CONFIG β€”β€”β€”
DOCS_FOLDER = "docs/" # folder with .txt, .md, etc.
OLLAMA_URL = os.getenv("OLLAMA_SERVER")
EMBED_MODEL = "nomic-embed-text:latest"
PERSIST_DIR = "chroma_db/" # on-disk Chroma store
CHUNK_SIZE = 2000
CHUNK_OVERLAP = 10
# β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”
def embed_all_docs():
all_chunks = []
files = glob(os.path.join(DOCS_FOLDER, "*.*"))
for path in files:
try:
# 1) Try loading with UTF-8 + autodetect fallback
loader = TextLoader(
path,
encoding="utf-8",
autodetect_encoding=True
)
docs = loader.load()
except UnicodeDecodeError:
# 2) If that still fails, fallback to a lenient read
print(f"⚠️ Decoding error on {path}, falling back to ignore-errors mode")
with open(path, "r", encoding="utf-8", errors="ignore") as f:
text = f.read()
docs = [Document(page_content=text, metadata={"source": path})]
# 3) Split into chunks
splitter = RecursiveCharacterTextSplitter(
chunk_size=CHUNK_SIZE,
chunk_overlap=CHUNK_OVERLAP
)
chunks = splitter.split_documents(docs)
print(f"β†’ {len(chunks)} chunks from {os.path.basename(path)}")
all_chunks.extend(chunks)
# 4) Embed & persist on-disk Chroma
embedder = OllamaEmbeddings(base_url=OLLAMA_URL, model=EMBED_MODEL)
vectordb = Chroma(
embedding_function=embedder,
persist_directory=PERSIST_DIR,
collection_name="my_docs"
)
vectordb.add_documents(all_chunks)
print(f"βœ… Persisted {len(all_chunks)} chunks to '{PERSIST_DIR}'")
if __name__ == "__main__":
embed_all_docs()