File size: 2,108 Bytes
10b392a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
from dotenv import load_dotenv
load_dotenv()
import os
from glob import glob

from langchain_community.document_loaders import TextLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_ollama import OllamaEmbeddings
from langchain_chroma import Chroma
from langchain.schema import Document

# β€”β€”β€” CONFIG β€”β€”β€”
DOCS_FOLDER   = "docs/"         # folder with .txt, .md, etc.
OLLAMA_URL    = os.getenv("OLLAMA_SERVER")
EMBED_MODEL   = "nomic-embed-text:latest"
PERSIST_DIR   = "chroma_db/"    # on-disk Chroma store
CHUNK_SIZE    = 2000
CHUNK_OVERLAP = 10
# β€”β€”β€”β€”β€”β€”β€”β€”β€”β€”

def embed_all_docs():
    all_chunks = []
    files = glob(os.path.join(DOCS_FOLDER, "*.*"))
    for path in files:
        try:
            # 1) Try loading with UTF-8 + autodetect fallback
            loader = TextLoader(
                path,
                encoding="utf-8",
                autodetect_encoding=True
            )
            docs = loader.load()
        except UnicodeDecodeError:
            # 2) If that still fails, fallback to a lenient read
            print(f"⚠️  Decoding error on {path}, falling back to ignore-errors mode")
            with open(path, "r", encoding="utf-8", errors="ignore") as f:
                text = f.read()
            docs = [Document(page_content=text, metadata={"source": path})]

        # 3) Split into chunks
        splitter = RecursiveCharacterTextSplitter(
            chunk_size=CHUNK_SIZE,
            chunk_overlap=CHUNK_OVERLAP
        )
        chunks = splitter.split_documents(docs)
        print(f"β†’ {len(chunks)} chunks from {os.path.basename(path)}")
        all_chunks.extend(chunks)

    # 4) Embed & persist on-disk Chroma
    embedder = OllamaEmbeddings(base_url=OLLAMA_URL, model=EMBED_MODEL)
    vectordb = Chroma(
        embedding_function=embedder,
        persist_directory=PERSIST_DIR,
        collection_name="my_docs"
    )
    vectordb.add_documents(all_chunks)
    print(f"βœ… Persisted {len(all_chunks)} chunks to '{PERSIST_DIR}'")

if __name__ == "__main__":
    embed_all_docs()