Teapack1's picture
Update ingest.py
cd7b78b verified
# ingest.py – works with LangChain v0.2+
from pathlib import Path
from typing import List
from langchain_community.vectorstores import FAISS
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import DirectoryLoader, PyPDFLoader
from langchain_huggingface.embeddings import HuggingFaceEmbeddings
from langchain_openai import OpenAIEmbeddings # optional
class Ingest:
def __init__(
self,
*,
english_embedding_model: str = "sentence-transformers/all-MiniLM-L6-v2",
czech_embedding_model: str = "Seznam/retromae-small-cs",
use_openai_embeddings: bool = False,
openai_embedding_model: str = "text-embedding-3-large",
openai_api_key: str | None = None,
chunk: int = 512,
overlap: int = 256,
english_store: str = "stores/english_512",
czech_store: str = "stores/czech_512",
data_english: str = "data/english",
data_czech: str = "data/czech",
):
self.english_embedding_model = english_embedding_model
self.czech_embedding_model = czech_embedding_model
self.use_openai_embeddings = use_openai_embeddings
self.openai_embedding_model = openai_embedding_model
self.openai_api_key = openai_api_key
self.chunk = chunk
self.overlap = overlap
self.english_store = Path(english_store)
self.czech_store = Path(czech_store)
self.data_english = Path(data_english)
self.data_czech = Path(data_czech)
# ------------------------------------------------------------------ utils
@staticmethod
def _load(folder: Path):
return DirectoryLoader(
str(folder),
recursive=True,
loader_cls=PyPDFLoader,
use_multithreading=True,
show_progress=True,
).load()
@staticmethod
def _split(docs: List, chunk: int, overlap: int):
splitter = RecursiveCharacterTextSplitter(chunk_size=chunk,
chunk_overlap=overlap)
return splitter.split_documents(docs)
# ------------------------------------------------------------------ ENG
def ingest_english(self):
if self.use_openai_embeddings:
if not self.openai_api_key:
raise ValueError("OPENAI_API_KEY missing for OpenAI embeddings.")
embed = OpenAIEmbeddings(
openai_api_key=self.openai_api_key,
model=self.openai_embedding_model,
)
mode = f"OpenAI {self.openai_embedding_model}"
else:
embed = HuggingFaceEmbeddings(
model_name=self.english_embedding_model,
model_kwargs={"device": "cpu"},
encode_kwargs={"normalize_embeddings": False},
)
mode = f"HuggingFace {self.english_embedding_model}"
print(f"β€’ English ingest with {mode}")
texts = self._split(self._load(self.data_english), self.chunk, self.overlap)
FAISS.from_documents(texts, embed).save_local(str(self.english_store))
print("βœ“ English store saved to", self.english_store)
# ------------------------------------------------------------------ CZ
def ingest_czech(self):
embed = HuggingFaceEmbeddings(
model_name=self.czech_embedding_model,
model_kwargs={"device": "cpu"},
encode_kwargs={"normalize_embeddings": False},
)
print(f"β€’ Czech ingest with {self.czech_embedding_model}")
texts = self._split(self._load(self.data_czech), self.chunk, self.overlap)
FAISS.from_documents(texts, embed).save_local(str(self.czech_store))
print("βœ“ Czech store saved to", self.czech_store)