Spaces:
Runtime error
Runtime error
| # ingest.py β works with LangChain v0.2+ | |
| from pathlib import Path | |
| from typing import List | |
| from langchain_community.vectorstores import FAISS | |
| from langchain_text_splitters import RecursiveCharacterTextSplitter | |
| from langchain_community.document_loaders import DirectoryLoader, PyPDFLoader | |
| from langchain_huggingface.embeddings import HuggingFaceEmbeddings | |
| from langchain_openai import OpenAIEmbeddings # optional | |
| class Ingest: | |
| def __init__( | |
| self, | |
| *, | |
| english_embedding_model: str = "sentence-transformers/all-MiniLM-L6-v2", | |
| czech_embedding_model: str = "Seznam/retromae-small-cs", | |
| use_openai_embeddings: bool = False, | |
| openai_embedding_model: str = "text-embedding-3-large", | |
| openai_api_key: str | None = None, | |
| chunk: int = 512, | |
| overlap: int = 256, | |
| english_store: str = "stores/english_512", | |
| czech_store: str = "stores/czech_512", | |
| data_english: str = "data/english", | |
| data_czech: str = "data/czech", | |
| ): | |
| self.english_embedding_model = english_embedding_model | |
| self.czech_embedding_model = czech_embedding_model | |
| self.use_openai_embeddings = use_openai_embeddings | |
| self.openai_embedding_model = openai_embedding_model | |
| self.openai_api_key = openai_api_key | |
| self.chunk = chunk | |
| self.overlap = overlap | |
| self.english_store = Path(english_store) | |
| self.czech_store = Path(czech_store) | |
| self.data_english = Path(data_english) | |
| self.data_czech = Path(data_czech) | |
| # ------------------------------------------------------------------ utils | |
| def _load(folder: Path): | |
| return DirectoryLoader( | |
| str(folder), | |
| recursive=True, | |
| loader_cls=PyPDFLoader, | |
| use_multithreading=True, | |
| show_progress=True, | |
| ).load() | |
| def _split(docs: List, chunk: int, overlap: int): | |
| splitter = RecursiveCharacterTextSplitter(chunk_size=chunk, | |
| chunk_overlap=overlap) | |
| return splitter.split_documents(docs) | |
| # ------------------------------------------------------------------ ENG | |
| def ingest_english(self): | |
| if self.use_openai_embeddings: | |
| if not self.openai_api_key: | |
| raise ValueError("OPENAI_API_KEY missing for OpenAI embeddings.") | |
| embed = OpenAIEmbeddings( | |
| openai_api_key=self.openai_api_key, | |
| model=self.openai_embedding_model, | |
| ) | |
| mode = f"OpenAI {self.openai_embedding_model}" | |
| else: | |
| embed = HuggingFaceEmbeddings( | |
| model_name=self.english_embedding_model, | |
| model_kwargs={"device": "cpu"}, | |
| encode_kwargs={"normalize_embeddings": False}, | |
| ) | |
| mode = f"HuggingFace {self.english_embedding_model}" | |
| print(f"β’ English ingest with {mode}") | |
| texts = self._split(self._load(self.data_english), self.chunk, self.overlap) | |
| FAISS.from_documents(texts, embed).save_local(str(self.english_store)) | |
| print("β English store saved to", self.english_store) | |
| # ------------------------------------------------------------------ CZ | |
| def ingest_czech(self): | |
| embed = HuggingFaceEmbeddings( | |
| model_name=self.czech_embedding_model, | |
| model_kwargs={"device": "cpu"}, | |
| encode_kwargs={"normalize_embeddings": False}, | |
| ) | |
| print(f"β’ Czech ingest with {self.czech_embedding_model}") | |
| texts = self._split(self._load(self.data_czech), self.chunk, self.overlap) | |
| FAISS.from_documents(texts, embed).save_local(str(self.czech_store)) | |
| print("β Czech store saved to", self.czech_store) | |