import os import sys from pathlib import Path from typing import Dict, Any, List from loguru import logger from omegaconf import OmegaConf from sqlalchemy import create_engine from sqlalchemy.orm import sessionmaker def get_meme_corpus(db, crud) -> List[str]: """ Retrieve all meme texts from the database. Args: db: Database session. crud: CRUD operations module. Returns: List[str]: List of meme texts. """ memes = crud.get_all_memes(db) corpus = [meme.text for meme in memes] logger.info(f"Retrieved {len(corpus)} memes from the database") return corpus def build_semantic_index( corpus: List[str], config: Dict[str, Any], SemanticIndexer): """ Build and save the semantic index. Args: corpus (List[str]): List of meme texts. config (Dict[str, Any]): Configuration dictionary. SemanticIndexer: SemanticIndexer class. """ model = config['semantic_search']['model'] prefix = config['semantic_search']['document_prefix'] indexer = SemanticIndexer(corpus, model=model, prefix=prefix) semantic_index_folder = config['index_folders']['semantic'] os.makedirs(semantic_index_folder, exist_ok=True) indexer.create_index(semantic_index_folder) logger.info(f"Semantic index created and saved in {semantic_index_folder}") def main(): from src.db import crud from src.indexing.semantic_indexer import SemanticIndexer logger.add("logs/build_semantic_index.log", rotation="10 MB") # Load configuration config = OmegaConf.load('config.yaml') config = OmegaConf.to_container(config) engine = create_engine(config['database']['url']) SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine) db = SessionLocal() try: corpus = get_meme_corpus(db, crud) build_semantic_index(corpus, config, SemanticIndexer) finally: db.close() logger.info("Semantic index building completed") if __name__ == "__main__": # Set up project root path project_root = Path(__file__).resolve().parents[1] sys.path.insert(0, str(project_root)) main()