import os import sys from pathlib import Path from typing import Dict, Any, List from loguru import logger from omegaconf import OmegaConf from sqlalchemy import create_engine from sqlalchemy.orm import sessionmaker def get_meme_corpus(db, crud) -> List[str]: """ Retrieve all meme texts from the database. Args: db: Database session. crud: CRUD operations module. Returns: List[str]: List of meme texts. """ memes = crud.get_all_memes(db) corpus = [meme.text for meme in memes] logger.info(f"Retrieved {len(corpus)} memes from the database") return corpus def build_bm25_index(corpus: List[str], config: Dict[str, Any], mystem_tokenizer, BM25Indexer): """ Build and save the BM25 index. Args: corpus (List[str]): List of meme texts. config (Dict[str, Any]): Configuration dictionary. mystem_tokenizer: MystemTokenizer instance. BM25Indexer: BM25Indexer class. """ indexer = BM25Indexer(corpus, mystem_tokenizer.tokenize) bm25_index_folder = config['index_folders']['bm25'] os.makedirs(bm25_index_folder, exist_ok=True) indexer.create_index(bm25_index_folder) logger.info(f"BM25S index created and saved in {bm25_index_folder}") def main(): from src.db import crud from src.preprocessing.mystem_tokenizer import MystemTokenizer from src.indexing.bm25_indexer import BM25Indexer logger.add("logs/build_bm25s_index.log", rotation="10 MB") # Load configuration config = OmegaConf.load('config.yaml') config = OmegaConf.to_container(config) engine = create_engine(config['database']['url']) SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine) db = SessionLocal() try: corpus = get_meme_corpus(db, crud) mystem_tokenizer = MystemTokenizer() build_bm25_index(corpus, config, mystem_tokenizer, BM25Indexer) finally: db.close() logger.info("BM25S index building completed") if __name__ == "__main__": # Set up project root path project_root = Path(__file__).resolve().parents[1] sys.path.insert(0, str(project_root)) main()