textmeme_search / scripts /build_semantic_index.py
Futyn-Maker
Deploy the app
7e1f5f6
import os
import sys
from pathlib import Path
from typing import Dict, Any, List
from loguru import logger
from omegaconf import OmegaConf
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker
def get_meme_corpus(db, crud) -> List[str]:
"""
Retrieve all meme texts from the database.
Args:
db: Database session.
crud: CRUD operations module.
Returns:
List[str]: List of meme texts.
"""
memes = crud.get_all_memes(db)
corpus = [meme.text for meme in memes]
logger.info(f"Retrieved {len(corpus)} memes from the database")
return corpus
def build_semantic_index(
corpus: List[str], config: Dict[str, Any], SemanticIndexer):
"""
Build and save the semantic index.
Args:
corpus (List[str]): List of meme texts.
config (Dict[str, Any]): Configuration dictionary.
SemanticIndexer: SemanticIndexer class.
"""
model = config['semantic_search']['model']
prefix = config['semantic_search']['document_prefix']
indexer = SemanticIndexer(corpus, model=model, prefix=prefix)
semantic_index_folder = config['index_folders']['semantic']
os.makedirs(semantic_index_folder, exist_ok=True)
indexer.create_index(semantic_index_folder)
logger.info(f"Semantic index created and saved in {semantic_index_folder}")
def main():
from src.db import crud
from src.indexing.semantic_indexer import SemanticIndexer
logger.add("logs/build_semantic_index.log", rotation="10 MB")
# Load configuration
config = OmegaConf.load('config.yaml')
config = OmegaConf.to_container(config)
engine = create_engine(config['database']['url'])
SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine)
db = SessionLocal()
try:
corpus = get_meme_corpus(db, crud)
build_semantic_index(corpus, config, SemanticIndexer)
finally:
db.close()
logger.info("Semantic index building completed")
if __name__ == "__main__":
# Set up project root path
project_root = Path(__file__).resolve().parents[1]
sys.path.insert(0, str(project_root))
main()