Spaces:
Sleeping
Sleeping
import os | |
import sys | |
from pathlib import Path | |
from typing import Dict, Any, List | |
from loguru import logger | |
from omegaconf import OmegaConf | |
from sqlalchemy import create_engine | |
from sqlalchemy.orm import sessionmaker | |
def get_meme_corpus(db, crud) -> List[str]: | |
""" | |
Retrieve all meme texts from the database. | |
Args: | |
db: Database session. | |
crud: CRUD operations module. | |
Returns: | |
List[str]: List of meme texts. | |
""" | |
memes = crud.get_all_memes(db) | |
corpus = [meme.text for meme in memes] | |
logger.info(f"Retrieved {len(corpus)} memes from the database") | |
return corpus | |
def build_semantic_index( | |
corpus: List[str], config: Dict[str, Any], SemanticIndexer): | |
""" | |
Build and save the semantic index. | |
Args: | |
corpus (List[str]): List of meme texts. | |
config (Dict[str, Any]): Configuration dictionary. | |
SemanticIndexer: SemanticIndexer class. | |
""" | |
model = config['semantic_search']['model'] | |
prefix = config['semantic_search']['document_prefix'] | |
indexer = SemanticIndexer(corpus, model=model, prefix=prefix) | |
semantic_index_folder = config['index_folders']['semantic'] | |
os.makedirs(semantic_index_folder, exist_ok=True) | |
indexer.create_index(semantic_index_folder) | |
logger.info(f"Semantic index created and saved in {semantic_index_folder}") | |
def main(): | |
from src.db import crud | |
from src.indexing.semantic_indexer import SemanticIndexer | |
logger.add("logs/build_semantic_index.log", rotation="10 MB") | |
# Load configuration | |
config = OmegaConf.load('config.yaml') | |
config = OmegaConf.to_container(config) | |
engine = create_engine(config['database']['url']) | |
SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine) | |
db = SessionLocal() | |
try: | |
corpus = get_meme_corpus(db, crud) | |
build_semantic_index(corpus, config, SemanticIndexer) | |
finally: | |
db.close() | |
logger.info("Semantic index building completed") | |
if __name__ == "__main__": | |
# Set up project root path | |
project_root = Path(__file__).resolve().parents[1] | |
sys.path.insert(0, str(project_root)) | |
main() | |