VerdictAI / build_municipal_faiss.py
brandonmusic's picture
Update build_municipal_faiss.py
c57b827 verified
# build_municipal_faiss.py
# Updated to incorporate preparation script changes, with added logging and error handling for combined embeddings
import os
import logging
import numpy as np
import faiss
from datasets import load_from_disk
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
MUNICIPAL_EMBEDDINGS_PATH = "/data/municipal_embeddings"
MUNICIPAL_FAISS_INDEX_PATH = "/data/municipal_faiss.index"
def main():
try:
logger.info("Starting build_municipal_faiss.py")
embeddings_dataset = load_from_disk(MUNICIPAL_EMBEDDINGS_PATH)
logger.info(f"Loaded combined embeddings dataset with {len(embeddings_dataset)} records")
if len(embeddings_dataset) == 0:
raise ValueError("Embeddings dataset is empty. Check prepare_municipal_embeddings.py output.")
# Extract embeddings (column name from dataset card)
embeddings = np.array([emb for emb in embeddings_dataset['embedding']]).astype('float32')
dimension = embeddings.shape[1]
logger.info(f"Embeddings dimension: {dimension}")
index = faiss.IndexFlatL2(dimension)
index.add(embeddings)
faiss.write_index(index, MUNICIPAL_FAISS_INDEX_PATH)
logger.info(f"Built and saved FAISS index to {MUNICIPAL_FAISS_INDEX_PATH}")
except Exception as e:
logger.error(f"Error building municipal FAISS index: {str(e)}", exc_info=True)
if __name__ == "__main__":
main()