Spaces:
Sleeping
Sleeping
| # build_municipal_faiss.py | |
| # Updated to incorporate preparation script changes, with added logging and error handling for combined embeddings | |
| import os | |
| import logging | |
| import numpy as np | |
| import faiss | |
| from datasets import load_from_disk | |
| logging.basicConfig(level=logging.INFO) | |
| logger = logging.getLogger(__name__) | |
| MUNICIPAL_EMBEDDINGS_PATH = "/data/municipal_embeddings" | |
| MUNICIPAL_FAISS_INDEX_PATH = "/data/municipal_faiss.index" | |
| def main(): | |
| try: | |
| logger.info("Starting build_municipal_faiss.py") | |
| embeddings_dataset = load_from_disk(MUNICIPAL_EMBEDDINGS_PATH) | |
| logger.info(f"Loaded combined embeddings dataset with {len(embeddings_dataset)} records") | |
| if len(embeddings_dataset) == 0: | |
| raise ValueError("Embeddings dataset is empty. Check prepare_municipal_embeddings.py output.") | |
| # Extract embeddings (column name from dataset card) | |
| embeddings = np.array([emb for emb in embeddings_dataset['embedding']]).astype('float32') | |
| dimension = embeddings.shape[1] | |
| logger.info(f"Embeddings dimension: {dimension}") | |
| index = faiss.IndexFlatL2(dimension) | |
| index.add(embeddings) | |
| faiss.write_index(index, MUNICIPAL_FAISS_INDEX_PATH) | |
| logger.info(f"Built and saved FAISS index to {MUNICIPAL_FAISS_INDEX_PATH}") | |
| except Exception as e: | |
| logger.error(f"Error building municipal FAISS index: {str(e)}", exc_info=True) | |
| if __name__ == "__main__": | |
| main() |