|
|
import faiss |
|
|
import numpy as np |
|
|
import os |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def build_faiss_index(embeddings): |
|
|
""" |
|
|
π Builds a FAISS index optimized for cosine similarity (float32-safe, dimension-aware). |
|
|
|
|
|
Args: |
|
|
embeddings (list | np.ndarray): List or array of embedding vectors. |
|
|
Returns: |
|
|
faiss.IndexFlatIP: FAISS index for cosine similarity search. |
|
|
""" |
|
|
|
|
|
if embeddings is None: |
|
|
raise ValueError("β No embeddings provided to build FAISS index.") |
|
|
|
|
|
|
|
|
if isinstance(embeddings, np.ndarray): |
|
|
|
|
|
if embeddings.ndim == 1: |
|
|
embeddings = embeddings.reshape(1, -1) |
|
|
vectors = embeddings.astype("float32") |
|
|
elif isinstance(embeddings, list): |
|
|
vectors = np.array(embeddings, dtype="float32") |
|
|
else: |
|
|
raise TypeError(f"β Unexpected embeddings type: {type(embeddings)}") |
|
|
|
|
|
|
|
|
if vectors.size == 0: |
|
|
raise ValueError("β Empty embeddings array provided.") |
|
|
|
|
|
|
|
|
faiss.normalize_L2(vectors) |
|
|
|
|
|
|
|
|
dim = vectors.shape[1] |
|
|
index = faiss.IndexFlatIP(dim) |
|
|
index.add(vectors) |
|
|
|
|
|
print(f"β
FAISS index built successfully β {index.ntotal} vectors | dim={dim} | cosine similarity mode.") |
|
|
return index |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def search_faiss(query_embedding, index, chunks, top_k=3): |
|
|
""" |
|
|
π Searches FAISS index for semantically similar chunks. |
|
|
|
|
|
Args: |
|
|
query_embedding (np.ndarray): Query vector (1D or 2D). |
|
|
index (faiss.IndexFlatIP): Built FAISS index. |
|
|
chunks (list[str]): Original document chunks. |
|
|
top_k (int): Number of results to return. |
|
|
|
|
|
Returns: |
|
|
list[str]: Top-matching chunks. |
|
|
""" |
|
|
if index is None or index.ntotal == 0: |
|
|
raise ValueError("β FAISS index is empty or not initialized.") |
|
|
|
|
|
|
|
|
query_vector = np.array([query_embedding], dtype="float32") if query_embedding.ndim == 1 else query_embedding.astype("float32") |
|
|
faiss.normalize_L2(query_vector) |
|
|
|
|
|
|
|
|
if query_vector.shape[1] != index.d: |
|
|
raise ValueError( |
|
|
f"β Embedding dimension mismatch: query={query_vector.shape[1]}, index={index.d}. " |
|
|
"Rebuild FAISS index with embeddings from the same model." |
|
|
) |
|
|
|
|
|
|
|
|
distances, indices = index.search(query_vector, top_k) |
|
|
results = [] |
|
|
for idx in indices[0]: |
|
|
if 0 <= idx < len(chunks): |
|
|
results.append(chunks[idx]) |
|
|
|
|
|
print(f"π FAISS search completed β retrieved {len(results)} chunks (top_k={top_k})") |
|
|
return results |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def save_faiss_index(index, path="faiss_index.bin"): |
|
|
"""πΎ Save FAISS index to disk.""" |
|
|
faiss.write_index(index, path) |
|
|
print(f"πΎ FAISS index saved to {path}") |
|
|
|
|
|
|
|
|
def load_faiss_index(path="faiss_index.bin"): |
|
|
"""π Load FAISS index from disk.""" |
|
|
if not os.path.exists(path): |
|
|
raise FileNotFoundError(f"β No FAISS index found at {path}") |
|
|
index = faiss.read_index(path) |
|
|
print(f"π FAISS index loaded from {path}") |
|
|
return index |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
|
|
|
sample_embeddings = np.array([ |
|
|
[0.1, 0.2, 0.3], |
|
|
[0.2, 0.1, 0.4], |
|
|
[0.9, 0.8, 0.7] |
|
|
], dtype="float32") |
|
|
|
|
|
query_vec = np.array([0.15, 0.18, 0.35], dtype="float32") |
|
|
|
|
|
|
|
|
idx = build_faiss_index(sample_embeddings) |
|
|
results = search_faiss(query_vec, idx, ["Chunk A", "Chunk B", "Chunk C"], top_k=2) |
|
|
|
|
|
print("π Top Results:") |
|
|
for r in results: |
|
|
print("-", r) |
|
|
|