Spaces:
Sleeping
Sleeping
| """Tests for EmbeddingEngine — TASK-001.""" | |
| import pytest | |
| import numpy as np | |
| from apohara_context_forge.embeddings.embedding_engine import EmbeddingEngine | |
| faiss_spec = __import__('importlib').util.find_spec | |
| pytestmark = pytest.mark.skipif( | |
| not faiss_spec('onnxruntime'), | |
| reason="onnxruntime not installed — GPU/DevCloud environment required" | |
| ) | |
| async def engine(): | |
| """Get EmbeddingEngine singleton.""" | |
| return await EmbeddingEngine.get_instance(dim=512, use_onnx=False) | |
| class TestEmbeddingEngine: | |
| """Tests for EmbeddingEngine core functionality.""" | |
| async def test_get_instance_returns_singleton(self, engine): | |
| """get_instance() returns the same instance on repeated calls.""" | |
| engine2 = await EmbeddingEngine.get_instance() | |
| assert engine is engine2 | |
| async def test_encode_returns_normalized_vector(self, engine): | |
| """encode() returns L2-normalized embedding.""" | |
| embedding = await engine.encode("test prompt") | |
| assert isinstance(embedding, np.ndarray) | |
| assert embedding.shape[0] == 512 # dim=512 | |
| norm = np.linalg.norm(embedding) | |
| assert abs(norm - 1.0) < 1e-6 | |
| async def test_encode_batch_returns_list(self, engine): | |
| """encode_batch() returns list of embeddings.""" | |
| texts = ["prompt one", "prompt two", "prompt three"] | |
| embeddings = await engine.encode_batch(texts) | |
| assert isinstance(embeddings, list) | |
| assert len(embeddings) == 3 | |
| for emb in embeddings: | |
| assert isinstance(emb, np.ndarray) | |
| assert emb.shape[0] == 512 | |
| async def test_simhash_returns_int(self, engine): | |
| """simhash() returns 64-bit integer.""" | |
| token_ids = [101, 2003, 1996, 3007, 102] | |
| h = await engine.simhash(token_ids) | |
| assert isinstance(h, int) | |
| assert h >= 0 | |
| async def test_simhash_deterministic(self, engine): | |
| """simhash() is deterministic for same input.""" | |
| token_ids = [101, 2003, 1996, 3007, 102] | |
| h1 = await engine.simhash(token_ids) | |
| h2 = await engine.simhash(token_ids) | |
| assert h1 == h2 | |
| async def test_simhash_different_for_different_inputs(self, engine): | |
| """simhash() returns different values for different token sequences.""" | |
| h1 = await engine.simhash([101, 2003, 1996]) | |
| h2 = await engine.simhash([101, 3007, 102]) | |
| assert h1 != h2 | |
| async def test_encode_caching(self, engine): | |
| """Identical text returns cached embedding.""" | |
| text = "shared system prompt" | |
| e1 = await engine.encode(text) | |
| e2 = await engine.encode(text) | |
| assert np.allclose(e1, e2) |