Spaces:
Sleeping
Sleeping
| import pytest | |
| from mediastorm.vectorize.embedder import Embedder | |
| def embedder(): | |
| return Embedder() | |
| def test_embed_query_returns_384_dimensions(embedder): | |
| """embed_query should produce a 384-dimensional vector.""" | |
| vec = embedder.embed_query("documentary about war") | |
| assert len(vec) == 384 | |
| def test_embed_query_is_normalized(embedder): | |
| """embed_query result should be L2-normalized.""" | |
| vec = embedder.embed_query("human rights in Africa") | |
| norm = sum(x ** 2 for x in vec) ** 0.5 | |
| assert norm == pytest.approx(1.0, abs=1e-4) | |
| def test_embed_query_repeated_calls_return_identical_results(embedder): | |
| """Repeated calls with same text should return identical vectors.""" | |
| text = "award winning photography" | |
| v1 = embedder.embed_query(text) | |
| v2 = embedder.embed_query(text) | |
| assert v1 == v2 | |
| def test_embed_query_cache_avoids_recomputation(embedder): | |
| """Second call with same text should not invoke embed_texts again.""" | |
| # Clear the cache so we start fresh | |
| embedder._embed_query_cached.cache_clear() | |
| call_count = 0 | |
| original_embed_texts = embedder.embed_texts | |
| def counting_embed_texts(texts): | |
| nonlocal call_count | |
| call_count += 1 | |
| return original_embed_texts(texts) | |
| embedder.embed_texts = counting_embed_texts | |
| try: | |
| embedder.embed_query("cache test query") | |
| embedder.embed_query("cache test query") | |
| finally: | |
| embedder.embed_texts = original_embed_texts | |
| assert call_count == 1, f"embed_texts called {call_count} times, expected 1" | |
| def test_embed_query_different_texts_produce_different_vectors(embedder): | |
| """Different queries should produce different embeddings.""" | |
| v1 = embedder.embed_query("war documentary") | |
| v2 = embedder.embed_query("cooking show") | |
| assert v1 != v2 | |
| def test_embed_query_matches_embed_texts_output(embedder): | |
| """embed_query result should match embed_texts([text])[0].""" | |
| text = "journalism and press freedom" | |
| # Use embed_texts directly to bypass cache for comparison | |
| expected = embedder.embed_texts([text])[0] | |
| result = embedder.embed_query(text) | |
| assert result == pytest.approx(expected, abs=1e-6) | |