davidheineman
/

colbert-acl

Model card Files Files and versions Community

davidheineman commited on Apr 26

Commit

7f8aaec

•

1 Parent(s): b8d9cff

implement colbert passthrough

Browse files

Files changed (1) hide show

search.py +28 -24

search.py CHANGED Viewed

@@ -140,26 +140,30 @@ def _calculate_colbert(Q: torch.Tensor, unfiltered_pids: torch.Tensor = None):
     Multi-stage ColBERT pipeline. Implemented using the PLAID engine, see fig. 5:
     https://arxiv.org/pdf/2205.09707.pdf#page=5
     """
-    # Stage 1 (Initial Candidate Generation): Find the closest candidates to the Q centroid score
-    _, centroid_scores = get_candidates(Q, ivf)
-    print(f'Stage 1 candidate generation: {unfiltered_pids.shape}')
-    # print(centroid_scores.shape)  # (num_questions, 32, hidden_dim)
-    # print(unfiltered_pids.shape)  # (num_passage_candidates)
-    # Stage 2 and 3 (Centroid Interaction with Pruning, then without Pruning)
-    idx = centroid_scores.max(-1).values >= CENTROID_SCORE_THRESHOLD
-    # pids = filter_pids(
-    #     unfiltered_pids, centroid_scores, embeddings.codes, doclens, offsets, idx, NDOCS
-    # )
-    # C++ : Filter pids under the centroid score threshold
-    pids_true = IndexScorer.filter_pids(
-        unfiltered_pids, centroid_scores, embeddings.codes, doclens, offsets, idx, NDOCS
-    )
-    pids = pids_true
-    assert torch.equal(pids_true, pids), f'\n{pids_true}\n{pids}'
-    print('Stage 2 filtering:', unfiltered_pids.shape, '->', pids.shape) # (n_docs) -> (n_docs/4)
     # Stage 3.5 (Decompression) - Get the true passage embeddings for calculating maxsim
     D_packed = IndexScorer.decompress_residuals(
@@ -188,16 +192,16 @@ def search_colbert(query, year, k):
     """
     ColBERT search with a query.
     """
     # Get kNN closest passages using naiive kNN search
     query_embed = OPENAI.embed_query(query)
     knn_results = MONGO.vector_knn_search(query_embed, year, k=k)
     unfiltered_pids = torch.tensor([r['id'] for r in knn_results], dtype=torch.int)
     print(f'Stage 0: Retreive passage candidates from kNN: {unfiltered_pids.shape}')
-    # Encode query using ColBERT model, using the appropriate [Q], [D] tokens
-    Q = searcher.encode(query)
-    Q = Q[:, :QUERY_MAX_LEN] # Cut off query to maxlen tokens
     scores, pids = _calculate_colbert(Q, unfiltered_pids=unfiltered_pids)
     # Sort values

     Multi-stage ColBERT pipeline. Implemented using the PLAID engine, see fig. 5:
     https://arxiv.org/pdf/2205.09707.pdf#page=5
     """
+    if unfiltered_pids is None:
+        # Stage 1 (Initial Candidate Generation): Find the closest candidates to the Q centroid score
+        _, centroid_scores = get_candidates(Q, ivf)
+        print(f'Stage 1 candidate generation: {unfiltered_pids.shape}')
+        # print(centroid_scores.shape)  # (num_questions, 32, hidden_dim)
+        # print(unfiltered_pids.shape)  # (num_passage_candidates)
+        # Stage 2 and 3 (Centroid Interaction with Pruning, then without Pruning)
+        idx = centroid_scores.max(-1).values >= CENTROID_SCORE_THRESHOLD
+        # pids = filter_pids(
+        #     unfiltered_pids, centroid_scores, embeddings.codes, doclens, offsets, idx, NDOCS
+        # )
+        # C++ : Filter pids under the centroid score threshold
+        pids_true = IndexScorer.filter_pids(
+            unfiltered_pids, centroid_scores, embeddings.codes, doclens, offsets, idx, NDOCS
+        )
+        pids = pids_true
+        assert torch.equal(pids_true, pids), f'\n{pids_true}\n{pids}'
+        print('Stage 2 filtering:', unfiltered_pids.shape, '->', pids.shape) # (n_docs) -> (n_docs/4)
+    else:
+        # Skip centroid interaction as we have performed this with kNN comparison
+        pids = unfiltered_pids
     # Stage 3.5 (Decompression) - Get the true passage embeddings for calculating maxsim
     D_packed = IndexScorer.decompress_residuals(
     """
     ColBERT search with a query.
     """
+    # Encode query using ColBERT model, using the appropriate [Q], [D] tokens
+    Q = searcher.encode(query)
+    Q = Q[:, :QUERY_MAX_LEN] # Cut off query to maxlen tokens
     # Get kNN closest passages using naiive kNN search
     query_embed = OPENAI.embed_query(query)
     knn_results = MONGO.vector_knn_search(query_embed, year, k=k)
     unfiltered_pids = torch.tensor([r['id'] for r in knn_results], dtype=torch.int)
     print(f'Stage 0: Retreive passage candidates from kNN: {unfiltered_pids.shape}')
     scores, pids = _calculate_colbert(Q, unfiltered_pids=unfiltered_pids)
     # Sort values