davidheineman
/

colbert-acl

Model card Files Files and versions Community

davidheineman commited on Apr 17

Commit

00b3aaf

•

1 Parent(s): 3d8408f

fix bug

Browse files

Files changed (2) hide show

index/metadata.json +2 -2
search.py +14 -7

index/metadata.json CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:fabe6f5e95f0eb8bee525adc7ab82d7fe275dc862e354f200eb494a74b2b23ea
-size 45753744

 version https://git-lfs.github.com/spec/v1
+oid sha256:fe45c70053d561277a4e751a25295c89fccf73b235410ce7779dc4b5aed11106
+size 1501

search.py CHANGED Viewed

@@ -1,4 +1,4 @@
-import os, shutil, ujson, tqdm
 import torch
 import torch.nn.functional as F
@@ -12,7 +12,9 @@ from utils import filter_pids
 INDEX_NAME = os.getenv("INDEX_NAME", 'index')
 INDEX_ROOT = os.getenv("INDEX_ROOT", '.')
 INDEX_PATH = os.path.join(INDEX_ROOT, INDEX_NAME)
 # Move index to ColBERT experiment path
 src_path = os.path.join(INDEX_ROOT, INDEX_NAME)
@@ -22,7 +24,11 @@ if not os.path.exists(dest_path):
     os.makedirs(dest_path)
     shutil.copytree(src_path, dest_path, dirs_exist_ok=True)
-searcher = Searcher(index=INDEX_NAME)
 NCELLS = 1
 CENTROID_SCORE_THRESHOLD = 0.5 # How close a document has to be to a centroid to be considered
@@ -93,7 +99,7 @@ def colbert_score(Q: torch.Tensor, D_padded: torch.Tensor, D_mask: torch.Tensor)
     return scores
-def get_candidates(centroid_scores: torch.Tensor, ivf: StridedTensor) -> torch.Tensor:
     """
     First find centroids closest to Q, then return all the passages in all
     centroids.
@@ -101,7 +107,10 @@ def get_candidates(centroid_scores: torch.Tensor, ivf: StridedTensor) -> torch.T
     We can replace this function with a k-NN search finding the closest passages
     using BERT similarity.
     """
     # Get the closest centroids via a matrix multiplication + argmax
     if NCELLS == 1:
         cells = centroid_scores.argmax(dim=0, keepdim=True).permute(1, 0)
     else:
@@ -116,7 +125,7 @@ def get_candidates(centroid_scores: torch.Tensor, ivf: StridedTensor) -> torch.T
     # Sort and retun values
     pids = pids.sort().values
     pids, _ = torch.unique_consecutive(pids, return_counts=True)
-    return pids
 def _calculate_colbert(Q: torch.Tensor):
@@ -125,9 +134,7 @@ def _calculate_colbert(Q: torch.Tensor):
     https://arxiv.org/pdf/2205.09707.pdf#page=5
     """
     # Stage 1 (Initial Candidate Generation): Find the closest candidates to the Q centroid score
-    Q = Q.squeeze(0)
-    centroid_scores = (centroids @ Q.T)
-    unfiltered_pids = get_candidates(centroid_scores, ivf)
     print(f'Stage 1 candidate generation: {unfiltered_pids.shape}')
     # print(centroid_scores.shape)  # (num_questions, 32, hidden_dim)

+import os, shutil, json, ujson, tqdm
 import torch
 import torch.nn.functional as F
 INDEX_NAME = os.getenv("INDEX_NAME", 'index')
 INDEX_ROOT = os.getenv("INDEX_ROOT", '.')
 INDEX_PATH = os.path.join(INDEX_ROOT, INDEX_NAME)
+COLLECTION_PATH = os.path.join(INDEX_ROOT, 'collection.json')
 # Move index to ColBERT experiment path
 src_path = os.path.join(INDEX_ROOT, INDEX_NAME)
     os.makedirs(dest_path)
     shutil.copytree(src_path, dest_path, dirs_exist_ok=True)
+# Load abstracts as a collection
+with open(COLLECTION_PATH, 'r', encoding='utf-8') as f:
+    collection = json.load(f)
+searcher = Searcher(index=INDEX_NAME, collection=collection)
 NCELLS = 1
 CENTROID_SCORE_THRESHOLD = 0.5 # How close a document has to be to a centroid to be considered
     return scores
+def get_candidates(Q: torch.Tensor, ivf: StridedTensor) -> torch.Tensor:
     """
     First find centroids closest to Q, then return all the passages in all
     centroids.
     We can replace this function with a k-NN search finding the closest passages
     using BERT similarity.
     """
+    Q = Q.squeeze(0)
     # Get the closest centroids via a matrix multiplication + argmax
+    centroid_scores = (centroids @ Q.T)
     if NCELLS == 1:
         cells = centroid_scores.argmax(dim=0, keepdim=True).permute(1, 0)
     else:
     # Sort and retun values
     pids = pids.sort().values
     pids, _ = torch.unique_consecutive(pids, return_counts=True)
+    return pids, centroid_scores
 def _calculate_colbert(Q: torch.Tensor):
     https://arxiv.org/pdf/2205.09707.pdf#page=5
     """
     # Stage 1 (Initial Candidate Generation): Find the closest candidates to the Q centroid score
+    unfiltered_pids, centroid_scores = get_candidates(Q, ivf)
     print(f'Stage 1 candidate generation: {unfiltered_pids.shape}')
     # print(centroid_scores.shape)  # (num_questions, 32, hidden_dim)