davidheineman
/

colbert-acl

Model card Files Files and versions Community

davidheineman commited on Apr 17, 2024

Commit

992c5b6

1 Parent(s): 8f175aa

add more comments

Browse files

Files changed (3) hide show

index_large/metadata.json +1 -8
search.py +9 -4
server.py +2 -1

index_large/metadata.json CHANGED Viewed

@@ -35,14 +35,7 @@
         "mask_punctuation": true,
         "checkpoint": "colbert-ir\/colbertv2.0",
         "triples": null,
-        "collection": [
-            "list with 46880 elements starting with...",
-            [
-                "Position paper for YRRSDS 2023",
-                "In this position paper, I will present the research interests in my PostDoc on safety and robustness specific to conversational AI, including then relevant overlap from my PhD.",
-                "Speech production is nuanced and unique to every individual, but today{'}s Spoken Dialogue Systems (SDSs) are trained to use general speech patterns to successfully improve performance on various evaluation metrics. However, these patterns do not apply to certain user groups - often the very people that can benefit the most from SDSs. For example, people with dementia produce more disfluent speech than the general population. The healthcare domain is now a popular setting for spoken dialogue and human-robot interaction research. This trend is similar when observing company behaviour. Charities promote industry voice assistants, the creators are getting HIPAA compliance, and their features sometimes target vulnerable user groups. It is therefore critical to adapt SDSs to be more accessible."
-            ]
-        ],
         "queries": null,
         "index_name": "index",
         "overwrite": false,

         "mask_punctuation": true,
         "checkpoint": "colbert-ir\/colbertv2.0",
         "triples": null,
+        "collection": [],
         "queries": null,
         "index_name": "index",
         "overwrite": false,

search.py CHANGED Viewed

@@ -79,6 +79,10 @@ def init_colbert(index_path=INDEX_PATH, load_index_with_mmap=False):
 def colbert_score(Q, D_padded, D_mask):
     assert Q.dim() == 3, Q.size()
     assert D_padded.dim() == 3, D_padded.size()
     assert Q.size(0) in [1, D_padded.size(0)]
@@ -109,8 +113,7 @@ def generate_candidates(Q):
     pids, _ = ivf.lookup(cells)
     # Sort and retun values
-    sorter = pids.sort()
-    pids = sorter.values
     pids, _ = torch.unique_consecutive(pids, return_counts=True)
     return pids, centroid_scores
@@ -130,11 +133,13 @@ def _calculate_colbert(Q):
     # print(ivf_1.shape)
     # print(ivf_2.shape)
-    # Stage 2 and 3 (Centroid Interaction with Pruning, then without Pruning) - C++ : Filter pids under the centroid score threshold
     idx = centroid_scores.max(-1).values >= CENTROID_SCORE_THRESHOLD
     pids = filter_pids(
         unfiltered_pids, centroid_scores, embeddings.codes, doclens, offsets, idx, NDOCS
     )
     # pids_true = IndexScorer.filter_pids(
     #     unfiltered_pids, centroid_scores, embeddings.codes, doclens, offsets, idx, NDOCS
     # )
@@ -150,7 +155,7 @@ def _calculate_colbert(Q):
     D_packed = F.normalize(D_packed.to(torch.float32), p=2, dim=-1)
     D_mask = doclens[pids.long()]
     D_padded, D_lengths = StridedTensor(D_packed, D_mask, use_gpu=False).as_padded_tensor()
-    print('Stage 3.5 decompression:', pids.shape, '->', D_padded.shape) # (n_docs/4) -> (n_docs/4, decompressed_residuals, hidden_dim)
     # Stage 4 (Final Ranking w/ Decompression) - Calculate the final (expensive) maxsim scores with ColBERT
     scores = colbert_score(Q, D_padded, D_lengths)

 def colbert_score(Q, D_padded, D_mask):
+    """
+    Computes late interaction between question (Q) and documents (D)
+    See Figure 1: https://aclanthology.org/2022.naacl-main.272.pdf#page=3
+    """
     assert Q.dim() == 3, Q.size()
     assert D_padded.dim() == 3, D_padded.size()
     assert Q.size(0) in [1, D_padded.size(0)]
     pids, _ = ivf.lookup(cells)
     # Sort and retun values
+    pids = pids.sort().values
     pids, _ = torch.unique_consecutive(pids, return_counts=True)
     return pids, centroid_scores
     # print(ivf_1.shape)
     # print(ivf_2.shape)
+    # Stage 2 and 3 (Centroid Interaction with Pruning, then without Pruning)
     idx = centroid_scores.max(-1).values >= CENTROID_SCORE_THRESHOLD
     pids = filter_pids(
         unfiltered_pids, centroid_scores, embeddings.codes, doclens, offsets, idx, NDOCS
     )
+    # C++ : Filter pids under the centroid score threshold
     # pids_true = IndexScorer.filter_pids(
     #     unfiltered_pids, centroid_scores, embeddings.codes, doclens, offsets, idx, NDOCS
     # )
     D_packed = F.normalize(D_packed.to(torch.float32), p=2, dim=-1)
     D_mask = doclens[pids.long()]
     D_padded, D_lengths = StridedTensor(D_packed, D_mask, use_gpu=False).as_padded_tensor()
+    print('Stage 3.5 decompression:', pids.shape, '->', D_padded.shape) # (n_docs/4) -> (n_docs/4, num_toks, hidden_dim)
     # Stage 4 (Final Ranking w/ Decompression) - Calculate the final (expensive) maxsim scores with ColBERT
     scores = colbert_score(Q, D_padded, D_lengths)

server.py CHANGED Viewed

@@ -67,6 +67,7 @@ if __name__ == "__main__":
     http://localhost:8893/api/search?k=25&query=How to extend context windows?
     """
     init_colbert()
-    # print(api_search_query("This is a test", 2))
     print(f'Test it at: http://localhost:8893/api/search?k=25&query=How to extend context windows?')
     app.run("0.0.0.0", PORT)

     http://localhost:8893/api/search?k=25&query=How to extend context windows?
     """
     init_colbert()
+    # test_response = api_search_query("What is NLP?", 2)
+    # print(test_response)
     print(f'Test it at: http://localhost:8893/api/search?k=25&query=How to extend context windows?')
     app.run("0.0.0.0", PORT)