davidheineman
/

colbert-acl

Model card Files Files and versions Community

davidheineman commited on Apr 26

Commit

6d2b619

•

1 Parent(s): f9ad19d

remove comments

Browse files

Files changed (3) hide show

search.py +0 -13
server.py +0 -15
server_placeholder.py +0 -111

search.py CHANGED Viewed

@@ -8,8 +8,6 @@ from colbert.search.strided_tensor import StridedTensor
 from colbert.indexing.codecs.residual_embeddings_strided import ResidualEmbeddingsStrided
 from colbert.indexing.codecs.residual import ResidualCodec
-from utils import filter_pids, decompress_residuals
 from openai_embed import QueryEmbedder
 from knn_db_access import MongoDBAccess
@@ -145,14 +143,8 @@ def _calculate_colbert(Q: torch.Tensor, unfiltered_pids: torch.Tensor = None):
         _, centroid_scores = get_candidates(Q, ivf)
         print(f'Stage 1 candidate generation: {unfiltered_pids.shape}')
-        # print(centroid_scores.shape)  # (num_questions, 32, hidden_dim)
-        # print(unfiltered_pids.shape)  # (num_passage_candidates)
         # Stage 2 and 3 (Centroid Interaction with Pruning, then without Pruning)
         idx = centroid_scores.max(-1).values >= CENTROID_SCORE_THRESHOLD
-        # pids = filter_pids(
-        #     unfiltered_pids, centroid_scores, embeddings.codes, doclens, offsets, idx, NDOCS
-        # )
         # C++ : Filter pids under the centroid score threshold
         pids_true = IndexScorer.filter_pids(
@@ -171,11 +163,6 @@ def _calculate_colbert(Q: torch.Tensor, unfiltered_pids: torch.Tensor = None):
         codec.decompression_lookup_table, embeddings.residuals, embeddings.codes,
         centroids, codec.dim, nbits
     )
-    # D_packed = decompress_residuals(
-    #     pids, doclens, offsets, bucket_weights, codec.reversed_bit_map,
-    #     codec.decompression_lookup_table, embeddings.residuals, embeddings.codes,
-    #     centroids, codec.dim, nbits
-    # )
     D_packed = F.normalize(D_packed.to(torch.float32), p=2, dim=-1)
     D_mask = doclens[pids.long()]
     D_padded, D_lengths = StridedTensor(D_packed, D_mask, use_gpu=False).as_padded_tensor()

 from colbert.indexing.codecs.residual_embeddings_strided import ResidualEmbeddingsStrided
 from colbert.indexing.codecs.residual import ResidualCodec
 from openai_embed import QueryEmbedder
 from knn_db_access import MongoDBAccess
         _, centroid_scores = get_candidates(Q, ivf)
         print(f'Stage 1 candidate generation: {unfiltered_pids.shape}')
         # Stage 2 and 3 (Centroid Interaction with Pruning, then without Pruning)
         idx = centroid_scores.max(-1).values >= CENTROID_SCORE_THRESHOLD
         # C++ : Filter pids under the centroid score threshold
         pids_true = IndexScorer.filter_pids(
         codec.decompression_lookup_table, embeddings.residuals, embeddings.codes,
         centroids, codec.dim, nbits
     )
     D_packed = F.normalize(D_packed.to(torch.float32), p=2, dim=-1)
     D_mask = doclens[pids.long()]
     D_padded, D_lengths = StridedTensor(D_packed, D_mask, use_gpu=False).as_padded_tensor()

server.py CHANGED Viewed

@@ -11,16 +11,6 @@ app = Flask(__name__)
 counter = {"api" : 0}
-# # Load data
-# COLLECTION_PATH = 'collection.json'
-# DATASET_PATH    = 'dataset.json'
-# with open(COLLECTION_PATH, 'r', encoding='utf-8') as f:
-#     collection = json.loads(f.read())
-# with open(DATASET_PATH, 'r', encoding='utf-8') as f:
-#     dataset = json.loads(f.read())
-# dataset = [d for d in dataset if 'abstract' in d.keys()] # We only indexed the entries containing abstracts
 @lru_cache(maxsize=1000000)
 def api_search_query(query, year, k=10):
@@ -43,8 +33,6 @@ def api_search_query(query, year, k=10):
             'rank': rank,
             'score': score,
             'prob': prob,
-            # 'text': collection[pid],
-            # 'entry': dataset[pid]
         }]
     topk = list(sorted(topk, key=lambda p: (-1 * p['score'], p['pid'])))
@@ -91,7 +79,4 @@ if __name__ == "__main__":
     http://localhost:8893/api/search?k=25&query=How to extend context windows?
     """
     init_colbert()
-    # test_response = api_search_query("What is NLP?", 2)
-    # print(test_response)
-    # print(f'Test it at: http://localhost:8893/api/search?k=25&query=How to extend context windows?')
     app.run("0.0.0.0", PORT)

 counter = {"api" : 0}
 @lru_cache(maxsize=1000000)
 def api_search_query(query, year, k=10):
             'rank': rank,
             'score': score,
             'prob': prob,
         }]
     topk = list(sorted(topk, key=lambda p: (-1 * p['score'], p['pid'])))
     http://localhost:8893/api/search?k=25&query=How to extend context windows?
     """
     init_colbert()
     app.run("0.0.0.0", PORT)

server_placeholder.py DELETED Viewed

@@ -1,111 +0,0 @@
-import os
-from flask import Flask, request
-"""
-This is a placeholder server, which gives an example response from ColBERT,
-for development use. The response would be returned on the query:
-    http://localhost:8893/api/search?k=3&query=Can you give some examples of NLP classification tasks?
-"""
-PORT = int(os.getenv("PORT", 8893))
-app = Flask(__name__)
-counter = {"api" : 0}
-example_response = {
-  "query": "Can you give some examples of NLP classification tasks?",
-  "topk": [
-    {
-      "entry": {
-        "ENTRYTYPE": "inproceedings",
-        "ID": "bang-etal-2023-enabling",
-        "abstract": "Many NLP classification tasks, such as sexism/racism detection or toxicity detection, are based on human values. Yet, human values can vary under diverse cultural conditions. Therefore, we introduce a framework for value-aligned classification that performs prediction based on explicitly written human values in the command. Along with the task, we propose a practical approach that distills value-aligned knowledge from large-scale language models (LLMs) to construct value-aligned classifiers in two steps. First, we generate value-aligned training data from LLMs by prompt-based few-shot learning. Next, we fine-tune smaller classification models with the generated data for the task. Empirical results show that our VA-Models surpass multiple baselines by at least 15.56{\\%} on the F1-score, including few-shot learning with OPT-175B and existing text augmentation methods. We suggest that using classifiers with explicit human value input improves both inclusivity {\\&} explainability in AI.",
-        "address": "Toronto, Canada",
-        "author": "Bang, Yejin  and\nYu, Tiezheng  and\nMadotto, Andrea  and\nLin, Zhaojiang  and\nDiab, Mona  and\nFung, Pascale",
-        "booktitle": "Proceedings of the 3rd Workshop on Trustworthy Natural Language Processing (TrustNLP 2023)",
-        "doi": "10.18653/v1/2023.trustnlp-1.27",
-        "editor": "Ovalle, Anaelia  and\nChang, Kai-Wei  and\nMehrabi, Ninareh  and\nPruksachatkun, Yada  and\nGalystan, Aram  and\nDhamala, Jwala  and\nVerma, Apurv  and\nCao, Trista  and\nKumar, Anoop  and\nGupta, Rahul",
-        "month": "July",
-        "pages": "311--325",
-        "publisher": "Association for Computational Linguistics",
-        "title": "Enabling Classifiers to Make Judgements Explicitly Aligned with Human Values",
-        "url": "https://aclanthology.org/2023.trustnlp-1.27",
-        "year": "2023"
-      },
-      "pid": 308,
-      "prob": 0.911249780843833,
-      "rank": 1,
-      "score": 24.9432468414307,
-      "text": "Many NLP classification tasks, such as sexism/racism detection or toxicity detection, are based on human values. Yet, human values can vary under diverse cultural conditions. Therefore, we introduce a framework for value-aligned classification that performs prediction based on explicitly written human values in the command. Along with the task, we propose a practical approach that distills value-aligned knowledge from large-scale language models (LLMs) to construct value-aligned classifiers in two steps. First, we generate value-aligned training data from LLMs by prompt-based few-shot learning. Next, we fine-tune smaller classification models with the generated data for the task. Empirical results show that our VA-Models surpass multiple baselines by at least 15.56{\\%} on the F1-score, including few-shot learning with OPT-175B and existing text augmentation methods. We suggest that using classifiers with explicit human value input improves both inclusivity {\\&} explainability in AI."
-    },
-    {
-      "entry": {
-        "ENTRYTYPE": "inproceedings",
-        "ID": "schick-schutze-2021-exploiting",
-        "abstract": "Some NLP tasks can be solved in a fully unsupervised fashion by providing a pretrained language model with {``}task descriptions{''} in natural language (e.g., Radford et al., 2019). While this approach underperforms its supervised counterpart, we show in this work that the two ideas can be combined: We introduce Pattern-Exploiting Training (PET), a semi-supervised training procedure that reformulates input examples as cloze-style phrases to help language models understand a given task. These phrases are then used to assign soft labels to a large set of unlabeled examples. Finally, standard supervised training is performed on the resulting training set. For several tasks and languages, PET outperforms supervised training and strong semi-supervised approaches in low-resource settings by a large margin.",
-        "address": "Online",
-        "author": "Schick, Timo  and\nSch{\\\"u}tze, Hinrich",
-        "booktitle": "Proceedings of the 16th Conference of the European Chapter of the Association for Computational Linguistics: Main Volume",
-        "doi": "10.18653/v1/2021.eacl-main.20",
-        "editor": "Merlo, Paola  and\nTiedemann, Jorg  and\nTsarfaty, Reut",
-        "month": "April",
-        "pages": "255--269",
-        "publisher": "Association for Computational Linguistics",
-        "title": "Exploiting Cloze-Questions for Few-Shot Text Classification and Natural Language Inference",
-        "url": "https://aclanthology.org/2021.eacl-main.20",
-        "year": "2021"
-      },
-      "pid": 20173,
-      "prob": 0.052513318016947,
-      "rank": 2,
-      "score": 22.0894966125488,
-      "text": "Some NLP tasks can be solved in a fully unsupervised fashion by providing a pretrained language model with {``}task descriptions{''} in natural language (e.g., Radford et al., 2019). While this approach underperforms its supervised counterpart, we show in this work that the two ideas can be combined: We introduce Pattern-Exploiting Training (PET), a semi-supervised training procedure that reformulates input examples as cloze-style phrases to help language models understand a given task. These phrases are then used to assign soft labels to a large set of unlabeled examples. Finally, standard supervised training is performed on the resulting training set. For several tasks and languages, PET outperforms supervised training and strong semi-supervised approaches in low-resource settings by a large margin."
-    },
-    {
-      "entry": {
-        "ENTRYTYPE": "inproceedings",
-        "ID": "cattan-etal-2023-champ",
-        "abstract": "Various NLP tasks require a complex hierarchical structure over nodes, where each node is a cluster of items. Examples include generating entailment graphs, hierarchical cross-document coreference resolution, annotating event and subevent relations, etc. To enable efficient annotation of such hierarchical structures, we release CHAMP, an open source tool allowing to incrementally construct both clusters and hierarchy simultaneously over any type of texts. This incremental approach significantly reduces annotation time compared to the common pairwise annotation approach and also guarantees maintaining transitivity at the cluster and hierarchy levels. Furthermore, CHAMP includes a consolidation mode, where an adjudicator can easily compare multiple cluster hierarchy annotations and resolve disagreements.",
-        "address": "Singapore",
-        "author": "Cattan, Arie  and\nHope, Tom  and\nDowney, Doug  and\nBar-Haim, Roy  and\nEden, Lilach  and\nKantor, Yoav  and\nDagan, Ido",
-        "booktitle": "Proceedings of the 2023 Conference on Empirical Methods in Natural Language Processing: System Demonstrations",
-        "doi": "10.18653/v1/2023.emnlp-demo.37",
-        "editor": "Feng, Yansong  and\nLefever, Els",
-        "month": "December",
-        "pages": "403--412",
-        "publisher": "Association for Computational Linguistics",
-        "title": "{CHAMP}: Efficient Annotation and Consolidation of Cluster Hierarchies",
-        "url": "https://aclanthology.org/2023.emnlp-demo.37",
-        "year": "2023"
-      },
-      "pid": 5258,
-      "prob": 0.0362369011392198,
-      "rank": 3,
-      "score": 21.7185077667236,
-      "text": "Various NLP tasks require a complex hierarchical structure over nodes, where each node is a cluster of items. Examples include generating entailment graphs, hierarchical cross-document coreference resolution, annotating event and subevent relations, etc. To enable efficient annotation of such hierarchical structures, we release CHAMP, an open source tool allowing to incrementally construct both clusters and hierarchy simultaneously over any type of texts. This incremental approach significantly reduces annotation time compared to the common pairwise annotation approach and also guarantees maintaining transitivity at the cluster and hierarchy levels. Furthermore, CHAMP includes a consolidation mode, where an adjudicator can easily compare multiple cluster hierarchy annotations and resolve disagreements."
-    }
-  ]
-}
-@app.route("/api/search", methods=["GET"])
-def api_search():
-    if request.method == "GET":
-        counter["api"] += 1
-        print("API request count:", counter["api"])
-        print(f'Recieved: {request.args.get("query")} {request.args.get("k")}')
-        return example_response
-    else:
-        return ('', 405)
-if __name__ == "__main__":
-    """
-    Example usage:
-    python server.py
-    http://localhost:8893/api/search?k=3&query=Can you give some examples of NLP classification tasks?
-    """
-    print(f'Test it at: http://localhost:8893/api/search?k=25&query=Can you give some examples of NLP classification tasks?')
-    app.run("0.0.0.0", PORT)