davidheineman commited on
Commit
6d2b619
1 Parent(s): f9ad19d

remove comments

Browse files
Files changed (3) hide show
  1. search.py +0 -13
  2. server.py +0 -15
  3. server_placeholder.py +0 -111
search.py CHANGED
@@ -8,8 +8,6 @@ from colbert.search.strided_tensor import StridedTensor
8
  from colbert.indexing.codecs.residual_embeddings_strided import ResidualEmbeddingsStrided
9
  from colbert.indexing.codecs.residual import ResidualCodec
10
 
11
- from utils import filter_pids, decompress_residuals
12
-
13
  from openai_embed import QueryEmbedder
14
  from knn_db_access import MongoDBAccess
15
 
@@ -145,14 +143,8 @@ def _calculate_colbert(Q: torch.Tensor, unfiltered_pids: torch.Tensor = None):
145
  _, centroid_scores = get_candidates(Q, ivf)
146
  print(f'Stage 1 candidate generation: {unfiltered_pids.shape}')
147
 
148
- # print(centroid_scores.shape) # (num_questions, 32, hidden_dim)
149
- # print(unfiltered_pids.shape) # (num_passage_candidates)
150
-
151
  # Stage 2 and 3 (Centroid Interaction with Pruning, then without Pruning)
152
  idx = centroid_scores.max(-1).values >= CENTROID_SCORE_THRESHOLD
153
- # pids = filter_pids(
154
- # unfiltered_pids, centroid_scores, embeddings.codes, doclens, offsets, idx, NDOCS
155
- # )
156
 
157
  # C++ : Filter pids under the centroid score threshold
158
  pids_true = IndexScorer.filter_pids(
@@ -171,11 +163,6 @@ def _calculate_colbert(Q: torch.Tensor, unfiltered_pids: torch.Tensor = None):
171
  codec.decompression_lookup_table, embeddings.residuals, embeddings.codes,
172
  centroids, codec.dim, nbits
173
  )
174
- # D_packed = decompress_residuals(
175
- # pids, doclens, offsets, bucket_weights, codec.reversed_bit_map,
176
- # codec.decompression_lookup_table, embeddings.residuals, embeddings.codes,
177
- # centroids, codec.dim, nbits
178
- # )
179
  D_packed = F.normalize(D_packed.to(torch.float32), p=2, dim=-1)
180
  D_mask = doclens[pids.long()]
181
  D_padded, D_lengths = StridedTensor(D_packed, D_mask, use_gpu=False).as_padded_tensor()
 
8
  from colbert.indexing.codecs.residual_embeddings_strided import ResidualEmbeddingsStrided
9
  from colbert.indexing.codecs.residual import ResidualCodec
10
 
 
 
11
  from openai_embed import QueryEmbedder
12
  from knn_db_access import MongoDBAccess
13
 
 
143
  _, centroid_scores = get_candidates(Q, ivf)
144
  print(f'Stage 1 candidate generation: {unfiltered_pids.shape}')
145
 
 
 
 
146
  # Stage 2 and 3 (Centroid Interaction with Pruning, then without Pruning)
147
  idx = centroid_scores.max(-1).values >= CENTROID_SCORE_THRESHOLD
 
 
 
148
 
149
  # C++ : Filter pids under the centroid score threshold
150
  pids_true = IndexScorer.filter_pids(
 
163
  codec.decompression_lookup_table, embeddings.residuals, embeddings.codes,
164
  centroids, codec.dim, nbits
165
  )
 
 
 
 
 
166
  D_packed = F.normalize(D_packed.to(torch.float32), p=2, dim=-1)
167
  D_mask = doclens[pids.long()]
168
  D_padded, D_lengths = StridedTensor(D_packed, D_mask, use_gpu=False).as_padded_tensor()
server.py CHANGED
@@ -11,16 +11,6 @@ app = Flask(__name__)
11
 
12
  counter = {"api" : 0}
13
 
14
- # # Load data
15
- # COLLECTION_PATH = 'collection.json'
16
- # DATASET_PATH = 'dataset.json'
17
-
18
- # with open(COLLECTION_PATH, 'r', encoding='utf-8') as f:
19
- # collection = json.loads(f.read())
20
- # with open(DATASET_PATH, 'r', encoding='utf-8') as f:
21
- # dataset = json.loads(f.read())
22
- # dataset = [d for d in dataset if 'abstract' in d.keys()] # We only indexed the entries containing abstracts
23
-
24
 
25
  @lru_cache(maxsize=1000000)
26
  def api_search_query(query, year, k=10):
@@ -43,8 +33,6 @@ def api_search_query(query, year, k=10):
43
  'rank': rank,
44
  'score': score,
45
  'prob': prob,
46
- # 'text': collection[pid],
47
- # 'entry': dataset[pid]
48
  }]
49
 
50
  topk = list(sorted(topk, key=lambda p: (-1 * p['score'], p['pid'])))
@@ -91,7 +79,4 @@ if __name__ == "__main__":
91
  http://localhost:8893/api/search?k=25&query=How to extend context windows?
92
  """
93
  init_colbert()
94
- # test_response = api_search_query("What is NLP?", 2)
95
- # print(test_response)
96
- # print(f'Test it at: http://localhost:8893/api/search?k=25&query=How to extend context windows?')
97
  app.run("0.0.0.0", PORT)
 
11
 
12
  counter = {"api" : 0}
13
 
 
 
 
 
 
 
 
 
 
 
14
 
15
  @lru_cache(maxsize=1000000)
16
  def api_search_query(query, year, k=10):
 
33
  'rank': rank,
34
  'score': score,
35
  'prob': prob,
 
 
36
  }]
37
 
38
  topk = list(sorted(topk, key=lambda p: (-1 * p['score'], p['pid'])))
 
79
  http://localhost:8893/api/search?k=25&query=How to extend context windows?
80
  """
81
  init_colbert()
 
 
 
82
  app.run("0.0.0.0", PORT)
server_placeholder.py DELETED
@@ -1,111 +0,0 @@
1
- import os
2
-
3
- from flask import Flask, request
4
-
5
- """
6
-
7
- This is a placeholder server, which gives an example response from ColBERT,
8
- for development use. The response would be returned on the query:
9
- http://localhost:8893/api/search?k=3&query=Can you give some examples of NLP classification tasks?
10
-
11
- """
12
-
13
- PORT = int(os.getenv("PORT", 8893))
14
- app = Flask(__name__)
15
-
16
- counter = {"api" : 0}
17
-
18
- example_response = {
19
- "query": "Can you give some examples of NLP classification tasks?",
20
- "topk": [
21
- {
22
- "entry": {
23
- "ENTRYTYPE": "inproceedings",
24
- "ID": "bang-etal-2023-enabling",
25
- "abstract": "Many NLP classification tasks, such as sexism/racism detection or toxicity detection, are based on human values. Yet, human values can vary under diverse cultural conditions. Therefore, we introduce a framework for value-aligned classification that performs prediction based on explicitly written human values in the command. Along with the task, we propose a practical approach that distills value-aligned knowledge from large-scale language models (LLMs) to construct value-aligned classifiers in two steps. First, we generate value-aligned training data from LLMs by prompt-based few-shot learning. Next, we fine-tune smaller classification models with the generated data for the task. Empirical results show that our VA-Models surpass multiple baselines by at least 15.56{\\%} on the F1-score, including few-shot learning with OPT-175B and existing text augmentation methods. We suggest that using classifiers with explicit human value input improves both inclusivity {\\&} explainability in AI.",
26
- "address": "Toronto, Canada",
27
- "author": "Bang, Yejin and\nYu, Tiezheng and\nMadotto, Andrea and\nLin, Zhaojiang and\nDiab, Mona and\nFung, Pascale",
28
- "booktitle": "Proceedings of the 3rd Workshop on Trustworthy Natural Language Processing (TrustNLP 2023)",
29
- "doi": "10.18653/v1/2023.trustnlp-1.27",
30
- "editor": "Ovalle, Anaelia and\nChang, Kai-Wei and\nMehrabi, Ninareh and\nPruksachatkun, Yada and\nGalystan, Aram and\nDhamala, Jwala and\nVerma, Apurv and\nCao, Trista and\nKumar, Anoop and\nGupta, Rahul",
31
- "month": "July",
32
- "pages": "311--325",
33
- "publisher": "Association for Computational Linguistics",
34
- "title": "Enabling Classifiers to Make Judgements Explicitly Aligned with Human Values",
35
- "url": "https://aclanthology.org/2023.trustnlp-1.27",
36
- "year": "2023"
37
- },
38
- "pid": 308,
39
- "prob": 0.911249780843833,
40
- "rank": 1,
41
- "score": 24.9432468414307,
42
- "text": "Many NLP classification tasks, such as sexism/racism detection or toxicity detection, are based on human values. Yet, human values can vary under diverse cultural conditions. Therefore, we introduce a framework for value-aligned classification that performs prediction based on explicitly written human values in the command. Along with the task, we propose a practical approach that distills value-aligned knowledge from large-scale language models (LLMs) to construct value-aligned classifiers in two steps. First, we generate value-aligned training data from LLMs by prompt-based few-shot learning. Next, we fine-tune smaller classification models with the generated data for the task. Empirical results show that our VA-Models surpass multiple baselines by at least 15.56{\\%} on the F1-score, including few-shot learning with OPT-175B and existing text augmentation methods. We suggest that using classifiers with explicit human value input improves both inclusivity {\\&} explainability in AI."
43
- },
44
- {
45
- "entry": {
46
- "ENTRYTYPE": "inproceedings",
47
- "ID": "schick-schutze-2021-exploiting",
48
- "abstract": "Some NLP tasks can be solved in a fully unsupervised fashion by providing a pretrained language model with {``}task descriptions{''} in natural language (e.g., Radford et al., 2019). While this approach underperforms its supervised counterpart, we show in this work that the two ideas can be combined: We introduce Pattern-Exploiting Training (PET), a semi-supervised training procedure that reformulates input examples as cloze-style phrases to help language models understand a given task. These phrases are then used to assign soft labels to a large set of unlabeled examples. Finally, standard supervised training is performed on the resulting training set. For several tasks and languages, PET outperforms supervised training and strong semi-supervised approaches in low-resource settings by a large margin.",
49
- "address": "Online",
50
- "author": "Schick, Timo and\nSch{\\\"u}tze, Hinrich",
51
- "booktitle": "Proceedings of the 16th Conference of the European Chapter of the Association for Computational Linguistics: Main Volume",
52
- "doi": "10.18653/v1/2021.eacl-main.20",
53
- "editor": "Merlo, Paola and\nTiedemann, Jorg and\nTsarfaty, Reut",
54
- "month": "April",
55
- "pages": "255--269",
56
- "publisher": "Association for Computational Linguistics",
57
- "title": "Exploiting Cloze-Questions for Few-Shot Text Classification and Natural Language Inference",
58
- "url": "https://aclanthology.org/2021.eacl-main.20",
59
- "year": "2021"
60
- },
61
- "pid": 20173,
62
- "prob": 0.052513318016947,
63
- "rank": 2,
64
- "score": 22.0894966125488,
65
- "text": "Some NLP tasks can be solved in a fully unsupervised fashion by providing a pretrained language model with {``}task descriptions{''} in natural language (e.g., Radford et al., 2019). While this approach underperforms its supervised counterpart, we show in this work that the two ideas can be combined: We introduce Pattern-Exploiting Training (PET), a semi-supervised training procedure that reformulates input examples as cloze-style phrases to help language models understand a given task. These phrases are then used to assign soft labels to a large set of unlabeled examples. Finally, standard supervised training is performed on the resulting training set. For several tasks and languages, PET outperforms supervised training and strong semi-supervised approaches in low-resource settings by a large margin."
66
- },
67
- {
68
- "entry": {
69
- "ENTRYTYPE": "inproceedings",
70
- "ID": "cattan-etal-2023-champ",
71
- "abstract": "Various NLP tasks require a complex hierarchical structure over nodes, where each node is a cluster of items. Examples include generating entailment graphs, hierarchical cross-document coreference resolution, annotating event and subevent relations, etc. To enable efficient annotation of such hierarchical structures, we release CHAMP, an open source tool allowing to incrementally construct both clusters and hierarchy simultaneously over any type of texts. This incremental approach significantly reduces annotation time compared to the common pairwise annotation approach and also guarantees maintaining transitivity at the cluster and hierarchy levels. Furthermore, CHAMP includes a consolidation mode, where an adjudicator can easily compare multiple cluster hierarchy annotations and resolve disagreements.",
72
- "address": "Singapore",
73
- "author": "Cattan, Arie and\nHope, Tom and\nDowney, Doug and\nBar-Haim, Roy and\nEden, Lilach and\nKantor, Yoav and\nDagan, Ido",
74
- "booktitle": "Proceedings of the 2023 Conference on Empirical Methods in Natural Language Processing: System Demonstrations",
75
- "doi": "10.18653/v1/2023.emnlp-demo.37",
76
- "editor": "Feng, Yansong and\nLefever, Els",
77
- "month": "December",
78
- "pages": "403--412",
79
- "publisher": "Association for Computational Linguistics",
80
- "title": "{CHAMP}: Efficient Annotation and Consolidation of Cluster Hierarchies",
81
- "url": "https://aclanthology.org/2023.emnlp-demo.37",
82
- "year": "2023"
83
- },
84
- "pid": 5258,
85
- "prob": 0.0362369011392198,
86
- "rank": 3,
87
- "score": 21.7185077667236,
88
- "text": "Various NLP tasks require a complex hierarchical structure over nodes, where each node is a cluster of items. Examples include generating entailment graphs, hierarchical cross-document coreference resolution, annotating event and subevent relations, etc. To enable efficient annotation of such hierarchical structures, we release CHAMP, an open source tool allowing to incrementally construct both clusters and hierarchy simultaneously over any type of texts. This incremental approach significantly reduces annotation time compared to the common pairwise annotation approach and also guarantees maintaining transitivity at the cluster and hierarchy levels. Furthermore, CHAMP includes a consolidation mode, where an adjudicator can easily compare multiple cluster hierarchy annotations and resolve disagreements."
89
- }
90
- ]
91
- }
92
-
93
- @app.route("/api/search", methods=["GET"])
94
- def api_search():
95
- if request.method == "GET":
96
- counter["api"] += 1
97
- print("API request count:", counter["api"])
98
- print(f'Recieved: {request.args.get("query")} {request.args.get("k")}')
99
- return example_response
100
- else:
101
- return ('', 405)
102
-
103
-
104
- if __name__ == "__main__":
105
- """
106
- Example usage:
107
- python server.py
108
- http://localhost:8893/api/search?k=3&query=Can you give some examples of NLP classification tasks?
109
- """
110
- print(f'Test it at: http://localhost:8893/api/search?k=25&query=Can you give some examples of NLP classification tasks?')
111
- app.run("0.0.0.0", PORT)