geonmin-kim commited on
Commit
d6585f5
1 Parent(s): d571cf0

Upload folder using huggingface_hub

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .dockerignore +6 -0
  2. .gitattributes +2 -0
  3. .gitignore +23 -0
  4. .gitmodules +3 -0
  5. Dockerfile +53 -0
  6. LICENSE.txt +191 -0
  7. MANIFEST.in +5 -0
  8. README.md +257 -8
  9. __pycache__/search_online.cpython-38.pyc +0 -0
  10. analyze_answer_inclusion_in_retrieval.py +100 -0
  11. collections/.gitkeep +1 -0
  12. convert_cast22_eval_json_to_queries_tsv.py +24 -0
  13. download_oracle_jdk.sh +15 -0
  14. encode_8shards_cuda0.sh +26 -0
  15. encode_8shards_cuda1.sh +26 -0
  16. encode_8shards_cuda2.sh +26 -0
  17. encode_8shards_cuda3.sh +26 -0
  18. entrypoint.sh +23 -0
  19. flagged/log.csv +272 -0
  20. integrations/clprf/test_clprf.py +951 -0
  21. integrations/clprf/test_trec_covid_r3.py +85 -0
  22. integrations/clprf/test_trec_covid_r4.py +91 -0
  23. integrations/clprf/test_trec_covid_r5.py +141 -0
  24. integrations/dense/test_ance.py +185 -0
  25. integrations/dense/test_distilbert_kd.py +45 -0
  26. integrations/dense/test_distilbert_tasb.py +44 -0
  27. integrations/dense/test_dpr.py +339 -0
  28. integrations/dense/test_encode.py +137 -0
  29. integrations/dense/test_kilt.py +69 -0
  30. integrations/dense/test_sbert.py +67 -0
  31. integrations/dense/test_tct_colbert-v2.py +132 -0
  32. integrations/dense/test_tct_colbert.py +183 -0
  33. integrations/lucenesearcher_anserini_checker.py +78 -0
  34. integrations/lucenesearcher_score_checker.py +71 -0
  35. integrations/papers/test_ecir2023.py +91 -0
  36. integrations/papers/test_sigir2021.py +180 -0
  37. integrations/papers/test_sigir2022.py +100 -0
  38. integrations/run_lucenesearcher.py +51 -0
  39. integrations/sparse/test_lucenesearcher_check_core17.py +59 -0
  40. integrations/sparse/test_lucenesearcher_check_core18.py +59 -0
  41. integrations/sparse/test_lucenesearcher_check_irst.py +355 -0
  42. integrations/sparse/test_lucenesearcher_check_ltr_msmarco_document.py +60 -0
  43. integrations/sparse/test_lucenesearcher_check_ltr_msmarco_passage.py +59 -0
  44. integrations/sparse/test_lucenesearcher_check_robust04.py +59 -0
  45. integrations/sparse/test_lucenesearcher_check_robust05.py +59 -0
  46. integrations/sparse/test_lucenesearcher_multithread.py +79 -0
  47. integrations/sparse/test_nmslib.py +66 -0
  48. integrations/sparse/test_prebuilt_msmarco_v1_doc.py +170 -0
  49. integrations/sparse/test_prebuilt_msmarco_v1_passage.py +96 -0
  50. integrations/sparse/test_prebuilt_robust04.py +42 -0
.dockerignore ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ datasets/
2
+ embeddings/
3
+ sparse/
4
+ Dockerfile
5
+ indexes
6
+ .dockerignore
.gitattributes CHANGED
@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ pyserini/resources/jars/anserini-0.21.1-SNAPSHOT-fatjar.jar filter=lfs diff=lfs merge=lfs -text
37
+ runs/q=NQtest_c=wikidpr_m=bm25.run filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *~
2
+ .DS_Store
3
+ .idea
4
+ __pycache__
5
+ *.pyc
6
+ pyserini/resources/jars/*.jar
7
+ collections/*
8
+ indexes/*
9
+ .vscode/
10
+ venv/
11
+ datasets/
12
+ embeddings/
13
+
14
+ # build directories from `python3 setup.py sdist bdist_wheel`
15
+ build/
16
+ dist/
17
+ pyserini.egg-info/
18
+
19
+ # runs should be ignored
20
+ runs/
21
+
22
+ # logs should also be ignored
23
+ logs/
.gitmodules ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ [submodule "tools"]
2
+ path = tools
3
+ url = https://github.com/castorini/anserini-tools.git
Dockerfile ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM nvcr.io/nvidia/pytorch:22.01-py3
2
+
3
+ ENV HOME=/root
4
+ ENV LC_ALL C.UTF-8
5
+ ENV LANG C.UTF-8
6
+ ARG DEBIAN_FRONTEND=noninteractive
7
+
8
+ # APP_PATH: assumed to be volume mounted with host
9
+ ENV APP_PATH=$HOME/np_app_text_retrieval_inference
10
+ ENV PYTHONPATH $APP_PATH
11
+
12
+ # EXTERNAL_PATH: assumed having system specific files, not volume mounted with host
13
+ ENV EXTERNAL_PATH=/workspace/external
14
+ RUN mkdir -p $EXTERNAL_PATH
15
+
16
+ WORKDIR $APP_PATH
17
+ COPY . $APP_PATH/
18
+
19
+ ### ubuntu package
20
+ RUN apt-get update
21
+ RUN apt-get install git vim unzip wget ffmpeg libsm6 libxext6 -y
22
+
23
+ ### conda
24
+ # note that retriever (based on pyserini) support cpu only for faiss
25
+ RUN conda install -c conda-forge faiss-cpu
26
+
27
+ ### pip
28
+ # install editable mode
29
+ RUN pip install --upgrade pip && \
30
+ pip install --no-binary :all: nmslib && \
31
+ pip install -e .
32
+
33
+ # update java version >= 11
34
+ WORKDIR $EXTERNAL_PATH
35
+ RUN cp $APP_PATH/download_oracle_jdk.sh $EXTERNAL_PATH/
36
+ RUN sh download_oracle_jdk.sh
37
+ RUN ln -sf ${EXTERNAL_PATH}/jdk-20.0.2/bin/java /opt/conda/bin/java
38
+ ENV JAVA_HOME=${EXTERNAL_PATH}/jdk-20.0.2
39
+
40
+ # anserini: need for sparse retrieval/indexing
41
+ RUN conda install -c conda-forge maven
42
+ RUN mvn -v
43
+ RUN git clone https://github.com/castorini/anserini.git
44
+ RUN cd anserini && \
45
+ mvn clean -Dmaven.test.skip=true package appassembler:assemble
46
+
47
+ # copy fatjar from anserini to app
48
+ # will be done at entrypoint
49
+ #COPY $EXTERNAL_PATH/anserini/target/anserini-*-fatjar.jar $APP_PATH/pyserini/resources/jars
50
+
51
+ ## entrypoint: call entrypoint.sh instead
52
+ WORKDIR $APP_PATH
53
+ #CMD ["bash"]
LICENSE.txt ADDED
@@ -0,0 +1,191 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ Apache License
3
+ Version 2.0, January 2004
4
+ http://www.apache.org/licenses/
5
+
6
+ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
7
+
8
+ 1. Definitions.
9
+
10
+ "License" shall mean the terms and conditions for use, reproduction,
11
+ and distribution as defined by Sections 1 through 9 of this document.
12
+
13
+ "Licensor" shall mean the copyright owner or entity authorized by
14
+ the copyright owner that is granting the License.
15
+
16
+ "Legal Entity" shall mean the union of the acting entity and all
17
+ other entities that control, are controlled by, or are under common
18
+ control with that entity. For the purposes of this definition,
19
+ "control" means (i) the power, direct or indirect, to cause the
20
+ direction or management of such entity, whether by contract or
21
+ otherwise, or (ii) ownership of fifty percent (50%) or more of the
22
+ outstanding shares, or (iii) beneficial ownership of such entity.
23
+
24
+ "You" (or "Your") shall mean an individual or Legal Entity
25
+ exercising permissions granted by this License.
26
+
27
+ "Source" form shall mean the preferred form for making modifications,
28
+ including but not limited to software source code, documentation
29
+ source, and configuration files.
30
+
31
+ "Object" form shall mean any form resulting from mechanical
32
+ transformation or translation of a Source form, including but
33
+ not limited to compiled object code, generated documentation,
34
+ and conversions to other media types.
35
+
36
+ "Work" shall mean the work of authorship, whether in Source or
37
+ Object form, made available under the License, as indicated by a
38
+ copyright notice that is included in or attached to the work
39
+ (an example is provided in the Appendix below).
40
+
41
+ "Derivative Works" shall mean any work, whether in Source or Object
42
+ form, that is based on (or derived from) the Work and for which the
43
+ editorial revisions, annotations, elaborations, or other modifications
44
+ represent, as a whole, an original work of authorship. For the purposes
45
+ of this License, Derivative Works shall not include works that remain
46
+ separable from, or merely link (or bind by name) to the interfaces of,
47
+ the Work and Derivative Works thereof.
48
+
49
+ "Contribution" shall mean any work of authorship, including
50
+ the original version of the Work and any modifications or additions
51
+ to that Work or Derivative Works thereof, that is intentionally
52
+ submitted to Licensor for inclusion in the Work by the copyright owner
53
+ or by an individual or Legal Entity authorized to submit on behalf of
54
+ the copyright owner. For the purposes of this definition, "submitted"
55
+ means any form of electronic, verbal, or written communication sent
56
+ to the Licensor or its representatives, including but not limited to
57
+ communication on electronic mailing lists, source code control systems,
58
+ and issue tracking systems that are managed by, or on behalf of, the
59
+ Licensor for the purpose of discussing and improving the Work, but
60
+ excluding communication that is conspicuously marked or otherwise
61
+ designated in writing by the copyright owner as "Not a Contribution."
62
+
63
+ "Contributor" shall mean Licensor and any individual or Legal Entity
64
+ on behalf of whom a Contribution has been received by Licensor and
65
+ subsequently incorporated within the Work.
66
+
67
+ 2. Grant of Copyright License. Subject to the terms and conditions of
68
+ this License, each Contributor hereby grants to You a perpetual,
69
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
70
+ copyright license to reproduce, prepare Derivative Works of,
71
+ publicly display, publicly perform, sublicense, and distribute the
72
+ Work and such Derivative Works in Source or Object form.
73
+
74
+ 3. Grant of Patent License. Subject to the terms and conditions of
75
+ this License, each Contributor hereby grants to You a perpetual,
76
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
77
+ (except as stated in this section) patent license to make, have made,
78
+ use, offer to sell, sell, import, and otherwise transfer the Work,
79
+ where such license applies only to those patent claims licensable
80
+ by such Contributor that are necessarily infringed by their
81
+ Contribution(s) alone or by combination of their Contribution(s)
82
+ with the Work to which such Contribution(s) was submitted. If You
83
+ institute patent litigation against any entity (including a
84
+ cross-claim or counterclaim in a lawsuit) alleging that the Work
85
+ or a Contribution incorporated within the Work constitutes direct
86
+ or contributory patent infringement, then any patent licenses
87
+ granted to You under this License for that Work shall terminate
88
+ as of the date such litigation is filed.
89
+
90
+ 4. Redistribution. You may reproduce and distribute copies of the
91
+ Work or Derivative Works thereof in any medium, with or without
92
+ modifications, and in Source or Object form, provided that You
93
+ meet the following conditions:
94
+
95
+ (a) You must give any other recipients of the Work or
96
+ Derivative Works a copy of this License; and
97
+
98
+ (b) You must cause any modified files to carry prominent notices
99
+ stating that You changed the files; and
100
+
101
+ (c) You must retain, in the Source form of any Derivative Works
102
+ that You distribute, all copyright, patent, trademark, and
103
+ attribution notices from the Source form of the Work,
104
+ excluding those notices that do not pertain to any part of
105
+ the Derivative Works; and
106
+
107
+ (d) If the Work includes a "NOTICE" text file as part of its
108
+ distribution, then any Derivative Works that You distribute must
109
+ include a readable copy of the attribution notices contained
110
+ within such NOTICE file, excluding those notices that do not
111
+ pertain to any part of the Derivative Works, in at least one
112
+ of the following places: within a NOTICE text file distributed
113
+ as part of the Derivative Works; within the Source form or
114
+ documentation, if provided along with the Derivative Works; or,
115
+ within a display generated by the Derivative Works, if and
116
+ wherever such third-party notices normally appear. The contents
117
+ of the NOTICE file are for informational purposes only and
118
+ do not modify the License. You may add Your own attribution
119
+ notices within Derivative Works that You distribute, alongside
120
+ or as an addendum to the NOTICE text from the Work, provided
121
+ that such additional attribution notices cannot be construed
122
+ as modifying the License.
123
+
124
+ You may add Your own copyright statement to Your modifications and
125
+ may provide additional or different license terms and conditions
126
+ for use, reproduction, or distribution of Your modifications, or
127
+ for any such Derivative Works as a whole, provided Your use,
128
+ reproduction, and distribution of the Work otherwise complies with
129
+ the conditions stated in this License.
130
+
131
+ 5. Submission of Contributions. Unless You explicitly state otherwise,
132
+ any Contribution intentionally submitted for inclusion in the Work
133
+ by You to the Licensor shall be under the terms and conditions of
134
+ this License, without any additional terms or conditions.
135
+ Notwithstanding the above, nothing herein shall supersede or modify
136
+ the terms of any separate license agreement you may have executed
137
+ with Licensor regarding such Contributions.
138
+
139
+ 6. Trademarks. This License does not grant permission to use the trade
140
+ names, trademarks, service marks, or product names of the Licensor,
141
+ except as required for reasonable and customary use in describing the
142
+ origin of the Work and reproducing the content of the NOTICE file.
143
+
144
+ 7. Disclaimer of Warranty. Unless required by applicable law or
145
+ agreed to in writing, Licensor provides the Work (and each
146
+ Contributor provides its Contributions) on an "AS IS" BASIS,
147
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
148
+ implied, including, without limitation, any warranties or conditions
149
+ of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
150
+ PARTICULAR PURPOSE. You are solely responsible for determining the
151
+ appropriateness of using or redistributing the Work and assume any
152
+ risks associated with Your exercise of permissions under this License.
153
+
154
+ 8. Limitation of Liability. In no event and under no legal theory,
155
+ whether in tort (including negligence), contract, or otherwise,
156
+ unless required by applicable law (such as deliberate and grossly
157
+ negligent acts) or agreed to in writing, shall any Contributor be
158
+ liable to You for damages, including any direct, indirect, special,
159
+ incidental, or consequential damages of any character arising as a
160
+ result of this License or out of the use or inability to use the
161
+ Work (including but not limited to damages for loss of goodwill,
162
+ work stoppage, computer failure or malfunction, or any and all
163
+ other commercial damages or losses), even if such Contributor
164
+ has been advised of the possibility of such damages.
165
+
166
+ 9. Accepting Warranty or Additional Liability. While redistributing
167
+ the Work or Derivative Works thereof, You may choose to offer,
168
+ and charge a fee for, acceptance of support, warranty, indemnity,
169
+ or other liability obligations and/or rights consistent with this
170
+ License. However, in accepting such obligations, You may act only
171
+ on Your own behalf and on Your sole responsibility, not on behalf
172
+ of any other Contributor, and only if You agree to indemnify,
173
+ defend, and hold each Contributor harmless for any liability
174
+ incurred by, or claims asserted against, such Contributor by reason
175
+ of your accepting any such warranty or additional liability.
176
+
177
+ END OF TERMS AND CONDITIONS
178
+
179
+ Copyright 2019-2021 Pyserini authors
180
+
181
+ Licensed under the Apache License, Version 2.0 (the "License");
182
+ you may not use this file except in compliance with the License.
183
+ You may obtain a copy of the License at
184
+
185
+ http://www.apache.org/licenses/LICENSE-2.0
186
+
187
+ Unless required by applicable law or agreed to in writing, software
188
+ distributed under the License is distributed on an "AS IS" BASIS,
189
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
190
+ See the License for the specific language governing permissions and
191
+ limitations under the License.
MANIFEST.in ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ include *.md
2
+ include requirements.txt
3
+ include pyserini/2cr/*.yaml
4
+ include pyserini/2cr/*.template
5
+ include pyserini/resources/jars/anserini-*-fatjar.jar
README.md CHANGED
@@ -1,12 +1,261 @@
1
  ---
2
- title: NetsPresso QA
3
- emoji: 📉
4
- colorFrom: red
5
- colorTo: pink
6
  sdk: gradio
7
- sdk_version: 3.43.2
8
- app_file: app.py
9
- pinned: false
10
  ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
 
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
  ---
2
+ title: NetsPresso_QA
3
+ app_file: run_ralm_netspresso_doc.py
 
 
4
  sdk: gradio
5
+ sdk_version: 3.41.2
 
 
6
  ---
7
+ # Text retrieval inference (indexing, search)
8
+
9
+ ## Installation
10
+ 1. 저장소 다운로드
11
+
12
+ ```bash
13
+ git clone https://github.com/nota-github/np_app_text_retrieval_inference
14
+ ```
15
+
16
+ 2. 모델 환경이 정의된 도커 이미지 생성 및 실행
17
+ ```bash
18
+ cd np_app_semantic_search_inference
19
+ docker build --cache-from notadockerhub/np_app_text_retrieval_inference:latest -t notadockerhub/np_app_text_retrieval_inference:latest -f ./Dockerfile .
20
+ docker run --name {container_name} --shm-size=8g -it --gpus '"device=0"' -v {your_code_dir}:/root/np_app_text_retrieval_inference -v /{your_data_dir}:/workspace/datasets notadockerhub/np_app_text_retrieval_inference:latest
21
+ ```
22
+
23
+ * retrieval시에는 gpu가 BERT 기반의 query encoding시에만 사용됩니다. 전체 시간에서는 적은 비율을 차지하므로 cpu만 사용해도 속도에서 큰 차이는 없습니다.
24
+ * 원하는 문서들을 indexing하는 경우 BERT를 이용하여 일회성으로 encoding하는데, 이 경우는 gpu를 사용하면 cpu보다 많은 시간을 절약할 수 있습니다.
25
+ * 현재 구현에서는 single gpu 사용만을 지원하고 있으며, multi gpu 사용을 위해서는 individual process를 만들어서 병렬로 처리해야 합니다.
26
+
27
+ * 대부분의 코드는 [pyserini](https://github.com/castorini/pyserini)에 기반하고 있습니다.
28
+
29
+ ## Dataset
30
+ ```
31
+ datasets
32
+ |-- dataset_name
33
+ | |-- collection.jsonl
34
+ | |-- queries.tsv
35
+ | |-- qrels.txt (optional, 정량평가를 원할 경우)
36
+ ```
37
+
38
+ * collection.jsonl: each line is `{"id": "PASSAGE_ID", "contents": "CONTENTS"}`.
39
+ * queries.tsv: each line is `QUERY_ID\tCONTENTS`.
40
+ * qrels.txt: each line is `QUERY_ID QUERY_TYPE PASSAGE_ID RELEVANCE_SCORE`.
41
+
42
+ ## Recommended retriever
43
+ * sparse model: BM25
44
+ * dense model
45
+ * multi-lingual: mDPR, mContriever
46
+ * multi-vector: colBERT
47
+ * hybrid model: sparse (first-pass) + dense (reranking)
48
+ * 다국어를 encode하는 baseline 모델은 `castorini/mdpr-question-nq`을 사용.
49
+ * 언어별 다양한 pre-trained 모델은 [HuggingFace model hub](https://huggingface.co/models)에서 검색 해볼 수 있음.
50
+
51
+ ## Sample dataset
52
+ * [mrtydi-korean](https://github.com/castorini/mr.tydi)
53
+ * 11개 언어를 포함한 다국어 검색을 위한 benchmark dataset
54
+ * 한국어의 경우 1496126개의 passage와 421개의 test query를 제공함
55
+ * `title`과 `text`를 포함한 multi-field를 활용할 수 있음 (일반적으로는 `text`만 사용 가능)
56
+ * [data hub](https://www.notion.so/notaai/Data-Hub-V1-Current-Version-45b0b0aa62084b3e985244ebb264d444?pvs=4)에서 원본 데이터 및 indexing된 결과물을 다운받을 수 있음.
57
+ * @data_hub:/ssd2/np_app/Dataset_Hub/Semantic_Search/{corpus,indexes}
58
+
59
+ ## Procedure
60
+
61
+ ### 1. Indexing
62
+ * Fast retrieval을 위해서 collection의 passage에 대한 indexing을 미리 계산함
63
+ * indexing 과정은 미리 만들둔것을 사용해도 됨
64
+ * mrtydi-korean의 경우 data hub에 존재
65
+ * [pre-built indexes for benchmark corpus/model](https://github.com/castorini/pyserini/blob/master/docs/prebuilt-indexes.md)
66
+ </br>
67
+
68
+ * dense model
69
+ ```
70
+ python -m pyserini.encode \
71
+ input --corpus /path/to/dataset/collection.jsonl \
72
+ --fields text \
73
+ output --embeddings indexes/dataset_name/dense \
74
+ --to-faiss \
75
+ encoder --encoder huggingface_model_name_or_checkpoint_path \
76
+ --fields text \
77
+ --max-length $MAX_LENGTH \
78
+ --batch $BATCH_SIZE \
79
+ --fp16
80
+ ```
81
+ * huggingface_model_name_or_checkpoint_path: huggingface model hub에서 제공하는 모델 이름 또는 checkpoint path
82
+ * e.g., mrtydi의 경우: `castorini/mdpr-passage-nq` 사용 (retrieval시의 query encoding: `castorini/mdpr-question-nq`)
83
+ * tied(vs. split)의 경우 passage/query encoder가 같음(vs. 다름)
84
+
85
+ * sparse model
86
+ ```
87
+ python -m pyserini.index.lucene \
88
+ --collection JsonCollection \
89
+ --input datasets/dataset_name/collection.jsonl \
90
+ --index /path/to/indexing/sparse \
91
+ --fields text \
92
+ --generator DefaultLuceneDocumentGenerator \
93
+ --language $LANG_CODE \
94
+ --threads $NUM_THREADS \
95
+ --storePositions --storeDocvectors --storeRaw
96
+ ```
97
+ * language code의 경우 ISO 639-1 방식을 따름 (e.g., en, ko, ja, zh)
98
+
99
+ * multifield를 활용할 경우 collection의 "contents"의 텍스트내에 field들을 \n으로 구분하고, --fields에 field 이름들(i.e., --fields title text)을 넣어줌.
100
+ * mrtydi의 경우 delimiter를 '\n\n'으로 사용함
101
+ ```
102
+ {"id": "5#1", "contents": "지미 카터\n\n지미 카터는 조지아주 섬터 카운티 플레인스 마을에서 태어났다. 조지아 공과대학교를 졸업하였다. 그 후 해군에 들어가 전함·원자력·잠수함의 승무원으로 일하였다. 1953년 미국 해군 대위로 예편하였고 이후 땅콩·���화 등을 가꿔 많은 돈을 벌었다. 그의 별명이 \"땅콩 농부\" (Peanut Farmer)로 알려졌다."}
103
+ ```
104
+
105
+ * MAX_LENGTH: positional embedding의 최대 길이 (e.g., BERT: 512, DPR: 256)
106
+
107
+ * 결과물 (dir: /path/to/indexing)
108
+ - docid: sets of passage id
109
+ - index: concatenation of (compressed) index vectors, binary file
110
+
111
+ ### 2. Search
112
+ * Indexing된 collection에 대하여 query에 대한 ranking 수행
113
+
114
+ #### online
115
+ * with sparse indexing
116
+ ```
117
+ export QUERY="최초로 전기 자동차를 개발한 기업은 어디인가?"
118
+ python search_online.py --index_type sparse --index /path/to/indexing/sparse --query "$QUERY" --lang_abbr $LANG_CODE
119
+ ```
120
+ <details>
121
+ <summary>결과 예시</summary>
122
+ <pre><code>
123
+ 1 1830196#0 21.52590
124
+ {
125
+ "id" : "1830196#0",
126
+ "contents" : "창안 자동차(, )는 중화인민공화국의 자동차 제조 기업이다. 본사는 충칭 시에 있다. 디이 자동차, 둥펑 자동차, 상하이 자동차, 체리 자동차와 함께 중화인민공화국의 5대 자동차 제조 기업으로 여겨진다. 중화인민공화국의 자동차 제조 및 판매, 자동차 엔진 제품 제조 업체이다. 1862년 상하이 시에서 이홍장에 의해 설립되었으며 1950년대 말에 지프를 최초로 생산하면서 자동차 제조 기업이 되었다. 1996년 10월 31일 법인설립되었고 대표자는 장 바오린이다. 1984년에는 일본의 자동차 제조 기업인 스즈키와 제휴 관계를 수립했고 2001년에는 포드 모터 컴퍼니를 합병하면서 창안 포드 자동차(長安福特汽車)가 설립되었다. 2009년에는 하페이 자동차(哈飛汽車), 창허 자동차(昌河汽車)를 합병했다. 충칭 자동차 생산의 태반은 창안자동차가 담당하고 있다. 창안은 1959년 이후 차를 만들어온 국유기업으로 2차대전의 미군용 지프를 본떠 만든 군용트럭이 시발점이었다. 오늘날 라인업은 전기차 하나를 비롯한 17개 모델로 확대됐다. 7개 조립공장과 1개 엔진공장을 통해 한해 약 100만 대를 만든다. 여기에다가 창안은 포드, 푸조와 스즈키와도 합작하고 있어 한해 생산량은 300만 대에 이른다. 창안자동차는 글로벌연구개발시스템을 가동중에 있다. 현재 충칭, 베이징, 허베이, 허페이, 이탈리아 토리노, 일본 요코하마, 영국 버밍엄, 미국 디트로이트 등지에 연구개발센터를 설립하였다. 우리나라 한온시스템은 독일 폴크스바겐, 중국 창안자동차 등에 친환경차용 전동식 컴프레셔를 납품하고 있다."
127
+ }
128
+ 2 128660#8 19.02320
129
+ {
130
+ "id" : "128660#8",
131
+ "contents" : "1990년대에 들어선 직후 가솔린자동차에 의한 환경오염문제가 대두되었다. 1996년 제너럴 모터스(GM)사는 양산 전기차 1호로 볼 수 있는 'EV1' 전기자동차를 개발한다. 이 전기자동차는 미국 캘리포니아 지역에서 임대형식으로 보급된다. 그러나 GM사는 수요가 크지 않아 수익성이 낮다는 이유로 1년만에 전기자동차 'EV1'의 조립라인을 폐쇄한다."
132
+ }
133
+ 3 320611#0 18.99790
134
+ {
135
+ "id" : "320611#0",
136
+ "contents" : "기아 그랜토(Kia Granto) 또는 아시아 그랜토(Asia Granto)는 1995년에 아시아자동차가 생산한 대형 트럭이다. 기아차가 일본 히노 자동차와 기술 제휴해서 히노 프로피아의 차체로 개발한 대형 트럭이다. 기존의 AM 트럭의 후속 차종으로 개발한 트럭으로, 아시아자동차가 창사 30주년을 기념해서 개발한 트럭이다. 선택 사양으로 ABS 브레이크, 속도 제한 장치, 브레이크 라이닝 간극 자동 조정 장치, 오토 그리스, 튜브형 브레이크 파이프, 전기식 변속기 전환 장치 등을 탑재하였다. 1997년에 대한민국산 트럭 최초로 U자형 적재함을 탑재하였으며, 최고 출력 430마력의 FY(8×4) 23톤 덤프 트럭을 출시하였다. 1999년에 아시아자동차가 기아자동차에게 흡수 합병되었으며, 이후 기아자동차에서 생산하다가 2000년 8월에 배기 가스 규제를 충족시키지 못하여 후속 차종 없이 단종되면서, 기아자동차는 대형 트럭 사업을 스카니아 코리아에 양도함에 따라 대형 트럭의 시장에서 완전히 철수하였다."
137
+ }
138
+ 4 1226703#1 18.78540
139
+ {
140
+ "id" : "1226703#1",
141
+ "contents" : "1845년에 회사를 창립 했으며 독일의 전지형 기중기 생산하는 기업 중 가장 오래되었다. 1868년에 말이 끄는 소방차를 개발했으며 1890년에 최초로 증기 소방 차량을 생산했다. 1914년에 최초로 트럭과 특수 차량을 제작했다. 1918년에 안스바흐 자동차 공장과 뉘르베르크 자동차 공장을 합병했다. 1937년에 3축 트럭을 생산 했으며 1943년에 제2차 세계대전으로 기존 공장이 파괴되면서 새로운 공장을 건설했다. 1956년에 군사 목적을 위해 대형 트력과 장비를 개발했다. 1960년대에 ��초로 기중기를 제작하기 시작 했으며 1970년대부터 1980년대까지 개발했다. 1985년에 최대 50톤 용량의 가진 전지형 기중기를 개발했다. 1990년 일본의 기중기 회사였던 타다노에 인수 되었다. 1991년에 일본 수출을 위해 전지형 기중기를 생산했다. 1995년에 회사 창립 150주년이 되었다. 2004년에 최초로 험지형 기중기를 제작한데 이어 2009년에 트럭 기중기를 제작했다. 2013년에 공장을 확장 및 이전하면서 현재에 이르고 있다."
142
+ }
143
+ 5 1045157#14 18.30410
144
+ {
145
+ "id" : "1045157#14",
146
+ "contents" : "2010년 3월 세계최초의 2000cc급 자동차를 위한 15Kw급 BLDC발전기 개발, 전기자동차의 주행거 리 제한 극복 세계최초의 동급 내연이륜차의 성능을 능가하는 전기스쿠터 힐리스 모델출시 및 신차발표회 EV전시장 오픈"
147
+ }
148
+ 6 128661#7 17.92510
149
+ {
150
+ "id" : "128661#7",
151
+ "contents" : "1991년 11월 21일 현대자동차는 한국내에서는 최초의 전기자동차를 독자개발했다고 발표했다."
152
+ }
153
+ 7 1312657#1 17.78780
154
+ {
155
+ "id" : "1312657#1",
156
+ "contents" : "1939년에 이탈리아 나폴리 출신인 빈센조 앙헬레스 게르바지오()와 타예레스 나폴리()에 의해 설립했다. 제2차 세계대전 당시 스페인에서 트럭을 생산하기 위해 차체 및 용접을 했으나, 이후 샤시에 특장 트럭 캡 디자인을 개발했다. 1958년에 최초로 공장이 이전되면서 버스를 생산하기 시작했다. 1960년에 세계 최초로 2층 버스를 생산했다. 1962년에 생산 공장이 재이전 되면서 팩토리아스 나폴리스 SA()에 인수되었다. 이 회사는 상용차를 생산한 업체로 주로 버스와 트럭을 생산했다. 1966년에 바헤이로스 디젤 SA()에 매각했다. 1969년에 다시 크라이슬러에 마각이 되었지만 버스 제조 부문의 경우 1971년에 벨기에의 자동차 제조 기업인 반호르에 매각되었다. 1983년에 반호르가 최대 주주가 되었고 인수 최기에 반호르의 브랜드로 차량 생산을 했지만 이후 이스파노 카로세라 SAL()로 사명이 변경되었다. 1997년에 이탈리아의 자동차 제조 기업인 피닌파라나()와 제휴를 맺고 시내버스 모델인 아비토와 고속버스 모델인 디보를 개발하기 시작했다. 2000년 9월에 모로코의 수도 카사블랑카에 공장을 설립했다. 2005년에 인도의 자동차 제조 기업인 타타자동차가 21%의 지분을 획득한데 이어 2009년에 지분 79%를 인수하면서 자회사가 되었다. 2010년에 현재의 사명으로 변경이 되었다. 2013년 9월에 타타자동차는 사라고사 공장 폐쇄를 발표했다. 매출 하락과 미래 전망이 불투명으로 폐쇄 결정을 내렸다."
157
+ }
158
+ 8 128660#63 17.71300
159
+ {
160
+ "id" : "128660#63",
161
+ "contents" : "후지중공업과 마츠비시 자동차는 2005년 8월에 전기자동차의 개발 계획을 발표하였다. 이 2개 회사가 거의 중지 상태였던 전기자동차의 개발을 재개하고 있다. 2008년에 들어 닛산-르노 연합이 전기자동차로 본격 참여 방침을 표명하였고, 도요타도 2010년대 초반에 전기자동차를 출시하기로 발표하는 등 전기 자동차가 활성화 조짐을 보이고 있다."
162
+ }
163
+ 9 126891#2 17.63640
164
+ {
165
+ "id" : "126891#2",
166
+ "contents" : "2007년, 스웨덴의 대표 자동차 메이커인 볼보는 세계 최초로 에탄올 자동차를 제작해서 자동차 경주에 참가했다. 스웨덴에서는 가솔린 자동차의 도시내 사용을 줄이고, 시민들이 자전거로 생활할 수 있게끔 유도하고 있다. 또한 볼보에서 친환경 자동차를 적극적으로 개발하게 하고, 시민들에게는 친환경 자동차 구입비에 150만 원의 보조금을 지급하며, 연료비는 가솔린의 70% 가격에 주유할 수 있게 하는 등 적극적인 탈석유 정책을 시행하고 있다."
167
+ }
168
+ 10 128660#3 17.29680
169
+ {
170
+ "id" : "128660#3",
171
+ "contents" : "전기자동차는 디젤 엔진, 가솔린 엔진을 사용하는 오토사이클(정적사이클)방식의 자동차보다 먼저 고안 되었다. 1830년부터 1840년 사이에 영국 스코틀랜드의 사업가 앤더슨이 전기자동차의 시초라고 할 수 있는 세계 최초의 원유전기마차를 발명한다. 1835년에 네덜란드 크리스토퍼 베커는 작은 크기의 전기자동차를 만든다."
172
+ }
173
+ </code></pre>
174
+
175
+ </details>
176
+
177
+ * with dense indexing
178
+ ```
179
+ python search_online.py --index_type dense --index /path/to/indexing/dense --query "$QUERY" --encoder huggingface_model_name_or_checkpoint_path --device $DEVICE
180
+ ```
181
+ * DEVICE: 'cpu' or 'cuda:$GPU_ID'
182
+ * search는 현재는 single gpu만 지원됩니다. multi gpu를 사용하려면 individual process를 만들어서 병렬로 처리해야 합니다.
183
+
184
+ * with hybrid (first-pass: sparse, reranking: dense) indexing
185
+ ```
186
+ python search_online.py --index_type hybrid --index /path/to/indexing/sparse,/path/to/indexing/dense --query "$QUERY" --encoder huggingface_model_name_or_checkpoint_path --device $DEVICE --alpha $ALPHA_MULTIPLIED_ON_SPARSE_SCORE --normalization --lang_abbr $LANG_CODE
187
+ ```
188
+ * ALPHA_MULTIPLIED_ON_SPARSE_SCORE는 (0,2)에서 line search를 하면서 최적의 값을 찾으며 0.5가 기본값입니다.
189
+
190
+ #### batch
191
+
192
+ * with dense indexing
193
+ ```
194
+ python -m pyserini.search.faiss \
195
+ --encoder huggingface_model_name_or_checkpoint_path \
196
+ --index /path/to/indexing_dense \
197
+ --topics datasets/dataset_name/queries.tsv \
198
+ --output /path/to/runfile --batch $BATCH_SIZE --threads $NUM_THREADS \
199
+ --hits $TOPK --remove-query --device $DEVICE
200
+ ```
201
+ * BATCH_SIZE, NUM_THREADS는 기본값을 64, 16으로 사용합니다.
202
+
203
+ * with sparse indexing
204
+ ```
205
+ python -m pyserini.search.lucene --bm25 \
206
+ --topics datasets/dataset_name/queries.tsv \
207
+ --index /path/to/indexing_sparse \
208
+ --hits $TOPK \
209
+ --language $LANG_CODE \
210
+ --output /path/to/runfile
211
+ ```
212
+
213
+ * hybrid model
214
+ ```
215
+ python -m pyserini.search.hybrid \
216
+ dense --index /path/to/indexing_dense \
217
+ --encoder huggingface_model_name_or_checkpoint_path \
218
+ --device $DEVICE \
219
+ sparse --index /path/to/indexing_sprase \
220
+ fusion --alpha $ALPHA_MULTIPLIED_ON_SPARSE_SCORE \
221
+ run --topics datasets/dataset_name/queries.jsonl \
222
+ --output /path/to/runfile \
223
+ --threads $NUM_THREADS \
224
+ --batch-size $BATCH_SIZE \
225
+ --hits $TOPK
226
+
227
+ python -m pyserini.search.hybrid \
228
+ dense --index path/to/indexing/dense \
229
+ --encoder huggingface_model_name_or_checkpoint_path \
230
+ --device $DEVICE \
231
+ sparse --index /path/to/indexing/sprase \
232
+ fusion --alpha $ALPHA_MULTIPLIED_ON_SPARSE_SCORE \
233
+ run --topics datasets/dataset_name/queries.tsv \
234
+ --output runs/hybrid.run \
235
+ --threads $NUM_THREADS \
236
+ --batch-size $BATCH_SIZE \
237
+ --hits 1000
238
+ ```
239
+
240
+ * 결과물 (dir: /path/to/runfile)
241
+ format: qid q_type pid topK score retrieval_type
242
+ example:
243
+ ```
244
+ 46 Q0 271267 1 2.134944 Faiss
245
+ 46 Q0 63734 2 2.118700 Faiss
246
+ 46 Q0 174045 3 2.110519 Faiss
247
+ ...
248
+ ```
249
+
250
+ ### 3. Evaluation (optional)
251
+ * **qrels** 파일은 정량평가를 위한 ground truth 파일로, qid q_type pid relevance_score 형식으로 구성되어 있음.
252
+ * **runfile**은 batch로 검색한 결과로, qid q_type pid topK score retrieval_type 형식으로 구성되어 있음.
253
+ * 아래 스크립트는 qrels 파일과 runfile을 비교하여 nDCG@10, MRR@100, Recall@100 등의 지표를 계산함.
254
+ ```
255
+ python -m pyserini.eval.trec_eval -c -mndcg_cut.10 -mrecip_rank -mrecall.100 /path/to/qrels /path/to/runfile
256
+
257
+ recip_rank all 0.3628
258
+ recall_100 all 0.7158
259
+ ndcg_cut_10 all 0.3805
260
+ ```
261
 
 
__pycache__/search_online.cpython-38.pyc ADDED
Binary file (4.58 kB). View file
 
analyze_answer_inclusion_in_retrieval.py ADDED
@@ -0,0 +1,100 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ from pyserini.search.lucene import LuceneSearcher
3
+
4
+ from tqdm import tqdm
5
+
6
+ def convert_unicode_to_normal(data):
7
+ if isinstance(data, str):
8
+ return data.encode('utf-8').decode('utf-8')
9
+ elif isinstance(data, list):
10
+ assert(isinstance(data[0], str))
11
+ return [sample.encode('utf-8').decode('utf-8') for sample in data]
12
+ else:
13
+ raise ValueError
14
+
15
+ K=30
16
+ index_dir="/root/indexes/index-wikipedia-dpr-20210120" # lucene
17
+ runfile_path=f"runs/q=NQtest_c=wikidpr_m=bm25_k={K}.run" # bm25
18
+ qafile_path="/root/nota-fairseq/examples/information_retrieval/open_domain_data/NQ/qa_pairs/test.jsonl"
19
+ logging_path="logging_q=NQ_c=wiki_including_ans.jsonl"
20
+
21
+ # define searcher with pre-built indexes
22
+ searcher = LuceneSearcher(index_dir=index_dir)
23
+
24
+ # v2. read qa first (due to runfile query name sort)
25
+ print("read qa file")
26
+ pair_by_qid = {}
27
+ with open(qafile_path, 'r') as fr_qa:
28
+ for pair in tqdm(fr_qa):
29
+ pair_data = json.loads(pair)
30
+ qid, query, answers = pair_data["qid"], pair_data["query"], pair_data["answers"] # str, str, list
31
+ pair_by_qid[qid] = {'query': query, 'answers':answers}
32
+
33
+ print("check retrieved passage include answer")
34
+ qid_with_ans_in_retrieval = []
35
+ with open(runfile_path, 'r') as fr_run, open(logging_path, 'w') as fw_log:
36
+ for result in tqdm(fr_run):
37
+ fields = result.split(' ')
38
+ assert(len(fields) == 6) # qid q_type pid k score engine
39
+
40
+ qid_, pid = fields[0], fields[2]
41
+ assert(qid_ in pair_by_qid.keys())
42
+ query, answers = pair_by_qid[qid_]['query'], pair_by_qid[qid_]['answers']
43
+
44
+ # get passage
45
+ psg_txt = searcher.doc(pid)
46
+ psg_txt = psg_txt.raw()
47
+ psg_txt = json.loads(psg_txt)
48
+ psg_txt = psg_txt['contents'].strip()
49
+ psg_txt = convert_unicode_to_normal(psg_txt)
50
+
51
+ # check if passage contains answer
52
+ #if any([ans in psg_txt for ans in answers]):
53
+ for ans in answers:
54
+ if ans in psg_txt:
55
+ log_w = {
56
+ "qid": qid_,
57
+ "pid": pid,
58
+ "query": query,
59
+ "answer": ans,
60
+ "passage": psg_txt
61
+ }
62
+ fw_log.write(json.dumps(log_w, ensure_ascii=False) + '\n')
63
+
64
+ if qid_ not in qid_with_ans_in_retrieval:
65
+ qid_with_ans_in_retrieval.append(qid_)
66
+ break # don't have to count check multiple answer in passage
67
+
68
+
69
+ print(f"#qid in test set: {len(pair_by_qid.keys())}, #qid having answer with retrieval(BM25, K={K}): {len(qid_with_ans_in_retrieval)}, Recall = {len(qid_with_ans_in_retrieval)/len(pair_by_qid.keys())*100}")
70
+
71
+ # v1
72
+ """
73
+ with open(runfile_path, 'r') as fr_run, open(qafile_path, 'r') as fr_qa:
74
+ for pair in tqdm(fr_qa):
75
+ pair_data = json.loads(pair)
76
+ qid, query, answers = pair_data["qid"], pair_data["query"], pair_data["answers"] # str, str, list
77
+
78
+ for k in range(K):
79
+ result=fr_run.readline()
80
+ print(result)
81
+
82
+ fields = result.split(' ')
83
+ assert(len(fields) == 6) # qid q_type pid k score engine
84
+
85
+ qid_, pid = fields[0], fields[2]
86
+
87
+ assert(qid == qid_), f"qid={qid}, qid_={qid_} should be same"
88
+
89
+ # get passage
90
+ psg_txt = searcher.doc(pid)
91
+ psg_txt = psg_txt.raw()
92
+ psg_txt = json.loads(psg_txt)
93
+ psg_txt = psg_txt['contents'].strip()
94
+ psg_txt = convert_unicode_to_normal(psg_txt)
95
+
96
+ # check if passage contains answer
97
+ if any([ans in psg_txt for ans in answers]):
98
+ import pdb
99
+ pdb.set_trace()
100
+ """
collections/.gitkeep ADDED
@@ -0,0 +1 @@
 
 
1
+ # This is the default directory for document collections. Placeholder so that directory is kept in git.
convert_cast22_eval_json_to_queries_tsv.py ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+
3
+ from tqdm import tqdm
4
+
5
+ ### read: cast 22 eval json
6
+ eval_json_path="/root/Corpus/CAsT22_eval_queries/cqr_inferred_results.json"
7
+
8
+ # read data
9
+ with open(eval_json_path, 'r') as fr:
10
+ data = json.load(fr)
11
+
12
+ # write: queries.tsv (format: {qid}\t{query})
13
+ eval_queries_path = "/root/Corpus/CAsT22_eval_queries/queries_cqr_result.tsv"
14
+ with open(eval_queries_path, 'w') as fw:
15
+ for sample in tqdm(data):
16
+ conv_id = sample['number']
17
+ for turn in sample['turn']:
18
+ turn_id = turn['number']
19
+ automatic_rewritten_utterance = turn['automatic_rewritten_utterance']
20
+
21
+ q_id = f"{conv_id}_{turn_id}"
22
+
23
+ fw.write(f"{q_id}\t{automatic_rewritten_utterance}\n")
24
+
download_oracle_jdk.sh ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+
3
+ ARCH=$(uname -m)
4
+ echo "Architecture: ${ARCH}"
5
+
6
+ if [ "$ARCH" = "x86_64" ]
7
+ then
8
+ JDK_URL="https://download.oracle.com/java/20/latest/jdk-20_linux-x64_bin.tar.gz"
9
+ else
10
+ JDK_URL="https://download.oracle.com/java/20/latest/jdk-20_linux-aarch64_bin.tar.gz"
11
+ fi
12
+
13
+ wget --no-check-certificate --no-cookies --header "Cookie: oraclelicense=accept-securebackup-cookie" "${JDK_URL}"
14
+ tar -xzf jdk-*.tar.gz
15
+ rm jdk-*.tar.gz
encode_8shards_cuda0.sh ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+
3
+ #SHARD_NUM=8
4
+ SHARDED_CORPUS_PREFIX="/root/Corpus/CAsT22_msmarcov2_kilt_flattened_8shards"
5
+ SHARDED_EMBEDDING_PREFIX="/ssd3/geonminkim/indexes/CAsT_21_22_msmarcov2_kilt/dense"
6
+
7
+ START_IDX=0
8
+ END_IDX=1
9
+ SHARD_NUM=8
10
+ GPU_ID=0
11
+
12
+ for SHARD_IDX in $(seq $START_IDX $END_IDX); do
13
+ echo "encoding for SHARD_IDX = $SHARD_IDX"
14
+ python -m pyserini.encode \
15
+ input --corpus $SHARDED_CORPUS_PREFIX/shard_$SHARD_IDX \
16
+ --fields text \
17
+ --shard-id $SHARD_IDX \
18
+ --shard-num $SHARD_NUM \
19
+ output --embeddings $SHARDED_EMBEDDING_PREFIX/shard_$SHARD_IDX \
20
+ --to-faiss \
21
+ encoder --encoder castorini/tct_colbert-v2-msmarco-cqe \
22
+ --fields text \
23
+ --device cuda:$GPU_ID \
24
+ --batch 128 \
25
+ --fp16
26
+ done
encode_8shards_cuda1.sh ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+
3
+ #SHARD_NUM=8
4
+ SHARDED_CORPUS_PREFIX="/root/Corpus/CAsT22_msmarcov2_kilt_flattened_8shards"
5
+ SHARDED_EMBEDDING_PREFIX="/ssd3/geonminkim/indexes/CAsT_21_22_msmarcov2_kilt/dense"
6
+
7
+ START_IDX=2
8
+ END_IDX=3
9
+ SHARD_NUM=8
10
+ GPU_ID=1
11
+
12
+ for SHARD_IDX in $(seq $START_IDX $END_IDX); do
13
+ echo "encoding for SHARD_IDX = $SHARD_IDX"
14
+ python -m pyserini.encode \
15
+ input --corpus $SHARDED_CORPUS_PREFIX/shard_$SHARD_IDX \
16
+ --fields text \
17
+ --shard-id $SHARD_IDX \
18
+ --shard-num $SHARD_NUM \
19
+ output --embeddings $SHARDED_EMBEDDING_PREFIX/shard_$SHARD_IDX \
20
+ --to-faiss \
21
+ encoder --encoder castorini/tct_colbert-v2-msmarco-cqe \
22
+ --fields text \
23
+ --device cuda:$GPU_ID \
24
+ --batch 128 \
25
+ --fp16
26
+ done
encode_8shards_cuda2.sh ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+
3
+ #SHARD_NUM=8
4
+ SHARDED_CORPUS_PREFIX="/root/Corpus/CAsT22_msmarcov2_kilt_flattened_8shards"
5
+ SHARDED_EMBEDDING_PREFIX="/ssd3/geonminkim/indexes/CAsT_21_22_msmarcov2_kilt/dense"
6
+
7
+ START_IDX=4
8
+ END_IDX=5
9
+ SHARD_NUM=8
10
+ GPU_ID=2
11
+
12
+ for SHARD_IDX in $(seq $START_IDX $END_IDX); do
13
+ echo "encoding for SHARD_IDX = $SHARD_IDX"
14
+ python -m pyserini.encode \
15
+ input --corpus $SHARDED_CORPUS_PREFIX/shard_$SHARD_IDX \
16
+ --fields text \
17
+ --shard-id $SHARD_IDX \
18
+ --shard-num $SHARD_NUM \
19
+ output --embeddings $SHARDED_EMBEDDING_PREFIX/shard_$SHARD_IDX \
20
+ --to-faiss \
21
+ encoder --encoder castorini/tct_colbert-v2-msmarco-cqe \
22
+ --fields text \
23
+ --device cuda:$GPU_ID \
24
+ --batch 128 \
25
+ --fp16
26
+ done
encode_8shards_cuda3.sh ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+
3
+ #SHARD_NUM=8
4
+ SHARDED_CORPUS_PREFIX="/root/Corpus/CAsT22_msmarcov2_kilt_flattened_8shards"
5
+ SHARDED_EMBEDDING_PREFIX="/ssd3/geonminkim/indexes/CAsT_21_22_msmarcov2_kilt/dense"
6
+
7
+ START_IDX=6
8
+ END_IDX=7
9
+ SHARD_NUM=8
10
+ GPU_ID=3
11
+
12
+ for SHARD_IDX in $(seq $START_IDX $END_IDX); do
13
+ echo "encoding for SHARD_IDX = $SHARD_IDX"
14
+ python -m pyserini.encode \
15
+ input --corpus $SHARDED_CORPUS_PREFIX/shard_$SHARD_IDX \
16
+ --fields text \
17
+ --shard-id $SHARD_IDX \
18
+ --shard-num $SHARD_NUM \
19
+ output --embeddings $SHARDED_EMBEDDING_PREFIX/shard_$SHARD_IDX \
20
+ --to-faiss \
21
+ encoder --encoder castorini/tct_colbert-v2-msmarco-cqe \
22
+ --fields text \
23
+ --device cuda:$GPU_ID \
24
+ --batch 128 \
25
+ --fp16
26
+ done
entrypoint.sh ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+
3
+ HOME=/root
4
+
5
+ # check pth
6
+ echo "check path for JAVA_HOME"
7
+ echo $JAVA_HOME
8
+
9
+ # additional environment variable
10
+ export PYSERINI_CACHE=/root/indexes
11
+
12
+ # APP_PATH: assumed to be volume mounted with host
13
+ APP_PATH=$HOME/np_app_text_retrieval_inference
14
+
15
+ # EXTERNAL_PATH: assumed having system specific files, not volume mounted with host
16
+ EXTERNAL_PATH=/workspace/external
17
+
18
+ # copy jar file
19
+ cp $EXTERNAL_PATH/anserini/target/anserini-*-fatjar.jar $APP_PATH/pyserini/resources/jars
20
+
21
+ # launch /bin/bash
22
+ cd $APP_PATH
23
+ exec /bin/bash
flagged/log.csv ADDED
@@ -0,0 +1,272 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 질의,검색 결과,flag,username,timestamp
2
+ "몸에 반점이 있다면 어떤 질병일 가능성이 있을까?
3
+ ","문서 1
4
+ 정확한 원인은 밝혀져 있지 않으나 유전적, 환경적 요인이 복합되어 작용하는 것으로 짐작된다. 이 질병을 앓는 가족 구성원이 있다면 본인에게도 영향을 받을 가능성이 있다. 영향을 받은 일란성 쌍둥이가 있다면 30%의 확률로 다른 쌍둥이도 이 질병을 가지고 있을 가능성이 있다. 이 질병은 스트레스, 감염, 출생 시에 발병할 수 있다. 제1형 당뇨병, 류마티스 관절염과 같은 자가면역 질환을 가진 환자들이 영향을 받을 가능성이 있다. 흡연은 이 질병의 위험성을 높이며 안구 문제를 더 악화시킬 수 있다. 이 질병은 TSI라는 항체에서 비롯하며 이는 갑상샘 자극 호르몬과 영향도가 비슷하다. 이러한 항체들은 갑상샘이 갑상샘 호르몬을 과도하게 생산하도록 유도한다. 질병 확인을 위한 혈액 검사, 방사성 요오드 섭취를 통한 증상에 기반하여 진단한다.
5
+
6
+ 문서 2
7
+ 1603년 37세의 나이로 사망하였고, 아들 아흐메트 1세가 왕위를 이어받았다. 사망 원인은 정확하게 알려져 있지 않으며, 자연사나 질병일 가능성이 있다.
8
+
9
+ 문서 3
10
+ 이상의 연구 결과들은 오직 가스 행성 및 갈색 왜성이 존재하지 않음을 밝혀줬을 뿐 지구나 금성처럼 작은 행성이 존재할 가능성은 아직 남아 있다. 만약 뜨거운 목성이 항성 가까이 있었다면 이들은 항성 근처를 돌던 지구형 행성의 궤도를 망가뜨려 생명체 발생 가능성을 낮췄을 것이다. 따라서 가스 행성이 없다는 것은 지구 비슷한 행성이 존재할 가능성을 높여 주는 사실이 된다. 통상적으로 밝혀진 연구에 따르면 중원소 함량이 높은 별 주위에는 행성이 있을 확률이 높고 그렇지 않은 별 주위에는 행성이 있을 확률이 적은 것으로 밝혀졌다. 두꺼운 먼지 원반이 있다는 사실은 항성 가까이에 암석 행성이 존재할 가능성을 높이는 것은 사실이다. 그러나 이는 동시에 그 암석 행성이 폭격을 받을 가능성이 높음을 알려주는 사실이기도 하다. 만약 행성이 발견된다면 이후의 연구 방향은 이 행성에 생명체가 살 수 있는 물과 대기가 존재하는지를 충분한 해상도의 망원경을 통해 알아내는 것이 된다. 지구의 산소가 생명체 존재를 나타내는 척도가 되는 것처럼 무기성의 대기 조성은 원시 생명체가 존재함을 나타내는 징표가 될 수 있다.
11
+
12
+ 문서 4
13
+ 치매나 백혈병, 당뇨, 파킨슨병과 같은 난치병들 중에는 세포의 변이나 사멸로 인한 질병이 대다수이다. 이러한 퇴행성 질병의 경우 세포 치료법을 이용해여 치료하는 경우가 많다. 특이점주의자들에 따르면 줄기세포 연구와 같은 세포 연구는 생명 공학 연구의 일부분이며 유전자 DNA 지도를 완벽하게 구조화할 수 있다면 세포분화 치료 혹은 세포복제 치료를 통해 환자 자신의 DNA를 지니고 텔로미어가 연장된 세포를 공급할 수 있을 것이라고 본다. 예컨데 현재 당뇨병 치료에 쓰이는 거부반응 제어제가 위험한 부작용을 일으킬 가능성이 있는 반면 이러한 세포 치료는 부작용 가능성을 급격히 낮출 수 있다. 이러한 세포 치료는 단순히 난치병의 치료에만 그치는 것이 아니라, 교차분화 기술을 이용한 새로운 신체 기관으로의 교체를 가능하게 한다.
14
+
15
+ 문서 5
16
+ 컴퓨터에 의한 시뮬레이션에는, 보이드로 불리는 틈새나 버그가 있고, 내부로부터도 아는 경우가 있을 지도 모르다. 그러한 것을 찾아내 검증할 수 있다면, 거기에 따라 모의현실의 내부에 있는 것을 증명할 수 있을 가능성이 있다. 그러나, 물리 법칙에 반하는 일은, 그 밖에도 설명할 수 있는 가설을 생각할 수 있다(신 등). 영화 《매트릭스》로 그려진 것처럼, 기시감 등의 일상적인 기묘한 체험도 어떠한 버그로서 설명할 수 있을 가능성이 있다.
17
+
18
+ 문서 6
19
+ 예를 들면 대기가 없는 어떤 행성 깊은 단층 속 그늘진 곳이나 화산 동굴 속에 생태계가 형성되어 있을 가능성이 있다. 저명했던 천문학자 칼 세이건은 이 개념을 태양계에 적용하여, 1976년 논문을 통해 목성의 대기 상층부에 유기체가 살고 있을 가능성을 피력했다. 그러나 목성에는 딱딱한 표면이 없기 때문에 생명체가 존재할 가능성은 거의 없다.
20
+
21
+ 문서 7
22
+ 널리 알려져 있다는 사실이 반드시 참임을 보증하는 것은 아니므로 이런 주장은 논리적으로 오류이다. 개인의 신념이 잘못되어 있을 가능성이 있다면 다수의 인간의 신념도 잘못되어 있을 가능성이 있다. 예를 들면, 75%의 사람이 A라고 대답했다고 해도 25%의 확률로 A가 아닐 가능성도 있다. 이 확률이 어떻게 되든 다수가 옳다는 것은 논리적이라고 할 수 없다. 만약 만장일치라고 해도, 표본의 수가 불충분할지도 모르고, 혹은 그 사람들이 모르는 사실이 존재하고 있어서 그 사실을 알면 결과가 바뀔지도 모른다.
23
+
24
+ 문서 8
25
+ ""(a and b)""와 같은 논리식을 계산한다고 하면 ""a""항이 거짓인 경우에, ""b""항을 계산하지 않아도 전체 식의 답을 알 수 있다. ""(a or b)""에서 ""a""항이 참인 경우에도 마찬가지이다. 여기서 항이 복잡한 식이면 이점이 많고, 식에서 결과가 참이나 거짓일 가능성과 계산의 비용에 따라 어떤 항이 먼저 계산되어야 좋은지 알 수 있다. 따라서 ""(a or b or c)""와 같은 식에서 ""a""항이 참값을 가질 가능성이 많다면, 전체 식을 쉽게 계산할 수 있다. 이런 가능성을 보장하기 위해, 컴파일러는 더 계산해야 할 것인지, 다른 항을 지름길 계산 해야 할 것인지를 검사하기도 한다. 이런 검사는 계산을 줄이는 것을 실패할 경우나 꼭 필요한 경우 무조건 전체 식을 계산해야 할 때 시간이 더 많이 걸리게 된다.
26
+
27
+ 문서 9
28
+ 생물학적으로는 인간의 뇌에 필요한 유전 정보를 가지는 인공적인 게놈을 적당한 숙주의 세포에 짜넣는 것으로 인공적으로 생명을 만드는 것도 가능할지도 모른다고 생각되며, 그러한 인공생명체는 의식을 가질 가능성이 높다. 그렇지만 그 생명체 안의 어떤 속성이 의식을 낳는 것일까? 비슷한 것을 비생물학적인 부품에서 만들 수 없는 것인지? 컴퓨터를 설계하기 위한 기술로 그러한 의식체를 낳을 수 없을까? 그러한 행위는 윤리적으로 문제 없을까?와 같은 여러 문제를 안고 있다.
29
+
30
+ 문서 10
31
+ 3부 리그",,,2023-06-16 00:57:35.174375
32
+ "모델 경량화 및 최적화와 관련하여 Netspresso bot에게 물어보세요.
33
+ 예를들어
34
+
35
+ - Why do I need to use Netspresso?
36
+ - Summarize how to compress the model with netspresso.
37
+ - Tell me what the pruning is.
38
+ - What kinds of hardware can I use with this toolkit?
39
+ - Can I use YOLOv8 with this tool? If so, tell me the examples.","1. Why do I need to use Netspresso?
40
+ Netspresso is a machine learning pipeline that provides tools and methods for model compression and optimization. By using Netspresso, you can reduce the size and computational cost of your models, making them more efficient for deployment on resource-constrained devices. It also allows you to fine-tune and optimize your models for better performance. Overall, Netspresso helps you achieve lightweight and optimized models for real-time applications.
41
+
42
+ 2. Summarize how to compress the model with Netspresso.
43
+ To compress a model with Netspresso, you can follow these steps:
44
+ - Step 1: Prepare your model.
45
+ - Step 2: Upload your model to Netspresso.
46
+ - Step 3: Choose the compression method (e.g., structured pruning, filter decomposition).
47
+ - Step 4: Configure the compression settings (e.g., pruning ratio, compression algorithm).
48
+ - Step 5: Start the compression process.
49
+ - Step 6: Check the compression result and download the compressed model.
50
+
51
+ 3. Tell me what pruning is.
52
+ Pruning is a model compression technique that involves removing unnecessary parameters or structures from a complex model to make it more compact and efficient. In the context of Netspresso, pruning can be done using structured pruning, which removes entire neurons, filters, or channels from the model. This results in a simplified model that can be deployed without any specific hardware or software requirements for acceleration.
53
+
54
+ 4. What kinds of hardware can I use with this toolkit?
55
+ Netspresso supports various hardware platforms for model deployment. Some of the supported hardware includes NVIDIA Jetson family devices, which can be used with TensorRT for accelerated inference. Other supported platforms include TensorFlow Lite and OpenVINO, which provide optimization for different hardware architectures. The specific hardware compatibility may depend on the compression method and the target framework used in Netspresso.
56
+
57
+ 5. Can I use YOLOv8 with this tool? If so, tell me the examples.
58
+ Yes, you can use YOLOv8 with Netspresso for model compression and optimization. YOLOv8 is a popular object detection model, and Netspresso provides support for object detection models. You can follow the steps in the Netspresso documentation to prepare, upload, compress, and optimize your YOLOv8 model using the available methods and tools in Netspresso. The documentation provides examples and guidelines for using Netspresso with different models, including YOLOv8.","Source 0
59
+ Step 4: Package model (beta)
60
+ Jump to ContentDocumentationHomeDocumentationv1.3.0DocumentationLog InLog InMoon (Dark Mode)Sun (Light Mode)v1.3.0HomeDocumentationSearchINTRODUCTIONWelcomeWhat is NetsPresso?Machine Learning Pipeline with NetsPressoAdvantages of NetsPressoBENCHMARKSModel Searcher- Object DetectionModel Compressor- Image Classification- Object Detection- Semantic Segmentation- Super ResolutionUSE CASESFree People Detection ModelsFree Vehicle Detection ModelsNETSPRESSO MODEL SEARCHERFeatures & Scope of supportStep 1: Prepare datasetStep 2: Upload datasetStep 3: Create project (Quick Search)Step 3: Create project (Retraining)Step 4: Check the project result and download a modelNETSPRESSO MODEL COMPRESSORFeatures & Scope of supportMethod: Structured PruningMethod: Filter DecompositionSupported modelsSupported ONNX operatorsStep 1: Prepare modelStep 2: Upload modelStep 3: Compress model (Automatic Compression)Step 3: Compress model (Advanced Compression)Step 4: Check the compression result and download a modelStep 5: Retrain the compressed modelNETSPRESSO MODEL LAUNCHERFeatures & Scope of supportStep 1: Prepare modelStep 2: Upload modelStep 3: Convert model (beta)Step 4: Package model (beta)Personal serverRequirementsConnect Personal ServerRELEASE NOTESNetsPresso ReleasesFAQAbout Credit?Powered by Step 4: Package model (beta)Suggest EditsPackage the model so that it can be deployed directly to the device.
61
+ 1. Go to Package page
62
+ Select New package at the drop-down menu that appears when you click Download button.
63
+ 2. Package the model
64
+ Enter the package name and select a base model to package.
65
+ Please note that the package name will be the library name and the name cannot be changed after packaging.
66
+ You can include pre/post processing codes (.py) with the model for the package (optional).
67
+ Download the pre/post processing code example and modify for your use cases.
68
+ 3. Download package file and run the package
69
+ Packaged file will be automatically downloaded.
70
+ To run the package, use the code written below. {package_name} must be changed to your package name.
71
+ Pythonfrom np_{package_name}.models.model import NPModel
72
+ NPModel.initialize(num_threads=1) # Initialize
73
+ npmodel = NPModel()
74
+ image_path = ""./test.jpg"" #Image path
75
+ print(npmodel.run(image_path)) # Inference
76
+ NPModel.finalize() # Memory management
77
+ Updated about 1 month ago Table of Contents
78
+ 1. Go to Package page
79
+ 2. Package the model
80
+ 3. Download package file and run the package
81
+
82
+ ######################################################
83
+
84
+ Source 1
85
+ Step 3: Convert model (beta)
86
+ Jump to ContentDocumentationHomeDocumentationv1.3.0DocumentationLog InLog InMoon (Dark Mode)Sun (Light Mode)v1.3.0HomeDocumentationSearchINTRODUCTIONWelcomeWhat is NetsPresso?Machine Learning Pipeline with NetsPressoAdvantages of NetsPressoBENCHMARKSModel Searcher- Object DetectionModel Compressor- Image Classification- Object Detection- Semantic Segmentation- Super ResolutionUSE CASESFree People Detection ModelsFree Vehicle Detection ModelsNETSPRESSO MODEL SEARCHERFeatures & Scope of supportStep 1: Prepare datasetStep 2: Upload datasetStep 3: Create project (Quick Search)Step 3: Create project (Retraining)Step 4: Check the project result and download a modelNETSPRESSO MODEL COMPRESSORFeatures & Scope of supportMethod: Structured PruningMethod: Filter DecompositionSupported modelsSupported ONNX operatorsStep 1: Prepare modelStep 2: Upload modelStep 3: Compress model (Automatic Compression)Step 3: Compress model (Advanced Compression)Step 4: Check the compression result and download a modelStep 5: Retrain the compressed modelNETSPRESSO MODEL LAUNCHERFeatures & Scope of supportStep 1: Prepare modelStep 2: Upload modelStep 3: Convert model (beta)Step 4: Package model (beta)Personal serverRequirementsConnect Personal ServerRELEASE NOTESNetsPresso ReleasesFAQAbout Credit?Powered by Step 3: Convert model (beta)Suggest EditsConvert the model format according to the target device.
87
+ 1. Go to Convert page
88
+ Click the Convert button on Models page.
89
+ 2. Covert model
90
+ Enter the name and memo for the converted model. Select a base model to be converted and the target hardware to benchmark the model.
91
+ Depending on the framework of the base model, the options available for converting are different.
92
+ Models built with Model Searcher → TensorRT, TensorFlow Lite, OpenVINO
93
+ Custom models
94
+ ONNX → TensorRT, TensorFlow Lite, OpenVINO
95
+ Click the Start converting button to convert the model. (Converting for the NVIDIA Jetson family (TensorRT) may take up to 1 hour.)
96
+ 3. Check the converting result
97
+ Converted model will be displayed on the Models page with performance benchmarks on the selected target hardware.Updated 6 months ago Table of Contents
98
+ 1. Go to Convert page
99
+ 2. Covert model
100
+ 3. Check the converting result
101
+
102
+ ######################################################
103
+
104
+ Source 2
105
+ Method: Structured Pruning
106
+ Jump to ContentDocumentationHomeDocumentationv1.3.0DocumentationLog InLog InMoon (Dark Mode)Sun (Light Mode)v1.3.0HomeDocumentationSearchINTRODUCTIONWelcomeWhat is NetsPresso?Machine Learning Pipeline with NetsPressoAdvantages of NetsPressoBENCHMARKSModel Searcher- Object DetectionModel Compressor- Image Classification- Object Detection- Semantic Segmentation- Super ResolutionUSE CASESFree People Detection ModelsFree Vehicle Detection ModelsNETSPRESSO MODEL SEARCHERFeatures & Scope of supportStep 1: Prepare datasetStep 2: Upload datasetStep 3: Create project (Quick Search)Step 3: Create project (Retraining)Step 4: Check the project result and download a modelNETSPRESSO MODEL COMPRESSORFeatures & Scope of supportMethod: Structured PruningMethod: Filter DecompositionSupported modelsSupported ONNX operatorsStep 1: Prepare modelStep 2: Upload modelStep 3: Compress model (Automatic Compression)Step 3: Compress model (Advanced Compression)Step 4: Check the compression result and download a modelStep 5: Retrain the compressed modelNETSPRESSO MODEL LAUNCHERFeatures & Scope of supportStep 1: Prepare modelStep 2: Upload modelStep 3: Convert model (beta)Step 4: Package model (beta)Personal serverRequirementsConnect Personal ServerRELEASE NOTESNetsPresso ReleasesFAQAbout Credit?Powered by Method: Structured PruningSuggest EditsModel Compression
107
+ The goal of model compression is to achieve a model that is simplified from the original without performance deterioration. By compressing the large model, the user can reduce the storage and computational cost and allow to use in real-time applications.
108
+ NetsPresso supports the following compression methods.
109
+ Structured Pruning
110
+ Filter Decomposition
111
+ This page describes for Structured Pruning.
112
+ What is ""Pruning""?
113
+ Pruning is the process of removing individual or groups of parameters from a complex model to make it faster and more compact. This compressing procedure is divided into unstructured pruning and structured pruning by the pruning objects.
114
+ Unstructured Pruning
115
+
116
+ : Removes individual parameters and returns a sparse model, which requires an additional device to be accelerated.
117
+
118
+ Structured Pruning
119
+
120
+ : Removes entire neurons, filters, or channels and returns a model, which does not require any particular hardware or software to be accelerated.
121
+
122
+ The goal of pruning is to reduce the computational resources and accelerate the model by removing unnecessary filters (Model Compressor only supports structured pruning. Unstructured pruning will be published in near future.).
123
+   However, the fine-tuning process is necessary to compensate for the loss of accuracy.
124
+ Structured Pruning
125
+ Supported functions
126
+ Pruning in Model Compressor provides two pruning functions (Pruning by Channel Index / Criteria) and one recommendation (SLAMP) to fulfill the user's demand on model compression.
127
+ Pruning by Channel Index
128
+
129
+   Removes the filters that a user wants to. If the selected filters are redundant or less important, it will return a better performing model.
130
+
131
+ Pruning by Criteria
132
+
133
+ L2 Norm
134
+
135
+ : L2-Norm is used to represent the importance of the corresponding filter. In other words, this method prunes filters based on the magnitude of weights.
136
+
137
+ Nuclear Norm
138
+
139
+ : The nuclear norm is the sum of the singular values representing the energy. It computes the nuclear norm on the feature map to determine the filter's relevance. For this reason, a portion of the dataset is needed. For more detail, please refer to the following paper.
140
+
141
+ Seul-Ki Yeom, Kyung-Hwan Shim, and Jee-Hyun Hwang. Toward compact deep neural networks via energy-aware pruning. arXiv preprint, 2021.
142
+
143
+ Geometric Median
144
+
145
+ : Geometric Median is used to measure the redundancy of the corresponding filter and remove redundant filters. For more detail, please refer to the following paper.
146
+
147
+ Yang He, Ping Liu, Ziwei Wang, Zhilan Hu, and Yi Yang. Filter pruning via geometric median for deep convolutional neural networks acceleration. In CVPR, 2019.
148
+
149
+ Normalization
150
+
151
+ The distribution and magnitude of the layers are varied, it is vital to compare those different distributions from the same perspective. For this reason, all of the criterion values are normalized by layer as follows.
152
+
153
+ ""Recommendation"" in Model Compressor
154
+
155
+   The ""Recommendation"" enables a so-called global pruning, which allocates the pruning ratio for each layer at ease. Current version only supports SLAMP.
156
+
157
+ SLAMP (Structured Layer-adaptive Sparsity for the Magnitude-based Pruning)
158
+
159
+ SLAMP is inspired by the ""Layer-adaptive Sparsity for the Magnitude-based Pruning"" from ICLR 2021, which is called LAMP.
160
+
161
+ Layer-Adaptive sparsity for the Magnitude-based Pruning (LAMP) is an unstructured pruning method, but here, it is modified and developed to measure the layer-wise importance for the Structured pruning.
162
+
163
+ Normalization function
164
+
165
+ Following normalization function is adopted into the above criteria value.
166
+
167
+ What you can do with Model Compressor
168
+ Choose one of ""Pruning by Channel Index"" or ""Pruning by Criteria"" by the purpose of the user.
169
+
170
+ ""Pruning by Channel Index"" is recommended for the expert, who already knows which filter is unnecessary to solve the issue.
171
+ ""Pruning by Criteria"" is recommended for the user who wants to prune certain amounts of ratio on specific layers or beginner of the model compression.
172
+
173
+ To use ""Pruning by Channel Index""
174
+ Check ""Pruning by Channel Index"".
175
+ Check the layers to be pruned.
176
+ Insert the filter index to be pruned (ex. 105, 8, 9, 11-13).
177
+
178
+ To use ""Pruning by Criteria"".
179
+
180
+ Check one of the criteria (ex. L2-Norm) to calculate the importance score of each filter.
181
+ Prune specific layers with a certain amount of ratio.
182
+
183
+ Check the layers to be pruned.
184
+ Insert the amounts of the ratio to be pruned (ex. 0.2).
185
+
186
+ Compress whole layers automatically for the given amounts of ratio.
187
+
188
+ Press ""Recommendation"" and insert the amounts of the ratio (ex. 0.8) to compress the model.
189
+
190
+ Verification of the Structured Pruning
191
+ Pruning in Model Compressor allows to remove the specific filters and preserve the previous knowledge. To verify the pruning method in Model Compressor, we assess the categorical accuracy of the returned compressed model and its fine-tuned model.
192
+
193
+ Cifar100 Accuracy of the Compressed Model and Fine-tuned Model
194
+ The compressed models are compressed by the following strategies: L2Norm, SLAMP, intersection. (Original models are from here)
195
+ The dashed and solid lines in the above image indicate the compressed model from Model Compressor and the model after fine-tuning, respectively, for the given pruning ratio.
196
+ 🚧Not Supported Layers
197
+ Group convolutional layer is currently not supported and will be updated in the near future.
198
+ Updated 5 months ago Table of Contents
199
+ Model Compression
200
+ What is ""Pruning""?
201
+ Structured Pruning
202
+ Supported functions
203
+ What you can do with Model Compressor
204
+ Verification of the Structured Pruning
205
+ ",,,2023-06-28 07:12:14.309344
206
+ What is NetsPresoo?,"NetsPresso is a machine learning pipeline that provides various tools and functionalities for model development and optimization. It includes components such as Model Searcher, Model Compressor, and Model Launcher. NetsPresso aims to simplify the process of developing and deploying machine learning models by offering features like dataset preparation, model compression, and model deployment. It supports tasks like image classification, object detection, semantic segmentation, and super resolution. NetsPresso also provides benchmark results and best practices for different models and datasets.","Source 0
207
+ Model Compressor
208
+ Jump to ContentDocumentationHomeDocumentationv1.3.0DocumentationLog InLog InMoon (Dark Mode)Sun (Light Mode)v1.3.0HomeDocumentationSearchINTRODUCTIONWelcomeWhat is NetsPresso?Machine Learning Pipeline with NetsPressoAdvantages of NetsPressoBENCHMARKSModel Searcher- Object DetectionModel Compressor- Image Classification- Object Detection- Semantic Segmentation- Super ResolutionUSE CASESFree People Detection ModelsFree Vehicle Detection ModelsNETSPRESSO MODEL SEARCHERFeatures & Scope of supportStep 1: Prepare datasetStep 2: Upload datasetStep 3: Create project (Quick Search)Step 3: Create project (Retraining)Step 4: Check the project result and download a modelNETSPRESSO MODEL COMPRESSORFeatures & Scope of supportMethod: Structured PruningMethod: Filter DecompositionSupported modelsSupported ONNX operatorsStep 1: Prepare modelStep 2: Upload modelStep 3: Compress model (Automatic Compression)Step 3: Compress model (Advanced Compression)Step 4: Check the compression result and download a modelStep 5: Retrain the compressed modelNETSPRESSO MODEL LAUNCHERFeatures & Scope of supportStep 1: Prepare modelStep 2: Upload modelStep 3: Convert model (beta)Step 4: Package model (beta)Personal serverRequirementsConnect Personal ServerRELEASE NOTESNetsPresso ReleasesFAQAbout Credit?Powered by Model CompressorSuggest EditsUsing NetsPresso Model Compressor on a variety of well-known public datasets, this article shows users how to get outstanding compression results.
209
+ The fine-tuning procedure is necessary for each compression. It usually follows the original model's training configuration, except the learning rate. After a few batches of training, the learning rate is optimized by determining if the loss has converged or not.
210
+ All of the original and compressed models can be downloaded easily on the Model Compressor Model Zoo.
211
+ See Image Classification Results
212
+ See Object Detection Results
213
+ See Semantic Segmentation Results
214
+ See Super Resolution ResultsUpdated 7 months ago Table of Contents
215
+ See Image Classification Results
216
+ See Object Detection Results
217
+ See Semantic Segmentation Results
218
+ See Super Resolution Results
219
+
220
+ ######################################################
221
+
222
+ Source 1
223
+ Connect Personal Server
224
+ Jump to ContentDocumentationHomeDocumentationv1.3.0DocumentationLog InLog InMoon (Dark Mode)Sun (Light Mode)v1.3.0HomeDocumentationSearchINTRODUCTIONWelcomeWhat is NetsPresso?Machine Learning Pipeline with NetsPressoAdvantages of NetsPressoBENCHMARKSModel Searcher- Object DetectionModel Compressor- Image Classification- Object Detection- Semantic Segmentation- Super ResolutionUSE CASESFree People Detection ModelsFree Vehicle Detection ModelsNETSPRESSO MODEL SEARCHERFeatures & Scope of supportStep 1: Prepare datasetStep 2: Upload datasetStep 3: Create project (Quick Search)Step 3: Create project (Retraining)Step 4: Check the project result and download a modelNETSPRESSO MODEL COMPRESSORFeatures & Scope of supportMethod: Structured PruningMethod: Filter DecompositionSupported modelsSupported ONNX operatorsStep 1: Prepare modelStep 2: Upload modelStep 3: Compress model (Automatic Compression)Step 3: Compress model (Advanced Compression)Step 4: Check the compression result and download a modelStep 5: Retrain the compressed modelNETSPRESSO MODEL LAUNCHERFeatures & Scope of supportStep 1: Prepare modelStep 2: Upload modelStep 3: Convert model (beta)Step 4: Package model (beta)Personal serverRequirementsConnect Personal ServerRELEASE NOTESNetsPresso ReleasesFAQAbout Credit?Powered by Connect Personal ServerSuggest EditsTo connect a personal server, start by clicking a 'New Server' button on the screen below.
225
+ location : My Account > Resources
226
+ The way to connect a personal server to NetsPresso is to install an agent on the personal server.
227
+ The process is as follows.
228
+ Step 1. Create Server
229
+ Specifies an identifiable name and the path where the agent will be installed.
230
+ Step 2. Set Server
231
+ Copy the script that pre-checks the server environment and receives server information.
232
+ Make the copied script into sh file and run it in the shell.
233
+ As a result of executing the script, you can see server information in json format as shown below.
234
+ Copy the server information in json format and paste it into the result input field.
235
+ Step 3. Connect Server
236
+ As shown below, check the server information and copy the connect script that can connect the server.
237
+ Make the copied script into sh file and run it in the shell.
238
+ As a result of execution, the server is connected as shown below.
239
+ You can check the server you have connected to on the Resources page.
240
+ Updated about 1 month ago Table of Contents
241
+ Step 1. Create Server
242
+ Step 2. Set Server
243
+ Step 3. Connect Server
244
+
245
+ ######################################################
246
+
247
+ Source 2
248
+ - Object Detection
249
+ Jump to ContentDocumentationHomeDocumentationv1.3.0DocumentationLog InLog InMoon (Dark Mode)Sun (Light Mode)v1.3.0HomeDocumentationSearchINTRODUCTIONWelcomeWhat is NetsPresso?Machine Learning Pipeline with NetsPressoAdvantages of NetsPressoBENCHMARKSModel Searcher- Object DetectionModel Compressor- Image Classification- Object Detection- Semantic Segmentation- Super ResolutionUSE CASESFree People Detection ModelsFree Vehicle Detection ModelsNETSPRESSO MODEL SEARCHERFeatures & Scope of supportStep 1: Prepare datasetStep 2: Upload datasetStep 3: Create project (Quick Search)Step 3: Create project (Retraining)Step 4: Check the project result and download a modelNETSPRESSO MODEL COMPRESSORFeatures & Scope of supportMethod: Structured PruningMethod: Filter DecompositionSupported modelsSupported ONNX operatorsStep 1: Prepare modelStep 2: Upload modelStep 3: Compress model (Automatic Compression)Step 3: Compress model (Advanced Compression)Step 4: Check the compression result and download a modelStep 5: Retrain the compressed modelNETSPRESSO MODEL LAUNCHERFeatures & Scope of supportStep 1: Prepare modelStep 2: Upload modelStep 3: Convert model (beta)Step 4: Package model (beta)Personal serverRequirementsConnect Personal ServerRELEASE NOTESNetsPresso ReleasesFAQAbout Credit?Powered by - Object DetectionSuggest EditsAll of the original and compressed models can be downloaded easily on the Model Compressor Model Zoo.
250
+ You can get Compressed results with Automatic Compression and Compressed (Adv.) results with Advanced Compression.
251
+ PyTorch
252
+ ModelBest PracticeTypeDatasetmAP(0.5) (%)mAP(0.5:0.95)(%)FLOPs (M)Params (M)Latency (ms)Model Size (MB)YOLOXOriginalCOCO68.049.7156006.2054.2112239.46207.37YOLOXGoogle ColabCompressed-1COCO67.16 (-0.84)48.64 (-1.06)101804.06 (1.53x)19.96 (2.7x)8502.72 (1.44x)76.61 (2.7x)YOLOXGoogle ColabCompressed-2COCO61.43 (-6.57)43.23 (-5.47)38607.03 (4.04x)4.93 (11.0x)4235.37 (2.89x)19.17 (10.80x)
253
+ The model’s latency is measured on Raspberry Pi 4B (1.5GHz ARM Cortex).
254
+ Options: FP32, ONNX runtime
255
+ TensorFlow-Keras
256
+ ModelBest PracticeTypeDatasetmAP(0.5) (%)mAP(0.5:0.95)(%)FLOPs (M)Params (M)Latency (ms)Model Size (MB)YOLOv4OriginalPASCAL VOC82.22-61871.8265.3264318.70262.90YOLOv4Google ColabCompressed-1PASCAL VOC87.23 (+5.01)-11459.69 (5.4x)10.59 (6.17x)28651.70 (2.16x)44.12 (5.96x)YOLOv4Google ColabCompressed-2PASCAL VOC87.91 (+5.69)-14442.96 (4.28x)10.71 (6.1x)28976.40 (2.14x)44.36 (5.93x)
257
+ YOLOv4 model with EfficientNet B1 based backbone.
258
+ The model’s latency is measured on Raspberry Pi 4B (1.5GHz ARM Cortex).
259
+ Options: FP32, TFLite
260
+ Updated about 2 months ago Table of Contents
261
+ PyTorch
262
+ TensorFlow-Keras
263
+
264
+ ######################################################
265
+
266
+ Source 3
267
+ - Object Detection
268
+ Jump to ContentDocumentationHomeDocumentationv1.3.0DocumentationLog InLog InMoon (Dark Mode)Sun (Light Mode)v1.3.0HomeDocumentationSearchINTRODUCTIONWelcomeWhat is NetsPresso?Machine Learning Pipeline with NetsPressoAdvantages of NetsPressoBENCHMARKSModel Searcher- Object DetectionModel Compressor- Image Classification- Object Detection- Semantic Segmentation- Super ResolutionUSE CASESFree People Detection ModelsFree Vehicle Detection ModelsNETSPRESSO MODEL SEARCHERFeatures & Scope of supportStep 1: Prepare datasetStep 2: Upload datasetStep 3: Create project (Quick Search)Step 3: Create project (Retraining)Step 4: Check the project result and download a modelNETSPRESSO MODEL COMPRESSORFeatures & Scope of supportMethod: Structured PruningMethod: Filter DecompositionSupported modelsSupported ONNX operatorsStep 1: Prepare modelStep 2: Upload modelStep 3: Compress model (Automatic Compression)Step 3: Compress model (Advanced Compression)Step 4: Check the compression result and download a modelStep 5: Retrain the compressed modelNETSPRESSO MODEL LAUNCHERFeatures & Scope of supportStep 1: Prepare modelStep 2: Upload modelStep 3: Convert model (beta)Step 4: Package model (beta)Personal serverRequirementsConnect Personal ServerRELEASE NOTESNetsPresso ReleasesFAQAbout Credit?Powered by - Object DetectionSuggest EditsQuick Search supports to train a model based on open sources and NPNets will be available with Advanced Search.
269
+ YOLOv5 and YOLOv5-NPNets
270
+ DatasetModelmAP(0.5) (%)mAP(0.5:0.95) (%)GFLOPsParameters (M)Model Size (MB)PASCAL VOCYOLOv5n72.2444.31.793.62PASCAL VOCYOLOv5n-NPNet73.446.33.51.182.49PASCAL VOCYOLOv5s77.952.2167.0713.7PASCAL VOCYOLOv5s-NPNet80.25612.84.619.05PASCAL VOCYOLOv5m82.159.348.320.9540.2PASCAL VOCYOLOv5m-NPNet83.460.63712.2623.7PASCAL VOCYOLOv5l82.961108.346.2488.6PASCAL VOCYOLOv5l-NPNet85.163.88125.1948.5Updated 5 months ago Table of Contents
271
+ YOLOv5 and YOLOv5-NPNets
272
+ ",,,2023-08-31 13:29:07.271798
integrations/clprf/test_clprf.py ADDED
@@ -0,0 +1,951 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #
2
+ # Pyserini: Reproducible IR research with sparse and dense representations
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ #
16
+
17
+ import os
18
+ import shutil
19
+ import unittest
20
+ from random import randint
21
+
22
+ from integrations.lucenesearcher_score_checker import LuceneSearcherScoreChecker
23
+ from integrations.utils import run_command, parse_score
24
+
25
+
26
+ class TestSearchIntegration(unittest.TestCase):
27
+ def setUp(self):
28
+ # The current directory depends on if you're running inside an IDE or from command line.
29
+ curdir = os.getcwd()
30
+ if curdir.endswith('clprf'):
31
+ self.pyserini_root = '../..'
32
+ self.anserini_root = '../../../anserini'
33
+ else:
34
+ self.pyserini_root = '.'
35
+ self.anserini_root = '../anserini'
36
+
37
+ self.tmp = f'{self.pyserini_root}/integrations/tmp{randint(0, 10000)}'
38
+
39
+ if os.path.exists(self.tmp):
40
+ shutil.rmtree(self.tmp)
41
+ else:
42
+ os.mkdir(self.tmp)
43
+
44
+ self.pyserini_search_cmd = 'python -m pyserini.search.lucene'
45
+ self.pyserini_fusion_cmd = 'python -m pyserini.fusion'
46
+ self.pyserini_eval_cmd = 'python -m pyserini.eval.trec_eval'
47
+
48
+ self.core17_index_path = os.path.join(self.anserini_root, 'indexes/lucene-index.nyt')
49
+ self.core17_qrels_path = os.path.join(self.pyserini_root, 'tools/topics-and-qrels/qrels.core17.txt')
50
+
51
+ self.core18_index_path = os.path.join(self.anserini_root, 'indexes/lucene-index.wapo.v2')
52
+ self.core18_qrels_path = os.path.join(self.pyserini_root, 'tools/topics-and-qrels/qrels.core18.txt')
53
+
54
+ self.robust04_index_path = os.path.join(self.anserini_root, 'indexes/lucene-index.disk45')
55
+ self.robust04_qrels_path = os.path.join(self.pyserini_root, 'tools/topics-and-qrels/qrels.robust04.txt')
56
+
57
+ self.robust05_index_path = os.path.join(self.anserini_root, 'indexes/lucene-index.robust05')
58
+ self.robust05_qrels_path = os.path.join(self.pyserini_root, 'tools/topics-and-qrels/qrels.robust05.txt')
59
+
60
+ self.core17_checker = LuceneSearcherScoreChecker(
61
+ index=self.core17_index_path,
62
+ topics=os.path.join(self.pyserini_root, 'tools/topics-and-qrels/topics.core17.txt'),
63
+ pyserini_topics='core17',
64
+ qrels=self.core17_qrels_path,
65
+ eval=f'{self.pyserini_eval_cmd} -m map -m P.30')
66
+
67
+ self.core18_checker = LuceneSearcherScoreChecker(
68
+ index=self.core18_index_path,
69
+ topics=os.path.join(self.pyserini_root, 'tools/topics-and-qrels/topics.core18.txt'),
70
+ pyserini_topics='core18',
71
+ qrels=os.path.join(self.pyserini_root, 'tools/topics-and-qrels/qrels.core18.txt'),
72
+ eval=f'{self.pyserini_eval_cmd} -m map -m P.30')
73
+
74
+ self.robust04_checker = LuceneSearcherScoreChecker(
75
+ index=self.robust04_index_path,
76
+ topics=os.path.join(self.pyserini_root, 'tools/topics-and-qrels/topics.robust04.txt'),
77
+ pyserini_topics='robust04',
78
+ qrels=os.path.join(self.pyserini_root, 'tools/topics-and-qrels/qrels.robust04.txt'),
79
+ eval=f'{self.pyserini_eval_cmd} -m map -m P.30')
80
+
81
+ self.robust05_checker = LuceneSearcherScoreChecker(
82
+ index=self.robust05_index_path,
83
+ topics=os.path.join(self.pyserini_root, 'tools/topics-and-qrels/topics.robust05.txt'),
84
+ pyserini_topics='robust05',
85
+ qrels=os.path.join(self.pyserini_root, 'tools/topics-and-qrels/qrels.robust05.txt'),
86
+ eval=f'{self.pyserini_eval_cmd} -m map -m P.30')
87
+
88
+ def test_cross_validation(self):
89
+ pyserini_topics = 'core17'
90
+ os.mkdir(f'{self.tmp}/core17')
91
+ for alpha in [x / 10.0 for x in range(0, 11)]:
92
+ run_file_cmd = f'{self.pyserini_search_cmd} --index {self.core17_index_path} \
93
+ --topics {pyserini_topics} --output {self.tmp}/core17/core17_lr_A{alpha}_bm25.txt \
94
+ --prcl lr --prcl.vectorizer TfidfVectorizer --prcl.alpha {alpha}'
95
+
96
+ status = os.system(run_file_cmd)
97
+ self.assertEqual(status, 0)
98
+ os.system(f'python {self.pyserini_root}/scripts/classifier_prf/cross_validate.py \
99
+ --anserini {self.anserini_root} --run_file {self.tmp} --pyserini {self.pyserini_root} \
100
+ --collection core17 --output {self.tmp}/core17_lr.txt --classifier lr ')
101
+
102
+ cmd = f'{self.pyserini_eval_cmd} -m map -m P.30 \
103
+ {self.anserini_root}/src/main/resources/topics-and-qrels/qrels.core17.txt \
104
+ {self.tmp}/core17_lr.txt'
105
+
106
+ status = os.system(cmd)
107
+ stdout, stderr = run_command(cmd)
108
+ score = parse_score(stdout, 'map')
109
+
110
+ self.assertEqual(status, 0)
111
+ self.assertEqual(stderr, '')
112
+ self.assertAlmostEqual(score, 0.2462, delta=0.0001)
113
+
114
+ def test_core17(self):
115
+ self.assertTrue(self.core17_checker.run('core17_bm25', '--bm25', 0.2087))
116
+
117
+ def test_core17_rm3(self):
118
+ self.assertTrue(self.core17_checker.run('core17_bm25', '--bm25 --rm3', 0.2798))
119
+
120
+ def test_core17_lr(self):
121
+ pyserini_topics = 'core17'
122
+
123
+ run_file_cmd = f'{self.pyserini_search_cmd} --index {self.core17_index_path} \
124
+ --topics {pyserini_topics} --output {self.tmp}/core17_lr.txt \
125
+ --prcl lr --prcl.vectorizer TfidfVectorizer --prcl.alpha 0.7'
126
+
127
+ status = os.system(run_file_cmd)
128
+ self.assertEqual(status, 0)
129
+
130
+ score_cmd = f'{self.pyserini_eval_cmd} -m map -m P.30 \
131
+ {self.anserini_root}/src/main/resources/topics-and-qrels/qrels.core17.txt \
132
+ {self.tmp}/core17_lr.txt'
133
+
134
+ status = os.system(score_cmd)
135
+ stdout, stderr = run_command(score_cmd)
136
+ score = parse_score(stdout, 'map')
137
+
138
+ self.assertEqual(status, 0)
139
+ self.assertEqual(stderr, '')
140
+ self.assertAlmostEqual(score, 0.2473, delta=0.0001)
141
+
142
+ def test_core17_lr_rm3(self):
143
+ pyserini_topics = 'core17'
144
+
145
+ run_file_cmd = f'{self.pyserini_search_cmd} --index {self.core17_index_path} \
146
+ --topics {pyserini_topics} --output {self.tmp}/core17_lr_rm3.txt \
147
+ --prcl lr --prcl.vectorizer TfidfVectorizer --prcl.alpha 0.4 --rm3'
148
+
149
+ status = os.system(run_file_cmd)
150
+ self.assertEqual(status, 0)
151
+
152
+ score_cmd = f'{self.pyserini_eval_cmd} -m map -m P.30 \
153
+ {self.anserini_root}/src/main/resources/topics-and-qrels/qrels.core17.txt \
154
+ {self.tmp}/core17_lr_rm3.txt'
155
+
156
+ status = os.system(score_cmd)
157
+ stdout, stderr = run_command(score_cmd)
158
+ score = parse_score(stdout, 'map')
159
+
160
+ self.assertEqual(status, 0)
161
+ self.assertEqual(stderr, '')
162
+ self.assertAlmostEqual(score, 0.2926, delta=0.0001)
163
+
164
+ def test_core17_svm(self):
165
+ pyserini_topics = 'core17'
166
+
167
+ run_file_cmd = f'{self.pyserini_search_cmd} --index {self.core17_index_path} \
168
+ --topics {pyserini_topics} --output {self.tmp}/core17_svm.txt \
169
+ --prcl svm --prcl.vectorizer TfidfVectorizer --prcl.alpha 0.7'
170
+
171
+ status = os.system(run_file_cmd)
172
+ self.assertEqual(status, 0)
173
+
174
+ score_cmd = f'{self.pyserini_eval_cmd} -m map -m P.30 \
175
+ {self.anserini_root}/src/main/resources/topics-and-qrels/qrels.core17.txt \
176
+ {self.tmp}/core17_svm.txt'
177
+
178
+ status = os.system(score_cmd)
179
+ stdout, stderr = run_command(score_cmd)
180
+ score = parse_score(stdout, 'map')
181
+
182
+ self.assertEqual(status, 0)
183
+ self.assertEqual(stderr, '')
184
+ self.assertAlmostEqual(score, 0.2385, delta=0.0001)
185
+
186
+ def test_core17_svm_rm3(self):
187
+ pyserini_topics = 'core17'
188
+
189
+ run_file_cmd = f'{self.pyserini_search_cmd} --index {self.core17_index_path} \
190
+ --topics {pyserini_topics} --output {self.tmp}/core17_svm_rm3.txt \
191
+ --prcl svm --prcl.vectorizer TfidfVectorizer --prcl.alpha 0.4 --rm3'
192
+
193
+ status = os.system(run_file_cmd)
194
+ self.assertEqual(status, 0)
195
+
196
+ score_cmd = f'{self.pyserini_eval_cmd} -m map -m P.30 \
197
+ {self.anserini_root}/src/main/resources/topics-and-qrels/qrels.core17.txt \
198
+ {self.tmp}/core17_svm_rm3.txt'
199
+
200
+ status = os.system(score_cmd)
201
+ stdout, stderr = run_command(score_cmd)
202
+ score = parse_score(stdout, 'map')
203
+
204
+ self.assertEqual(status, 0)
205
+ self.assertEqual(stderr, '')
206
+ self.assertAlmostEqual(score, 0.2956, delta=0.0001)
207
+
208
+ def test_core17_avg(self):
209
+ pyserini_topics = 'core17'
210
+
211
+ run_file_cmd = f'{self.pyserini_search_cmd} --index {self.core17_index_path} \
212
+ --topics {pyserini_topics} --output {self.tmp}/core17_avg.txt \
213
+ --prcl lr svm --prcl.vectorizer TfidfVectorizer --prcl.alpha 0.6'
214
+
215
+ status = os.system(run_file_cmd)
216
+ self.assertEqual(status, 0)
217
+
218
+ score_cmd = f'{self.pyserini_eval_cmd} -m map -m P.30 \
219
+ {self.anserini_root}/src/main/resources/topics-and-qrels/qrels.core17.txt \
220
+ {self.tmp}/core17_avg.txt'
221
+
222
+ status = os.system(score_cmd)
223
+ stdout, stderr = run_command(score_cmd)
224
+ score = parse_score(stdout, 'map')
225
+
226
+ self.assertEqual(status, 0)
227
+ self.assertEqual(stderr, '')
228
+ self.assertAlmostEqual(score, 0.2442, delta=0.0001)
229
+
230
+ def test_core17_avg_rm3(self):
231
+ pyserini_topics = 'core17'
232
+
233
+ run_file_cmd = f'{self.pyserini_search_cmd} --index {self.core17_index_path} \
234
+ --topics {pyserini_topics} --output {self.tmp}/core17_avg_rm3.txt \
235
+ --prcl lr svm --prcl.vectorizer TfidfVectorizer --prcl.alpha 0.5 --rm3'
236
+
237
+ status = os.system(run_file_cmd)
238
+ self.assertEqual(status, 0)
239
+
240
+ score_cmd = f'{self.pyserini_eval_cmd} -m map -m P.30 \
241
+ {self.anserini_root}/src/main/resources/topics-and-qrels/qrels.core17.txt \
242
+ {self.tmp}/core17_avg_rm3.txt'
243
+
244
+ status = os.system(score_cmd)
245
+ stdout, stderr = run_command(score_cmd)
246
+ score = parse_score(stdout, 'map')
247
+
248
+ self.assertEqual(status, 0)
249
+ self.assertEqual(stderr, '')
250
+ self.assertAlmostEqual(score, 0.2950, delta=0.0001)
251
+
252
+ def test_core17_rrf(self):
253
+ pyserini_topics = 'core17'
254
+ lr_cmd = f'{self.pyserini_search_cmd} --index {self.core17_index_path} \
255
+ --topics {pyserini_topics} --output {self.tmp}/core17_lr.txt \
256
+ --prcl lr --prcl.vectorizer TfidfVectorizer --prcl.alpha 0.7'
257
+
258
+ status = os.system(lr_cmd)
259
+ self.assertEqual(status, 0)
260
+
261
+ svm_cmd = f'{self.pyserini_search_cmd} --index {self.core17_index_path} \
262
+ --topics {pyserini_topics} --output {self.tmp}/core17_svm.txt \
263
+ --prcl svm --prcl.vectorizer TfidfVectorizer --prcl.alpha 0.7'
264
+
265
+ status = os.system(svm_cmd)
266
+ self.assertEqual(status, 0)
267
+
268
+ rrf_cmd = f'{self.pyserini_fusion_cmd} \
269
+ --runs {self.tmp}/core17_lr.txt {self.tmp}/core17_svm.txt \
270
+ --output {self.tmp}/core17_rrf.txt --resort'
271
+
272
+ status = os.system(rrf_cmd)
273
+ self.assertEqual(status, 0)
274
+
275
+ score_cmd = f'{self.pyserini_eval_cmd} -m map -m P.30 \
276
+ {self.anserini_root}/src/main/resources/topics-and-qrels/qrels.core17.txt \
277
+ {self.tmp}/core17_rrf.txt'
278
+
279
+ status = os.system(score_cmd)
280
+ stdout, stderr = run_command(score_cmd)
281
+ score = parse_score(stdout, 'map')
282
+
283
+ self.assertEqual(status, 0)
284
+ self.assertEqual(stderr, '')
285
+ self.assertAlmostEqual(score, 0.2446, delta=0.0001)
286
+
287
+ def test_core17_rrf_rm3(self):
288
+ pyserini_topics = 'core17'
289
+ lr_cmd = f'{self.pyserini_search_cmd} --index {self.core17_index_path} \
290
+ --topics {pyserini_topics} --output {self.tmp}/core17_lr_rm3.txt \
291
+ --prcl lr --prcl.vectorizer TfidfVectorizer --prcl.alpha 0.4 --rm3'
292
+
293
+ status = os.system(lr_cmd)
294
+ self.assertEqual(status, 0)
295
+
296
+ svm_cmd = f'{self.pyserini_search_cmd} --index {self.core17_index_path} \
297
+ --topics {pyserini_topics} --output {self.tmp}/core17_svm_rm3.txt \
298
+ --prcl svm --prcl.vectorizer TfidfVectorizer --prcl.alpha 0.4 --rm3'
299
+
300
+ status = os.system(svm_cmd)
301
+ self.assertEqual(status, 0)
302
+
303
+ rrf_cmd = f'{self.pyserini_fusion_cmd} \
304
+ --runs {self.tmp}/core17_lr_rm3.txt {self.tmp}/core17_svm_rm3.txt \
305
+ --output {self.tmp}/core17_rrf_rm3.txt --resort'
306
+
307
+ status = os.system(rrf_cmd)
308
+ self.assertEqual(status, 0)
309
+
310
+ score_cmd = f'{self.pyserini_eval_cmd} -m map -m P.30 \
311
+ {self.anserini_root}/src/main/resources/topics-and-qrels/qrels.core17.txt \
312
+ {self.tmp}/core17_rrf_rm3.txt'
313
+
314
+ status = os.system(score_cmd)
315
+ stdout, stderr = run_command(score_cmd)
316
+ score = parse_score(stdout, 'map')
317
+
318
+ self.assertEqual(status, 0)
319
+ self.assertEqual(stderr, '')
320
+ self.assertAlmostEqual(score, 0.2957, delta=0.0001)
321
+
322
+ def test_core18(self):
323
+ self.assertTrue(self.core18_checker.run('core18_bm25', '--bm25', 0.2496))
324
+
325
+ def test_core18_rm3(self):
326
+ self.assertTrue(self.core18_checker.run('core18_bm25', '--bm25 --rm3', 0.3129))
327
+
328
+ def test_core18_lr(self):
329
+ pyserini_topics = 'core18'
330
+
331
+ run_file_cmd = f'{self.pyserini_search_cmd} --index {self.core18_index_path} \
332
+ --topics {pyserini_topics} --output {self.tmp}/core18_lr.txt \
333
+ --prcl lr --prcl.vectorizer TfidfVectorizer --prcl.alpha 0.6'
334
+
335
+ status = os.system(run_file_cmd)
336
+ self.assertEqual(status, 0)
337
+
338
+ score_cmd = f'{self.pyserini_eval_cmd} -m map -m P.30 \
339
+ {self.anserini_root}/src/main/resources/topics-and-qrels/qrels.core18.txt \
340
+ {self.tmp}/core18_lr.txt'
341
+
342
+ status = os.system(score_cmd)
343
+ stdout, stderr = run_command(score_cmd)
344
+ score = parse_score(stdout, 'map')
345
+
346
+ self.assertEqual(status, 0)
347
+ self.assertEqual(stderr, '')
348
+ self.assertAlmostEqual(score, 0.2839, delta=0.0001)
349
+
350
+ def test_core18_lr_rm3(self):
351
+ pyserini_topics = 'core18'
352
+
353
+ run_file_cmd = f'{self.pyserini_search_cmd} --index {self.core18_index_path} \
354
+ --topics {pyserini_topics} --output {self.tmp}/core18_lr_rm3.txt \
355
+ --prcl lr --prcl.vectorizer TfidfVectorizer --prcl.alpha 0.5 --rm3'
356
+
357
+ status = os.system(run_file_cmd)
358
+ self.assertEqual(status, 0)
359
+
360
+ score_cmd = f'{self.pyserini_eval_cmd} -m map -m P.30 \
361
+ {self.anserini_root}/src/main/resources/topics-and-qrels/qrels.core18.txt \
362
+ {self.tmp}/core18_lr_rm3.txt'
363
+
364
+ status = os.system(score_cmd)
365
+ stdout, stderr = run_command(score_cmd)
366
+ score = parse_score(stdout, 'map')
367
+
368
+ self.assertEqual(status, 0)
369
+ self.assertEqual(stderr, '')
370
+ self.assertAlmostEqual(score, 0.3216, delta=0.0001)
371
+
372
+ def test_core18_svm(self):
373
+ pyserini_topics = 'core18'
374
+
375
+ run_file_cmd = f'{self.pyserini_search_cmd} --index {self.core18_index_path} \
376
+ --topics {pyserini_topics} --output {self.tmp}/core18_svm.txt \
377
+ --prcl svm --prcl.vectorizer TfidfVectorizer --prcl.alpha 0.6'
378
+
379
+ status = os.system(run_file_cmd)
380
+ self.assertEqual(status, 0)
381
+
382
+ score_cmd = f'{self.pyserini_eval_cmd} -m map -m P.30 \
383
+ {self.anserini_root}/src/main/resources/topics-and-qrels/qrels.core18.txt \
384
+ {self.tmp}/core18_svm.txt'
385
+
386
+ status = os.system(score_cmd)
387
+ stdout, stderr = run_command(score_cmd)
388
+ score = parse_score(stdout, 'map')
389
+
390
+ self.assertEqual(status, 0)
391
+ self.assertEqual(stderr, '')
392
+ self.assertAlmostEqual(score, 0.2841, delta=0.0001)
393
+
394
+ def test_core18_svm_rm3(self):
395
+ pyserini_topics = 'core18'
396
+
397
+ run_file_cmd = f'{self.pyserini_search_cmd} --index {self.core18_index_path} \
398
+ --topics {pyserini_topics} --output {self.tmp}/core18_svm_rm3.txt \
399
+ --prcl svm --prcl.vectorizer TfidfVectorizer --prcl.alpha 0.5 --rm3'
400
+
401
+ status = os.system(run_file_cmd)
402
+ self.assertEqual(status, 0)
403
+
404
+ score_cmd = f'{self.pyserini_eval_cmd} -m map -m P.30 \
405
+ {self.anserini_root}/src/main/resources/topics-and-qrels/qrels.core18.txt \
406
+ {self.tmp}/core18_svm_rm3.txt'
407
+
408
+ status = os.system(score_cmd)
409
+ stdout, stderr = run_command(score_cmd)
410
+ score = parse_score(stdout, 'map')
411
+
412
+ self.assertEqual(status, 0)
413
+ self.assertEqual(stderr, '')
414
+ self.assertAlmostEqual(score, 0.3200, delta=0.0001)
415
+
416
+ def test_core18_avg(self):
417
+ pyserini_topics = 'core18'
418
+
419
+ run_file_cmd = f'{self.pyserini_search_cmd} --index {self.core18_index_path} \
420
+ --topics {pyserini_topics} --output {self.tmp}/core18_avg.txt \
421
+ --prcl lr svm --prcl.vectorizer TfidfVectorizer --prcl.alpha 0.4'
422
+
423
+ status = os.system(run_file_cmd)
424
+ self.assertEqual(status, 0)
425
+
426
+ score_cmd = f'{self.pyserini_eval_cmd} -m map -m P.30 \
427
+ {self.anserini_root}/src/main/resources/topics-and-qrels/qrels.core18.txt \
428
+ {self.tmp}/core18_avg.txt'
429
+
430
+ status = os.system(score_cmd)
431
+ stdout, stderr = run_command(score_cmd)
432
+ score = parse_score(stdout, 'map')
433
+
434
+ self.assertEqual(status, 0)
435
+ self.assertEqual(stderr, '')
436
+ self.assertAlmostEqual(score, 0.2860, delta=0.0001)
437
+
438
+ def test_core18_avg_rm3(self):
439
+ pyserini_topics = 'core18'
440
+
441
+ run_file_cmd = f'{self.pyserini_search_cmd} --index {self.core18_index_path} \
442
+ --topics {pyserini_topics} --output {self.tmp}/core18_avg_rm3.txt \
443
+ --prcl lr svm --prcl.vectorizer TfidfVectorizer --prcl.alpha 0.4 --rm3'
444
+
445
+ status = os.system(run_file_cmd)
446
+ self.assertEqual(status, 0)
447
+
448
+ score_cmd = f'{self.pyserini_eval_cmd} -m map -m P.30 \
449
+ {self.anserini_root}/src/main/resources/topics-and-qrels/qrels.core18.txt \
450
+ {self.tmp}/core18_avg_rm3.txt'
451
+
452
+ status = os.system(score_cmd)
453
+ stdout, stderr = run_command(score_cmd)
454
+ score = parse_score(stdout, 'map')
455
+
456
+ self.assertEqual(status, 0)
457
+ self.assertEqual(stderr, '')
458
+ self.assertAlmostEqual(score, 0.3215, delta=0.0001)
459
+
460
+ def test_core18_rrf(self):
461
+ pyserini_topics = 'core18'
462
+ lr_cmd = f'{self.pyserini_search_cmd} --index {self.core18_index_path} \
463
+ --topics {pyserini_topics} --output {self.tmp}/core18_lr.txt \
464
+ --prcl lr --prcl.vectorizer TfidfVectorizer --prcl.alpha 0.6'
465
+
466
+ status = os.system(lr_cmd)
467
+ self.assertEqual(status, 0)
468
+
469
+ svm_cmd = f'{self.pyserini_search_cmd} --index {self.core18_index_path} \
470
+ --topics {pyserini_topics} --output {self.tmp}/core18_svm.txt \
471
+ --prcl svm --prcl.vectorizer TfidfVectorizer --prcl.alpha 0.6'
472
+
473
+ status = os.system(svm_cmd)
474
+ self.assertEqual(status, 0)
475
+
476
+ rrf_cmd = f'{self.pyserini_fusion_cmd} \
477
+ --runs {self.tmp}/core18_lr.txt {self.tmp}/core18_svm.txt \
478
+ --output {self.tmp}/core18_rrf.txt --resort'
479
+
480
+ status = os.system(rrf_cmd)
481
+ self.assertEqual(status, 0)
482
+
483
+ score_cmd = f'{self.pyserini_eval_cmd} -m map -m P.30 \
484
+ {self.anserini_root}/src/main/resources/topics-and-qrels/qrels.core18.txt \
485
+ {self.tmp}/core18_rrf.txt'
486
+
487
+ status = os.system(score_cmd)
488
+ stdout, stderr = run_command(score_cmd)
489
+ score = parse_score(stdout, 'map')
490
+
491
+ self.assertEqual(status, 0)
492
+ self.assertEqual(stderr, '')
493
+ self.assertAlmostEqual(score, 0.2881, delta=0.0001)
494
+
495
+ def test_core18_rrf_rm3(self):
496
+ pyserini_topics = 'core18'
497
+ lr_cmd = f'{self.pyserini_search_cmd} --index {self.core18_index_path} \
498
+ --topics {pyserini_topics} --output {self.tmp}/core18_lr_rm3.txt \
499
+ --prcl lr --prcl.vectorizer TfidfVectorizer --prcl.alpha 0.5 --rm3'
500
+
501
+ status = os.system(lr_cmd)
502
+ self.assertEqual(status, 0)
503
+
504
+ svm_cmd = f'{self.pyserini_search_cmd} --index {self.core18_index_path} \
505
+ --topics {pyserini_topics} --output {self.tmp}/core18_svm_rm3.txt \
506
+ --prcl svm --prcl.vectorizer TfidfVectorizer --prcl.alpha 0.5 --rm3'
507
+
508
+ status = os.system(svm_cmd)
509
+ self.assertEqual(status, 0)
510
+
511
+ rrf_cmd = f'{self.pyserini_fusion_cmd} \
512
+ --runs {self.tmp}/core18_lr_rm3.txt {self.tmp}/core18_svm_rm3.txt \
513
+ --output {self.tmp}/core18_rrf_rm3.txt --resort'
514
+
515
+ status = os.system(rrf_cmd)
516
+ self.assertEqual(status, 0)
517
+
518
+ score_cmd = f'{self.pyserini_eval_cmd} -m map -m P.30 \
519
+ {self.anserini_root}/src/main/resources/topics-and-qrels/qrels.core18.txt \
520
+ {self.tmp}/core18_rrf_rm3.txt'
521
+
522
+ status = os.system(score_cmd)
523
+ stdout, stderr = run_command(score_cmd)
524
+ score = parse_score(stdout, 'map')
525
+
526
+ self.assertEqual(status, 0)
527
+ self.assertEqual(stderr, '')
528
+ self.assertAlmostEqual(score, 0.3205, delta=0.0001)
529
+
530
+ def test_robust04(self):
531
+ self.assertTrue(self.robust04_checker.run('robust04_bm25', '--bm25', 0.2531))
532
+
533
+ def test_robust04_rm3(self):
534
+ self.assertTrue(self.robust04_checker.run('robust04_bm25_rm3', '--bm25 --rm3', 0.2908))
535
+
536
+ def test_robust04_lr(self):
537
+ pyserini_topics = 'robust04'
538
+
539
+ run_file_cmd = f'{self.pyserini_search_cmd} --index {self.robust04_index_path} \
540
+ --topics {pyserini_topics} --output {self.tmp}/robust04_lr.txt \
541
+ --prcl lr --prcl.vectorizer TfidfVectorizer --prcl.alpha 0.5'
542
+
543
+ status = os.system(run_file_cmd)
544
+ self.assertEqual(status, 0)
545
+
546
+ score_cmd = f'{self.pyserini_eval_cmd} -m map -m P.30 \
547
+ {self.anserini_root}/src/main/resources/topics-and-qrels/qrels.robust04.txt \
548
+ {self.tmp}/robust04_lr.txt'
549
+
550
+ status = os.system(score_cmd)
551
+ stdout, stderr = run_command(score_cmd)
552
+ score = parse_score(stdout, 'map')
553
+
554
+ self.assertEqual(status, 0)
555
+ self.assertEqual(stderr, '')
556
+ self.assertAlmostEqual(score, 0.2747, delta=0.0001)
557
+
558
+ def test_robust04_lr_rm3(self):
559
+ pyserini_topics = 'robust04'
560
+
561
+ run_file_cmd = f'{self.pyserini_search_cmd} --index {self.robust04_index_path} \
562
+ --topics {pyserini_topics} --output {self.tmp}/robust04_lr_rm3.txt \
563
+ --prcl lr --prcl.vectorizer TfidfVectorizer --prcl.alpha 0.3 --rm3'
564
+
565
+ status = os.system(run_file_cmd)
566
+ self.assertEqual(status, 0)
567
+
568
+ score_cmd = f'{self.pyserini_eval_cmd} -m map -m P.30 \
569
+ {self.anserini_root}/src/main/resources/topics-and-qrels/qrels.robust04.txt \
570
+ {self.tmp}/robust04_lr_rm3.txt'
571
+
572
+ status = os.system(score_cmd)
573
+ stdout, stderr = run_command(score_cmd)
574
+ score = parse_score(stdout, 'map')
575
+
576
+ self.assertEqual(status, 0)
577
+ self.assertEqual(stderr, '')
578
+ self.assertAlmostEqual(score, 0.2969, delta=0.0001)
579
+
580
+ def test_robust04_svm(self):
581
+ pyserini_topics = 'robust04'
582
+
583
+ run_file_cmd = f'{self.pyserini_search_cmd} --index {self.robust04_index_path} \
584
+ --topics {pyserini_topics} --output {self.tmp}/robust04_svm.txt \
585
+ --prcl svm --prcl.vectorizer TfidfVectorizer --prcl.alpha 0.5'
586
+
587
+ status = os.system(run_file_cmd)
588
+ self.assertEqual(status, 0)
589
+
590
+ score_cmd = f'{self.pyserini_eval_cmd} -m map -m P.30 \
591
+ {self.anserini_root}/src/main/resources/topics-and-qrels/qrels.robust04.txt \
592
+ {self.tmp}/robust04_svm.txt'
593
+
594
+ status = os.system(score_cmd)
595
+ stdout, stderr = run_command(score_cmd)
596
+ score = parse_score(stdout, 'map')
597
+
598
+ self.assertEqual(status, 0)
599
+ self.assertEqual(stderr, '')
600
+ self.assertAlmostEqual(score, 0.2726, delta=0.0001)
601
+
602
+ def test_robust04_svm_rm3(self):
603
+ pyserini_topics = 'robust04'
604
+
605
+ run_file_cmd = f'{self.pyserini_search_cmd} --index {self.robust04_index_path} \
606
+ --topics {pyserini_topics} --output {self.tmp}/robust04_svm_rm3.txt \
607
+ --prcl svm --prcl.vectorizer TfidfVectorizer --prcl.alpha 0.3 --rm3'
608
+
609
+ status = os.system(run_file_cmd)
610
+ self.assertEqual(status, 0)
611
+
612
+ score_cmd = f'{self.pyserini_eval_cmd} -m map -m P.30 \
613
+ {self.anserini_root}/src/main/resources/topics-and-qrels/qrels.robust04.txt \
614
+ {self.tmp}/robust04_svm_rm3.txt'
615
+
616
+ status = os.system(score_cmd)
617
+ stdout, stderr = run_command(score_cmd)
618
+ score = parse_score(stdout, 'map')
619
+
620
+ self.assertEqual(status, 0)
621
+ self.assertEqual(stderr, '')
622
+ self.assertAlmostEqual(score, 0.2972, delta=0.0001)
623
+
624
+ def test_robust04_avg(self):
625
+ pyserini_topics = 'robust04'
626
+
627
+ run_file_cmd = f'{self.pyserini_search_cmd} --index {self.robust04_index_path} \
628
+ --topics {pyserini_topics} --output {self.tmp}/robust04_avg.txt \
629
+ --prcl lr svm --prcl.vectorizer TfidfVectorizer --prcl.alpha 0.5'
630
+
631
+ status = os.system(run_file_cmd)
632
+ self.assertEqual(status, 0)
633
+
634
+ score_cmd = f'{self.pyserini_eval_cmd} -m map -m P.30 \
635
+ {self.anserini_root}/src/main/resources/topics-and-qrels/qrels.robust04.txt \
636
+ {self.tmp}/robust04_avg.txt'
637
+
638
+ status = os.system(score_cmd)
639
+ stdout, stderr = run_command(score_cmd)
640
+ score = parse_score(stdout, 'map')
641
+
642
+ self.assertEqual(status, 0)
643
+ self.assertEqual(stderr, '')
644
+ self.assertAlmostEqual(score, 0.276, delta=0.0001)
645
+
646
+ def test_robust04_avg_rm3(self):
647
+ pyserini_topics = 'robust04'
648
+
649
+ run_file_cmd = f'{self.pyserini_search_cmd} --index {self.robust04_index_path} \
650
+ --topics {pyserini_topics} --output {self.tmp}/robust04_avg_rm3.txt \
651
+ --prcl lr svm --prcl.vectorizer TfidfVectorizer --prcl.alpha 0.3 --rm3'
652
+
653
+ status = os.system(run_file_cmd)
654
+ self.assertEqual(status, 0)
655
+
656
+ score_cmd = f'{self.pyserini_eval_cmd} -m map -m P.30 \
657
+ {self.anserini_root}/src/main/resources/topics-and-qrels/qrels.robust04.txt \
658
+ {self.tmp}/robust04_avg_rm3.txt'
659
+
660
+ status = os.system(score_cmd)
661
+ stdout, stderr = run_command(score_cmd)
662
+ score = parse_score(stdout, 'map')
663
+
664
+ self.assertEqual(status, 0)
665
+ self.assertEqual(stderr, '')
666
+ self.assertAlmostEqual(score, 0.2980, delta=0.0001)
667
+
668
+ def test_robust04_rrf(self):
669
+ pyserini_topics = 'robust04'
670
+ lr_cmd = f'{self.pyserini_search_cmd} --index {self.robust04_index_path} \
671
+ --topics {pyserini_topics} --output {self.tmp}/robust04_lr.txt \
672
+ --prcl lr --prcl.vectorizer TfidfVectorizer --prcl.alpha 0.5'
673
+
674
+ status = os.system(lr_cmd)
675
+ self.assertEqual(status, 0)
676
+
677
+ svm_cmd = f'{self.pyserini_search_cmd} --index {self.robust04_index_path} \
678
+ --topics {pyserini_topics} --output {self.tmp}/robust04_svm.txt \
679
+ --prcl svm --prcl.vectorizer TfidfVectorizer --prcl.alpha 0.5'
680
+
681
+ status = os.system(svm_cmd)
682
+ self.assertEqual(status, 0)
683
+
684
+ rrf_cmd = f'{self.pyserini_fusion_cmd} \
685
+ --runs {self.tmp}/robust04_lr.txt {self.tmp}/robust04_svm.txt \
686
+ --output {self.tmp}/robust04_rrf.txt --resort'
687
+
688
+ status = os.system(rrf_cmd)
689
+ self.assertEqual(status, 0)
690
+
691
+ score_cmd = f'{self.pyserini_eval_cmd} -m map -m P.30 \
692
+ {self.anserini_root}/src/main/resources/topics-and-qrels/qrels.robust04.txt \
693
+ {self.tmp}/robust04_rrf.txt'
694
+
695
+ status = os.system(score_cmd)
696
+ stdout, stderr = run_command(score_cmd)
697
+ score = parse_score(stdout, 'map')
698
+
699
+ self.assertEqual(status, 0)
700
+ self.assertEqual(stderr, '')
701
+ self.assertAlmostEqual(score, 0.275, delta=0.0001)
702
+
703
+ def test_robust04_rrf_rm3(self):
704
+ pyserini_topics = 'robust04'
705
+ lr_cmd = f'{self.pyserini_search_cmd} --index {self.robust04_index_path} \
706
+ --topics {pyserini_topics} --output {self.tmp}/robust04_lr_rm3.txt \
707
+ --prcl lr --prcl.vectorizer TfidfVectorizer --prcl.alpha 0.3 --rm3'
708
+
709
+ status = os.system(lr_cmd)
710
+ self.assertEqual(status, 0)
711
+
712
+ svm_cmd = f'{self.pyserini_search_cmd} --index {self.robust04_index_path} \
713
+ --topics {pyserini_topics} --output {self.tmp}/robust04_svm_rm3.txt \
714
+ --prcl svm --prcl.vectorizer TfidfVectorizer --prcl.alpha 0.3 --rm3'
715
+
716
+ status = os.system(svm_cmd)
717
+ self.assertEqual(status, 0)
718
+
719
+ rrf_cmd = f'{self.pyserini_fusion_cmd} \
720
+ --runs {self.tmp}/robust04_lr_rm3.txt {self.tmp}/robust04_svm_rm3.txt \
721
+ --output {self.tmp}/robust04_rrf_rm3.txt --resort'
722
+
723
+ status = os.system(rrf_cmd)
724
+ self.assertEqual(status, 0)
725
+
726
+ score_cmd = f'{self.pyserini_eval_cmd} -m map -m P.30 \
727
+ {self.anserini_root}/src/main/resources/topics-and-qrels/qrels.robust04.txt \
728
+ {self.tmp}/robust04_rrf_rm3.txt'
729
+
730
+ status = os.system(score_cmd)
731
+ stdout, stderr = run_command(score_cmd)
732
+ score = parse_score(stdout, 'map')
733
+
734
+ self.assertEqual(status, 0)
735
+ self.assertEqual(stderr, '')
736
+ self.assertAlmostEqual(score, 0.2977, delta=0.0001)
737
+
738
+ def test_robust05(self):
739
+ self.assertTrue(self.robust05_checker.run('robust05_bm25', '--bm25', 0.2032))
740
+
741
+ def test_robust05_rm3(self):
742
+ self.assertTrue(self.robust05_checker.run('robust05_bm25_rm3', '--bm25 --rm3', 0.2624))
743
+
744
+ def test_robust05_lr(self):
745
+ pyserini_topics = 'robust05'
746
+
747
+ run_file_cmd = f'{self.pyserini_search_cmd} --index {self.robust05_index_path} \
748
+ --topics {pyserini_topics} --output {self.tmp}/robust05_lr.txt \
749
+ --prcl lr --prcl.vectorizer TfidfVectorizer --prcl.alpha 0.8'
750
+
751
+ status = os.system(run_file_cmd)
752
+ self.assertEqual(status, 0)
753
+
754
+ score_cmd = f'{self.pyserini_eval_cmd} -m map -m P.30 \
755
+ {self.anserini_root}/src/main/resources/topics-and-qrels/qrels.robust05.txt \
756
+ {self.tmp}/robust05_lr.txt'
757
+
758
+ status = os.system(score_cmd)
759
+ stdout, stderr = run_command(score_cmd)
760
+ score = parse_score(stdout, 'map')
761
+
762
+ self.assertEqual(status, 0)
763
+ self.assertEqual(stderr, '')
764
+ self.assertAlmostEqual(score, 0.2476, delta=0.0001)
765
+
766
+ def test_robust05_lr_rm3(self):
767
+ pyserini_topics = 'robust05'
768
+
769
+ run_file_cmd = f'{self.pyserini_search_cmd} --index {self.robust05_index_path} \
770
+ --topics {pyserini_topics} --output {self.tmp}/robust05_lr_rm3.txt \
771
+ --prcl lr --prcl.vectorizer TfidfVectorizer --prcl.alpha 0.6 --rm3'
772
+
773
+ status = os.system(run_file_cmd)
774
+ self.assertEqual(status, 0)
775
+
776
+ score_cmd = f'{self.pyserini_eval_cmd} -m map -m P.30 \
777
+ {self.anserini_root}/src/main/resources/topics-and-qrels/qrels.robust05.txt \
778
+ {self.tmp}/robust05_lr_rm3.txt'
779
+
780
+ status = os.system(score_cmd)
781
+ stdout, stderr = run_command(score_cmd)
782
+ score = parse_score(stdout, 'map')
783
+
784
+ self.assertEqual(status, 0)
785
+ self.assertEqual(stderr, '')
786
+ self.assertAlmostEqual(score, 0.2872, delta=0.0001)
787
+
788
+ def test_robust05_svm(self):
789
+ pyserini_topics = 'robust05'
790
+
791
+ run_file_cmd = f'{self.pyserini_search_cmd} --index {self.robust05_index_path} \
792
+ --topics {pyserini_topics} --output {self.tmp}/robust05_svm.txt \
793
+ --prcl svm --prcl.vectorizer TfidfVectorizer --prcl.alpha 0.8'
794
+
795
+ status = os.system(run_file_cmd)
796
+ self.assertEqual(status, 0)
797
+
798
+ score_cmd = f'{self.pyserini_eval_cmd} -m map -m P.30 \
799
+ {self.anserini_root}/src/main/resources/topics-and-qrels/qrels.robust05.txt \
800
+ {self.tmp}/robust05_svm.txt'
801
+
802
+ status = os.system(score_cmd)
803
+ stdout, stderr = run_command(score_cmd)
804
+ score = parse_score(stdout, 'map')
805
+
806
+ self.assertEqual(status, 0)
807
+ self.assertEqual(stderr, '')
808
+ self.assertAlmostEqual(score, 0.2486, delta=0.0001)
809
+
810
+ def test_robust05_svm_rm3(self):
811
+ pyserini_topics = 'robust05'
812
+
813
+ run_file_cmd = f'{self.pyserini_search_cmd} --index {self.robust05_index_path} \
814
+ --topics {pyserini_topics} --output {self.tmp}/robust05_svm_rm3.txt \
815
+ --prcl svm --prcl.vectorizer TfidfVectorizer --prcl.alpha 0.6 --rm3'
816
+
817
+ status = os.system(run_file_cmd)
818
+ self.assertEqual(status, 0)
819
+
820
+ score_cmd = f'{self.pyserini_eval_cmd} -m map -m P.30 \
821
+ {self.anserini_root}/src/main/resources/topics-and-qrels/qrels.robust05.txt \
822
+ {self.tmp}/robust05_svm_rm3.txt'
823
+
824
+ status = os.system(score_cmd)
825
+ stdout, stderr = run_command(score_cmd)
826
+ score = parse_score(stdout, 'map')
827
+
828
+ self.assertEqual(status, 0)
829
+ self.assertEqual(stderr, '')
830
+ self.assertAlmostEqual(score, 0.2871, delta=0.0001)
831
+
832
+ def test_robust05_avg(self):
833
+ pyserini_topics = 'robust05'
834
+
835
+ run_file_cmd = f'{self.pyserini_search_cmd} --index {self.robust05_index_path} \
836
+ --topics {pyserini_topics} --output {self.tmp}/robust05_avg.txt \
837
+ --prcl lr svm --prcl.vectorizer TfidfVectorizer --prcl.alpha 0.8'
838
+
839
+ status = os.system(run_file_cmd)
840
+ self.assertEqual(status, 0)
841
+
842
+ score_cmd = f'{self.pyserini_eval_cmd} -m map -m P.30 \
843
+ {self.anserini_root}/src/main/resources/topics-and-qrels/qrels.robust05.txt \
844
+ {self.tmp}/robust05_avg.txt'
845
+
846
+ status = os.system(score_cmd)
847
+ stdout, stderr = run_command(score_cmd)
848
+ score = parse_score(stdout, 'map')
849
+
850
+ self.assertEqual(status, 0)
851
+ self.assertEqual(stderr, '')
852
+ self.assertAlmostEqual(score, 0.2485, delta=0.0001)
853
+
854
+ def test_robust05_avg_rm3(self):
855
+ pyserini_topics = 'robust05'
856
+
857
+ run_file_cmd = f'{self.pyserini_search_cmd} --index {self.robust05_index_path} \
858
+ --topics {pyserini_topics} --output {self.tmp}/robust05_avg_rm3.txt \
859
+ --prcl lr svm --prcl.vectorizer TfidfVectorizer --prcl.alpha 0.6 --rm3'
860
+
861
+ status = os.system(run_file_cmd)
862
+ self.assertEqual(status, 0)
863
+
864
+ score_cmd = f'{self.pyserini_eval_cmd} -m map -m P.30 \
865
+ {self.anserini_root}/src/main/resources/topics-and-qrels/qrels.robust05.txt \
866
+ {self.tmp}/robust05_avg_rm3.txt'
867
+
868
+ status = os.system(score_cmd)
869
+ stdout, stderr = run_command(score_cmd)
870
+ score = parse_score(stdout, 'map')
871
+
872
+ self.assertEqual(status, 0)
873
+ self.assertEqual(stderr, '')
874
+ self.assertAlmostEqual(score, 0.2880, delta=0.0001)
875
+
876
+ def test_robust05_rrf(self):
877
+ pyserini_topics = 'robust05'
878
+ lr_cmd = f'{self.pyserini_search_cmd} --index {self.robust05_index_path} \
879
+ --topics {pyserini_topics} --output {self.tmp}/robust05_lr.txt \
880
+ --prcl lr --prcl.vectorizer TfidfVectorizer --prcl.alpha 0.5'
881
+
882
+ status = os.system(lr_cmd)
883
+ self.assertEqual(status, 0)
884
+
885
+ svm_cmd = f'{self.pyserini_search_cmd} --index {self.robust05_index_path} \
886
+ --topics {pyserini_topics} --output {self.tmp}/robust05_svm.txt \
887
+ --prcl svm --prcl.vectorizer TfidfVectorizer --prcl.alpha 0.5'
888
+
889
+ status = os.system(svm_cmd)
890
+ self.assertEqual(status, 0)
891
+
892
+ rrf_cmd = f'{self.pyserini_fusion_cmd} \
893
+ --runs {self.tmp}/robust05_lr.txt {self.tmp}/robust05_svm.txt \
894
+ --output {self.tmp}/robust05_rrf.txt --resort'
895
+
896
+ status = os.system(rrf_cmd)
897
+ self.assertEqual(status, 0)
898
+
899
+ score_cmd = f'{self.pyserini_eval_cmd} -m map -m P.30 \
900
+ {self.anserini_root}/src/main/resources/topics-and-qrels/qrels.robust05.txt \
901
+ {self.tmp}/robust05_rrf.txt'
902
+
903
+ status = os.system(score_cmd)
904
+ stdout, stderr = run_command(score_cmd)
905
+ score = parse_score(stdout, 'map')
906
+
907
+ self.assertEqual(status, 0)
908
+ self.assertEqual(stderr, '')
909
+ self.assertAlmostEqual(score, 0.2401, delta=0.0001)
910
+
911
+ def test_robust05_rrf_rm3(self):
912
+ pyserini_topics = 'robust05'
913
+ lr_cmd = f'{self.pyserini_search_cmd} --index {self.robust05_index_path} \
914
+ --topics {pyserini_topics} --output {self.tmp}/robust05_lr_rm3.txt \
915
+ --prcl lr --prcl.vectorizer TfidfVectorizer --prcl.alpha 0.3 --rm3'
916
+
917
+ status = os.system(lr_cmd)
918
+ self.assertEqual(status, 0)
919
+
920
+ svm_cmd = f'{self.pyserini_search_cmd} --index {self.robust05_index_path} \
921
+ --topics {pyserini_topics} --output {self.tmp}/robust05_svm_rm3.txt \
922
+ --prcl svm --prcl.vectorizer TfidfVectorizer --prcl.alpha 0.3 --rm3'
923
+
924
+ status = os.system(svm_cmd)
925
+ self.assertEqual(status, 0)
926
+
927
+ rrf_cmd = f'{self.pyserini_fusion_cmd} \
928
+ --runs {self.tmp}/robust05_lr_rm3.txt {self.tmp}/robust05_svm_rm3.txt \
929
+ --output {self.tmp}/robust05_rrf_rm3.txt --resort'
930
+
931
+ status = os.system(rrf_cmd)
932
+ self.assertEqual(status, 0)
933
+
934
+ score_cmd = f'{self.pyserini_eval_cmd} -m map -m P.30 \
935
+ {self.anserini_root}/src/main/resources/topics-and-qrels/qrels.robust05.txt \
936
+ {self.tmp}/robust05_rrf_rm3.txt'
937
+
938
+ status = os.system(score_cmd)
939
+ stdout, stderr = run_command(score_cmd)
940
+ score = parse_score(stdout, 'map')
941
+
942
+ self.assertEqual(status, 0)
943
+ self.assertEqual(stderr, '')
944
+ self.assertAlmostEqual(score, 0.2808, delta=0.0001)
945
+
946
+ def tearDown(self):
947
+ shutil.rmtree(f'{self.tmp}')
948
+
949
+
950
+ if __name__ == '__main__':
951
+ unittest.main()
integrations/clprf/test_trec_covid_r3.py ADDED
@@ -0,0 +1,85 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #
2
+ # Pyserini: Reproducible IR research with sparse and dense representations
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ #
16
+
17
+ import json
18
+ import os
19
+ import re
20
+ import shutil
21
+ import unittest
22
+ from random import randint
23
+
24
+ from pyserini.util import download_url, download_prebuilt_index
25
+
26
+
27
+ class TestSearchIntegration(unittest.TestCase):
28
+ def setUp(self):
29
+
30
+ curdir = os.getcwd()
31
+ if curdir.endswith('clprf'):
32
+ self.pyserini_root = '../..'
33
+ else:
34
+ self.pyserini_root = '.'
35
+
36
+ self.tmp = f'{self.pyserini_root}/integrations/tmp{randint(0, 10000)}'
37
+
38
+ # In the rare event there's a collision
39
+ if os.path.exists(self.tmp):
40
+ shutil.rmtree(self.tmp)
41
+
42
+ os.mkdir(self.tmp)
43
+ os.mkdir(f'{self.tmp}/runs')
44
+
45
+ self.round3_runs = {
46
+ 'https://raw.githubusercontent.com/castorini/anserini/master/src/main/resources/topics-and-qrels/qrels.covid-round3-cumulative.txt':
47
+ 'dfccc32efd58a8284ae411e5c6b27ce9',
48
+ }
49
+
50
+ download_url('https://ir.nist.gov/covidSubmit/archive/round3/covidex.r3.monot5',
51
+ f'{self.tmp}/runs')
52
+
53
+ for url in self.round3_runs:
54
+ print(f'Verifying stored run at {url}...')
55
+ filename = url.split('/')[-1]
56
+ filename = re.sub('\\?dl=1$', '', filename) # Remove the Dropbox 'force download' parameter
57
+
58
+ download_url(url, self.tmp, md5=self.round3_runs[url], force=True)
59
+ self.assertTrue(os.path.exists(os.path.join(self.tmp, filename)))
60
+
61
+ def test_bm25(self):
62
+ tmp_folder_name = self.tmp.split('/')[-1]
63
+ prebuilt_index_path = download_prebuilt_index('trec-covid-r3-abstract')
64
+ os.system(f'python {self.pyserini_root}/scripts/classifier_prf/rank_trec_covid.py \
65
+ -alpha 0.5 \
66
+ -clf lr \
67
+ -vectorizer tfidf \
68
+ -new_qrels {self.pyserini_root}/tools/topics-and-qrels/qrels.covid-round3.txt \
69
+ -base {self.tmp}/runs/covidex.r3.monot5 \
70
+ -tmp_base {tmp_folder_name} \
71
+ -qrels {self.pyserini_root}/tools/topics-and-qrels/qrels.covid-round2-cumulative.txt \
72
+ -index {prebuilt_index_path} \
73
+ -tag covidex.r3.t5.lr \
74
+ -output {self.tmp}/output.json')
75
+ with open(f'{self.tmp}/output.json') as json_file:
76
+ data = json.load(json_file)
77
+ self.assertEqual("0.3333", data['map'])
78
+ self.assertEqual("0.6916", data['ndcg'])
79
+
80
+ def tearDown(self):
81
+ shutil.rmtree(self.tmp)
82
+
83
+
84
+ if __name__ == '__main__':
85
+ unittest.main()
integrations/clprf/test_trec_covid_r4.py ADDED
@@ -0,0 +1,91 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #
2
+ # Pyserini: Reproducible IR research with sparse and dense representations
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ #
16
+
17
+ import gzip
18
+ import json
19
+ import os
20
+ import re
21
+ import shutil
22
+ import unittest
23
+ from random import randint
24
+
25
+ from pyserini.util import download_url, download_prebuilt_index
26
+
27
+
28
+ class TestSearchIntegration(unittest.TestCase):
29
+ def setUp(self):
30
+
31
+ curdir = os.getcwd()
32
+ if curdir.endswith('clprf'):
33
+ self.pyserini_root = '../..'
34
+ else:
35
+ self.pyserini_root = '.'
36
+
37
+ self.tmp = f'{self.pyserini_root}/integrations/tmp{randint(0, 10000)}'
38
+
39
+ # In the rare event there's a collision
40
+ if os.path.exists(self.tmp):
41
+ shutil.rmtree(self.tmp)
42
+
43
+ os.mkdir(self.tmp)
44
+ os.mkdir(f'{self.tmp}/runs')
45
+
46
+ self.round4_runs = {
47
+ 'https://raw.githubusercontent.com/castorini/anserini/master/src/main/resources/topics-and-qrels/qrels.covid-round4-cumulative.txt':
48
+ '7a5c27e8e052c49ff72d557051825973',
49
+ }
50
+
51
+ download_url('https://ir.nist.gov/covidSubmit/archive/round4/covidex.r4.d2q.duot5.gz',
52
+ f'{self.tmp}/runs')
53
+
54
+ with gzip.open(f'{self.tmp}/runs/covidex.r4.d2q.duot5.gz', 'rb') as f_in:
55
+ with open(f'{self.tmp}/runs/covidex.r4.d2q.duot5', 'wb') as f_out:
56
+ shutil.copyfileobj(f_in, f_out)
57
+
58
+ for url in self.round4_runs:
59
+ print(f'Verifying stored run at {url}...')
60
+ filename = url.split('/')[-1]
61
+ filename = re.sub('\\?dl=1$', '', filename) # Remove the Dropbox 'force download' parameter
62
+
63
+ download_url(url, self.tmp, md5=self.round4_runs[url], force=True)
64
+ self.assertTrue(os.path.exists(os.path.join(self.tmp, filename)))
65
+ print('')
66
+
67
+ def test_bm25(self):
68
+ tmp_folder_name = self.tmp.split('/')[-1]
69
+ prebuilt_index_path = download_prebuilt_index('trec-covid-r4-abstract')
70
+ os.system(f'python {self.pyserini_root}/scripts/classifier_prf/rank_trec_covid.py \
71
+ -alpha 0.6 \
72
+ -clf lr \
73
+ -vectorizer tfidf \
74
+ -new_qrels {self.pyserini_root}/tools/topics-and-qrels/qrels.covid-round4.txt \
75
+ -base {self.tmp}/runs/covidex.r4.d2q.duot5 \
76
+ -tmp_base {tmp_folder_name} \
77
+ -qrels {self.pyserini_root}/tools/topics-and-qrels/qrels.covid-round3-cumulative.txt \
78
+ -index {prebuilt_index_path} \
79
+ -tag covidex.r4.d2q.duot5.lr \
80
+ -output {self.tmp}/output.json')
81
+ with open(f'{self.tmp}/output.json') as json_file:
82
+ data = json.load(json_file)
83
+ self.assertEqual("0.3846", data['map'])
84
+ self.assertEqual("0.7745", data['ndcg'])
85
+
86
+ def tearDown(self):
87
+ shutil.rmtree(self.tmp)
88
+
89
+
90
+ if __name__ == '__main__':
91
+ unittest.main()
integrations/clprf/test_trec_covid_r5.py ADDED
@@ -0,0 +1,141 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #
2
+ # Pyserini: Reproducible IR research with sparse and dense representations
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ #
16
+
17
+ import os
18
+ import re
19
+ import shutil
20
+ import unittest
21
+ import json
22
+ import gzip
23
+ from random import randint
24
+ from pyserini.util import download_url, download_prebuilt_index
25
+
26
+
27
+ class TestSearchIntegration(unittest.TestCase):
28
+ def setUp(self):
29
+
30
+ curdir = os.getcwd()
31
+ if curdir.endswith('clprf'):
32
+ self.pyserini_root = '../..'
33
+ else:
34
+ self.pyserini_root = '.'
35
+
36
+ self.tmp = f'{self.pyserini_root}/integrations/tmp{randint(0, 10000)}'
37
+
38
+ # In the rare event there's a collision
39
+ if os.path.exists(self.tmp):
40
+ shutil.rmtree(self.tmp)
41
+
42
+ os.mkdir(self.tmp)
43
+ os.mkdir(f'{self.tmp}/runs')
44
+
45
+ self.round5_runs = {
46
+ 'https://ir.nist.gov/covidSubmit/archive/round5/covidex.r5.d2q.1s.gz':
47
+ '2181ae5b7fe8bafbd3b41700f3ccde02',
48
+ 'https://ir.nist.gov/covidSubmit/archive/round5/covidex.r5.d2q.2s.gz':
49
+ 'e61f9b6de5ffbe1b5b82d35216968154',
50
+ 'https://ir.nist.gov/covidSubmit/archive/round5/covidex.r5.2s.gz':
51
+ '6e517a5e044d8b7ce983f7e165cf4aeb',
52
+ 'https://ir.nist.gov/covidSubmit/archive/round5/covidex.r5.1s.gz':
53
+ 'dc9b4b45494294a8448cf0693f07f7fd'
54
+ }
55
+
56
+ for url in self.round5_runs:
57
+ print(f'Verifying stored run at {url}...')
58
+ filename = url.split('/')[-1]
59
+ filename = re.sub('\\?dl=1$', '', filename) # Remove the Dropbox 'force download' parameter
60
+ gzip_filename = '.'.join(filename.split('.')[:-1])
61
+
62
+ download_url(url, f'{self.tmp}/runs/', md5=self.round5_runs[url], force=True)
63
+ self.assertTrue(os.path.exists(os.path.join(f'{self.tmp}/runs/', filename)))
64
+ with gzip.open(f'{self.tmp}/runs/{filename}', 'rb') as f_in:
65
+ with open(f'{self.tmp}/runs/{gzip_filename}', 'wb') as f_out:
66
+ shutil.copyfileobj(f_in, f_out)
67
+
68
+ def test_round5(self):
69
+ tmp_folder_name = self.tmp.split('/')[-1]
70
+ prebuilt_index_path = download_prebuilt_index('trec-covid-r5-abstract')
71
+
72
+ os.system(f'python {self.pyserini_root}/scripts/classifier_prf/rank_trec_covid.py \
73
+ -alpha 0.6 \
74
+ -clf lr \
75
+ -vectorizer tfidf \
76
+ -new_qrels {self.pyserini_root}/tools/topics-and-qrels/qrels.covid-round5.txt \
77
+ -base {self.tmp}/runs/covidex.r5.d2q.1s \
78
+ -tmp_base {tmp_folder_name} \
79
+ -qrels {self.pyserini_root}/tools/topics-and-qrels/qrels.covid-round4-cumulative.txt \
80
+ -index {prebuilt_index_path} \
81
+ -tag covidex.r5.d2q.1s \
82
+ -output {self.tmp}/output.json')
83
+ with open(f'{self.tmp}/output.json') as json_file:
84
+ data = json.load(json_file)
85
+ self.assertEqual("0.3859", data['map'])
86
+ self.assertEqual("0.8221", data['ndcg'])
87
+
88
+ os.system(f'python {self.pyserini_root}/scripts/classifier_prf/rank_trec_covid.py \
89
+ -alpha 0.6 \
90
+ -clf lr \
91
+ -vectorizer tfidf \
92
+ -new_qrels {self.pyserini_root}/tools/topics-and-qrels/qrels.covid-round5.txt \
93
+ -base {self.tmp}/runs/covidex.r5.d2q.2s \
94
+ -tmp_base {tmp_folder_name} \
95
+ -qrels {self.pyserini_root}/tools/topics-and-qrels/qrels.covid-round4-cumulative.txt \
96
+ -index {prebuilt_index_path} \
97
+ -tag covidex.r5.d2q.2s \
98
+ -output {self.tmp}/output.json')
99
+ with open(f'{self.tmp}/output.json') as json_file:
100
+ data = json.load(json_file)
101
+ self.assertEqual("0.3875", data['map'])
102
+ self.assertEqual("0.8304", data['ndcg'])
103
+
104
+ os.system(f'python {self.pyserini_root}/scripts/classifier_prf/rank_trec_covid.py \
105
+ -alpha 0.6 \
106
+ -clf lr \
107
+ -vectorizer tfidf \
108
+ -new_qrels {self.pyserini_root}/tools/topics-and-qrels/qrels.covid-round5.txt \
109
+ -base {self.tmp}/runs/covidex.r5.1s \
110
+ -tmp_base {tmp_folder_name} \
111
+ -qrels {self.pyserini_root}/tools/topics-and-qrels/qrels.covid-round4-cumulative.txt \
112
+ -index {prebuilt_index_path} \
113
+ -tag covidex.r5.1s \
114
+ -output {self.tmp}/output.json')
115
+ with open(f'{self.tmp}/output.json') as json_file:
116
+ data = json.load(json_file)
117
+ self.assertEqual("0.3885", data['map'])
118
+ self.assertEqual("0.8135", data['ndcg'])
119
+
120
+ os.system(f'python {self.pyserini_root}/scripts/classifier_prf/rank_trec_covid.py \
121
+ -alpha 0.6 \
122
+ -clf lr \
123
+ -vectorizer tfidf \
124
+ -new_qrels {self.pyserini_root}/tools/topics-and-qrels/qrels.covid-round5.txt \
125
+ -base {self.tmp}/runs/covidex.r5.2s \
126
+ -tmp_base {tmp_folder_name} \
127
+ -qrels {self.pyserini_root}/tools/topics-and-qrels/qrels.covid-round4-cumulative.txt \
128
+ -index {prebuilt_index_path} \
129
+ -tag covidex.r5.2s \
130
+ -output {self.tmp}/output.json')
131
+ with open(f'{self.tmp}/output.json') as json_file:
132
+ data = json.load(json_file)
133
+ self.assertEqual("0.3922", data['map'])
134
+ self.assertEqual("0.8311", data['ndcg'])
135
+
136
+ def tearDown(self):
137
+ shutil.rmtree(self.tmp)
138
+
139
+
140
+ if __name__ == '__main__':
141
+ unittest.main()
integrations/dense/test_ance.py ADDED
@@ -0,0 +1,185 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #
2
+ # Pyserini: Reproducible IR research with sparse and dense representations
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ #
16
+
17
+ """Integration tests for ANCE and ANCE PRF using on-the-fly query encoding."""
18
+
19
+ import os
20
+ import socket
21
+ import unittest
22
+
23
+ from integrations.utils import clean_files, run_command, parse_score, parse_score_qa, parse_score_msmarco
24
+ from pyserini.search import QueryEncoder
25
+ from pyserini.search import get_topics
26
+
27
+
28
+ class TestSearchIntegration(unittest.TestCase):
29
+ def setUp(self):
30
+ self.temp_files = []
31
+ self.threads = 16
32
+ self.batch_size = 256
33
+ self.rocchio_alpha = 0.4
34
+ self.rocchio_beta = 0.6
35
+
36
+ # Hard-code larger values for internal servers
37
+ if socket.gethostname().startswith('damiano') or socket.gethostname().startswith('orca'):
38
+ self.threads = 36
39
+ self.batch_size = 144
40
+
41
+ def test_ance_encoded_queries(self):
42
+ encoded = QueryEncoder.load_encoded_queries('ance-msmarco-passage-dev-subset')
43
+ topics = get_topics('msmarco-passage-dev-subset')
44
+ for t in topics:
45
+ self.assertTrue(topics[t]['title'] in encoded.embedding)
46
+
47
+ encoded = QueryEncoder.load_encoded_queries('ance-dl19-passage')
48
+ topics = get_topics('dl19-passage')
49
+ for t in topics:
50
+ self.assertTrue(topics[t]['title'] in encoded.embedding)
51
+
52
+ encoded = QueryEncoder.load_encoded_queries('ance-dl20')
53
+ topics = get_topics('dl20')
54
+ for t in topics:
55
+ self.assertTrue(topics[t]['title'] in encoded.embedding)
56
+
57
+ def test_msmarco_passage_ance_avg_prf_otf(self):
58
+ output_file = 'test_run.dl2019.ance.avg-prf.otf.trec'
59
+ self.temp_files.append(output_file)
60
+ cmd1 = f'python -m pyserini.search.faiss --topics dl19-passage \
61
+ --index msmarco-passage-ance-bf \
62
+ --encoder castorini/ance-msmarco-passage \
63
+ --batch-size {self.batch_size} \
64
+ --threads {self.threads} \
65
+ --output {output_file} \
66
+ --prf-depth 3 \
67
+ --prf-method avg'
68
+ cmd2 = f'python -m pyserini.eval.trec_eval -l 2 -m map dl19-passage {output_file}'
69
+ status = os.system(cmd1)
70
+ stdout, stderr = run_command(cmd2)
71
+ score = parse_score(stdout, 'map')
72
+ self.assertEqual(status, 0)
73
+ self.assertAlmostEqual(score, 0.4247, delta=0.0001)
74
+
75
+ def test_msmarco_passage_ance_rocchio_prf_otf(self):
76
+ output_file = 'test_run.dl2019.ance.rocchio-prf.otf.trec'
77
+ self.temp_files.append(output_file)
78
+ cmd1 = f'python -m pyserini.search.faiss --topics dl19-passage \
79
+ --index msmarco-passage-ance-bf \
80
+ --encoder castorini/ance-msmarco-passage \
81
+ --batch-size {self.batch_size} \
82
+ --threads {self.threads} \
83
+ --output {output_file} \
84
+ --prf-depth 5 \
85
+ --prf-method rocchio \
86
+ --rocchio-topk 5 \
87
+ --threads {self.threads} \
88
+ --rocchio-alpha {self.rocchio_alpha} \
89
+ --rocchio-beta {self.rocchio_beta}'
90
+ cmd2 = f'python -m pyserini.eval.trec_eval -l 2 -m map dl19-passage {output_file}'
91
+ status = os.system(cmd1)
92
+ stdout, stderr = run_command(cmd2)
93
+ score = parse_score(stdout, 'map')
94
+ self.assertEqual(status, 0)
95
+ self.assertAlmostEqual(score, 0.4211, delta=0.0001)
96
+
97
+ def test_msmarco_doc_ance_bf_otf(self):
98
+ output_file = 'test_run.msmarco-doc.passage.ance-maxp.otf.txt'
99
+ self.temp_files.append(output_file)
100
+ cmd1 = f'python -m pyserini.search.faiss --topics msmarco-doc-dev \
101
+ --index msmarco-doc-ance-maxp-bf \
102
+ --encoder castorini/ance-msmarco-doc-maxp \
103
+ --output {output_file}\
104
+ --hits 1000 \
105
+ --max-passage \
106
+ --max-passage-hits 100 \
107
+ --output-format msmarco \
108
+ --batch-size {self.batch_size} \
109
+ --threads {self.threads}'
110
+ cmd2 = f'python -m pyserini.eval.msmarco_doc_eval --judgments msmarco-doc-dev --run {output_file}'
111
+ status = os.system(cmd1)
112
+ stdout, stderr = run_command(cmd2)
113
+ score = parse_score_msmarco(stdout, 'MRR @100')
114
+ self.assertEqual(status, 0)
115
+ # We get a small difference, 0.3794 on macOS.
116
+ self.assertAlmostEqual(score, 0.3796, delta=0.0002)
117
+
118
+ def test_msmarco_doc_ance_bf_encoded_queries(self):
119
+ encoder = QueryEncoder.load_encoded_queries('ance_maxp-msmarco-doc-dev')
120
+ topics = get_topics('msmarco-doc-dev')
121
+ for t in topics:
122
+ self.assertTrue(topics[t]['title'] in encoder.embedding)
123
+
124
+ def test_nq_test_ance_bf_otf(self):
125
+ output_file = 'test_run.ance.nq-test.multi.bf.otf.trec'
126
+ retrieval_file = 'test_run.ance.nq-test.multi.bf.otf.json'
127
+ self.temp_files.extend([output_file, retrieval_file])
128
+ cmd1 = f'python -m pyserini.search.faiss --topics dpr-nq-test \
129
+ --index wikipedia-ance-multi-bf \
130
+ --encoder castorini/ance-dpr-question-multi \
131
+ --output {output_file} \
132
+ --batch-size {self.batch_size} --threads {self.threads}'
133
+ cmd2 = f'python -m pyserini.eval.convert_trec_run_to_dpr_retrieval_run --topics dpr-nq-test \
134
+ --index wikipedia-dpr \
135
+ --input {output_file} \
136
+ --output {retrieval_file}'
137
+ cmd3 = f'python -m pyserini.eval.evaluate_dpr_retrieval --retrieval {retrieval_file} --topk 20'
138
+ status1 = os.system(cmd1)
139
+ status2 = os.system(cmd2)
140
+ stdout, stderr = run_command(cmd3)
141
+ score = parse_score_qa(stdout, 'Top20')
142
+ self.assertEqual(status1, 0)
143
+ self.assertEqual(status2, 0)
144
+ self.assertAlmostEqual(score, 0.8224, places=4)
145
+
146
+ def test_nq_test_ance_encoded_queries(self):
147
+ encoder = QueryEncoder.load_encoded_queries('dpr_multi-nq-test')
148
+ topics = get_topics('dpr-nq-test')
149
+ for t in topics:
150
+ self.assertTrue(topics[t]['title'] in encoder.embedding)
151
+
152
+ def test_trivia_test_ance_bf_otf(self):
153
+ output_file = 'test_run.ance.trivia-test.multi.bf.otf.trec'
154
+ retrieval_file = 'test_run.ance.trivia-test.multi.bf.otf.json'
155
+ self.temp_files.extend([output_file, retrieval_file])
156
+ cmd1 = f'python -m pyserini.search.faiss --topics dpr-trivia-test \
157
+ --index wikipedia-ance-multi-bf \
158
+ --encoder castorini/ance-dpr-question-multi \
159
+ --output {output_file} \
160
+ --batch-size {self.batch_size} --threads {self.threads}'
161
+ cmd2 = f'python -m pyserini.eval.convert_trec_run_to_dpr_retrieval_run --topics dpr-trivia-test \
162
+ --index wikipedia-dpr \
163
+ --input {output_file} \
164
+ --output {retrieval_file}'
165
+ cmd3 = f'python -m pyserini.eval.evaluate_dpr_retrieval --retrieval {retrieval_file} --topk 20'
166
+ status1 = os.system(cmd1)
167
+ status2 = os.system(cmd2)
168
+ stdout, stderr = run_command(cmd3)
169
+ score = parse_score_qa(stdout, 'Top20')
170
+ self.assertEqual(status1, 0)
171
+ self.assertEqual(status2, 0)
172
+ self.assertAlmostEqual(score, 0.8010, places=4)
173
+
174
+ def test_trivia_test_ance_encoded_queries(self):
175
+ encoder = QueryEncoder.load_encoded_queries('dpr_multi-trivia-test')
176
+ topics = get_topics('dpr-trivia-test')
177
+ for t in topics:
178
+ self.assertTrue(topics[t]['title'] in encoder.embedding)
179
+
180
+ def tearDown(self):
181
+ clean_files(self.temp_files)
182
+
183
+
184
+ if __name__ == '__main__':
185
+ unittest.main()
integrations/dense/test_distilbert_kd.py ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #
2
+ # Pyserini: Reproducible IR research with sparse and dense representations
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ #
16
+
17
+ """Integration tests for DistilBERT KD."""
18
+
19
+ import unittest
20
+
21
+ from pyserini.search import QueryEncoder
22
+ from pyserini.search import get_topics
23
+
24
+
25
+ class TestSearchIntegration(unittest.TestCase):
26
+ # Note that we test actual retrieval in 2CR, so no need to test here.
27
+ def test_distilbert_kd_encoded_queries(self):
28
+ encoded = QueryEncoder.load_encoded_queries('distilbert_kd-msmarco-passage-dev-subset')
29
+ topics = get_topics('msmarco-passage-dev-subset')
30
+ for t in topics:
31
+ self.assertTrue(topics[t]['title'] in encoded.embedding)
32
+
33
+ encoded = QueryEncoder.load_encoded_queries('distilbert_kd-dl19-passage')
34
+ topics = get_topics('dl19-passage')
35
+ for t in topics:
36
+ self.assertTrue(topics[t]['title'] in encoded.embedding)
37
+
38
+ encoded = QueryEncoder.load_encoded_queries('distilbert_kd-dl20')
39
+ topics = get_topics('dl20')
40
+ for t in topics:
41
+ self.assertTrue(topics[t]['title'] in encoded.embedding)
42
+
43
+
44
+ if __name__ == '__main__':
45
+ unittest.main()
integrations/dense/test_distilbert_tasb.py ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #
2
+ # Pyserini: Reproducible IR research with sparse and dense representations
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ #
16
+
17
+ """Integration tests for DistilBERT TAS-B."""
18
+
19
+ import unittest
20
+
21
+ from pyserini.search import QueryEncoder
22
+ from pyserini.search import get_topics
23
+
24
+
25
+ class TestSearchIntegration(unittest.TestCase):
26
+ def test_distilbert_kd_tas_b_encoded_queries(self):
27
+ encoded = QueryEncoder.load_encoded_queries('distilbert_tas_b-msmarco-passage-dev-subset')
28
+ topics = get_topics('msmarco-passage-dev-subset')
29
+ for t in topics:
30
+ self.assertTrue(topics[t]['title'] in encoded.embedding)
31
+
32
+ encoded = QueryEncoder.load_encoded_queries('distilbert_tas_b-dl19-passage')
33
+ topics = get_topics('dl19-passage')
34
+ for t in topics:
35
+ self.assertTrue(topics[t]['title'] in encoded.embedding)
36
+
37
+ encoded = QueryEncoder.load_encoded_queries('distilbert_tas_b-dl20')
38
+ topics = get_topics('dl20')
39
+ for t in topics:
40
+ self.assertTrue(topics[t]['title'] in encoded.embedding)
41
+
42
+
43
+ if __name__ == '__main__':
44
+ unittest.main()
integrations/dense/test_dpr.py ADDED
@@ -0,0 +1,339 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #
2
+ # Pyserini: Reproducible IR research with sparse and dense representations
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ #
16
+
17
+ """Integration tests for DPR model using pre-encoded queries."""
18
+
19
+ import json
20
+ import os
21
+ import socket
22
+ import unittest
23
+
24
+ from integrations.utils import clean_files, run_command, parse_score_qa
25
+ from pyserini.search import QueryEncoder
26
+ from pyserini.search import get_topics
27
+
28
+
29
+ class TestSearchIntegration(unittest.TestCase):
30
+ def setUp(self):
31
+ self.temp_files = []
32
+ self.threads = 16
33
+ self.batch_size = 256
34
+
35
+ # Hard-code larger values for internal servers
36
+ if socket.gethostname().startswith('damiano') or socket.gethostname().startswith('orca'):
37
+ self.threads = 36
38
+ self.batch_size = 144
39
+
40
+ def test_dpr_nq_test_bf_otf(self):
41
+ output_file = 'test_run.dpr.nq-test.multi.bf.otf.trec'
42
+ retrieval_file = 'test_run.dpr.nq-test.multi.bf.otf.json'
43
+ self.temp_files.extend([output_file, retrieval_file])
44
+ cmd1 = f'python -m pyserini.search.faiss --topics dpr-nq-test \
45
+ --index wikipedia-dpr-multi-bf \
46
+ --encoder facebook/dpr-question_encoder-multiset-base \
47
+ --output {output_file} \
48
+ --batch-size {self.batch_size} --threads {self.threads}'
49
+ cmd2 = f'python -m pyserini.eval.convert_trec_run_to_dpr_retrieval_run --topics dpr-nq-test \
50
+ --index wikipedia-dpr \
51
+ --input {output_file} \
52
+ --output {retrieval_file}'
53
+ cmd3 = f'python -m pyserini.eval.evaluate_dpr_retrieval --retrieval {retrieval_file} --topk 20'
54
+ status1 = os.system(cmd1)
55
+ status2 = os.system(cmd2)
56
+ stdout, stderr = run_command(cmd3)
57
+ score = parse_score_qa(stdout, 'Top20')
58
+ self.assertEqual(status1, 0)
59
+ self.assertEqual(status2, 0)
60
+ self.assertAlmostEqual(score, 0.7947, places=4)
61
+
62
+ def test_dpr_nq_test_bf_bm25_hybrid_otf(self):
63
+ output_file = 'test_run.dpr.nq-test.multi.bf.otf.bm25.trec'
64
+ retrieval_file = 'test_run.dpr.nq-test.multi.bf.otf.bm25.json'
65
+ self.temp_files.extend([output_file, retrieval_file])
66
+ cmd1 = f'python -m pyserini.search.hybrid dense --index wikipedia-dpr-multi-bf \
67
+ --encoder facebook/dpr-question_encoder-multiset-base \
68
+ sparse --index wikipedia-dpr \
69
+ fusion --alpha 1.3 \
70
+ run --topics dpr-nq-test \
71
+ --batch-size {self.batch_size} --threads {self.threads} \
72
+ --output {output_file} '
73
+ cmd2 = f'python -m pyserini.eval.convert_trec_run_to_dpr_retrieval_run --topics dpr-nq-test \
74
+ --index wikipedia-dpr \
75
+ --input {output_file} \
76
+ --output {retrieval_file}'
77
+ cmd3 = f'python -m pyserini.eval.evaluate_dpr_retrieval --retrieval {retrieval_file} --topk 20'
78
+ status1 = os.system(cmd1)
79
+ status2 = os.system(cmd2)
80
+ stdout, stderr = run_command(cmd3)
81
+ score = parse_score_qa(stdout, 'Top20')
82
+ self.assertEqual(status1, 0)
83
+ self.assertEqual(status2, 0)
84
+ self.assertAlmostEqual(score, 0.8260, places=4)
85
+
86
+ def test_dpr_nq_test_encoded_queries(self):
87
+ encoder = QueryEncoder.load_encoded_queries('dpr_multi-nq-test')
88
+ topics = get_topics('dpr-nq-test')
89
+ for t in topics:
90
+ self.assertTrue(topics[t]['title'] in encoder.embedding)
91
+
92
+ def test_dpr_trivia_test_bf_otf(self):
93
+ output_file = 'test_run.dpr.trivia-test.multi.bf.otf.trec'
94
+ retrieval_file = 'test_run.dpr.trivia-test.multi.bf.otf.json'
95
+ self.temp_files.extend([output_file, retrieval_file])
96
+ cmd1 = f'python -m pyserini.search.faiss --topics dpr-trivia-test \
97
+ --encoder facebook/dpr-question_encoder-multiset-base \
98
+ --index wikipedia-dpr-multi-bf \
99
+ --output {output_file} \
100
+ --batch-size {self.batch_size} --threads {self.threads}'
101
+ cmd2 = f'python -m pyserini.eval.convert_trec_run_to_dpr_retrieval_run --topics dpr-trivia-test \
102
+ --index wikipedia-dpr \
103
+ --input {output_file} \
104
+ --output {retrieval_file}'
105
+ cmd3 = f'python -m pyserini.eval.evaluate_dpr_retrieval --retrieval {retrieval_file} --topk 20'
106
+ status1 = os.system(cmd1)
107
+ status2 = os.system(cmd2)
108
+ stdout, stderr = run_command(cmd3)
109
+ score = parse_score_qa(stdout, 'Top20')
110
+ self.assertEqual(status1, 0)
111
+ self.assertEqual(status2, 0)
112
+ self.assertAlmostEqual(score, 0.7887, places=4)
113
+
114
+ def test_dpr_trivia_test_bf_bm25_hybrid_otf(self):
115
+ output_file = 'test_run.dpr.trivia-test.multi.bf.otf.bm25.trec'
116
+ retrieval_file = 'test_run.dpr.trivia-test.multi.bf.otf.bm25.json'
117
+ self.temp_files.extend([output_file, retrieval_file])
118
+ cmd1 = f'python -m pyserini.search.hybrid dense --index wikipedia-dpr-multi-bf \
119
+ --encoder facebook/dpr-question_encoder-multiset-base \
120
+ sparse --index wikipedia-dpr \
121
+ fusion --alpha 0.95 \
122
+ run --topics dpr-trivia-test \
123
+ --batch-size {self.batch_size} --threads {self.threads} \
124
+ --output {output_file} '
125
+ cmd2 = f'python -m pyserini.eval.convert_trec_run_to_dpr_retrieval_run --topics dpr-trivia-test \
126
+ --index wikipedia-dpr \
127
+ --input {output_file} \
128
+ --output {retrieval_file}'
129
+ cmd3 = f'python -m pyserini.eval.evaluate_dpr_retrieval --retrieval {retrieval_file} --topk 20'
130
+ status1 = os.system(cmd1)
131
+ status2 = os.system(cmd2)
132
+ stdout, stderr = run_command(cmd3)
133
+ score = parse_score_qa(stdout, 'Top20')
134
+ self.assertEqual(status1, 0)
135
+ self.assertEqual(status2, 0)
136
+ self.assertAlmostEqual(score, 0.8264, places=4)
137
+
138
+ def test_dpr_trivia_test_encoded_queries(self):
139
+ encoder = QueryEncoder.load_encoded_queries('dpr_multi-trivia-test')
140
+ topics = get_topics('dpr-trivia-test')
141
+ for t in topics:
142
+ self.assertTrue(topics[t]['title'] in encoder.embedding)
143
+
144
+ def test_dpr_wq_test_bf_otf(self):
145
+ output_file = 'test_run.dpr.wq-test.multi.bf.otf.trec'
146
+ retrieval_file = 'test_run.dpr.wq-test.multi.bf.otf.json'
147
+ self.temp_files.extend([output_file, retrieval_file])
148
+ cmd1 = f'python -m pyserini.search.faiss --topics dpr-wq-test \
149
+ --index wikipedia-dpr-multi-bf \
150
+ --encoder facebook/dpr-question_encoder-multiset-base \
151
+ --output {output_file} \
152
+ --batch-size {self.batch_size} --threads {self.threads}'
153
+ cmd2 = f'python -m pyserini.eval.convert_trec_run_to_dpr_retrieval_run --topics dpr-wq-test \
154
+ --index wikipedia-dpr \
155
+ --input {output_file} \
156
+ --output {retrieval_file}'
157
+ cmd3 = f'python -m pyserini.eval.evaluate_dpr_retrieval --retrieval {retrieval_file} --topk 20'
158
+ status1 = os.system(cmd1)
159
+ status2 = os.system(cmd2)
160
+ stdout, stderr = run_command(cmd3)
161
+ score = parse_score_qa(stdout, 'Top20')
162
+ self.assertEqual(status1, 0)
163
+ self.assertEqual(status2, 0)
164
+ self.assertAlmostEqual(score, 0.7505, places=4)
165
+
166
+ def test_dpr_wq_test_bf_bm25_hybrid_otf(self):
167
+ output_file = 'test_run.dpr.wq-test.multi.bf.otf.bm25.trec'
168
+ retrieval_file = 'test_run.dpr.wq-test.multi.bf.otf.bm25.json'
169
+ self.temp_files.extend([output_file, retrieval_file])
170
+ cmd1 = f'python -m pyserini.search.hybrid dense --index wikipedia-dpr-multi-bf \
171
+ --encoder facebook/dpr-question_encoder-multiset-base \
172
+ sparse --index wikipedia-dpr \
173
+ fusion --alpha 0.95 \
174
+ run --topics dpr-wq-test \
175
+ --batch-size {self.batch_size} --threads {self.threads} \
176
+ --output {output_file} '
177
+ cmd2 = f'python -m pyserini.eval.convert_trec_run_to_dpr_retrieval_run --topics dpr-wq-test \
178
+ --index wikipedia-dpr \
179
+ --input {output_file} \
180
+ --output {retrieval_file}'
181
+ cmd3 = f'python -m pyserini.eval.evaluate_dpr_retrieval --retrieval {retrieval_file} --topk 20'
182
+ status1 = os.system(cmd1)
183
+ status2 = os.system(cmd2)
184
+ stdout, stderr = run_command(cmd3)
185
+ score = parse_score_qa(stdout, 'Top20')
186
+ self.assertEqual(status1, 0)
187
+ self.assertEqual(status2, 0)
188
+ self.assertAlmostEqual(score, 0.7712, places=4)
189
+
190
+ def test_dpr_wq_test_encoded_queries(self):
191
+ encoder = QueryEncoder.load_encoded_queries('dpr_multi-wq-test')
192
+ topics = get_topics('dpr-wq-test')
193
+ for t in topics:
194
+ self.assertTrue(topics[t]['title'] in encoder.embedding)
195
+
196
+ def test_dpr_curated_test_bf_otf(self):
197
+ output_file = 'test_run.dpr.curated-test.multi.bf.otf.trec'
198
+ retrieval_file = 'test_run.dpr.curated-test.multi.bf.otf.json'
199
+ self.temp_files.extend([output_file, retrieval_file])
200
+ cmd1 = f'python -m pyserini.search.faiss --topics dpr-curated-test \
201
+ --index wikipedia-dpr-multi-bf \
202
+ --encoder facebook/dpr-question_encoder-multiset-base \
203
+ --output {output_file} \
204
+ --batch-size {self.batch_size} --threads {self.threads}'
205
+ cmd2 = f'python -m pyserini.eval.convert_trec_run_to_dpr_retrieval_run --topics dpr-curated-test \
206
+ --index wikipedia-dpr \
207
+ --input {output_file} \
208
+ --output {retrieval_file} \
209
+ --regex'
210
+ cmd3 = f'python -m pyserini.eval.evaluate_dpr_retrieval --retrieval {retrieval_file} --topk 20 --regex'
211
+ status1 = os.system(cmd1)
212
+ status2 = os.system(cmd2)
213
+ stdout, stderr = run_command(cmd3)
214
+ score = parse_score_qa(stdout, 'Top20')
215
+ self.assertEqual(status1, 0)
216
+ self.assertEqual(status2, 0)
217
+ self.assertAlmostEqual(score, 0.8876, places=4)
218
+
219
+ def test_dpr_curated_test_bf_bm25_hybrid_otf(self):
220
+ output_file = 'test_run.dpr.curated-test.multi.bf.otf.bm25.trec'
221
+ retrieval_file = 'test_run.dpr.curated-test.multi.bf.otf.bm25.json'
222
+ self.temp_files.extend([output_file, retrieval_file])
223
+ cmd1 = f'python -m pyserini.search.hybrid dense --index wikipedia-dpr-multi-bf \
224
+ --encoder facebook/dpr-question_encoder-multiset-base \
225
+ sparse --index wikipedia-dpr \
226
+ fusion --alpha 1.05 \
227
+ run --topics dpr-curated-test \
228
+ --batch-size {self.batch_size} --threads {self.threads} \
229
+ --output {output_file} '
230
+ cmd2 = f'python -m pyserini.eval.convert_trec_run_to_dpr_retrieval_run --topics dpr-curated-test \
231
+ --index wikipedia-dpr \
232
+ --input {output_file} \
233
+ --output {retrieval_file} \
234
+ --regex'
235
+ cmd3 = f'python -m pyserini.eval.evaluate_dpr_retrieval --retrieval {retrieval_file} --topk 20 --regex'
236
+ status1 = os.system(cmd1)
237
+ status2 = os.system(cmd2)
238
+ stdout, stderr = run_command(cmd3)
239
+ score = parse_score_qa(stdout, 'Top20')
240
+ self.assertEqual(status1, 0)
241
+ self.assertEqual(status2, 0)
242
+ self.assertAlmostEqual(score, 0.9006, places=4)
243
+
244
+ def test_dpr_curated_test_encoded_queries(self):
245
+ encoder = QueryEncoder.load_encoded_queries('dpr_multi-curated-test')
246
+ topics = get_topics('dpr-curated-test')
247
+ for t in topics:
248
+ self.assertTrue(topics[t]['title'] in encoder.embedding)
249
+
250
+ def test_dpr_squad_test_bf_otf(self):
251
+ output_file = 'test_run.dpr.squad-test.multi.bf.otf.trec'
252
+ retrieval_file = 'test_run.dpr.squad-test.multi.bf.otf.json'
253
+ self.temp_files.extend([output_file, retrieval_file])
254
+ cmd1 = f'python -m pyserini.search.faiss --topics dpr-squad-test \
255
+ --index wikipedia-dpr-multi-bf \
256
+ --encoder facebook/dpr-question_encoder-multiset-base \
257
+ --output {output_file} \
258
+ --batch-size {self.batch_size} --threads {self.threads}'
259
+ cmd2 = f'python -m pyserini.eval.convert_trec_run_to_dpr_retrieval_run --topics dpr-squad-test \
260
+ --index wikipedia-dpr \
261
+ --input {output_file} \
262
+ --output {retrieval_file}'
263
+ cmd3 = f'python -m pyserini.eval.evaluate_dpr_retrieval --retrieval {retrieval_file} --topk 20'
264
+ status1 = os.system(cmd1)
265
+ status2 = os.system(cmd2)
266
+ stdout, stderr = run_command(cmd3)
267
+ score = parse_score_qa(stdout, 'Top20')
268
+ self.assertEqual(status1, 0)
269
+ self.assertEqual(status2, 0)
270
+ self.assertAlmostEqual(score, 0.5199, places=4)
271
+
272
+ def test_dpr_squad_test_bf_bm25_hybrid_otf(self):
273
+ output_file = 'test_run.dpr.squad-test.multi.bf.otf.bm25.trec'
274
+ retrieval_file = 'test_run.dpr.squad-test.multi.bf.otf.bm25.json'
275
+ self.temp_files.extend([output_file, retrieval_file])
276
+ cmd1 = f'python -m pyserini.search.hybrid dense --index wikipedia-dpr-multi-bf \
277
+ --encoder facebook/dpr-question_encoder-multiset-base \
278
+ sparse --index wikipedia-dpr \
279
+ fusion --alpha 2.0 \
280
+ run --topics dpr-squad-test \
281
+ --batch-size {self.batch_size} --threads {self.threads} \
282
+ --output {output_file} '
283
+ cmd2 = f'python -m pyserini.eval.convert_trec_run_to_dpr_retrieval_run --topics dpr-squad-test \
284
+ --index wikipedia-dpr \
285
+ --input {output_file} \
286
+ --output {retrieval_file}'
287
+ cmd3 = f'python -m pyserini.eval.evaluate_dpr_retrieval --retrieval {retrieval_file} --topk 20'
288
+ status1 = os.system(cmd1)
289
+ status2 = os.system(cmd2)
290
+ stdout, stderr = run_command(cmd3)
291
+ score = parse_score_qa(stdout, 'Top20')
292
+ self.assertEqual(status1, 0)
293
+ self.assertEqual(status2, 0)
294
+ # This appears to be a flaky test case; previously, we were getting a score of 0.7511, per
295
+ # https://github.com/castorini/pyserini/pull/1273/files#diff-799c2c339e1d7defa31fa1e82f9b16886269b37805376ef93f7c8afedcee574e
296
+ # Sometimes we get 0.7512. Fix is to reduce tolerance.
297
+ self.assertAlmostEqual(score, 0.7514, places=3)
298
+
299
+ def test_dpr_squad_test_encoded_queries(self):
300
+ encoder = QueryEncoder.load_encoded_queries('dpr_multi-squad-test')
301
+ topics = get_topics('dpr-squad-test')
302
+ for t in topics:
303
+ self.assertTrue(topics[t]['title'] in encoder.embedding)
304
+
305
+ def test_convert_trec_run_to_dpr_retrieval_run(self):
306
+ trec_run_file = 'tests/resources/simple_test_run_convert_trec_run_dpr.trec'
307
+ topics_file = 'tests/resources/simple_topics_dpr.txt'
308
+ dpr_run_file = 'test_run.convert.trec_run.dpr.json'
309
+ collection_path = "tests/resources/sample_collection_dense"
310
+ topic_reader = "io.anserini.search.topicreader.DprNqTopicReader"
311
+ index_dir = 'temp_index'
312
+
313
+ self.temp_files.extend([dpr_run_file, index_dir])
314
+ cmd1 = f'python -m pyserini.index.lucene -collection JsonCollection ' + \
315
+ f'-generator DefaultLuceneDocumentGenerator ' + \
316
+ f'-threads 1 -input {collection_path} -index {index_dir} -storeRaw'
317
+
318
+ cmd2 = f'python -m pyserini.eval.convert_trec_run_to_dpr_retrieval_run --topics-file {topics_file} \
319
+ --topics-reader {topic_reader} \
320
+ --index {index_dir} \
321
+ --input {trec_run_file} \
322
+ --output {dpr_run_file}'
323
+ _ = os.system(cmd1)
324
+ _ = os.system(cmd2)
325
+
326
+ with open(dpr_run_file) as f:
327
+ topic_data = json.load(f)
328
+
329
+ self.assertEqual(topic_data["0"]["answers"], ['text'])
330
+ self.assertEqual(topic_data["0"]["question"], "what is in document three")
331
+ self.assertEqual(topic_data["1"]["answers"], ['contents'])
332
+ self.assertEqual(topic_data["1"]["question"], "what is document two")
333
+
334
+ def tearDown(self):
335
+ clean_files(self.temp_files)
336
+
337
+
338
+ if __name__ == '__main__':
339
+ unittest.main()
integrations/dense/test_encode.py ADDED
@@ -0,0 +1,137 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #
2
+ # Pyserini: Reproducible IR research with sparse and dense representations
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ #
16
+
17
+ """Integration tests for create dense index """
18
+
19
+ import os
20
+ import shutil
21
+ import unittest
22
+ from urllib.request import urlretrieve
23
+
24
+ import faiss
25
+
26
+ from pyserini.search.faiss import FaissSearcher
27
+ from pyserini.search.lucene import LuceneImpactSearcher
28
+
29
+
30
+ class TestSearchIntegration(unittest.TestCase):
31
+ def setUp(self):
32
+ curdir = os.getcwd()
33
+ if curdir.endswith('dense'):
34
+ self.pyserini_root = '../..'
35
+ else:
36
+ self.pyserini_root = '.'
37
+ self.temp_folders = []
38
+ self.corpus_url = 'https://github.com/castorini/anserini-data/raw/master/CACM/corpus/jsonl/cacm.json'
39
+ self.corpus_path = f'{self.pyserini_root}/integrations/dense/temp_cacm/'
40
+ os.makedirs(self.corpus_path, exist_ok=True)
41
+ self.temp_folders.append(self.corpus_path)
42
+ urlretrieve(self.corpus_url, os.path.join(self.corpus_path, 'cacm.json'))
43
+
44
+ def test_dpr_encode_as_faiss(self):
45
+ index_dir = f'{self.pyserini_root}/temp_index'
46
+ self.temp_folders.append(index_dir)
47
+ cmd1 = f'python -m pyserini.encode input --corpus {self.corpus_path} \
48
+ --fields text \
49
+ output --embeddings {index_dir} --to-faiss \
50
+ encoder --encoder facebook/dpr-ctx_encoder-multiset-base \
51
+ --fields text \
52
+ --batch 4 \
53
+ --device cpu'
54
+ _ = os.system(cmd1)
55
+ searcher = FaissSearcher(
56
+ index_dir,
57
+ 'facebook/dpr-question_encoder-multiset-base'
58
+ )
59
+ q_emb, hit = searcher.search('What is the solution of separable closed queueing networks?', k=1, return_vector=True)
60
+ self.assertEqual(hit[0].docid, 'CACM-2445')
61
+ self.assertAlmostEqual(hit[0].vectors[0], -6.88267112e-01, places=4)
62
+ self.assertEqual(searcher.num_docs, 3204)
63
+
64
+ def test_dpr_encode_as_faiss_search_with_partitions(self):
65
+ # Create two partitions of the CACM index, search them individually, and merge results to compute top hit
66
+ index_dir = f'{self.pyserini_root}/temp_index'
67
+ os.makedirs(os.path.join(index_dir, 'partition1'), exist_ok=True)
68
+ os.makedirs(os.path.join(index_dir, 'partition2'), exist_ok=True)
69
+ self.temp_folders.append(index_dir)
70
+ cmd1 = f'python -m pyserini.encode input --corpus {self.corpus_path} \
71
+ --fields text \
72
+ output --embeddings {index_dir} --to-faiss \
73
+ encoder --encoder facebook/dpr-ctx_encoder-multiset-base \
74
+ --fields text \
75
+ --batch 4 \
76
+ --device cpu'
77
+ _ = os.system(cmd1)
78
+ index = faiss.read_index(os.path.join(index_dir, 'index'))
79
+ new_index_partition1 = faiss.IndexFlatIP(index.d)
80
+ new_index_partition2 = faiss.IndexFlatIP(index.d)
81
+ vectors_partition1 = index.reconstruct_n(0, index.ntotal // 2)
82
+ vectors_partition2 = index.reconstruct_n(index.ntotal // 2, index.ntotal - index.ntotal // 2)
83
+ new_index_partition1.add(vectors_partition1)
84
+ new_index_partition2.add(vectors_partition2)
85
+
86
+ faiss.write_index(new_index_partition1, os.path.join(index_dir, 'partition1/index'))
87
+ faiss.write_index(new_index_partition2, os.path.join(index_dir, 'partition2/index'))
88
+
89
+ with open(os.path.join(index_dir, 'partition1/docid'), 'w') as docid1, open(os.path.join(index_dir, 'partition2/docid'), 'w') as docid2:
90
+ with open(os.path.join(index_dir, 'docid'), 'r') as file:
91
+ for i in range(index.ntotal):
92
+ line = next(file)
93
+ if i < (index.ntotal // 2):
94
+ docid1.write(line)
95
+ else:
96
+ docid2.write(line)
97
+
98
+ searcher_partition1 = FaissSearcher(index_dir + '/partition1','facebook/dpr-question_encoder-multiset-base')
99
+ searcher_partition2 = FaissSearcher(index_dir + '/partition2','facebook/dpr-question_encoder-multiset-base')
100
+ q_emb, hit1 = searcher_partition1.search('What is the solution of separable closed queueing networks?', k=2, return_vector=True)
101
+ q_emb, hit2 = searcher_partition2.search('What is the solution of separable closed queueing networks?', k=2, return_vector=True)
102
+ merged_hits = hit1 + hit2
103
+ merged_hits.sort(key=lambda x: x.score, reverse=True)
104
+
105
+ self.assertEqual(merged_hits[0].docid, 'CACM-2445')
106
+ self.assertAlmostEqual(merged_hits[0].vectors[0], -6.88267112e-01, places=4)
107
+ self.assertEqual(searcher_partition1.num_docs, 1602)
108
+ self.assertEqual(searcher_partition2.num_docs, 1602)
109
+
110
+ def test_unicoil_encode_as_jsonl(self):
111
+ embedding_dir = f'{self.pyserini_root}/temp_embeddings'
112
+ self.temp_folders.append(embedding_dir)
113
+ cmd1 = f'python -m pyserini.encode input --corpus {self.corpus_path} \
114
+ --fields text \
115
+ output --embeddings {embedding_dir} \
116
+ encoder --encoder castorini/unicoil-msmarco-passage \
117
+ --fields text \
118
+ --batch 4 \
119
+ --device cpu'
120
+ _ = os.system(cmd1)
121
+ index_dir = f'{self.pyserini_root}/temp_lucene'
122
+ self.temp_folders.append(index_dir)
123
+ cmd2 = f'python -m pyserini.index -collection JsonVectorCollection \
124
+ -input {embedding_dir} \
125
+ -index {index_dir} \
126
+ -generator DefaultLuceneDocumentGenerator \
127
+ -impact -pretokenized -threads 12 -storeRaw'
128
+ _ = os.system(cmd2)
129
+ searcher = LuceneImpactSearcher(index_dir, query_encoder='castorini/unicoil-msmarco-passage')
130
+ hits = searcher.search('What is the solution of separable closed queueing networks?', k=1)
131
+ hit = hits[0]
132
+ self.assertEqual(hit.docid, 'CACM-2712')
133
+ self.assertAlmostEqual(hit.score, 18.402, places=3)
134
+
135
+ def tearDown(self):
136
+ for f in self.temp_folders:
137
+ shutil.rmtree(f)
integrations/dense/test_kilt.py ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #
2
+ # Pyserini: Reproducible IR research with sparse and dense representations
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ #
16
+
17
+ """Integration tests for KILT integration."""
18
+
19
+ import os
20
+ import re
21
+ import socket
22
+ import unittest
23
+
24
+ from integrations.utils import clean_files, run_command
25
+
26
+
27
+ def parse_kilt_score(output, metric, digits=4):
28
+ pattern = re.compile(r"[0-1]\.[0-9]*")
29
+ for line in output.split('\n')[::-1]:
30
+ if metric in line:
31
+ score = float(pattern.search(line).group(0))
32
+ return round(score, digits)
33
+ return None
34
+
35
+
36
+ class TestSearchIntegration(unittest.TestCase):
37
+ def setUp(self):
38
+ self.temp_files = []
39
+ self.threads = 16
40
+ self.batch_size = 256
41
+
42
+ # Hard-code larger values for internal servers
43
+ if socket.gethostname().startswith('damiano') or socket.gethostname().startswith('orca'):
44
+ self.threads = 36
45
+ self.batch_size = 144
46
+
47
+ def test_kilt_search(self):
48
+ run_file = 'test_run.fever-dev-kilt.jsonl'
49
+ self.temp_files.append(run_file)
50
+ cmd1 = f'python -m pyserini.search --topics fever-dev-kilt \
51
+ --topics-format kilt \
52
+ --index wikipedia-kilt-doc \
53
+ --output {run_file} \
54
+ --output-format kilt \
55
+ --threads {self.threads} \
56
+ --batch-size {self.batch_size}'
57
+ status = os.system(cmd1)
58
+ self.assertEqual(status, 0)
59
+ cmd2 = f'python -m pyserini.eval.evaluate_kilt_retrieval {run_file} fever-dev-kilt --ks 1,100'
60
+ stdout, stderr = run_command(cmd2)
61
+ score = parse_kilt_score(stdout, "Rprec")
62
+ self.assertAlmostEqual(score, 0.3821, delta=0.0001)
63
+
64
+ def tearDown(self):
65
+ clean_files(self.temp_files)
66
+
67
+
68
+ if __name__ == '__main__':
69
+ unittest.main()
integrations/dense/test_sbert.py ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #
2
+ # Pyserini: Reproducible IR research with sparse and dense representations
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ #
16
+
17
+ """Integration tests for ANCE model using on-the-fly query encoding."""
18
+
19
+ import os
20
+ import socket
21
+ import unittest
22
+
23
+ from integrations.utils import clean_files, run_command, parse_score
24
+ from pyserini.search import QueryEncoder
25
+ from pyserini.search import get_topics
26
+
27
+
28
+ class TestSearchIntegration(unittest.TestCase):
29
+ def setUp(self):
30
+ self.temp_files = []
31
+ self.threads = 16
32
+ self.batch_size = 256
33
+
34
+ # Hard-code larger values for internal servers
35
+ if socket.gethostname().startswith('damiano') or socket.gethostname().startswith('orca'):
36
+ self.threads = 36
37
+ self.batch_size = 144
38
+
39
+ def test_msmarco_passage_sbert_bf_otf(self):
40
+ output_file = 'test_run.msmarco-passage.sbert.bf.otf.tsv'
41
+ self.temp_files.append(output_file)
42
+ cmd1 = f'python -m pyserini.search.faiss --topics msmarco-passage-dev-subset \
43
+ --index msmarco-passage-sbert-bf \
44
+ --encoder sentence-transformers/msmarco-distilbert-base-v3 \
45
+ --batch-size {self.batch_size} \
46
+ --threads {self.threads} \
47
+ --output {output_file} \
48
+ --output-format msmarco'
49
+ cmd2 = f'python -m pyserini.eval.msmarco_passage_eval msmarco-passage-dev-subset {output_file}'
50
+ status = os.system(cmd1)
51
+ stdout, stderr = run_command(cmd2)
52
+ score = parse_score(stdout, "MRR @10")
53
+ self.assertEqual(status, 0)
54
+ self.assertAlmostEqual(score, 0.3314, delta=0.0001)
55
+
56
+ def test_msmarco_passage_sbert_encoded_queries(self):
57
+ encoded = QueryEncoder.load_encoded_queries('sbert-msmarco-passage-dev-subset')
58
+ topics = get_topics('msmarco-passage-dev-subset')
59
+ for t in topics:
60
+ self.assertTrue(topics[t]['title'] in encoded.embedding)
61
+
62
+ def tearDown(self):
63
+ clean_files(self.temp_files)
64
+
65
+
66
+ if __name__ == '__main__':
67
+ unittest.main()
integrations/dense/test_tct_colbert-v2.py ADDED
@@ -0,0 +1,132 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #
2
+ # Pyserini: Reproducible IR research with sparse and dense representations
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ #
16
+
17
+ """Integration tests for TCT-ColBERTv2 models using on-the-fly query encoding."""
18
+
19
+ import os
20
+ import socket
21
+ import unittest
22
+
23
+ from integrations.utils import clean_files, run_command, parse_score
24
+ from pyserini.search import QueryEncoder
25
+ from pyserini.search import get_topics
26
+
27
+
28
+ class TestSearchIntegration(unittest.TestCase):
29
+ def setUp(self):
30
+ self.temp_files = []
31
+ self.threads = 16
32
+ self.batch_size = 256
33
+
34
+ # Hard-code larger values for internal servers
35
+ if socket.gethostname().startswith('damiano') or socket.gethostname().startswith('orca'):
36
+ self.threads = 36
37
+ self.batch_size = 144
38
+
39
+ def test_msmarco_passage_tct_colbert_v2_bf_otf(self):
40
+ output_file = 'test_run.msmarco-passage.tct_colbert-v2.bf-otf.tsv'
41
+ self.temp_files.append(output_file)
42
+ cmd1 = f'python -m pyserini.search.faiss --topics msmarco-passage-dev-subset \
43
+ --index msmarco-passage-tct_colbert-v2-bf \
44
+ --encoder castorini/tct_colbert-v2-msmarco \
45
+ --batch-size {self.batch_size} \
46
+ --threads {self.threads} \
47
+ --output {output_file} \
48
+ --output-format msmarco'
49
+ cmd2 = f'python -m pyserini.eval.msmarco_passage_eval msmarco-passage-dev-subset {output_file}'
50
+ status = os.system(cmd1)
51
+ stdout, stderr = run_command(cmd2)
52
+ score = parse_score(stdout, "MRR @10")
53
+ self.assertEqual(status, 0)
54
+ self.assertAlmostEqual(score, 0.3440, delta=0.0001)
55
+
56
+ def test_msmarco_passage_tct_colbert_v2_hn_otf(self):
57
+ output_file = 'test_run.msmarco-passage.tct_colbert-v2-hn.bf-otf.tsv'
58
+ self.temp_files.append(output_file)
59
+ cmd1 = f'python -m pyserini.search.faiss --topics msmarco-passage-dev-subset \
60
+ --index msmarco-passage-tct_colbert-v2-hn-bf \
61
+ --encoder castorini/tct_colbert-v2-hn-msmarco \
62
+ --batch-size {self.batch_size} \
63
+ --threads {self.threads} \
64
+ --output {output_file} \
65
+ --output-format msmarco'
66
+ cmd2 = f'python -m pyserini.eval.msmarco_passage_eval msmarco-passage-dev-subset {output_file}'
67
+ status = os.system(cmd1)
68
+ stdout, stderr = run_command(cmd2)
69
+ score = parse_score(stdout, "MRR @10")
70
+ self.assertEqual(status, 0)
71
+ self.assertAlmostEqual(score, 0.3543, delta=0.0001)
72
+
73
+ def test_msmarco_passage_tct_colbert_v2_hnp_bf_bm25_hybrid_otf(self):
74
+ output_file = 'test_run.msmarco-passage.tct_colbert-v2-hnp.bf-otf.bm25.tsv'
75
+ self.temp_files.append(output_file)
76
+ cmd1 = f'python -m pyserini.search.hybrid dense --index msmarco-passage-tct_colbert-v2-hnp-bf \
77
+ --encoder castorini/tct_colbert-v2-hnp-msmarco \
78
+ sparse --index msmarco-passage \
79
+ fusion --alpha 0.06 \
80
+ run --topics msmarco-passage-dev-subset \
81
+ --output {output_file} \
82
+ --batch-size {self.batch_size} --threads {self.threads} \
83
+ --output-format msmarco'
84
+ cmd2 = f'python -m pyserini.eval.msmarco_passage_eval msmarco-passage-dev-subset {output_file}'
85
+ status = os.system(cmd1)
86
+ stdout, stderr = run_command(cmd2)
87
+ score = parse_score(stdout, "MRR @10")
88
+ self.assertEqual(status, 0)
89
+ self.assertAlmostEqual(score, 0.3682, delta=0.0001)
90
+
91
+ def test_msmarco_passage_tct_colbert_v2_hnp_bf_d2q_hybrid_otf(self):
92
+ output_file = 'test_run.msmarco-passage.tct_colbert-v2-hnp.bf-otf.doc2queryT5.tsv'
93
+ self.temp_files.append(output_file)
94
+ cmd1 = f'python -m pyserini.search.hybrid dense --index msmarco-passage-tct_colbert-v2-hnp-bf \
95
+ --encoder castorini/tct_colbert-v2-hnp-msmarco \
96
+ sparse --index msmarco-passage-expanded \
97
+ fusion --alpha 0.1 \
98
+ run --topics msmarco-passage-dev-subset \
99
+ --output {output_file} \
100
+ --batch-size {self.batch_size} --threads {self.threads} \
101
+ --output-format msmarco'
102
+ cmd2 = f'python -m pyserini.eval.msmarco_passage_eval msmarco-passage-dev-subset {output_file}'
103
+ status = os.system(cmd1)
104
+ stdout, stderr = run_command(cmd2)
105
+ score = parse_score(stdout, "MRR @10")
106
+ self.assertEqual(status, 0)
107
+ self.assertAlmostEqual(score, 0.3731, delta=0.0001)
108
+
109
+ def test_msmarco_passage_tct_colbert_v2_encoded_queries(self):
110
+ encoded = QueryEncoder.load_encoded_queries('tct_colbert-v2-msmarco-passage-dev-subset')
111
+ topics = get_topics('msmarco-passage-dev-subset')
112
+ for t in topics:
113
+ self.assertTrue(topics[t]['title'] in encoded.embedding)
114
+
115
+ def test_msmarco_passage_tct_colbert_v2_hn_encoded_queries(self):
116
+ encoded = QueryEncoder.load_encoded_queries('tct_colbert-v2-hn-msmarco-passage-dev-subset')
117
+ topics = get_topics('msmarco-passage-dev-subset')
118
+ for t in topics:
119
+ self.assertTrue(topics[t]['title'] in encoded.embedding)
120
+
121
+ def test_msmarco_passage_tct_colbert_v2_hnp_encoded_queries(self):
122
+ encoded = QueryEncoder.load_encoded_queries('tct_colbert-v2-hnp-msmarco-passage-dev-subset')
123
+ topics = get_topics('msmarco-passage-dev-subset')
124
+ for t in topics:
125
+ self.assertTrue(topics[t]['title'] in encoded.embedding)
126
+
127
+ def tearDown(self):
128
+ clean_files(self.temp_files)
129
+
130
+
131
+ if __name__ == '__main__':
132
+ unittest.main()
integrations/dense/test_tct_colbert.py ADDED
@@ -0,0 +1,183 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #
2
+ # Pyserini: Reproducible IR research with sparse and dense representations
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ #
16
+
17
+ """Integration tests for TCT-ColBERTv1 models using on-the-fly query encoding."""
18
+
19
+ import os
20
+ import socket
21
+ import unittest
22
+
23
+ from integrations.utils import clean_files, run_command, parse_score
24
+ from pyserini.search import QueryEncoder
25
+ from pyserini.search import get_topics
26
+
27
+
28
+ class TestSearchIntegration(unittest.TestCase):
29
+ def setUp(self):
30
+ self.temp_files = []
31
+ self.threads = 16
32
+ self.batch_size = 256
33
+
34
+ # Hard-code larger values for internal servers
35
+ if socket.gethostname().startswith('damiano') or socket.gethostname().startswith('orca'):
36
+ self.threads = 36
37
+ self.batch_size = 144
38
+
39
+ def test_msmarco_passage_tct_colbert_bf_otf(self):
40
+ output_file = 'test_run.msmarco-passage.tct_colbert.bf-otf.tsv'
41
+ self.temp_files.append(output_file)
42
+ cmd1 = f'python -m pyserini.search.faiss --topics msmarco-passage-dev-subset \
43
+ --index msmarco-passage-tct_colbert-bf \
44
+ --encoder castorini/tct_colbert-msmarco \
45
+ --batch-size {self.batch_size} \
46
+ --threads {self.threads} \
47
+ --output {output_file} \
48
+ --output-format msmarco'
49
+ cmd2 = f'python -m pyserini.eval.msmarco_passage_eval msmarco-passage-dev-subset {output_file}'
50
+ status = os.system(cmd1)
51
+ stdout, stderr = run_command(cmd2)
52
+ score = parse_score(stdout, "MRR @10")
53
+ self.assertEqual(status, 0)
54
+ # We get a small difference in scores on macOS vs. Linux, better way to check:
55
+ self.assertAlmostEqual(score, 0.3350, delta=0.0002)
56
+
57
+ def test_msmarco_passage_tct_colbert_hnsw_otf(self):
58
+ output_file = 'test_run.msmarco-passage.tct_colbert.hnsw-otf.tsv'
59
+ self.temp_files.append(output_file)
60
+ cmd1 = f'python -m pyserini.search.faiss --topics msmarco-passage-dev-subset \
61
+ --index msmarco-passage-tct_colbert-hnsw \
62
+ --encoder castorini/tct_colbert-msmarco \
63
+ --output {output_file} \
64
+ --output-format msmarco '
65
+ cmd2 = f'python -m pyserini.eval.msmarco_passage_eval msmarco-passage-dev-subset {output_file}'
66
+ status = os.system(cmd1)
67
+ stdout, stderr = run_command(cmd2)
68
+ score = parse_score(stdout, "MRR @10")
69
+ self.assertEqual(status, 0)
70
+ self.assertAlmostEqual(score, 0.3345, delta=0.0002)
71
+
72
+ def test_msmarco_passage_tct_colbert_bf_bm25_hybrid_otf(self):
73
+ output_file = 'test_run.msmarco-passage.tct_colbert.bf-otf.bm25.tsv'
74
+ self.temp_files.append(output_file)
75
+ cmd1 = f'python -m pyserini.search.hybrid dense --index msmarco-passage-tct_colbert-bf \
76
+ --encoder castorini/tct_colbert-msmarco \
77
+ sparse --index msmarco-passage \
78
+ fusion --alpha 0.12 \
79
+ run --topics msmarco-passage-dev-subset \
80
+ --output {output_file} \
81
+ --batch-size {self.batch_size} --threads {self.threads} \
82
+ --output-format msmarco'
83
+ cmd2 = f'python -m pyserini.eval.msmarco_passage_eval msmarco-passage-dev-subset {output_file}'
84
+ status = os.system(cmd1)
85
+ stdout, stderr = run_command(cmd2)
86
+ score = parse_score(stdout, "MRR @10")
87
+ self.assertEqual(status, 0)
88
+ self.assertAlmostEqual(score, 0.3529, places=4)
89
+
90
+ def test_msmarco_passage_tct_colbert_bf_d2q_hybrid_otf(self):
91
+ output_file = 'test_run.msmarco-passage.tct_colbert.bf-otf.doc2queryT5.tsv'
92
+ self.temp_files.append(output_file)
93
+ cmd1 = f'python -m pyserini.search.hybrid dense --index msmarco-passage-tct_colbert-bf \
94
+ --encoder castorini/tct_colbert-msmarco \
95
+ sparse --index msmarco-passage-expanded \
96
+ fusion --alpha 0.22 \
97
+ run --topics msmarco-passage-dev-subset \
98
+ --output {output_file} \
99
+ --batch-size {self.batch_size} --threads {self.threads} \
100
+ --output-format msmarco'
101
+ cmd2 = f'python -m pyserini.eval.msmarco_passage_eval msmarco-passage-dev-subset {output_file}'
102
+ status = os.system(cmd1)
103
+ stdout, stderr = run_command(cmd2)
104
+ score = parse_score(stdout, "MRR @10")
105
+ self.assertEqual(status, 0)
106
+ self.assertAlmostEqual(score, 0.3647, places=4)
107
+
108
+ def test_msmarco_passage_tct_colbert_encoded_queries(self):
109
+ encoded = QueryEncoder.load_encoded_queries('tct_colbert-msmarco-passage-dev-subset')
110
+ topics = get_topics('msmarco-passage-dev-subset')
111
+ for t in topics:
112
+ self.assertTrue(topics[t]['title'] in encoded.embedding)
113
+
114
+ def test_msmarco_doc_tct_colbert_bf_otf(self):
115
+ output_file = 'test_run.msmarco-doc.passage.tct_colbert-otf.txt'
116
+ self.temp_files.append(output_file)
117
+ cmd1 = f'python -m pyserini.search.faiss --topics msmarco-doc-dev \
118
+ --index msmarco-doc-tct_colbert-bf \
119
+ --encoder castorini/tct_colbert-msmarco \
120
+ --output {output_file} \
121
+ --hits 1000 \
122
+ --max-passage \
123
+ --max-passage-hits 100 \
124
+ --output-format msmarco \
125
+ --batch-size {self.batch_size} \
126
+ --threads {self.threads}'
127
+ cmd2 = f'python -m pyserini.eval.msmarco_doc_eval --judgments msmarco-doc-dev --run {output_file}'
128
+ status = os.system(cmd1)
129
+ stdout, stderr = run_command(cmd2)
130
+ score = parse_score(stdout, "MRR @100")
131
+ self.assertEqual(status, 0)
132
+ self.assertAlmostEqual(score, 0.3323, places=4)
133
+
134
+ def test_msmarco_doc_tct_colbert_bf_bm25_hybrid_otf(self):
135
+ output_file = 'test_run.msmarco-doc.tct_colbert.bf-otf.bm25.tsv'
136
+ self.temp_files.append(output_file)
137
+ cmd1 = f'python -m pyserini.search.hybrid dense --index msmarco-doc-tct_colbert-bf \
138
+ --encoder castorini/tct_colbert-msmarco \
139
+ sparse --index msmarco-doc-per-passage \
140
+ fusion --alpha 0.25 \
141
+ run --topics msmarco-doc-dev \
142
+ --output {output_file} \
143
+ --hits 1000 --max-passage --max-passage-hits 100 \
144
+ --batch-size {self.batch_size} --threads {self.threads} \
145
+ --output-format msmarco'
146
+ cmd2 = f'python -m pyserini.eval.msmarco_doc_eval --judgments msmarco-doc-dev --run {output_file}'
147
+ status = os.system(cmd1)
148
+ stdout, stderr = run_command(cmd2)
149
+ score = parse_score(stdout, "MRR @100")
150
+ self.assertEqual(status, 0)
151
+ self.assertAlmostEqual(score, 0.3701, places=4)
152
+
153
+ def test_msmarco_doc_tct_colbert_bf_d2q_hybrid_otf(self):
154
+ output_file = 'test_run.msmarco-doc.tct_colbert.bf-otf.doc2queryT5.tsv'
155
+ self.temp_files.append(output_file)
156
+ cmd1 = f'python -m pyserini.search.hybrid dense --index msmarco-doc-tct_colbert-bf \
157
+ --encoder castorini/tct_colbert-msmarco \
158
+ sparse --index msmarco-doc-expanded-per-passage \
159
+ fusion --alpha 0.32 \
160
+ run --topics msmarco-doc-dev \
161
+ --output {output_file} \
162
+ --hits 1000 --max-passage --max-passage-hits 100 \
163
+ --batch-size {self.batch_size} --threads {self.threads} \
164
+ --output-format msmarco'
165
+ cmd2 = f'python -m pyserini.eval.msmarco_doc_eval --judgments msmarco-doc-dev --run {output_file}'
166
+ status = os.system(cmd1)
167
+ stdout, stderr = run_command(cmd2)
168
+ score = parse_score(stdout, "MRR @100")
169
+ self.assertEqual(status, 0)
170
+ self.assertAlmostEqual(score, 0.3784, places=4)
171
+
172
+ def test_msmarco_doc_tct_colbert_encoded_queries(self):
173
+ encoded = QueryEncoder.load_encoded_queries('tct_colbert-msmarco-doc-dev')
174
+ topics = get_topics('msmarco-doc-dev')
175
+ for t in topics:
176
+ self.assertTrue(topics[t]['title'] in encoded.embedding)
177
+
178
+ def tearDown(self):
179
+ clean_files(self.temp_files)
180
+
181
+
182
+ if __name__ == '__main__':
183
+ unittest.main()
integrations/lucenesearcher_anserini_checker.py ADDED
@@ -0,0 +1,78 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #
2
+ # Pyserini: Reproducible IR research with sparse and dense representations
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ #
16
+
17
+ import filecmp
18
+ import os
19
+ from typing import List
20
+
21
+
22
+ class LuceneSearcherAnseriniMatchChecker:
23
+ def __init__(self, anserini_root: str, index: str, topics: str, pyserini_topics: str, qrels: str, eval_root: str):
24
+ self.anserini_root = anserini_root
25
+ self.index_path = index
26
+ self.topics = topics
27
+ self.qrels = qrels
28
+ self.pyserini_topics = pyserini_topics
29
+
30
+ self.anserini_base_cmd = os.path.join(self.anserini_root,
31
+ 'target/appassembler/bin/SearchCollection -topicreader Trec')
32
+ self.pyserini_base_cmd = 'python -m pyserini.search.lucene'
33
+
34
+ self.eval_base_cmd = os.path.join(eval_root, 'tools/eval/trec_eval.9.0.4/trec_eval -m map -m P.30')
35
+
36
+ @staticmethod
37
+ def _cleanup(files: List[str]):
38
+ for file in files:
39
+ if os.path.exists(file):
40
+ os.remove(file)
41
+
42
+ def run(self, runtag: str, anserini_extras: str, pyserini_extras: str):
43
+ print('-------------------------')
44
+ print(f'Running {runtag}:')
45
+ print('-------------------------')
46
+
47
+ anserini_output = f'verify.anserini.{runtag}.txt'
48
+ pyserini_output = f'verify.pyserini.{runtag}.txt'
49
+
50
+ anserini_cmd = f'{self.anserini_base_cmd} -index {self.index_path} ' \
51
+ + f'-topics {self.topics} -output {anserini_output} {anserini_extras}'
52
+ pyserini_cmd = f'{self.pyserini_base_cmd} --index {self.index_path} ' \
53
+ + f'--topics {self.pyserini_topics} --output {pyserini_output} {pyserini_extras}'
54
+
55
+ status = os.system(anserini_cmd)
56
+ if not status == 0:
57
+ self._cleanup([anserini_output, pyserini_output])
58
+ return False
59
+ status = os.system(pyserini_cmd)
60
+ if not status == 0:
61
+ self._cleanup([anserini_output, pyserini_output])
62
+ return False
63
+
64
+ res = filecmp.cmp(anserini_output, pyserini_output)
65
+ if res is True:
66
+ eval_cmd = f'{self.eval_base_cmd} {self.qrels} {anserini_output}'
67
+ status = os.system(eval_cmd)
68
+ if not status == 0:
69
+ print(f'[FAIL] {runtag} evaluation failure!')
70
+ self._cleanup([anserini_output, pyserini_output])
71
+ return False
72
+ print(f'[SUCCESS] {runtag} results verified!')
73
+ self._cleanup([anserini_output, pyserini_output])
74
+ return True
75
+ else:
76
+ print(f'[FAIL] {runtag} result do not match!')
77
+ self._cleanup([anserini_output, pyserini_output])
78
+ return False
integrations/lucenesearcher_score_checker.py ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #
2
+ # Pyserini: Reproducible IR research with sparse and dense representations
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ #
16
+
17
+ import os
18
+ from typing import List
19
+
20
+ from integrations.utils import run_command, parse_score
21
+
22
+
23
+ class LuceneSearcherScoreChecker:
24
+ def __init__(self, index: str, topics: str, pyserini_topics: str, qrels: str, eval:str):
25
+ self.index_path = index
26
+ self.topics = topics
27
+ self.qrels = qrels
28
+ self.pyserini_topics = pyserini_topics
29
+
30
+ self.pyserini_base_cmd = 'python -m pyserini.search.lucene'
31
+
32
+ self.eval_base_cmd = eval
33
+
34
+ @staticmethod
35
+ def _cleanup(files: List[str]):
36
+ for file in files:
37
+ if os.path.exists(file):
38
+ os.remove(file)
39
+
40
+ def run(self, runtag: str, pyserini_extras: str, actualscore: float, tokenizer = None):
41
+ print('-------------------------')
42
+ print(f'Running {runtag}:')
43
+ print('-------------------------')
44
+
45
+ pyserini_output = f'verify.pyserini.{runtag}.txt'
46
+
47
+ pyserini_cmd = f'{self.pyserini_base_cmd} --index {self.index_path} \
48
+ --topics {self.pyserini_topics} --output {pyserini_output} {pyserini_extras}'
49
+
50
+ if tokenizer is not None:
51
+ pyserini_cmd = pyserini_cmd + f' --tokenizer {tokenizer}'
52
+
53
+ status = os.system(pyserini_cmd)
54
+ if not status == 0:
55
+ return False
56
+
57
+ eval_cmd = f'{self.eval_base_cmd} {self.qrels} {pyserini_output}'
58
+ status = os.system(eval_cmd)
59
+ if not status == 0:
60
+ return False
61
+
62
+ stdout, stderr = run_command(eval_cmd)
63
+ score = parse_score(stdout, 'map')
64
+ self._cleanup([pyserini_output])
65
+
66
+ if actualscore != score:
67
+ return False
68
+
69
+ return True
70
+
71
+
integrations/papers/test_ecir2023.py ADDED
@@ -0,0 +1,91 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #
2
+ # Pyserini: Reproducible IR research with sparse and dense representations
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ #
16
+
17
+ """Integration tests for commands in Pradeep et al. resource paper at ECIR 2023."""
18
+
19
+ import os
20
+ import unittest
21
+
22
+ from integrations.utils import clean_files, run_command, parse_score, parse_score_qa
23
+
24
+
25
+ class TestECIR2023(unittest.TestCase):
26
+ def setUp(self):
27
+ self.temp_files = []
28
+
29
+ def test_section5_sub2_first(self):
30
+ """Sample code of the first command in Section 5.2."""
31
+ metrics = ["Top5", "Top20", "Top100"]
32
+ ground_truth = [73.8, 84.27, 89.34]
33
+
34
+ output_file = 'runs/run.nq-test.dkrr.trec'
35
+ json_file = 'runs/run.nq-test.dkrr.json'
36
+ self.temp_files.append(output_file)
37
+ self.temp_files.append(json_file)
38
+
39
+ # retrieval
40
+ run_cmd = f'python -m pyserini.search.faiss \
41
+ --index wikipedia-dpr-dkrr-nq \
42
+ --topics nq-test \
43
+ --encoder castorini/dkrr-dpr-nq-retriever \
44
+ --output {output_file} --query-prefix question: \
45
+ --threads 72 --batch-size 72 \
46
+ --hits 100'
47
+ status = os.system(run_cmd)
48
+ self.assertEqual(status, 0)
49
+
50
+ # conversion
51
+ convert_cmd = f'python -m pyserini.eval.convert_trec_run_to_dpr_retrieval_run \
52
+ --topics nq-test \
53
+ --index wikipedia-dpr \
54
+ --input {output_file} \
55
+ --output {json_file}'
56
+ status = os.system(convert_cmd)
57
+ self.assertEqual(status, 0)
58
+
59
+ # evaluation
60
+ eval_cmd = f'python -m pyserini.eval.evaluate_dpr_retrieval \
61
+ --retrieval {json_file} \
62
+ --topk 5 20 100'
63
+ stdout, stderr = run_command(eval_cmd)
64
+
65
+ scores = []
66
+ for mt in metrics:
67
+ scores.append(parse_score_qa(stdout, mt, 4) * 100)
68
+
69
+ for score in zip(scores, ground_truth):
70
+ self.assertAlmostEqual(score[0], score[1], delta=0.02)
71
+
72
+ def test_section5_sub2_second(self):
73
+ """Sample code of the second command in Section 5.2."""
74
+
75
+ cmd_nq = 'python scripts/repro_matrix/run_all_odqa.py --topics nq'
76
+ cmd_tqa = 'python scripts/repro_matrix/run_all_odqa.py --topics nq'
77
+
78
+ # run both commands, check if all tests passed (i.e., returned OK)
79
+ stdout_nq, stderr_nq = run_command(cmd_nq)
80
+ self.assertEqual(stdout_nq.count('[OK]'), 21)
81
+
82
+ stdout_tqa, stderr_tqa = run_command(cmd_tqa)
83
+ self.assertEqual(stdout_tqa.count('[OK]'), 21)
84
+
85
+ def tearDown(self):
86
+ clean_files(self.temp_files)
87
+
88
+
89
+ if __name__ == '__main__':
90
+ unittest.main()
91
+
integrations/papers/test_sigir2021.py ADDED
@@ -0,0 +1,180 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #
2
+ # Pyserini: Reproducible IR research with sparse and dense representations
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ #
16
+
17
+ """Integration tests for commands in Lin et al. (SIGIR 2021) paper."""
18
+
19
+ import os
20
+ import unittest
21
+
22
+ from integrations.utils import clean_files, run_command, parse_score_msmarco
23
+ from pyserini.dsearch import SimpleDenseSearcher, TctColBertQueryEncoder
24
+ from pyserini.hsearch import HybridSearcher
25
+ from pyserini.index import IndexReader
26
+ from pyserini.search import SimpleSearcher
27
+ from pyserini.search import get_topics, get_qrels
28
+
29
+
30
+ class TestSIGIR2021(unittest.TestCase):
31
+ def setUp(self):
32
+ self.temp_files = []
33
+
34
+ def test_figure1(self):
35
+ """Sample code in Figure 1."""
36
+
37
+ searcher = SimpleSearcher.from_prebuilt_index('msmarco-passage')
38
+ hits = searcher.search('what is a lobster roll?', 10)
39
+
40
+ self.assertAlmostEqual(hits[0].score, 11.00830, delta=0.0001)
41
+ self.assertEqual(hits[0].docid, '7157707')
42
+
43
+ self.assertAlmostEqual(hits[9].score, 9.92200, delta=0.0001)
44
+ self.assertEqual(hits[9].docid, '6234461')
45
+
46
+ self.assertEqual(len(hits), 10)
47
+
48
+ def test_figure2(self):
49
+ """Sample code in Figure 2."""
50
+
51
+ encoder = TctColBertQueryEncoder('castorini/tct_colbert-msmarco')
52
+ searcher = SimpleDenseSearcher.from_prebuilt_index('msmarco-passage-tct_colbert-hnsw', encoder)
53
+ hits = searcher.search('what is a lobster roll')
54
+
55
+ self.assertAlmostEqual(hits[0].score, 70.53741, delta=0.0001)
56
+ self.assertEqual(hits[0].docid, '7157710')
57
+
58
+ self.assertAlmostEqual(hits[9].score, 69.01737, delta=0.0001)
59
+ self.assertEqual(hits[9].docid, '2920399')
60
+
61
+ self.assertEqual(len(hits), 10)
62
+
63
+ def test_figure3(self):
64
+ """Sample code in Figure 3."""
65
+
66
+ ssearcher = SimpleSearcher.from_prebuilt_index('msmarco-passage')
67
+ encoder = TctColBertQueryEncoder('castorini/tct_colbert-msmarco')
68
+ dsearcher = SimpleDenseSearcher.from_prebuilt_index('msmarco-passage-tct_colbert-hnsw', encoder)
69
+ hsearcher = HybridSearcher(dsearcher, ssearcher)
70
+
71
+ hits = hsearcher.search('what is a lobster roll')
72
+
73
+ self.assertAlmostEqual(hits[0].score, 71.56023, delta=0.0001)
74
+ self.assertEqual(hits[0].docid, '7157715')
75
+
76
+ self.assertAlmostEqual(hits[9].score, 70.07635, delta=0.0001)
77
+ self.assertEqual(hits[9].docid, '7157708')
78
+
79
+ self.assertEqual(len(hits), 10)
80
+
81
+ def test_figure4(self):
82
+ """Sample code in Figure 4."""
83
+
84
+ topics = get_topics('msmarco-passage-dev-subset')
85
+ qrels = get_qrels('msmarco-passage-dev-subset')
86
+
87
+ self.assertEqual(len(topics), 6980)
88
+ self.assertEqual(len(qrels), 6980)
89
+
90
+ # Compute the average length of queries:
91
+ avg_qlen = sum([len(topics[t]['title'].split()) for t in topics])/len(topics)
92
+
93
+ # Compute the average number of relevance judgments per query:
94
+ avg_qrels = sum([len(qrels[t]) for t in topics])/len(topics)
95
+
96
+ self.assertAlmostEqual(avg_qlen, 5.925, delta=0.001)
97
+ self.assertAlmostEqual(avg_qrels, 1.065, delta=0.001)
98
+
99
+ def test_figure5(self):
100
+ """Sample code in Figure 5."""
101
+
102
+ # Initialize from a pre-built index:
103
+ reader = IndexReader.from_prebuilt_index('robust04')
104
+
105
+ terms = reader.terms()
106
+ term = next(terms)
107
+ self.assertEqual(term.term, '0')
108
+ self.assertEqual(term.df, 10826)
109
+ self.assertEqual(term.cf, 33491)
110
+
111
+ term = next(terms)
112
+
113
+ self.assertEqual(term.term, '0,0')
114
+ self.assertEqual(term.df, 2)
115
+ self.assertEqual(term.cf, 2)
116
+
117
+ # Analyze a term:
118
+ term = 'atomic'
119
+ analyzed = reader.analyze(term)
120
+ self.assertEqual(analyzed[0], 'atom')
121
+
122
+ # Directly fetch term statistics for a term:
123
+ df, cf = reader.get_term_counts(term)
124
+ self.assertEqual(df, 5219)
125
+ self.assertEqual(cf, 9144)
126
+
127
+ # Traverse postings for a term:
128
+ postings_list = reader.get_postings_list(term)
129
+ self.assertEqual(len(postings_list), 5219)
130
+ self.assertEqual(postings_list[0].docid, 432)
131
+ self.assertEqual(postings_list[0].tf, 1)
132
+ self.assertEqual(postings_list[0].positions, [137])
133
+ self.assertEqual(postings_list[5218].docid, 527779)
134
+ self.assertEqual(postings_list[5218].tf, 1)
135
+ self.assertEqual(postings_list[5218].positions, [21])
136
+
137
+ # Examples of manipulating document vectors:
138
+ tf = reader.get_document_vector('LA071090-0047')
139
+ tp = reader.get_term_positions('LA071090-0047')
140
+ df = {
141
+ term: (reader.get_term_counts(term, analyzer=None))[0]
142
+ for term in tf.keys()
143
+ }
144
+ bm25_vector = {
145
+ term: reader.compute_bm25_term_weight('LA071090-0047',
146
+ term,
147
+ analyzer=None)
148
+ for term in tf.keys()
149
+ }
150
+
151
+ self.assertEqual(tf['hubbl'], 12)
152
+ self.assertEqual(tp['caught'], [42, 624, 960])
153
+ self.assertEqual(df['problem'], 82225)
154
+ self.assertAlmostEqual(bm25_vector['hubbl'], 7.49397, delta=0.001)
155
+ self.assertAlmostEqual(bm25_vector['earth'], 2.64872, delta=0.001)
156
+
157
+ def test_section3_3(self):
158
+ """Sample code in Section 3.3."""
159
+
160
+ output_file = 'run.msmarco-passage.txt'
161
+ self.temp_files.append(output_file)
162
+ run_cmd = f'python -m pyserini.search --topics msmarco-passage-dev-subset \
163
+ --index msmarco-passage --output {output_file} \
164
+ --bm25 --output-format msmarco'
165
+ status = os.system(run_cmd)
166
+ self.assertEqual(status, 0)
167
+
168
+ eval_cmd = f'python -m pyserini.eval.msmarco_passage_eval \
169
+ msmarco-passage-dev-subset {output_file}'
170
+ stdout, stderr = run_command(eval_cmd)
171
+ score = parse_score_msmarco(stdout, "MRR @10")
172
+ self.assertAlmostEqual(score, 0.1872, delta=0.0001)
173
+ # Temporary fix: this is Lucene 9 code running on Lucene 8 prebuilt index.
174
+
175
+ def tearDown(self):
176
+ clean_files(self.temp_files)
177
+
178
+
179
+ if __name__ == '__main__':
180
+ unittest.main()
integrations/papers/test_sigir2022.py ADDED
@@ -0,0 +1,100 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #
2
+ # Pyserini: Reproducible IR research with sparse and dense representations
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ #
16
+
17
+ """Integration tests for commands in Ma et al. resource paper and Trotman et al. demo paper at SIGIR 2022."""
18
+
19
+ import os
20
+ import unittest
21
+
22
+ from integrations.utils import clean_files, run_command, parse_score, parse_score_msmarco
23
+
24
+
25
+ class TestSIGIR2021(unittest.TestCase):
26
+ def setUp(self):
27
+ self.temp_files = []
28
+
29
+ def test_Ma_etal_section4_1a(self):
30
+ """Sample code in Section 4.1. in Ma et al. resource paper."""
31
+
32
+ output_file = 'run.msmarco-passage.expanded.txt'
33
+ self.temp_files.append(output_file)
34
+ run_cmd = f'python -m pyserini.search.lucene \
35
+ --index msmarco-v1-passage-d2q-t5 \
36
+ --topics msmarco-passage-dev-subset \
37
+ --output {output_file} \
38
+ --output-format msmarco \
39
+ --bm25'
40
+ status = os.system(run_cmd)
41
+ self.assertEqual(status, 0)
42
+
43
+ eval_cmd = f'python -m pyserini.eval.msmarco_passage_eval \
44
+ msmarco-passage-dev-subset {output_file}'
45
+ stdout, stderr = run_command(eval_cmd)
46
+ score = parse_score_msmarco(stdout, "MRR @10")
47
+ self.assertAlmostEqual(score, 0.2816, delta=0.0001)
48
+ # Note that this is the score with (k1=2.18, b=0.86); score is 0.2723 with default (k1=0.9, b=0.4) parameters.
49
+
50
+ def test_Ma_etal_section4_1b(self):
51
+ """Sample code in Section 4.1. in Ma et al. resource paper."""
52
+
53
+ output_file = 'run.msmarco-v2-passage.unicoil.txt'
54
+ self.temp_files.append(output_file)
55
+ run_cmd = f'python -m pyserini.search.lucene \
56
+ --index msmarco-v2-passage-unicoil-0shot \
57
+ --topics msmarco-v2-passage-dev \
58
+ --encoder castorini/unicoil-msmarco-passage \
59
+ --output {output_file} \
60
+ --batch 144 --threads 36 \
61
+ --hits 1000 \
62
+ --impact'
63
+ status = os.system(run_cmd)
64
+ self.assertEqual(status, 0)
65
+
66
+ eval_cmd = f'python -m pyserini.eval.trec_eval -c -M 100 -m map -m recip_rank msmarco-v2-passage-dev {output_file}'
67
+ stdout, stderr = run_command(eval_cmd)
68
+ score = parse_score(stdout, "recip_rank")
69
+ self.assertAlmostEqual(score, 0.1501, delta=0.0001)
70
+ # This is the score with otf; with pre-encoded, the score is 0.1499.
71
+
72
+ def test_Trotman_etal(self):
73
+ """Sample code in Trotman et al. demo paper."""
74
+
75
+ output_file = 'run.msmarco-passage.unicoil.tsv'
76
+ self.temp_files.append(output_file)
77
+ run_cmd = f'python -m pyserini.search.lucene \
78
+ --index msmarco-passage-unicoil-d2q \
79
+ --topics msmarco-passage-dev-subset-unicoil \
80
+ --output {output_file} \
81
+ --output-format msmarco \
82
+ --batch 36 --threads 12 \
83
+ --hits 1000 \
84
+ --impact'
85
+ status = os.system(run_cmd)
86
+ self.assertEqual(status, 0)
87
+
88
+ eval_cmd = f'python -m pyserini.eval.msmarco_passage_eval msmarco-passage-dev-subset {output_file}'
89
+ stdout, stderr = run_command(eval_cmd)
90
+ score = parse_score_msmarco(stdout, "MRR @10", digits=3)
91
+ self.assertAlmostEqual(score, 0.352, delta=0.0005)
92
+
93
+ # TODO: There's corresponding test code with JASS that's also in the demo paper. We should also add.
94
+
95
+ def tearDown(self):
96
+ clean_files(self.temp_files)
97
+
98
+
99
+ if __name__ == '__main__':
100
+ unittest.main()
integrations/run_lucenesearcher.py ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #
2
+ # Pyserini: Reproducible IR research with sparse and dense representations
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ #
16
+
17
+ import hashlib
18
+ import os
19
+ from typing import List
20
+
21
+
22
+ class RunLuceneSearcher:
23
+ def __init__(self, index: str, topics: str):
24
+ self.index_path = index
25
+ self.topics = topics
26
+ self.pyserini_base_cmd = 'python -m pyserini.search.lucene'
27
+
28
+ @staticmethod
29
+ def _cleanup(files: List[str]):
30
+ for file in files:
31
+ if os.path.exists(file):
32
+ os.remove(file)
33
+
34
+ def run(self, runtag: str, extras: str) -> str:
35
+ print('-------------------------')
36
+ print(f'Running {runtag}:')
37
+ print('-------------------------')
38
+
39
+ output = f'verify.pyserini.{runtag}.txt'
40
+ pyserini_cmd = f'{self.pyserini_base_cmd} --index {self.index_path} ' \
41
+ + f'--topics {self.topics} --output {output} {extras}'
42
+
43
+ status = os.system(pyserini_cmd)
44
+ if not status == 0:
45
+ self._cleanup([output])
46
+ return ""
47
+
48
+ with open(output, 'rb') as f:
49
+ md5 = hashlib.md5(f.read()).hexdigest()
50
+ self._cleanup([output])
51
+ return md5
integrations/sparse/test_lucenesearcher_check_core17.py ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #
2
+ # Pyserini: Reproducible IR research with sparse and dense representations
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ #
16
+
17
+ import os
18
+ import unittest
19
+
20
+ from integrations.lucenesearcher_anserini_checker import LuceneSearcherAnseriniMatchChecker
21
+
22
+
23
+ class TestSearchIntegration(unittest.TestCase):
24
+ def setUp(self):
25
+ # The current directory depends on if you're running inside an IDE or from command line.
26
+ curdir = os.getcwd()
27
+ if curdir.endswith('sparse'):
28
+ anserini_root = '../../../anserini'
29
+ pyserini_root = '../..'
30
+ else:
31
+ anserini_root = '../anserini'
32
+ pyserini_root = '.'
33
+
34
+ self.checker = LuceneSearcherAnseriniMatchChecker(
35
+ anserini_root=anserini_root,
36
+ index=os.path.join(anserini_root, 'indexes/lucene-index.nyt'),
37
+ topics=os.path.join(pyserini_root, 'tools/topics-and-qrels/topics.core17.txt'),
38
+ pyserini_topics='core17',
39
+ qrels=os.path.join(pyserini_root, 'tools/topics-and-qrels/qrels.core17.txt'),
40
+ eval_root=pyserini_root)
41
+
42
+ def test_bm25(self):
43
+ self.assertTrue(self.checker.run('core17_bm25', '-bm25', '--bm25'))
44
+
45
+ def test_bm25_rm3(self):
46
+ self.assertTrue(self.checker.run('core17_bm25_rm3', '-bm25 -rm3', '--bm25 --rm3'))
47
+
48
+ def test_qld(self):
49
+ self.assertTrue(self.checker.run('core17_qld', '-qld', '--qld'))
50
+
51
+ def test_qld_rm3(self):
52
+ self.assertTrue(self.checker.run('core17_qld_rm3', '-qld -rm3', '--qld --rm3'))
53
+
54
+ def tearDown(self):
55
+ pass
56
+
57
+
58
+ if __name__ == '__main__':
59
+ unittest.main()
integrations/sparse/test_lucenesearcher_check_core18.py ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #
2
+ # Pyserini: Reproducible IR research with sparse and dense representations
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ #
16
+
17
+ import os
18
+ import unittest
19
+
20
+ from integrations.lucenesearcher_anserini_checker import LuceneSearcherAnseriniMatchChecker
21
+
22
+
23
+ class TestSearchIntegration(unittest.TestCase):
24
+ def setUp(self):
25
+ # The current directory depends on if you're running inside an IDE or from command line.
26
+ curdir = os.getcwd()
27
+ if curdir.endswith('sparse'):
28
+ anserini_root = '../../../anserini'
29
+ pyserini_root = '../..'
30
+ else:
31
+ anserini_root = '../anserini'
32
+ pyserini_root = '.'
33
+
34
+ self.checker = LuceneSearcherAnseriniMatchChecker(
35
+ anserini_root=anserini_root,
36
+ index=os.path.join(anserini_root, 'indexes/lucene-index.wapo.v2'),
37
+ topics=os.path.join(pyserini_root, 'tools/topics-and-qrels/topics.core18.txt'),
38
+ pyserini_topics='core18',
39
+ qrels=os.path.join(pyserini_root, 'tools/topics-and-qrels/qrels.core18.txt'),
40
+ eval_root=pyserini_root)
41
+
42
+ def test_bm25(self):
43
+ self.assertTrue(self.checker.run('core18_bm25', '-bm25', '--bm25'))
44
+
45
+ def test_bm25_rm3(self):
46
+ self.assertTrue(self.checker.run('core18_bm25_rm3', '-bm25 -rm3', '--bm25 --rm3'))
47
+
48
+ def test_qld(self):
49
+ self.assertTrue(self.checker.run('core18_qld', '-qld', '--qld'))
50
+
51
+ def test_qld_rm3(self):
52
+ self.assertTrue(self.checker.run('core18_qld_rm3', '-qld -rm3', '--qld --rm3'))
53
+
54
+ def tearDown(self):
55
+ pass
56
+
57
+
58
+ if __name__ == '__main__':
59
+ unittest.main()
integrations/sparse/test_lucenesearcher_check_irst.py ADDED
@@ -0,0 +1,355 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #
2
+ # Pyserini: Reproducible IR research with sparse and dense representations
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ #
16
+
17
+ import os
18
+ import unittest
19
+ from shutil import rmtree
20
+ from random import randint
21
+
22
+ from integrations.utils import run_command, parse_score
23
+
24
+
25
+ class TestMsmarcoPassageIrst(unittest.TestCase):
26
+ def setUp(self):
27
+ curdir = os.getcwd()
28
+ if curdir.endswith('sparse'):
29
+ self.pyserini_root = '../..'
30
+ else:
31
+ self.pyserini_root = '.'
32
+ self.tmp = f'tmp{randint(0, 10000)}'
33
+ if os.path.isdir(self.tmp):
34
+ rmtree(self.tmp)
35
+ os.mkdir(self.tmp)
36
+ self.dl19_pass = 'dl19-passage'
37
+ self.dl20 = 'dl20'
38
+
39
+ def test_sum_aggregation_dl19_passage(self):
40
+ # dl19 passage sum
41
+ topic = 'dl19-passage'
42
+ os.system(f'python -m pyserini.search.lucene.irst \
43
+ --topics {self.dl19_pass} \
44
+ --index msmarco-v1-passage \
45
+ --output {self.tmp}/regression_test_sum.{topic}.txt \
46
+ --alpha 0.1 ')
47
+
48
+ score_cmd = f'python -m pyserini.eval.trec_eval \
49
+ -c -m map -m ndcg_cut.10 -l 2 {topic} {self.tmp}/regression_test_sum.{topic}.txt'
50
+
51
+ status = os.system(score_cmd)
52
+ stdout, stderr = run_command(score_cmd)
53
+ map_score = parse_score(stdout, "map")
54
+ ndcg_score = parse_score(stdout, "ndcg")
55
+
56
+ self.assertEqual(status, 0)
57
+ self.assertEqual(stderr, '')
58
+ self.assertEqual(map_score, 0.3281)
59
+ self.assertEqual(ndcg_score, 0.5260)
60
+
61
+ def test_sum_aggregation_dl20_passage(self):
62
+ # dl20 passage sum
63
+ topic = 'dl20-passage'
64
+ os.system(f'python -m pyserini.search.lucene.irst \
65
+ --topics {self.dl20} \
66
+ --index msmarco-v1-passage \
67
+ --output {self.tmp}/regression_test_sum.{topic}.txt \
68
+ --alpha 0.1 ')
69
+
70
+ score_cmd = f'python -m pyserini.eval.trec_eval \
71
+ -c -m map -m ndcg_cut.10 -l 2 {topic} {self.tmp}/regression_test_sum.{topic}.txt'
72
+
73
+ status = os.system(score_cmd)
74
+ stdout, stderr = run_command(score_cmd)
75
+ map_score = parse_score(stdout, "map")
76
+ ndcg_score = parse_score(stdout, "ndcg")
77
+
78
+ self.assertEqual(status, 0)
79
+ self.assertEqual(stderr, '')
80
+ self.assertEqual(map_score, 0.3520)
81
+ self.assertEqual(ndcg_score, 0.5578)
82
+
83
+ def test_max_aggregation_dl19(self):
84
+ # dl19 passage max
85
+ topic = 'dl19-passage'
86
+
87
+ os.system(f'python -m pyserini.search.lucene.irst \
88
+ --topics {self.dl19_pass} \
89
+ --index msmarco-v1-passage \
90
+ --output {self.tmp}/regression_test_max.{topic}.txt \
91
+ --alpha 0.3 \
92
+ --max-sim ')
93
+ score_cmd = f'python -m pyserini.eval.trec_eval \
94
+ -c -m map -m ndcg_cut.10 -l 2 {topic} {self.tmp}/regression_test_max.{topic}.txt'
95
+
96
+ status = os.system(score_cmd)
97
+ stdout, stderr = run_command(score_cmd)
98
+ map_score = parse_score(stdout, "map")
99
+ ndcg_score = parse_score(stdout, "ndcg")
100
+
101
+ self.assertEqual(status, 0)
102
+ self.assertEqual(stderr, '')
103
+ self.assertEqual(map_score, 0.3286)
104
+ self.assertEqual(ndcg_score, 0.5371)
105
+
106
+
107
+ def test_max_aggregation_dl20_passage(self):
108
+ # dl20 passage max
109
+ topic = 'dl20-passage'
110
+ os.system(f'python -m pyserini.search.lucene.irst \
111
+ --topics {self.dl20} \
112
+ --index msmarco-v1-passage \
113
+ --output {self.tmp}/regression_test_max.{topic}.txt \
114
+ --alpha 0.3 \
115
+ --max-sim')
116
+
117
+ score_cmd = f'python -m pyserini.eval.trec_eval \
118
+ -c -m map -m ndcg_cut.10 -l 2 {topic} {self.tmp}/regression_test_max.{topic}.txt'
119
+
120
+ status = os.system(score_cmd)
121
+ stdout, stderr = run_command(score_cmd)
122
+ map_score = parse_score(stdout, "map")
123
+ ndcg_score = parse_score(stdout, "ndcg")
124
+
125
+ self.assertEqual(status, 0)
126
+ self.assertEqual(stderr, '')
127
+ self.assertEqual(map_score, 0.3357)
128
+ self.assertEqual(ndcg_score, 0.5469)
129
+
130
+ def tearDown(self):
131
+ rmtree(self.tmp)
132
+
133
+
134
+ class TestMsmarcoDocumentIrst(unittest.TestCase):
135
+ def setUp(self):
136
+ curdir = os.getcwd()
137
+ if curdir.endswith('sparse'):
138
+ self.pyserini_root = '../..'
139
+ else:
140
+ self.pyserini_root = '.'
141
+ self.tmp = f'tmp{randint(0, 10000)}'
142
+ if os.path.isdir(self.tmp):
143
+ rmtree(self.tmp)
144
+ os.mkdir(self.tmp)
145
+ self.dl19_doc = 'dl19-doc'
146
+ self.dl20 = 'dl20'
147
+
148
+ def test_sum_aggregation_dl19_doc(self):
149
+ # dl19-doc-sum
150
+ topic = 'dl19-doc'
151
+ os.system(f'python -m pyserini.search.lucene.irst \
152
+ --topics {self.dl19_doc} \
153
+ --index msmarco-v1-doc \
154
+ --output {self.tmp}/regression_test_sum.{topic}.txt \
155
+ --alpha 0.3')
156
+
157
+ score_cmd = f'python -m pyserini.eval.trec_eval \
158
+ -c -m map -m ndcg_cut.10 -M 100 {topic} {self.tmp}/regression_test_sum.{topic}.txt'
159
+
160
+ status = os.system(score_cmd)
161
+ stdout, stderr = run_command(score_cmd)
162
+ map_score = parse_score(stdout, "map")
163
+ ndcg_score = parse_score(stdout, "ndcg")
164
+
165
+ self.assertEqual(status, 0)
166
+ self.assertEqual(stderr, '')
167
+ self.assertEqual(map_score, 0.2524)
168
+ self.assertEqual(ndcg_score, 0.5494)
169
+
170
+ def test_sum_aggregation_dl20_doc(self):
171
+ # dl20-doc-sum
172
+ topic = 'dl20-doc'
173
+ os.system(f'python -m pyserini.search.lucene.irst \
174
+ --topics {self.dl20} \
175
+ --index msmarco-v1-doc \
176
+ --output {self.tmp}/regression_test_sum.{topic}.txt \
177
+ --alpha 0.3 ')
178
+
179
+ score_cmd = f'python -m pyserini.eval.trec_eval \
180
+ -c -m map -m ndcg_cut.10 -M 100 {topic} {self.tmp}/regression_test_sum.{topic}.txt'
181
+
182
+ status = os.system(score_cmd)
183
+ stdout, stderr = run_command(score_cmd)
184
+ map_score = parse_score(stdout, "map")
185
+ ndcg_score = parse_score(stdout, "ndcg")
186
+
187
+ self.assertEqual(status, 0)
188
+ self.assertEqual(stderr, '')
189
+ self.assertEqual(map_score, 0.3825)
190
+ self.assertEqual(ndcg_score, 0.5559)
191
+
192
+ def test_max_aggregation_dl19_doc(self):
193
+ # dl19-doc-max
194
+ topic = 'dl19-doc'
195
+ os.system(f'python -m pyserini.search.lucene.irst \
196
+ --topics {self.dl19_doc} \
197
+ --index msmarco-v1-doc \
198
+ --output {self.tmp}/regression_test_max.{topic}.txt \
199
+ --alpha 0.3 \
200
+ --max-sim')
201
+
202
+ score_cmd = f'python -m pyserini.eval.trec_eval \
203
+ -c -m map -m ndcg_cut.10 -M 100 {topic} {self.tmp}/regression_test_max.{topic}.txt'
204
+
205
+ status = os.system(score_cmd)
206
+ stdout, stderr = run_command(score_cmd)
207
+ map_score = parse_score(stdout, "map")
208
+ ndcg_score = parse_score(stdout, "ndcg")
209
+
210
+ self.assertEqual(status, 0)
211
+ self.assertEqual(stderr, '')
212
+ self.assertEqual(map_score, 0.2205)
213
+ self.assertEqual(ndcg_score, 0.4917)
214
+
215
+ def test_max_aggregation_dl20_doc(self):
216
+ # dl20-doc-max
217
+ topic = 'dl20-doc'
218
+ os.system(f'python -m pyserini.search.lucene.irst \
219
+ --topics {self.dl20} \
220
+ --index msmarco-v1-doc \
221
+ --output {self.tmp}/regression_test_max.{topic}.txt \
222
+ --alpha 0.3 \
223
+ --max-sim')
224
+
225
+ score_cmd = f'python -m pyserini.eval.trec_eval \
226
+ -c -m map -m ndcg_cut.10 -M 100 {topic} {self.tmp}/regression_test_max.{topic}.txt'
227
+
228
+ status = os.system(score_cmd)
229
+ stdout, stderr = run_command(score_cmd)
230
+ map_score = parse_score(stdout, "map")
231
+ ndcg_score = parse_score(stdout, "ndcg")
232
+
233
+ self.assertEqual(status, 0)
234
+ self.assertEqual(stderr, '')
235
+ self.assertEqual(map_score, 0.3373)
236
+ self.assertEqual(ndcg_score, 0.5015)
237
+
238
+ def tearDown(self):
239
+ rmtree(self.tmp)
240
+
241
+
242
+ class TestMsmarcoDocumentSegIrst(unittest.TestCase):
243
+ def setUp(self):
244
+ curdir = os.getcwd()
245
+ if curdir.endswith('sparse'):
246
+ self.pyserini_root = '../..'
247
+ else:
248
+ self.pyserini_root = '.'
249
+ self.tmp = f'tmp{randint(0, 10000)}'
250
+ if os.path.isdir(self.tmp):
251
+ rmtree(self.tmp)
252
+ os.mkdir(self.tmp)
253
+ self.dl19_doc = 'dl19-doc'
254
+ self.dl20 = 'dl20'
255
+
256
+ def test_sum_aggregation_dl19_doc_seg(self):
257
+ # dl19-doc-seg-sum
258
+ topic = 'dl19-doc'
259
+ os.system(f'python -m pyserini.search.lucene.irst \
260
+ --topics {self.dl19_doc} \
261
+ --index msmarco-v1-doc-segmented \
262
+ --output {self.tmp}/regression_test_sum.{topic}.txt \
263
+ --hits 10000 --segments \
264
+ --alpha 0.3')
265
+
266
+ score_cmd = f'python -m pyserini.eval.trec_eval \
267
+ -c -m map -m ndcg_cut.10 -M 100 {topic} {self.tmp}/regression_test_sum.{topic}.txt'
268
+
269
+ status = os.system(score_cmd)
270
+ stdout, stderr = run_command(score_cmd)
271
+ map_score = parse_score(stdout, "map")
272
+ ndcg_score = parse_score(stdout, "ndcg")
273
+
274
+ self.assertEqual(status, 0)
275
+ self.assertEqual(stderr, '')
276
+ self.assertEqual(map_score, 0.2711)
277
+ self.assertEqual(ndcg_score, 0.5596)
278
+
279
+ def test_sum_aggregation_dl20_doc_seg(self):
280
+ # dl20-doc-seg-sum
281
+ topic = 'dl20-doc'
282
+ os.system(f'python -m pyserini.search.lucene.irst \
283
+ --topics {self.dl20} \
284
+ --index msmarco-v1-doc-segmented \
285
+ --output {self.tmp}/regression_test_sum.{topic}.txt \
286
+ --hits 10000 --segments \
287
+ --alpha 0.3 ')
288
+
289
+ score_cmd = f'python -m pyserini.eval.trec_eval \
290
+ -c -m map -m ndcg_cut.10 -M 100 {topic} {self.tmp}/regression_test_sum.{topic}.txt'
291
+
292
+ status = os.system(score_cmd)
293
+ stdout, stderr = run_command(score_cmd)
294
+ map_score = parse_score(stdout, "map")
295
+ ndcg_score = parse_score(stdout, "ndcg")
296
+
297
+ self.assertEqual(status, 0)
298
+ self.assertEqual(stderr, '')
299
+ self.assertEqual(map_score, 0.3759)
300
+ self.assertEqual(ndcg_score, 0.5343)
301
+
302
+ def test_max_aggregation_dl19_doc_seg(self):
303
+ # dl19-doc-seg-max
304
+ topic = 'dl19-doc'
305
+ os.system(f'python -m pyserini.search.lucene.irst \
306
+ --topics {self.dl19_doc} \
307
+ --index msmarco-v1-doc-segmented \
308
+ --output {self.tmp}/regression_test_max.{topic}.txt \
309
+ --alpha 0.3 \
310
+ --hits 10000 --segments \
311
+ --max-sim')
312
+
313
+ score_cmd = f'python -m pyserini.eval.trec_eval \
314
+ -c -m map -m ndcg_cut.10 -M 100 {topic} {self.tmp}/regression_test_max.{topic}.txt'
315
+
316
+ status = os.system(score_cmd)
317
+ stdout, stderr = run_command(score_cmd)
318
+ map_score = parse_score(stdout, "map")
319
+ ndcg_score = parse_score(stdout, "ndcg")
320
+
321
+ self.assertEqual(status, 0)
322
+ self.assertEqual(stderr, '')
323
+ self.assertEqual(map_score, 0.2425)
324
+ self.assertEqual(ndcg_score, 0.5193)
325
+
326
+ def test_max_aggregation_dl20_doc_seg(self):
327
+ # dl20-doc-seg-max
328
+ topic = 'dl20-doc'
329
+ os.system(f'python -m pyserini.search.lucene.irst \
330
+ --topics {self.dl20} \
331
+ --index msmarco-v1-doc-segmented \
332
+ --output {self.tmp}/regression_test_max.{topic}.txt \
333
+ --alpha 0.3 \
334
+ --hits 10000 --segments \
335
+ --max-sim')
336
+
337
+ score_cmd = f'python -m pyserini.eval.trec_eval \
338
+ -c -m map -m ndcg_cut.10 -M 100 {topic} {self.tmp}/regression_test_max.{topic}.txt'
339
+
340
+ status = os.system(score_cmd)
341
+ stdout, stderr = run_command(score_cmd)
342
+ map_score = parse_score(stdout, "map")
343
+ ndcg_score = parse_score(stdout, "ndcg")
344
+
345
+ self.assertEqual(status, 0)
346
+ self.assertEqual(stderr, '')
347
+ self.assertEqual(map_score, 0.3496)
348
+ self.assertEqual(ndcg_score, 0.5089)
349
+
350
+ def tearDown(self):
351
+ rmtree(self.tmp)
352
+
353
+
354
+ if __name__ == '__main__':
355
+ unittest.main()
integrations/sparse/test_lucenesearcher_check_ltr_msmarco_document.py ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #
2
+ # Pyserini: Reproducible IR research with sparse and dense representations
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ #
16
+
17
+ import os
18
+ import subprocess
19
+ import sys
20
+ import unittest
21
+ from shutil import rmtree
22
+
23
+
24
+ class TestLtrMsmarcoDocument(unittest.TestCase):
25
+ def test_reranking(self):
26
+ if(os.path.isdir('ltr_test')):
27
+ rmtree('ltr_test')
28
+ os.mkdir('ltr_test')
29
+ inp = 'run.msmarco-pass-doc.bm25.txt'
30
+ outp = 'run.ltr.msmarco-pass-doc.test.trec'
31
+ outp_tsv = 'run.ltr.msmarco-pass-doc.test.tsv'
32
+ #Pre-trained ltr model
33
+ model_url = 'https://rgw.cs.uwaterloo.ca/JIMMYLIN-bucket0/pyserini-models/model-ltr-msmarco-passage-mrr-v1.tar.gz'
34
+ model_tar_name = 'model-ltr-msmarco-passage-mrr-v1.tar.gz'
35
+ os.system(f'wget {model_url} -P ltr_test/')
36
+ os.system(f'tar -xzvf ltr_test/{model_tar_name} -C ltr_test')
37
+
38
+ # IBM model
39
+ ibm_model_url = 'https://rgw.cs.uwaterloo.ca/JIMMYLIN-bucket0/pyserini-models/model-ltr-ibm.tar.gz'
40
+ ibm_model_tar_name = 'model-ltr-ibm.tar.gz'
41
+ os.system(f'wget {ibm_model_url} -P ltr_test/')
42
+ os.system(f'tar -xzvf ltr_test/{ibm_model_tar_name} -C ltr_test')
43
+ os.system(f'python -m pyserini.search.lucene.ltr \
44
+ --topic tools/topics-and-qrels/topics.msmarco-doc.dev.txt \
45
+ --model ltr_test/msmarco-passage-ltr-mrr-v1/ \
46
+ --qrel tools/topics-and-qrels/qrels.msmarco-doc.dev.txt \
47
+ --index msmarco-doc-per-passage-ltr --ibm-model ltr_test/ibm_model/ \
48
+ --granularity document --output ltr_test/{outp} --max-passage --hits 10000')
49
+
50
+ result = subprocess.check_output(f'python tools/scripts/msmarco/msmarco_doc_eval.py --judgments tools/topics-and-qrels/qrels.msmarco-doc.dev.txt --run ltr_test/{outp}', shell=True).decode(sys.stdout.encoding)
51
+ a,b = result.find('#####################\nMRR @100:'), result.find('\nQueriesRanked: 5193\n#####################\n')
52
+ mrr = result[a+32:b]
53
+ # See:
54
+ # - https://github.com/castorini/pyserini/issues/951
55
+ # - https://github.com/castorini/pyserini/issues/1430
56
+ self.assertAlmostEqual(float(mrr), 0.3108, delta=0.0002)
57
+ rmtree('ltr_test')
58
+
59
+ if __name__ == '__main__':
60
+ unittest.main()
integrations/sparse/test_lucenesearcher_check_ltr_msmarco_passage.py ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #
2
+ # Pyserini: Reproducible IR research with sparse and dense representations
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ #
16
+
17
+ import os
18
+ import subprocess
19
+ import sys
20
+ import unittest
21
+ from shutil import rmtree
22
+
23
+ from pyserini.search.lucene import LuceneSearcher
24
+
25
+
26
+ class TestLtrMsmarcoPassage(unittest.TestCase):
27
+ def test_reranking(self):
28
+ if(os.path.isdir('ltr_test')):
29
+ rmtree('ltr_test')
30
+ os.mkdir('ltr_test')
31
+ inp = 'run.msmarco-passage.bm25tuned.txt'
32
+ outp = 'run.ltr.msmarco-passage.test.tsv'
33
+ #Pre-trained ltr model
34
+ model_url = 'https://rgw.cs.uwaterloo.ca/JIMMYLIN-bucket0/pyserini-models/model-ltr-msmarco-passage-mrr-v1.tar.gz'
35
+ model_tar_name = 'model-ltr-msmarco-passage-mrr-v1.tar.gz'
36
+ os.system(f'wget {model_url} -P ltr_test/')
37
+ os.system(f'tar -xzvf ltr_test/{model_tar_name} -C ltr_test')
38
+ # IBM model
39
+ ibm_model_url = 'https://rgw.cs.uwaterloo.ca/JIMMYLIN-bucket0/pyserini-models/model-ltr-ibm.tar.gz'
40
+ ibm_model_tar_name = 'model-ltr-ibm.tar.gz'
41
+ os.system(f'wget {ibm_model_url} -P ltr_test/')
42
+ os.system(f'tar -xzvf ltr_test/{ibm_model_tar_name} -C ltr_test')
43
+ #queries process
44
+ os.system(f'python -m pyserini.search.lucene.ltr \
45
+ --model ltr_test/msmarco-passage-ltr-mrr-v1 \
46
+ --topic tools/topics-and-qrels/topics.msmarco-passage.dev-subset.txt \
47
+ --qrel tools/topics-and-qrels/qrels.msmarco-passage.dev-subset.txt \
48
+ --index msmarco-passage-ltr --ibm-model ltr_test/ibm_model/ \
49
+ --output-format tsv --output ltr_test/{outp}')
50
+ result = subprocess.check_output(f'python tools/scripts/msmarco/msmarco_passage_eval.py tools/topics-and-qrels/qrels.msmarco-passage.dev-subset.txt ltr_test/{outp}', shell=True).decode(sys.stdout.encoding)
51
+ a,b = result.find('#####################\nMRR @10:'), result.find('\nQueriesRanked: 6980\n#####################\n')
52
+ mrr = result[a+31:b]
53
+ # See https://github.com/castorini/pyserini/issues/951
54
+ self.assertAlmostEqual(float(mrr), 0.2472, delta=0.0001)
55
+ rmtree('ltr_test')
56
+
57
+
58
+ if __name__ == '__main__':
59
+ unittest.main()
integrations/sparse/test_lucenesearcher_check_robust04.py ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #
2
+ # Pyserini: Reproducible IR research with sparse and dense representations
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ #
16
+
17
+ import os
18
+ import unittest
19
+
20
+ from integrations.lucenesearcher_anserini_checker import LuceneSearcherAnseriniMatchChecker
21
+
22
+
23
+ class TestSearchIntegration(unittest.TestCase):
24
+ def setUp(self):
25
+ # The current directory depends on if you're running inside an IDE or from command line.
26
+ curdir = os.getcwd()
27
+ if curdir.endswith('sparse'):
28
+ anserini_root = '../../../anserini'
29
+ pyserini_root = '../..'
30
+ else:
31
+ anserini_root = '../anserini'
32
+ pyserini_root = '.'
33
+
34
+ self.checker = LuceneSearcherAnseriniMatchChecker(
35
+ anserini_root=anserini_root,
36
+ index=os.path.join(anserini_root, 'indexes/lucene-index.disk45'),
37
+ topics=os.path.join(pyserini_root, 'tools/topics-and-qrels/topics.robust04.txt'),
38
+ pyserini_topics='robust04',
39
+ qrels=os.path.join(pyserini_root, 'tools/topics-and-qrels/qrels.robust04.txt'),
40
+ eval_root=pyserini_root)
41
+
42
+ def test_bm25(self):
43
+ self.assertTrue(self.checker.run('robust04_bm25', '-bm25', '--bm25'))
44
+
45
+ def test_bm25_rm3(self):
46
+ self.assertTrue(self.checker.run('robust04_bm25_rm3', '-bm25 -rm3', '--bm25 --rm3'))
47
+
48
+ def test_qld(self):
49
+ self.assertTrue(self.checker.run('robust04_qld', '-qld', '--qld'))
50
+
51
+ def test_qld_rm3(self):
52
+ self.assertTrue(self.checker.run('robust04_qld_rm3', '-qld -rm3', '--qld --rm3'))
53
+
54
+ def tearDown(self):
55
+ pass
56
+
57
+
58
+ if __name__ == '__main__':
59
+ unittest.main()
integrations/sparse/test_lucenesearcher_check_robust05.py ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #
2
+ # Pyserini: Reproducible IR research with sparse and dense representations
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ #
16
+
17
+ import os
18
+ import unittest
19
+
20
+ from integrations.lucenesearcher_anserini_checker import LuceneSearcherAnseriniMatchChecker
21
+
22
+
23
+ class TestSearchIntegration(unittest.TestCase):
24
+ def setUp(self):
25
+ # The current directory depends on if you're running inside an IDE or from command line.
26
+ curdir = os.getcwd()
27
+ if curdir.endswith('sparse'):
28
+ anserini_root = '../../../anserini'
29
+ pyserini_root = '../..'
30
+ else:
31
+ anserini_root = '../anserini'
32
+ pyserini_root = '.'
33
+
34
+ self.checker = LuceneSearcherAnseriniMatchChecker(
35
+ anserini_root=anserini_root,
36
+ index=os.path.join(anserini_root, 'indexes/lucene-index.robust05'),
37
+ topics=os.path.join(pyserini_root, 'tools/topics-and-qrels/topics.robust05.txt'),
38
+ pyserini_topics='robust05',
39
+ qrels=os.path.join(pyserini_root, 'tools/topics-and-qrels/qrels.robust05.txt'),
40
+ eval_root=pyserini_root)
41
+
42
+ def test_bm25(self):
43
+ self.assertTrue(self.checker.run('robust05_bm25', '-bm25', '--bm25'))
44
+
45
+ def test_bm25_rm3(self):
46
+ self.assertTrue(self.checker.run('robust05_bm25_rm3', '-bm25 -rm3', '--bm25 --rm3'))
47
+
48
+ def test_qld(self):
49
+ self.assertTrue(self.checker.run('robust05_qld', '-qld', '--qld'))
50
+
51
+ def test_qld_rm3(self):
52
+ self.assertTrue(self.checker.run('robust05_qld_rm3', '-qld -rm3', '--qld --rm3'))
53
+
54
+ def tearDown(self):
55
+ pass
56
+
57
+
58
+ if __name__ == '__main__':
59
+ unittest.main()
integrations/sparse/test_lucenesearcher_multithread.py ADDED
@@ -0,0 +1,79 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #
2
+ # Pyserini: Reproducible IR research with sparse and dense representations
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ #
16
+
17
+ import unittest
18
+
19
+ from integrations.run_lucenesearcher import RunLuceneSearcher
20
+
21
+
22
+ class TestSearchIntegration(unittest.TestCase):
23
+ def setUp(self):
24
+ self.test_threads = ['--threads 1 --batch-size 64', '--threads 4 --batch-size 64']
25
+
26
+ def check_equal(self, runner: RunLuceneSearcher, runtag: str, extras: str) -> bool:
27
+ checksums = []
28
+ for i, config in enumerate(self.test_threads):
29
+ checksum = runner.run(runtag=f'{runtag}-{i}', extras=f'{config} {extras}')
30
+ if len(checksum) == 0:
31
+ print(f'[FAIL] {runtag} {config} failed to run!')
32
+ return False
33
+ checksums.append(checksum)
34
+ equal = all(x == checksums[0] for x in checksums)
35
+ if equal:
36
+ print(f'[SUCCESS] {runtag} results match!')
37
+ else:
38
+ print(f'[FAIL] {runtag} results do not match!')
39
+ return equal
40
+
41
+ def test_robust04(self):
42
+ checker = RunLuceneSearcher(index='robust04', topics='robust04')
43
+ self.assertTrue(self.check_equal(checker, 'robust04', extras=''))
44
+
45
+ def test_msmarco_passage(self):
46
+ checker = RunLuceneSearcher(index='msmarco-v1-passage', topics='msmarco-passage-dev-subset')
47
+ self.assertTrue(self.check_equal(checker, 'msmarco-v1-passage', extras='--output-format msmarco'))
48
+
49
+ def test_msmarco_passage_docTTTTTquery(self):
50
+ checker = RunLuceneSearcher(index='msmarco-v1-passage-d2q-t5', topics='msmarco-passage-dev-subset')
51
+ self.assertTrue(self.check_equal(checker, 'msmarco-v1-passage-d2q-t5', extras='--output-format msmarco'))
52
+
53
+ def test_msmarco_doc(self):
54
+ checker = RunLuceneSearcher(index='msmarco-v1-doc', topics='msmarco-doc-dev')
55
+ self.assertTrue(self.check_equal(checker, 'msmarco-v1-doc', extras='--hits 100 --output-format msmarco'))
56
+
57
+ def test_msmarco_doc_docTTTTTquery(self):
58
+ checker = RunLuceneSearcher(index='msmarco-v1-doc-d2q-t5', topics='msmarco-doc-dev')
59
+ self.assertTrue(self.check_equal(checker, 'msmarco-v1-doc-d2q-t5',
60
+ extras='--hits 100 --output-format msmarco'))
61
+
62
+ def test_msmarco_doc_per_passage(self):
63
+ checker = RunLuceneSearcher(index='msmarco-v1-doc-segmented', topics='msmarco-doc-dev')
64
+ self.assertTrue(
65
+ self.check_equal(checker, 'msmarco-v1-doc-segmented',
66
+ extras='--hits 1000 --max-passage --max-passage-hits 100 --output-format msmarco'))
67
+
68
+ def test_msmarco_doc_docTTTTTquery_passage(self):
69
+ checker = RunLuceneSearcher(index='msmarco-v1-doc-segmented-d2q-t5', topics='msmarco-doc-dev')
70
+ self.assertTrue(
71
+ self.check_equal(checker, 'msmarco-v1-doc-segmented-d2q-t5',
72
+ extras='--hits 1000 --max-passage --max-passage-hits 100 --output-format msmarco'))
73
+
74
+ def tearDown(self):
75
+ pass
76
+
77
+
78
+ if __name__ == '__main__':
79
+ unittest.main()
integrations/sparse/test_nmslib.py ADDED
@@ -0,0 +1,66 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #
2
+ # Pyserini: Reproducible IR research with sparse and dense representations
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ #
16
+
17
+ """Integration tests for nmslib index search"""
18
+
19
+ import os
20
+ import socket
21
+ import unittest
22
+
23
+ from integrations.utils import clean_files, run_command, parse_score
24
+
25
+
26
+ class TestSearchIntegration(unittest.TestCase):
27
+ def setUp(self):
28
+ self.temp_files = []
29
+ self.threads = 12
30
+ self.batch_size = 36
31
+
32
+ # Hard-code larger values for internal servers
33
+ if socket.gethostname().startswith('damiano') or socket.gethostname().startswith('orca'):
34
+ self.threads = 36
35
+ self.batch_size = 144
36
+
37
+ def test_msmarco_passage_deepimpact_nmslib_hnsw(self):
38
+ output_file = 'test_run.msmarco-passage.deepimpact.nmslib.tsv'
39
+ self.temp_files.append(output_file)
40
+ cmd = 'wget https://raw.githubusercontent.com/castorini/pyserini-data/main/encoded-queries/deepimpact_msmarco_passage_dev_topic.jsonl'
41
+ status = os.system(cmd)
42
+ cmd = 'wget https://rgw.cs.uwaterloo.ca/JIMMYLIN-bucket0/pyserini-indexes/nmslib-index.msmarco-passage.deepimpact.20211012.58d286.tar.gz'
43
+ status = os.system(cmd)
44
+ cmd = 'tar -xvf nmslib-index.msmarco-passage.deepimpact.20211012.58d286.tar.gz'
45
+ status = os.system(cmd)
46
+ self.temp_files.append('deepimpact_msmarco_passage_dev_topic.jsonl')
47
+ self.temp_files.append('nmslib-index.msmarco-passage.deepimpact.20211012.58d286.tar.gz')
48
+ self.temp_files.append('nmslib-index.msmarco-passage.deepimpact.20211012.58d286')
49
+ cmd1 = f'python -m pyserini.search.nmslib --topics deepimpact_msmarco_passage_dev_topic.jsonl \
50
+ --index nmslib-index.msmarco-passage.deepimpact.20211012.58d286 \
51
+ --output {output_file} \
52
+ --hits 1000 --batch {self.batch_size} --threads {self.threads} \
53
+ --output-format msmarco --is-sparse --ef 1000'
54
+ cmd2 = f'python -m pyserini.eval.msmarco_passage_eval msmarco-passage-dev-subset {output_file}'
55
+ status = os.system(cmd1)
56
+ stdout, stderr = run_command(cmd2)
57
+ score = parse_score(stdout, "MRR @10")
58
+ self.assertEqual(status, 0)
59
+ self.assertAlmostEqual(score, 0.298, delta=0.001)
60
+
61
+ def tearDown(self):
62
+ clean_files(self.temp_files)
63
+
64
+
65
+ if __name__ == '__main__':
66
+ unittest.main()
integrations/sparse/test_prebuilt_msmarco_v1_doc.py ADDED
@@ -0,0 +1,170 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #
2
+ # Pyserini: Reproducible IR research with sparse and dense representations
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ #
16
+
17
+ """Integration tests for MS MARCO V1 doc corpora (full and segmented) using pre-built indexes."""
18
+
19
+ import unittest
20
+
21
+ from integrations.utils import run_retrieval_and_return_scores
22
+
23
+
24
+ class TestPrebuiltMsMarcoV1Doc(unittest.TestCase):
25
+ def setUp(self):
26
+ self.threads = 16
27
+ self.batch_size = 128
28
+
29
+ #
30
+ # doc "full" conditions
31
+ #
32
+
33
+ def test_doc_full_trec_output(self):
34
+ """Test case for MS MARCO V1 doc (full), dev queries, TREC output
35
+ on all three pre-built indexes (base, slim, full)."""
36
+
37
+ # Loop over all three pre-built indexes.
38
+ for index in ['msmarco-v1-doc', 'msmarco-v1-doc-slim', 'msmarco-v1-doc-full']:
39
+ scores = run_retrieval_and_return_scores(
40
+ 'runs/test_run.msmarco-doc.trec.txt',
41
+ f'python -m pyserini.search.lucene --threads {self.threads} --batch-size {self.batch_size} \
42
+ --index {index} --topics msmarco-doc-dev --bm25 --hits 1000',
43
+ 'msmarco-doc-dev',
44
+ 'trec_eval',
45
+ [['map', 'map'], ['recall.1000', 'recall_1000']])
46
+
47
+ self.assertTrue('map' in scores)
48
+ self.assertTrue('recall.1000' in scores)
49
+ self.assertAlmostEqual(scores['map'], 0.2774, delta=0.0001)
50
+ self.assertAlmostEqual(scores['recall.1000'], 0.9357, delta=0.0001)
51
+
52
+ def test_doc_full_msmarco_output(self):
53
+ """Test case for MS MARCO V1 doc (full), dev queries, MS MARCO output
54
+ on all three pre-built indexes (base, slim, full)."""
55
+
56
+ # Loop over all three pre-built indexes.
57
+ for index in ['msmarco-v1-doc', 'msmarco-v1-doc-slim', 'msmarco-v1-doc-full']:
58
+ scores = run_retrieval_and_return_scores(
59
+ 'runs/test_run.msmarco-doc.msmarco.txt',
60
+ f'python -m pyserini.search.lucene --threads {self.threads} --batch-size {self.batch_size} \
61
+ --index {index} --topics msmarco-doc-dev --bm25 --hits 100 --output-format msmarco',
62
+ 'msmarco-doc-dev',
63
+ 'msmarco_doc_string', [])
64
+
65
+ self.assertTrue('MRR@100' in scores)
66
+ self.assertEqual(scores['MRR@100'], '0.2766351807440808')
67
+
68
+ #
69
+ # doc segmented conditions
70
+ #
71
+
72
+ def test_doc_segmented_trec_output(self):
73
+ """Test case for MS MARCO V1 doc segmented, dev queries, TREC output
74
+ on all three pre-built indexes (base, slim, full)."""
75
+
76
+ # Loop over all three pre-built indexes.
77
+ for index in ['msmarco-v1-doc-segmented', 'msmarco-v1-doc-segmented-slim', 'msmarco-v1-doc-segmented-full']:
78
+ scores = run_retrieval_and_return_scores(
79
+ 'runs/test_run.msmarco-doc-segmented.trec.txt',
80
+ f'python -m pyserini.search.lucene --threads {self.threads} --batch-size {self.batch_size} \
81
+ --index {index} --topics msmarco-doc-dev --bm25 --hits 10000 --max-passage --max-passage-hits 1000',
82
+ 'msmarco-doc-dev',
83
+ 'trec_eval',
84
+ [['map', 'map'], ['recall.1000', 'recall_1000']])
85
+
86
+ self.assertTrue('map' in scores)
87
+ self.assertTrue('recall.1000' in scores)
88
+ self.assertAlmostEqual(scores['map'], 0.2762, delta=0.0001)
89
+ self.assertAlmostEqual(scores['recall.1000'], 0.9311, delta=0.0001)
90
+
91
+ def test_doc_segmented_msmarco_output(self):
92
+ """Test case for MS MARCO V1 doc segmented, dev queries, MS MARCO output
93
+ on all three pre-built indexes (base, slim, full)."""
94
+
95
+ # Loop over all three pre-built indexes.
96
+ for index in ['msmarco-v1-doc-segmented', 'msmarco-v1-doc-segmented-slim', 'msmarco-v1-doc-segmented-full']:
97
+ scores = run_retrieval_and_return_scores(
98
+ 'runs/test_run.msmarco-doc-segmented.msmarco.txt',
99
+ f'python -m pyserini.search.lucene --threads {self.threads} --batch-size {self.batch_size} \
100
+ --index {index} --topics msmarco-doc-dev \
101
+ --bm25 --hits 1000 --max-passage --max-passage-hits 100 --output-format msmarco',
102
+ 'msmarco-doc-dev',
103
+ 'msmarco_doc_string', [])
104
+
105
+ self.assertTrue('MRR@100' in scores)
106
+ self.assertEqual(scores['MRR@100'], '0.2755196341768384')
107
+
108
+ #
109
+ # doc2query conditions
110
+ #
111
+
112
+ def test_doc_full_expanded_trec_output(self):
113
+ """Test case for MS MARCO V1 doc (full) + doc2query-T5 expansions, dev queries, TREC output."""
114
+ scores = run_retrieval_and_return_scores(
115
+ 'runs/test_run.msmarco-doc.expanded.trec.txt',
116
+ f'python -m pyserini.search.lucene --threads {self.threads} --batch-size {self.batch_size} \
117
+ --index msmarco-v1-doc-d2q-t5 --topics msmarco-doc-dev --bm25 --hits 1000',
118
+ 'msmarco-doc-dev',
119
+ 'trec_eval',
120
+ [['map', 'map'], ['recall.1000', 'recall_1000']])
121
+
122
+ self.assertTrue('map' in scores)
123
+ self.assertTrue('recall.1000' in scores)
124
+ self.assertAlmostEqual(scores['map'], 0.3273, delta=0.0001)
125
+ self.assertAlmostEqual(scores['recall.1000'], 0.9553, delta=0.0001)
126
+
127
+ def test_doc_full_expanded_msmarco_output(self):
128
+ """Test case for MS MARCO V1 doc (full) + doc2query-T5 expansions, dev queries, MS MARCO output."""
129
+ scores = run_retrieval_and_return_scores(
130
+ 'runs/test_run.msmarco-doc.expanded.msmarco.txt',
131
+ f'python -m pyserini.search.lucene --threads {self.threads} --batch-size {self.batch_size} \
132
+ --index msmarco-v1-doc-d2q-t5 --topics msmarco-doc-dev --bm25 --hits 100 --output-format msmarco',
133
+ 'msmarco-doc-dev',
134
+ 'msmarco_doc_string', [])
135
+
136
+ self.assertTrue('MRR@100' in scores)
137
+ self.assertEqual(scores['MRR@100'], '0.3268656233100833')
138
+
139
+ def test_doc_segmented_expanded_trec_output(self):
140
+ """Test case for MS MARCO V1 doc segmented + doc2query-T5 expansions, dev queries, TREC output."""
141
+ scores = run_retrieval_and_return_scores(
142
+ 'runs/test_run.msmarco-doc-segmented.expanded.trec.txt',
143
+ f'python -m pyserini.search.lucene --threads {self.threads} --batch-size {self.batch_size} \
144
+ --index msmarco-v1-doc-segmented-d2q-t5 --topics msmarco-doc-dev \
145
+ --bm25 --hits 10000 --max-passage --max-passage-hits 1000',
146
+ 'msmarco-doc-dev',
147
+ 'trec_eval',
148
+ [['map', 'map'], ['recall.1000', 'recall_1000']])
149
+
150
+ self.assertTrue('map' in scores)
151
+ self.assertTrue('recall.1000' in scores)
152
+ self.assertAlmostEqual(scores['map'], 0.3213, delta=0.0001)
153
+ self.assertAlmostEqual(scores['recall.1000'], 0.9530, delta=0.0001)
154
+
155
+ def test_doc_segmented_expanded_msmarco_output(self):
156
+ """Test case for MS MARCO V1 doc segmented + doc2query-T5 expansions, dev queries, MS MARCO output."""
157
+ scores = run_retrieval_and_return_scores(
158
+ 'runs/test_run.msmarco-doc-segmented.expanded.msmarco.txt',
159
+ f'python -m pyserini.search.lucene --threads {self.threads} --batch-size {self.batch_size} \
160
+ --index msmarco-v1-doc-segmented-d2q-t5 --topics msmarco-doc-dev \
161
+ --bm25 --hits 1000 --max-passage --max-passage-hits 100 --output-format msmarco',
162
+ 'msmarco-doc-dev',
163
+ 'msmarco_doc_string', [])
164
+
165
+ self.assertTrue('MRR@100' in scores)
166
+ self.assertEqual(scores['MRR@100'], '0.320918438140918')
167
+
168
+
169
+ if __name__ == '__main__':
170
+ unittest.main()
integrations/sparse/test_prebuilt_msmarco_v1_passage.py ADDED
@@ -0,0 +1,96 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #
2
+ # Pyserini: Reproducible IR research with sparse and dense representations
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ #
16
+
17
+ """Integration tests for MS MARCO V1 passage corpus using pre-built indexes."""
18
+
19
+ import unittest
20
+
21
+ from integrations.utils import run_retrieval_and_return_scores
22
+
23
+
24
+ class TestPrebuiltMsMarcoV1Passage(unittest.TestCase):
25
+ def setUp(self):
26
+ self.threads = 16
27
+ self.batch_size = 128
28
+
29
+ def test_passage_trec_output(self):
30
+ """Test case for MS MARCO V1 passage, dev queries, TREC output
31
+ on all three pre-built indexes (base, slim, full)."""
32
+
33
+ # Loop over all three pre-built indexes.
34
+ for index in ['msmarco-v1-passage', 'msmarco-v1-passage-slim', 'msmarco-v1-passage-full']:
35
+ scores = run_retrieval_and_return_scores(
36
+ 'runs/test_run.msmarco-passage.trec.txt',
37
+ f'python -m pyserini.search.lucene \
38
+ --threads {self.threads} --batch-size {self.batch_size} \
39
+ --index {index} --topics msmarco-passage-dev-subset --bm25',
40
+ 'msmarco-passage-dev-subset',
41
+ 'trec_eval',
42
+ [['map', 'map'], ['recall.1000', 'recall_1000']])
43
+
44
+ self.assertTrue('map' in scores)
45
+ self.assertTrue('recall.1000' in scores)
46
+ self.assertAlmostEqual(scores['map'], 0.1958, delta=0.0001)
47
+ self.assertAlmostEqual(scores['recall.1000'], 0.8573, delta=0.0001)
48
+
49
+ def test_passage_msmarco_output(self):
50
+ """Test case for MS MARCO V1 passage, dev queries, MS MARCO output
51
+ on all three pre-built indexes (base, slim, full)."""
52
+
53
+ # Loop over all three pre-built indexes.
54
+ for index in ['msmarco-v1-passage', 'msmarco-v1-passage-slim', 'msmarco-v1-passage-full']:
55
+ scores = run_retrieval_and_return_scores(
56
+ 'runs/test_run.msmarco-passage.msmarco.txt',
57
+ f'python -m pyserini.search.lucene --threads {self.threads} --batch-size {self.batch_size} \
58
+ --index {index} --topics msmarco-passage-dev-subset --bm25 --output-format msmarco',
59
+ 'msmarco-passage-dev-subset',
60
+ 'msmarco_passage_string', [])
61
+
62
+ self.assertTrue('MRR@10' in scores)
63
+ self.assertEqual(scores['MRR@10'], '0.18741227770955546')
64
+
65
+ def test_passage_expanded_trec_output(self):
66
+ """Test case for MS MARCO V1 passage w/ doc2query-T5 expansions, dev queries, TREC output."""
67
+
68
+ scores = run_retrieval_and_return_scores(
69
+ 'runs/test_run.msmarco-passage.expanded.trec.txt',
70
+ f'python -m pyserini.search.lucene --threads {self.threads} --batch-size {self.batch_size} \
71
+ --index msmarco-v1-passage-d2q-t5 --topics msmarco-passage-dev-subset --bm25',
72
+ 'msmarco-passage-dev-subset',
73
+ 'trec_eval',
74
+ [['map', 'map'], ['recall.1000', 'recall_1000']])
75
+
76
+ self.assertTrue('map' in scores)
77
+ self.assertTrue('recall.1000' in scores)
78
+ self.assertAlmostEqual(scores['map'], 0.2893, delta=0.0001)
79
+ self.assertAlmostEqual(scores['recall.1000'], 0.9506, delta=0.0001)
80
+
81
+ def test_passage_expanded_msmarco_output(self):
82
+ """Test case for MS MARCO V1 passage w/ doc2query-T5 expansions, dev queries, MS MARCO output."""
83
+
84
+ scores = run_retrieval_and_return_scores(
85
+ 'runs/test_run.msmarco-passage.expanded.msmarco.txt',
86
+ f'python -m pyserini.search.lucene --threads {self.threads} --batch-size {self.batch_size} \
87
+ --index msmarco-v1-passage-d2q-t5 --topics msmarco-passage-dev-subset --bm25 --output-format msmarco',
88
+ 'msmarco-passage-dev-subset',
89
+ 'msmarco_passage_string', [])
90
+
91
+ self.assertTrue('MRR@10' in scores)
92
+ self.assertEqual(scores['MRR@10'], '0.281560751807885')
93
+
94
+
95
+ if __name__ == '__main__':
96
+ unittest.main()
integrations/sparse/test_prebuilt_robust04.py ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #
2
+ # Pyserini: Reproducible IR research with sparse and dense representations
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ #
16
+
17
+ """Integration tests for Robust04 using pre-built indexes."""
18
+
19
+ import unittest
20
+
21
+ from integrations.utils import run_retrieval_and_return_scores
22
+
23
+
24
+ class TestPrebuiltRobust04(unittest.TestCase):
25
+ def test_robust04(self):
26
+ """Test case for Robust04."""
27
+
28
+ scores = run_retrieval_and_return_scores(
29
+ 'runs/test_run.robust04.bm25.txt',
30
+ 'python -m pyserini.search.lucene --topics robust04 --index robust04 --bm25',
31
+ 'robust04',
32
+ 'trec_eval',
33
+ [['map', 'map'], ['P.30', 'P_30']])
34
+
35
+ self.assertTrue('map' in scores)
36
+ self.assertTrue('P.30' in scores)
37
+ self.assertAlmostEqual(scores['map'], 0.2531, delta=0.0001)
38
+ self.assertAlmostEqual(scores['P.30'], 0.3102, delta=0.0001)
39
+
40
+
41
+ if __name__ == '__main__':
42
+ unittest.main()