lreining commited on
Commit
c2a9afa
·
1 Parent(s): b8fd633

Add codebase

Browse files
nlp4web-codebase/README.md ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ # nlp4web
2
+ Codebase of teaching materials for NLP4Web.
nlp4web-codebase/nlp4web_codebase/__init__.py ADDED
File without changes
nlp4web-codebase/nlp4web_codebase/ir/__init__.py ADDED
File without changes
nlp4web-codebase/nlp4web_codebase/ir/analysis.py ADDED
@@ -0,0 +1,160 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from typing import Dict, List, Optional, Protocol
3
+ import pandas as pd
4
+ import tqdm
5
+ import ujson
6
+ from nlp4web_codebase.ir.data_loaders import IRDataset
7
+
8
+
9
+ def round_dict(obj: Dict[str, float], ndigits: int = 4) -> Dict[str, float]:
10
+ return {k: round(v, ndigits=ndigits) for k, v in obj.items()}
11
+
12
+
13
+ def sort_dict(obj: Dict[str, float], reverse: bool = True) -> Dict[str, float]:
14
+ return dict(sorted(obj.items(), key=lambda pair: pair[1], reverse=reverse))
15
+
16
+
17
+ def save_ranking_results(
18
+ output_dir: str,
19
+ query_ids: List[str],
20
+ rankings: List[Dict[str, float]],
21
+ query_performances_lists: List[Dict[str, float]],
22
+ cid2tweights_lists: Optional[List[Dict[str, Dict[str, float]]]] = None,
23
+ ):
24
+ os.makedirs(output_dir, exist_ok=True)
25
+ output_path = os.path.join(output_dir, "ranking_results.jsonl")
26
+ rows = []
27
+ for i, (query_id, ranking, query_performances) in enumerate(
28
+ zip(query_ids, rankings, query_performances_lists)
29
+ ):
30
+ row = {
31
+ "query_id": query_id,
32
+ "ranking": round_dict(ranking),
33
+ "query_performances": round_dict(query_performances),
34
+ "cid2tweights": {},
35
+ }
36
+ if cid2tweights_lists is not None:
37
+ row["cid2tweights"] = {
38
+ cid: round_dict(tws) for cid, tws in cid2tweights_lists[i].items()
39
+ }
40
+ rows.append(row)
41
+ pd.DataFrame(rows).to_json(
42
+ output_path,
43
+ orient="records",
44
+ lines=True,
45
+ )
46
+
47
+
48
+ class TermWeightingFunction(Protocol):
49
+ def __call__(self, query: str, cid: str) -> Dict[str, float]: ...
50
+
51
+
52
+ def compare(
53
+ dataset: IRDataset,
54
+ results_path1: str,
55
+ results_path2: str,
56
+ output_dir: str,
57
+ main_metric: str = "recip_rank",
58
+ system1: Optional[str] = None,
59
+ system2: Optional[str] = None,
60
+ term_weighting_fn1: Optional[TermWeightingFunction] = None,
61
+ term_weighting_fn2: Optional[TermWeightingFunction] = None,
62
+ ) -> None:
63
+ os.makedirs(output_dir, exist_ok=True)
64
+ df1 = pd.read_json(results_path1, orient="records", lines=True)
65
+ df2 = pd.read_json(results_path2, orient="records", lines=True)
66
+ assert len(df1) == len(df2)
67
+ all_qrels = {}
68
+ for split in dataset.split2qrels:
69
+ all_qrels.update(dataset.get_qrels_dict(split))
70
+ qid2query = {query.query_id: query for query in dataset.queries}
71
+ cid2doc = {doc.collection_id: doc for doc in dataset.corpus}
72
+ diff_col = f"{main_metric}:qp1-qp2"
73
+ merged = pd.merge(df1, df2, on="query_id", how="outer")
74
+ rows = []
75
+ for _, example in tqdm.tqdm(merged.iterrows(), desc="Comparing", total=len(merged)):
76
+ docs = {cid: cid2doc[cid].text for cid in dict(example["ranking_x"])}
77
+ docs.update({cid: cid2doc[cid].text for cid in dict(example["ranking_y"])})
78
+ query_id = example["query_id"]
79
+ row = {
80
+ "query_id": query_id,
81
+ "query": qid2query[query_id].text,
82
+ diff_col: example["query_performances_x"][main_metric]
83
+ - example["query_performances_y"][main_metric],
84
+ "ranking1": ujson.dumps(example["ranking_x"], indent=4),
85
+ "ranking2": ujson.dumps(example["ranking_y"], indent=4),
86
+ "docs": ujson.dumps(docs, indent=4),
87
+ "query_performances1": ujson.dumps(
88
+ example["query_performances_x"], indent=4
89
+ ),
90
+ "query_performances2": ujson.dumps(
91
+ example["query_performances_y"], indent=4
92
+ ),
93
+ "qrels": ujson.dumps(all_qrels[query_id], indent=4),
94
+ }
95
+ if term_weighting_fn1 is not None and term_weighting_fn2 is not None:
96
+ all_cids = set(example["ranking_x"]) | set(example["ranking_y"])
97
+ cid2tweights1 = {}
98
+ cid2tweights2 = {}
99
+ ranking1 = {}
100
+ ranking2 = {}
101
+ for cid in all_cids:
102
+ tweights1 = term_weighting_fn1(query=qid2query[query_id].text, cid=cid)
103
+ tweights2 = term_weighting_fn2(query=qid2query[query_id].text, cid=cid)
104
+ ranking1[cid] = sum(tweights1.values())
105
+ ranking2[cid] = sum(tweights2.values())
106
+ cid2tweights1[cid] = tweights1
107
+ cid2tweights2[cid] = tweights2
108
+ ranking1 = sort_dict(ranking1)
109
+ ranking2 = sort_dict(ranking2)
110
+ row["ranking1"] = ujson.dumps(ranking1, indent=4)
111
+ row["ranking2"] = ujson.dumps(ranking2, indent=4)
112
+ cid2tweights1 = {cid: cid2tweights1[cid] for cid in ranking1}
113
+ cid2tweights2 = {cid: cid2tweights2[cid] for cid in ranking2}
114
+ row["cid2tweights1"] = ujson.dumps(cid2tweights1, indent=4)
115
+ row["cid2tweights2"] = ujson.dumps(cid2tweights2, indent=4)
116
+ rows.append(row)
117
+ table = pd.DataFrame(rows).sort_values(by=diff_col, ascending=False)
118
+ output_path = os.path.join(output_dir, f"compare-{system1}_vs_{system2}.tsv")
119
+ table.to_csv(output_path, sep="\t", index=False)
120
+
121
+
122
+ # if __name__ == "__main__":
123
+ # # python -m lecture2.bm25.analysis
124
+ # from nlp4web_codebase.ir.data_loaders.sciq import load_sciq
125
+ # from lecture2.bm25.bm25_retriever import BM25Retriever
126
+ # from lecture2.bm25.tfidf_retriever import TFIDFRetriever
127
+ # import numpy as np
128
+
129
+ # sciq = load_sciq()
130
+ # system1 = "bm25"
131
+ # system2 = "tfidf"
132
+ # results_path1 = f"output/sciq-{system1}/results/ranking_results.jsonl"
133
+ # results_path2 = f"output/sciq-{system2}/results/ranking_results.jsonl"
134
+ # index_dir1 = f"output/sciq-{system1}"
135
+ # index_dir2 = f"output/sciq-{system2}"
136
+ # compare(
137
+ # dataset=sciq,
138
+ # results_path1=results_path1,
139
+ # results_path2=results_path2,
140
+ # output_dir=f"output/sciq-{system1}_vs_{system2}",
141
+ # system1=system1,
142
+ # system2=system2,
143
+ # term_weighting_fn1=BM25Retriever(index_dir1).get_term_weights,
144
+ # term_weighting_fn2=TFIDFRetriever(index_dir2).get_term_weights,
145
+ # )
146
+
147
+ # # bias on #shared_terms of TFIDF:
148
+ # df1 = pd.read_json(results_path1, orient="records", lines=True)
149
+ # df2 = pd.read_json(results_path2, orient="records", lines=True)
150
+ # merged = pd.merge(df1, df2, on="query_id", how="outer")
151
+ # nterms1 = []
152
+ # nterms2 = []
153
+ # for _, row in merged.iterrows():
154
+ # nterms1.append(len(list(dict(row["cid2tweights_x"]).values())[0]))
155
+ # nterms2.append(len(list(dict(row["cid2tweights_y"]).values())[0]))
156
+ # percentiles = (5, 25, 50, 75, 95)
157
+ # print(system1, np.percentile(nterms1, percentiles), np.mean(nterms1).round(2))
158
+ # print(system2, np.percentile(nterms2, percentiles), np.mean(nterms2).round(2))
159
+ # # bm25 [ 3. 4. 5. 7. 11.] 5.64
160
+ # # tfidf [1. 2. 3. 5. 9.] 3.58
nlp4web-codebase/nlp4web_codebase/ir/data_loaders/__init__.py ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dataclasses import dataclass
2
+ from enum import Enum
3
+ from typing import Dict, List
4
+ from nlp4web_codebase.ir.data_loaders.dm import Document, Query, QRel
5
+
6
+
7
+ class Split(str, Enum):
8
+ train = "train"
9
+ dev = "dev"
10
+ test = "test"
11
+
12
+
13
+ @dataclass
14
+ class IRDataset:
15
+ corpus: List[Document]
16
+ queries: List[Query]
17
+ split2qrels: Dict[Split, List[QRel]]
18
+
19
+ def get_stats(self) -> Dict[str, int]:
20
+ stats = {"|corpus|": len(self.corpus), "|queries|": len(self.queries)}
21
+ for split, qrels in self.split2qrels.items():
22
+ stats[f"|qrels-{split}|"] = len(qrels)
23
+ return stats
24
+
25
+ def get_qrels_dict(self, split: Split) -> Dict[str, Dict[str, int]]:
26
+ qrels_dict = {}
27
+ for qrel in self.split2qrels[split]:
28
+ qrels_dict.setdefault(qrel.query_id, {})
29
+ qrels_dict[qrel.query_id][qrel.collection_id] = qrel.relevance
30
+ return qrels_dict
31
+
32
+ def get_split_queries(self, split: Split) -> List[Query]:
33
+ qrels = self.split2qrels[split]
34
+ qids = {qrel.query_id for qrel in qrels}
35
+ return list(filter(lambda query: query.query_id in qids, self.queries))
nlp4web-codebase/nlp4web_codebase/ir/data_loaders/dm.py ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dataclasses import dataclass
2
+ from typing import Optional
3
+
4
+
5
+ @dataclass
6
+ class Document:
7
+ collection_id: str
8
+ text: str
9
+
10
+
11
+ @dataclass
12
+ class Query:
13
+ query_id: str
14
+ text: str
15
+
16
+
17
+ @dataclass
18
+ class QRel:
19
+ query_id: str
20
+ collection_id: str
21
+ relevance: int
22
+ answer: Optional[str] = None
nlp4web-codebase/nlp4web_codebase/ir/data_loaders/sciq.py ADDED
@@ -0,0 +1,88 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Dict, List
2
+
3
+ import joblib
4
+ from datasets import load_dataset
5
+ from nlp4web_codebase.ir.data_loaders import IRDataset, Split
6
+ from nlp4web_codebase.ir.data_loaders.dm import Document, QRel, Query
7
+
8
+
9
+ @(joblib.Memory(".cache").cache)
10
+ def load_sciq(verbose: bool = False) -> IRDataset:
11
+ train = load_dataset("allenai/sciq", split="train")
12
+ validation = load_dataset("allenai/sciq", split="validation")
13
+ test = load_dataset("allenai/sciq", split="test")
14
+ data = {Split.train: train, Split.dev: validation, Split.test: test}
15
+
16
+ # Each duplicated record is the same to each other:
17
+ df = train.to_pandas() + validation.to_pandas() + test.to_pandas()
18
+ for question, group in df.groupby("question"):
19
+ assert len(set(group["support"].tolist())) == len(group)
20
+ assert len(set(group["correct_answer"].tolist())) == len(group)
21
+
22
+ # Build:
23
+ corpus = []
24
+ queries = []
25
+ split2qrels: Dict[str, List[dict]] = {}
26
+ question2id = {}
27
+ support2id = {}
28
+ for split, rows in data.items():
29
+ if verbose:
30
+ print(f"|raw_{split}|", len(rows))
31
+ split2qrels[split] = []
32
+ for i, row in enumerate(rows):
33
+ example_id = f"{split}-{i}"
34
+ support: str = row["support"]
35
+ if len(support.strip()) == 0:
36
+ continue
37
+ question = row["question"]
38
+ if len(support.strip()) == 0:
39
+ continue
40
+ if support in support2id:
41
+ continue
42
+ else:
43
+ support2id[support] = example_id
44
+ if question in question2id:
45
+ continue
46
+ else:
47
+ question2id[question] = example_id
48
+ doc = {"collection_id": example_id, "text": support}
49
+ query = {"query_id": example_id, "text": row["question"]}
50
+ qrel = {
51
+ "query_id": example_id,
52
+ "collection_id": example_id,
53
+ "relevance": 1,
54
+ "answer": row["correct_answer"],
55
+ }
56
+ corpus.append(Document(**doc))
57
+ queries.append(Query(**query))
58
+ split2qrels[split].append(QRel(**qrel))
59
+
60
+ # Assembly and return:
61
+ return IRDataset(corpus=corpus, queries=queries, split2qrels=split2qrels)
62
+
63
+
64
+ if __name__ == "__main__":
65
+ # python -m nlp4web_codebase.ir.data_loaders.sciq
66
+ import time
67
+
68
+ import ujson
69
+
70
+ start = time.time()
71
+ dataset = load_sciq(verbose=True)
72
+ print(f"Loading costs: {time.time() - start}s")
73
+ print(ujson.dumps(dataset.get_stats(), indent=4))
74
+ # ________________________________________________________________________________
75
+ # [Memory] Calling __main__--home-kwang-research-nlp4web-ir-exercise-nlp4web-nlp4web-ir-data_loaders-sciq.load_sciq...
76
+ # load_sciq(verbose=True)
77
+ # |raw_train| 11679
78
+ # |raw_dev| 1000
79
+ # |raw_test| 1000
80
+ # ________________________________________________________load_sciq - 7.3s, 0.1min
81
+ # Loading costs: 7.260092735290527s
82
+ # {
83
+ # "|corpus|": 12160,
84
+ # "|queries|": 12160,
85
+ # "|qrels-train|": 10409,
86
+ # "|qrels-dev|": 875,
87
+ # "|qrels-test|": 876
88
+ # }
nlp4web-codebase/nlp4web_codebase/ir/models/__init__.py ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from abc import ABC, abstractmethod
2
+ from typing import Any, Dict, Type
3
+
4
+
5
+ class BaseRetriever(ABC):
6
+
7
+ @property
8
+ @abstractmethod
9
+ def index_class(self) -> Type[Any]:
10
+ pass
11
+
12
+ def get_term_weights(self, query: str, cid: str) -> Dict[str, float]:
13
+ raise NotImplementedError
14
+
15
+ @abstractmethod
16
+ def score(self, query: str, cid: str) -> float:
17
+ pass
18
+
19
+ @abstractmethod
20
+ def retrieve(self, query: str, topk: int = 10) -> Dict[str, float]:
21
+ pass
nlp4web-codebase/requirements.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ .
nlp4web-codebase/setup.py ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from setuptools import setup, find_packages
2
+
3
+
4
+ with open("README.md", "r", encoding="utf-8") as fh:
5
+ readme = fh.read()
6
+
7
+ setup(
8
+ name="nlp4web-codebase",
9
+ version="0.0.0",
10
+ author="Kexin Wang",
11
+ author_email="kexin.wang.2049@gmail.com",
12
+ description="Codebase of teaching materials for NLP4Web.",
13
+ long_description=readme,
14
+ long_description_content_type="text/markdown",
15
+ url="https://https://github.com/kwang2049/nlp4web-codebase",
16
+ project_urls={
17
+ "Bug Tracker": "https://github.com/kwang2049/nlp4web-codebase/issues",
18
+ },
19
+ packages=find_packages(),
20
+ classifiers=[
21
+ "Programming Language :: Python :: 3",
22
+ "License :: OSI Approved :: Apache Software License",
23
+ "Operating System :: OS Independent",
24
+ ],
25
+ python_requires=">=3.10",
26
+ install_requires=[
27
+ "nltk==3.8.1",
28
+ "numpy==1.26.4",
29
+ "scipy==1.13.1",
30
+ "pandas==2.2.2",
31
+ "tqdm==4.66.5",
32
+ "ujson==5.10.0",
33
+ "joblib==1.4.2",
34
+ "datasets==3.0.1",
35
+ "pytrec_eval==0.5",
36
+ ],
37
+ )