shavarani commited on
Commit
c337225
1 Parent(s): 49abc85

SpEL files required to run the streamlit app copied from https://github.com/shavarani/SpEL

Browse files
aida.py ADDED
@@ -0,0 +1,220 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import zipfile
3
+ from configuration import get_aida_yago_tsv_file_path, get_resources_dir
4
+
5
+ TRAIN_START_LINE = "-DOCSTART- (1 EU)"
6
+ TESTA_START_LINE = "-DOCSTART- (947testa CRICKET)"
7
+ TESTB_START_LINE = "-DOCSTART- (1163testb SOCCER)"
8
+
9
+ CANONICAL_REDIRECTS = None
10
+
11
+
12
+ class AnnotationRecord:
13
+ def __init__(self, line):
14
+ """
15
+ Lines with tabs are tokens the are part of a mention:
16
+ - column 1 is the token
17
+ - column 2 is either B (beginning of a mention) or I (continuation of a mention)
18
+ - column 3 is the full mention used to find entity candidates
19
+ - column 4 is the corresponding YAGO2 entity (in YAGO encoding, i.e. unicode characters are backslash encoded and spaces are replaced by underscores, see also the tools on the YAGO2 website), OR --NME--, denoting that there is no matching entity in YAGO2 for this particular mention, or that we are missing the connection between the mention string and the YAGO2 entity.
20
+ - column 5 is the corresponding Wikipedia URL of the entity (added for convenience when evaluating against a Wikipedia based method)
21
+ - column 6 is the corresponding Wikipedia ID of the entity (added for convenience when evaluating against a Wikipedia based method - the ID refers to the dump used for annotation, 2010-08-17)
22
+ - column 7 is the corresponding Freebase mid, if there is one (thanks to Massimiliano Ciaramita from Google Zürich for creating the mapping and making it available to us)
23
+ """
24
+ data_columns = line.split('\t')
25
+ self.token = None
26
+ self.begin_inside_tag = None
27
+ self.full_mention = None
28
+ self.yago_entity = None
29
+ self.wikipedia_url = None
30
+ self.wikipedia_id = None
31
+ self.freebase_mid = None
32
+ self.candidates = None
33
+ if data_columns:
34
+ self.token = data_columns[0]
35
+ if len(data_columns) > 1:
36
+ self.begin_inside_tag = data_columns[1]
37
+ if len(data_columns) > 2:
38
+ self.full_mention = data_columns[2]
39
+ if len(data_columns) > 3:
40
+ self.yago_entity = data_columns[3]
41
+ if len(data_columns) > 4:
42
+ self.wikipedia_url = data_columns[4]
43
+ if len(data_columns) > 5:
44
+ self.wikipedia_id = data_columns[5]
45
+ if len(data_columns) > 6:
46
+ self.freebase_mid = data_columns[6]
47
+
48
+ def set_candidates(self, candidate_record):
49
+ self.candidates = candidate_record
50
+ self.candidates.non_considered_word_count -= 1
51
+
52
+ def __str__(self):
53
+ res = ""
54
+ t = [self.token, self.begin_inside_tag, self.full_mention, self.yago_entity, self.wikipedia_url,
55
+ self.wikipedia_id, self.freebase_mid]
56
+ for ind, e in enumerate(t):
57
+ if not e:
58
+ continue
59
+ if ind < len(t) - 1:
60
+ res += e + "|"
61
+ else:
62
+ res += e
63
+ if res[-1] == "|":
64
+ res = res[:-1]
65
+ return res
66
+
67
+
68
+ class Document:
69
+ def __init__(self, document_id):
70
+ self.document_id = document_id
71
+ self.annotations = []
72
+ self.current_annotation = []
73
+
74
+ def add_annotation(self, line, candidates):
75
+ if not line:
76
+ self.flush_current_annotation()
77
+ else:
78
+ ar = AnnotationRecord(line)
79
+ for c in candidates:
80
+ if c.non_considered_word_count < 1:
81
+ continue
82
+ if c.orig_text == ar.full_mention:
83
+ ar.set_candidates(c)
84
+ break
85
+ self.current_annotation.append(ar)
86
+
87
+ def flush_current_annotation(self):
88
+ self.annotations.append(self.current_annotation)
89
+ self.current_annotation = []
90
+
91
+
92
+ class Candidate:
93
+ def __init__(self, candidate_line):
94
+ self.id = ""
95
+ self.in_count = 0
96
+ self.out_count = 0
97
+ self.links = 0
98
+ self.url = ""
99
+ self.name = ""
100
+ self.normal_name = ""
101
+ self.normal_wiki_title = ""
102
+ self.predicted_type = ""
103
+ for item in candidate_line.split('\t'):
104
+ if item == 'CANDIDATE' or not item.strip():
105
+ continue
106
+ elif item.startswith('id:'):
107
+ self.id = item[3:]
108
+ elif item.startswith('inCount:'):
109
+ self.in_count = int(item[8:])
110
+ elif item.startswith('outCount:'):
111
+ self.out_count = int(item[9:])
112
+ elif item.startswith('links:'):
113
+ self.links = item[6:]
114
+ elif item.startswith('url:'):
115
+ self.url = item[4:]
116
+ elif item.startswith('name:'):
117
+ self.name = item[5:]
118
+ elif item.startswith('normalName:'):
119
+ self.normal_name = item[11:]
120
+ elif item.startswith('normalWikiTitle:'):
121
+ self.normal_wiki_title = item[16:]
122
+ elif item.startswith('predictedType:'):
123
+ self.predicted_type = item[14:]
124
+ else:
125
+ raise ValueError(f"Undefined PPRforNED CANDIDATE column: {item}")
126
+
127
+ def __str__(self):
128
+ return f"id: {self.id}\twiki_page: {self.url.replace('http://en.wikipedia.org/wiki/', '')}"
129
+
130
+
131
+ class CandidateRecord:
132
+ def __init__(self, entity_header):
133
+ self.candidates = []
134
+ self.text = ""
135
+ self.normal_name = ""
136
+ self.predicted_type = ""
137
+ self.q = False
138
+ self.qid = ""
139
+ self.docid = -1
140
+ self.orig_text = ""
141
+ self.non_considered_word_count = 0
142
+ self.url = ""
143
+ for item in entity_header.split('\t'):
144
+ if item == 'ENTITY':
145
+ continue
146
+ elif item.startswith('text:'):
147
+ self.text = item[5:]
148
+ elif item.startswith('normalName:'):
149
+ self.normal_name = item[11:]
150
+ elif item.startswith('predictedType:'):
151
+ self.predicted_type = item[14:]
152
+ elif item.startswith('q:'):
153
+ self.q = bool(item[2:])
154
+ elif item.startswith('qid:'):
155
+ self.qid = item[4:]
156
+ elif item.startswith('docId:'):
157
+ self.docid = int(item[6:]) - 1
158
+ elif item.startswith('origText:'):
159
+ self.orig_text = item[9:]
160
+ self.non_considered_word_count = len(self.orig_text.split())
161
+ elif item.startswith('url:'):
162
+ self.url = item[4:]
163
+ else:
164
+ raise ValueError(f"Undefined PPRforNED column: {item}")
165
+
166
+ def add_candidate(self, candidate_line):
167
+ self.candidates.append(Candidate(candidate_line))
168
+
169
+ def __str__(self):
170
+ cnds = '\n\t'.join([str(x) for x in self.candidates])
171
+ return f"doc_id: {self.docid}\toriginal_text: {self.orig_text}\tcandidates:\n\t{cnds}"
172
+
173
+
174
+ def get_candidates(ppr_for_ned_candidates_zip, last_document_id):
175
+ candidates_string = ppr_for_ned_candidates_zip.read(str(last_document_id + 1)).decode("utf-8").split("\n")
176
+ candidates = []
177
+ for c_line in candidates_string:
178
+ if not c_line.strip():
179
+ continue
180
+ if c_line.startswith("ENTITY"):
181
+ candidates.append(CandidateRecord(c_line))
182
+ elif c_line.startswith("CANDIDATE"):
183
+ assert len(candidates)
184
+ candidates[-1].add_candidate(c_line)
185
+ else:
186
+ raise ValueError("This must be unreachable!")
187
+ return candidates
188
+
189
+ class AIDADataset:
190
+ def __init__(self):
191
+ super(AIDADataset, self).__init__()
192
+ self.dataset = None
193
+ self.data_path = str(get_aida_yago_tsv_file_path().absolute())
194
+ assert os.path.exists(self.data_path), f"The passed dataset address: {self.data_path} does not exist"
195
+ self.load_dataset()
196
+
197
+ def load_dataset(self):
198
+ ppr_for_ned_candidates_zip = zipfile.ZipFile(get_resources_dir() / "data" / "PPRforNED.zip", "r")
199
+ annotations = [[], [], []]
200
+ current_document = None
201
+ current_document_candidates = None
202
+ data_split_id = -1
203
+ last_document_id = 0
204
+ with open(self.data_path, "r", encoding="utf-8") as data_file:
205
+ for ind, line in enumerate(data_file):
206
+ line = line.strip()
207
+ if line.startswith("-DOCSTART-"):
208
+ if current_document:
209
+ annotations[data_split_id].append(current_document)
210
+ last_document_id += 1
211
+ if line == TRAIN_START_LINE or line == TESTA_START_LINE or line == TESTB_START_LINE:
212
+ data_split_id += 1
213
+ current_document = Document(last_document_id)
214
+ current_document_candidates = get_candidates(ppr_for_ned_candidates_zip, last_document_id)
215
+ else:
216
+ current_document.add_annotation(line, current_document_candidates)
217
+ if current_document:
218
+ annotations[data_split_id].append(current_document)
219
+ self.dataset = {"train": annotations[0], "testa": annotations[1], "testb": annotations[2]}
220
+ ppr_for_ned_candidates_zip.close()
app.py ADDED
@@ -0,0 +1,75 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ The SpEL annotation visualization script. You can use this script as a playground to explore the capabilities and
3
+ limitations of the SpEL framework.
4
+ """
5
+ import torch
6
+ from model import SpELAnnotator
7
+ from data_loader import dl_sa
8
+ from utils import chunk_annotate_and_merge_to_phrase
9
+ from candidate_manager import CandidateManager
10
+ import streamlit as st
11
+ from annotated_text import annotated_text
12
+ device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
13
+
14
+ @st.cache_resource
15
+ def load_model():
16
+ load_aida_finetuned = True
17
+ load_full_vocabulary=True
18
+ candidate_setting = "n"
19
+ model = SpELAnnotator()
20
+ model.init_model_from_scratch(device=device)
21
+ candidates_manager_to_use = CandidateManager(dl_sa.mentions_vocab,
22
+ is_kb_yago=candidate_setting == "k",
23
+ is_ppr_for_ned=candidate_setting.startswith("p"),
24
+ is_context_agnostic=candidate_setting == "pg",
25
+ is_indexed_for_spans=True) if candidate_setting != "n" else None
26
+ if load_aida_finetuned and not load_full_vocabulary:
27
+ model.shrink_classification_head_to_aida(device=device)
28
+ model.load_checkpoint(None, device=device, load_from_torch_hub=True, finetuned_after_step=3)
29
+ elif load_aida_finetuned:
30
+ model.load_checkpoint(None, device=device, load_from_torch_hub=True, finetuned_after_step=4)
31
+ else:
32
+ model.load_checkpoint(None, device=device, load_from_torch_hub=True, finetuned_after_step=2)
33
+ return model, candidates_manager_to_use
34
+
35
+ annotator, candidates_manager = load_model()
36
+ st.title("SpEL Prediction Visualization")
37
+ mention = st.text_input("Enter the text:")
38
+ process_button = st.button("Annotate")
39
+
40
+ if process_button and mention:
41
+ phrase_annotations = chunk_annotate_and_merge_to_phrase(
42
+ annotator, mention, k_for_top_k_to_keep=5, normalize_for_chinese_characters=True)
43
+ last_step_annotations = [[p.words[0].token_offsets[0][1][0],
44
+ p.words[-1].token_offsets[-1][1][-1],
45
+ (dl_sa.mentions_itos[p.resolved_annotation], p.subword_annotations)]
46
+ for p in phrase_annotations if p.resolved_annotation != 0]
47
+ if candidates_manager:
48
+ for p in phrase_annotations:
49
+ candidates_manager.modify_phrase_annotation_using_candidates(p, mention)
50
+ if last_step_annotations:
51
+ anns = sorted([(l_ann[0], l_ann[1], l_ann[2][0]) for l_ann in last_step_annotations], key=lambda x: x[0])
52
+ begin = 0
53
+ last_char = len(mention)
54
+ anns_pointer = 0
55
+ processed_anns = []
56
+ anno_text = []
57
+ while begin < last_char:
58
+ if anns_pointer == len(anns):
59
+ processed_anns.append((begin, last_char, "O"))
60
+ anno_text.append(mention[begin: last_char])
61
+ begin = last_char
62
+ continue
63
+ first_unprocessed_annotation = anns[anns_pointer]
64
+ if first_unprocessed_annotation[0] > begin:
65
+ processed_anns.append((begin, first_unprocessed_annotation[0], "O"))
66
+ anno_text.append(mention[begin: first_unprocessed_annotation[0]])
67
+ begin = first_unprocessed_annotation[0]
68
+ else:
69
+ processed_anns.append(first_unprocessed_annotation)
70
+ anns_pointer += 1
71
+ begin = first_unprocessed_annotation[1]
72
+ anno_text.append((mention[first_unprocessed_annotation[0]: first_unprocessed_annotation[1]], first_unprocessed_annotation[2]))
73
+ annotated_text(anno_text)
74
+ else:
75
+ annotated_text(mention)
base_model.cfg ADDED
@@ -0,0 +1 @@
 
 
1
+ roberta-base
candidate_manager.py ADDED
@@ -0,0 +1,103 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ This file contains the implementation of the candidate manager in charge of loading the candidate sets,
3
+ and modifying the phrase annotations using the loaded candidates.
4
+ """
5
+ import json
6
+ from span_annotation import PhraseAnnotation
7
+ from configuration import get_resources_dir
8
+
9
+
10
+ class CandidateManager:
11
+ def __init__(self, mentions_vocab, is_kb_yago = False, is_ppr_for_ned = False, is_context_agnostic = False,
12
+ is_indexed_for_spans= False):
13
+ self.mentions_vocab = mentions_vocab
14
+ self.candidates = None
15
+ if is_kb_yago:
16
+ print(" * Loading the candidates stored for KB+YAGO ...")
17
+ is_context_agnostic = True
18
+ is_indexed_for_spans = False
19
+ self.load_kb_plus_yago()
20
+ elif is_ppr_for_ned:
21
+ print(" * Loading the {} PPRforNED candidate set ...".format(
22
+ 'context agnostic' if is_context_agnostic else 'context aware'))
23
+ self.load_ppr_for_ned_candidates(is_context_agnostic, is_indexed_for_spans)
24
+ else:
25
+ raise ValueError("Either \'is_kb_yago\' or \'is_ppr_for_ned\' flags must be True!")
26
+ self.is_context_agnostic = is_context_agnostic
27
+ self.is_indexed_for_spans = is_indexed_for_spans
28
+ self.is_kb_yago = is_kb_yago
29
+ self.is_ppr_for_ned = is_ppr_for_ned
30
+
31
+ def load_ppr_for_ned_candidates(self, is_context_agnostic, is_indexed_for_spans):
32
+ if is_context_agnostic:
33
+ file_address = "context_agnostic_mentions.json"
34
+ elif is_indexed_for_spans:
35
+ file_address = "context_aware_spans.json"
36
+ else:
37
+ file_address = "context_aware_mentions.json"
38
+ candidates_a = json.load(open(
39
+ get_resources_dir() / "data" / "candidates" / "aida_testa_pprforned" / file_address, "r"))
40
+ candidates_b = json.load(open(
41
+ get_resources_dir() / "data" / "candidates" / "aida_testb_pprforned" / file_address, "r"))
42
+ if is_context_agnostic:
43
+ for key in candidates_b:
44
+ if key in candidates_a:
45
+ for elem in candidates_b[key]:
46
+ if elem not in candidates_a[key]:
47
+ candidates_a[key].append(elem)
48
+ else:
49
+ candidates_a[key] = candidates_b[key]
50
+ else:
51
+ candidates_a.update(candidates_b)
52
+ self.candidates = candidates_a
53
+
54
+ def load_kb_plus_yago(self):
55
+ self.candidates = json.load(open(
56
+ get_resources_dir() / "data" / "candidates" / "kb_plus_yago_candidates.json", "r"))
57
+
58
+ def _fetch_candidates(self, phrase_annotation, sentence = None):
59
+ candidates = []
60
+ if self.is_kb_yago:
61
+ phrase_to_check = phrase_annotation.word_string.lower()
62
+ if phrase_to_check in self.candidates:
63
+ candidates = self.candidates[phrase_to_check]
64
+ elif self.is_ppr_for_ned:
65
+ # TODO lower-cased check mention surface forms
66
+ span_key = f"({phrase_annotation.begin_character}, {phrase_annotation.end_character})"
67
+ if self.is_context_agnostic and phrase_annotation.word_string in self.candidates:
68
+ candidates = self.candidates[phrase_annotation.word_string]
69
+ elif not self.is_context_agnostic and sentence in self.candidates:
70
+ if self.is_indexed_for_spans and span_key in self.candidates[sentence]:
71
+ candidates = self.candidates[sentence][span_key]
72
+ elif not self.is_indexed_for_spans and phrase_annotation.word_string in self.candidates[sentence]:
73
+ candidates = self.candidates[sentence][phrase_annotation.word_string]
74
+ return candidates
75
+
76
+ def modify_phrase_annotation_using_candidates(self, phrase_annotation: PhraseAnnotation, sentence: str = None):
77
+ """
78
+ The method post processes the :param phrase_annotation: found in a :param sentence: to make sure it is bound to
79
+ the predefined {self.candidates} set.
80
+ It is not possible to perform candidate look up for spans in context agnostic scenario
81
+ so {self.is_indexed_for_spans} will only be considered where {self.is_context_agnostic} is False.
82
+ """
83
+ if self.candidates is None or phrase_annotation.resolved_annotation == 0:
84
+ return
85
+ candidates = self._fetch_candidates(phrase_annotation, sentence)
86
+ if not candidates:
87
+ phrase_annotation.set_alternative_as_resolved_annotation(0)
88
+ return
89
+ if self.is_kb_yago:
90
+ candidates_ = [self.mentions_vocab[x[0]] for x in candidates if x[0] in self.mentions_vocab]
91
+ prior_probabilities_ = [x[1] for x in candidates if x[0] in self.mentions_vocab]
92
+ else:
93
+ candidates_ = [self.mentions_vocab[x] for x in candidates if x in self.mentions_vocab]
94
+ prior_probabilities_ = [1.0 for x in candidates if x in self.mentions_vocab]
95
+ # TODO use the prior_probabilities_ to adjust the probabilities
96
+ if candidates_:
97
+ all_p_anns = phrase_annotation.all_possible_annotations()
98
+ filtered_p_predictions = sorted(
99
+ [x for x in all_p_anns if x[0] in candidates_], key=lambda y: y[1], reverse=True)
100
+ if filtered_p_predictions:
101
+ phrase_annotation.set_alternative_as_resolved_annotation(filtered_p_predictions[0][0])
102
+ else:
103
+ phrase_annotation.set_alternative_as_resolved_annotation(0)
configuration.py ADDED
@@ -0,0 +1,104 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import torch
3
+ import pathlib
4
+ import json
5
+ from datetime import date
6
+
7
+ device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
8
+
9
+ AIDA_CANONICAL_REDIRECTS = None
10
+ OOD_CANONICAL_REDIRECTS = None
11
+
12
+
13
+ def get_base_model_name():
14
+ return open("base_model.cfg", "r").read().strip()
15
+
16
+
17
+ def get_project_top_dir():
18
+ return pathlib.Path(os.path.abspath(__file__)).parent
19
+
20
+
21
+ def get_resources_dir():
22
+ return get_project_top_dir() / 'resources'
23
+
24
+
25
+ def get_checkpoints_dir():
26
+ path_ = get_project_top_dir() / '.checkpoints'
27
+ if not os.path.exists(path_):
28
+ os.mkdir(path_)
29
+ return path_
30
+
31
+
32
+ def get_logdir_dir():
33
+ path_ = get_project_top_dir() / '.logdir'
34
+ if not os.path.exists(path_):
35
+ os.mkdir(path_)
36
+ return path_
37
+
38
+
39
+ def get_aida_train_canonical_redirects():
40
+ global AIDA_CANONICAL_REDIRECTS
41
+ if not AIDA_CANONICAL_REDIRECTS:
42
+ r_file = get_resources_dir() / "data" / "aida_canonical_redirects.json"
43
+ with r_file.open() as f:
44
+ AIDA_CANONICAL_REDIRECTS = json.load(f)
45
+ return AIDA_CANONICAL_REDIRECTS
46
+
47
+ def get_ood_canonical_redirects():
48
+ global OOD_CANONICAL_REDIRECTS
49
+ if not OOD_CANONICAL_REDIRECTS:
50
+ r_file = get_resources_dir() / "data" / "ood_canonical_redirects.json"
51
+ with r_file.open() as f:
52
+ OOD_CANONICAL_REDIRECTS = json.load(f)
53
+ return OOD_CANONICAL_REDIRECTS
54
+
55
+
56
+ def get_aida_yago_tsv_file_path():
57
+ return get_resources_dir() / "data" / "AIDA-YAGO2-dataset.tsv"
58
+
59
+
60
+ def get_exec_run_file():
61
+ return get_logdir_dir() / f"annotator_log-{date.today().strftime('%Y-%b-%d')}.log"
62
+
63
+
64
+ def get_aida_vocab():
65
+ mentions_vocab = dict({'|||O|||': 0, '<pad>': 1})
66
+ dictionary_file = get_resources_dir() / "vocab" / "aida.txt"
67
+ dfile = dictionary_file.open("r")
68
+ for _ad_element in dfile.read().split("\n"):
69
+ mentions_vocab[_ad_element] = len(mentions_vocab)
70
+ return mentions_vocab
71
+
72
+ def get_ood_vocab():
73
+ # This function might be used if one is interested in testing out the "masking all the candidates not in our
74
+ # expected entity set" which is mentioned in the footnote of section 4.1 of the paper.
75
+ mentions_vocab = dict({'|||O|||': 0, '<pad>': 1})
76
+ dictionary_file = get_resources_dir() / "vocab" / "out_of_domain.txt"
77
+ dfile = dictionary_file.open("r")
78
+ for _ad_element in dfile.read().split("\n"):
79
+ mentions_vocab[_ad_element] = len(mentions_vocab)
80
+ return mentions_vocab
81
+
82
+
83
+ def get_aida_plus_wikipedia_vocab():
84
+ mentions_vocab = get_aida_vocab()
85
+ dictionary_file = get_resources_dir() / "vocab" / f"enwiki_20230827.txt"
86
+ dfile = dictionary_file.open("r")
87
+ for _ad_element in dfile.read().split("\n"):
88
+ if _ad_element not in mentions_vocab:
89
+ mentions_vocab[_ad_element] = len(mentions_vocab)
90
+ return mentions_vocab
91
+
92
+ def get_aida_plus_wikipedia_plus_out_of_domain_vocab():
93
+ mentions_vocab = get_aida_plus_wikipedia_vocab()
94
+ dictionary_file = get_resources_dir() / "vocab" / f"out_of_domain.txt"
95
+ dfile = dictionary_file.open("r")
96
+ for _ad_element in dfile.read().split("\n"):
97
+ if _ad_element not in mentions_vocab:
98
+ mentions_vocab[_ad_element] = len(mentions_vocab)
99
+ return mentions_vocab
100
+
101
+ def get_n3_entity_to_kb_mappings():
102
+ kb_file = get_resources_dir() / "data" / "n3_kb_mappings.json"
103
+ knowledge_base = json.load(kb_file.open("r"))
104
+ return knowledge_base
data_loader.py ADDED
@@ -0,0 +1,390 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ The enwiki/conll Dataset reader/provider using torchtext.
3
+ The datasets were crated using the scripts from:
4
+ https://github.com/samuelbroscheit/entity_knowledge_in_bert/tree/master/bert_entity/preprocessing
5
+ The get_dataset.collate_batch function is influenced by:
6
+ https://raw.githubusercontent.com/samuelbroscheit/entity_knowledge_in_bert/master/bert_entity/data_loader_wiki.py
7
+
8
+ Please note that the pre-processed fine-tuning data will be automatically downloaded upon instantiation of the data
9
+ readers and the result will be saved under /home/<user_name>/.cache/torch/text/datasets/ (in linux systems)
10
+
11
+ The expected sizes of the auto-downloaded datasets:
12
+ - Step 1 (general knowledge fine-tuning):
13
+ enwiki-2023-spel-roberta-tokenized-aug-27-2023.tar.gz: 19.1 GBs
14
+ * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
15
+ * You can delete the file above once fine-tuning step 1 is done, and you are moving on to step 2. *
16
+ * in the cleaning up process, make sure you remove the cached validation set files under .checkpoints *
17
+ * directory as well *
18
+ * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
19
+ - Step 2 (general knowledge fine-tuning):
20
+ enwiki-2023-spel-roberta-tokenized-aug-27-2023-retokenized.tar.gz: 17.5 GBs
21
+ * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
22
+ * You can delete the file above once fine-tuning step 2 is done, and you are moving on to step 3. *
23
+ * in the cleaning up process, make sure you remove the cached validation set files under .checkpoints *
24
+ * directory as well *
25
+ * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *
26
+ - Step 3 (domain specific fine-tuning):
27
+ aida-conll-spel-roberta-tokenized-aug-23-2023.tar.gz: 5.1 MBs
28
+
29
+ No extra preprocessing step will be required, as soon as you start the fine-tuning script for each step,
30
+ the proper fine-tuning dataset will be downloaded and will be served **without** the need for unzipping.
31
+ """
32
+ import os
33
+ import json
34
+ import numpy
35
+ from functools import partial
36
+ from collections import OrderedDict
37
+ from tqdm import tqdm
38
+ from typing import Union, Tuple
39
+
40
+ import torch
41
+ from torch.utils.data import DataLoader, Dataset
42
+ from torch.utils.data.distributed import DistributedSampler
43
+ from torchdata.datapipes.iter import FileOpener, IterableWrapper, HttpReader, FileLister
44
+ from torchtext.data.datasets_utils import _wrap_split_argument, _create_dataset_directory
45
+ from torchtext.utils import download_from_url
46
+
47
+ from transformers import AutoTokenizer, BatchEncoding
48
+
49
+ from configuration import (get_aida_plus_wikipedia_plus_out_of_domain_vocab, get_aida_train_canonical_redirects,
50
+ get_aida_vocab, get_ood_vocab, get_checkpoints_dir, get_base_model_name, device)
51
+
52
+ BERT_MODEL_NAME = get_base_model_name()
53
+ MAX_SPAN_ANNOTATION_SIZE = 4
54
+
55
+
56
+ class StaticAccess:
57
+ def __init__(self):
58
+ self.mentions_vocab, self.mentions_itos = None, None
59
+ self.set_vocab_and_itos_to_all()
60
+ self.aida_canonical_redirects = get_aida_train_canonical_redirects()
61
+ self._all_vocab_mask_for_aida = None
62
+ self._all_vocab_mask_for_ood = None
63
+
64
+ def set_vocab_and_itos_to_all(self):
65
+ self.mentions_vocab = get_aida_plus_wikipedia_plus_out_of_domain_vocab()
66
+ self.mentions_itos = [w[0] for w in sorted(self.mentions_vocab.items(), key=lambda x: x[1])]
67
+
68
+ @staticmethod
69
+ def get_aida_vocab_and_itos():
70
+ aida_mentions_vocab = get_aida_vocab()
71
+ aida_mentions_itos = [w[0] for w in sorted(aida_mentions_vocab.items(), key=lambda x: x[1])]
72
+ return aida_mentions_vocab, aida_mentions_itos
73
+
74
+ def shrink_vocab_to_aida(self):
75
+ self.mentions_vocab, self.mentions_itos = self.get_aida_vocab_and_itos()
76
+
77
+ def get_all_vocab_mask_for_aida(self):
78
+ if self._all_vocab_mask_for_aida is None:
79
+ mentions_vocab = get_aida_plus_wikipedia_plus_out_of_domain_vocab()
80
+ mask = torch.ones(len(mentions_vocab)).to(device)
81
+ mask = mask * -10000
82
+ mask[torch.Tensor([mentions_vocab[x] for x in get_aida_vocab()]).long()] = 0
83
+ self._all_vocab_mask_for_aida = mask
84
+ return self._all_vocab_mask_for_aida
85
+
86
+ def get_all_vocab_mask_for_ood(self):
87
+ if self._all_vocab_mask_for_ood is None:
88
+ mentions_vocab = get_aida_plus_wikipedia_plus_out_of_domain_vocab()
89
+ mask = torch.ones(len(mentions_vocab)).to(device)
90
+ mask = mask * -10000
91
+ mask[torch.Tensor([mentions_vocab[x] for x in get_ood_vocab()]).long()] = 0
92
+ self._all_vocab_mask_for_ood = mask
93
+ return self._all_vocab_mask_for_ood
94
+
95
+
96
+ dl_sa = StaticAccess()
97
+
98
+
99
+ class ENWIKI20230827Config:
100
+ URL = "https://1sfu-my.sharepoint.com/:u:/g/personal/sshavara_sfu_ca/Ea3IVbOpkTJKpASNyL9aFGMBQpH0ABU2hQa-wYyakkZ9TQ?e=DJFF3v&download=1"
101
+ MD5 = "eb9a54a8f1f858cdcbf6c750942a896f"
102
+ PATH = "enwiki-2023-spel-roberta-tokenized-aug-27-2023.tar.gz"
103
+ DATASET_NAME = "WIKIPEDIA20230827"
104
+ NUM_LINES = {'train': 3055221, 'valid': 1000, 'test': 1000}
105
+
106
+
107
+ class ENWIKI20230827V2Config:
108
+ URL = 'https://1sfu-my.sharepoint.com/:u:/g/personal/sshavara_sfu_ca/EeS_Tgl_CFJNiTh6YH5IDrsBocEZUsZV3lxPB6pleTxyxw?e=caH1cf&download=1'
109
+ MD5 = "83a37f528800a463cd1a376e80ffc744"
110
+ PATH = "enwiki-2023-spel-roberta-tokenized-aug-27-2023-retokenized.tar.gz"
111
+ DATASET_NAME = "WIKIPEDIA20230827V2"
112
+ NUM_LINES = {'train': 3038581, 'valid': 996}
113
+
114
+
115
+ class AIDA20230827Config:
116
+ URL = "https://1sfu-my.sharepoint.com/:u:/g/personal/sshavara_sfu_ca/EajEGYyf8LBOoxqDaiPBvbgBwFuEC08nssvZwGJWsG_HXg?e=wAwV6H&download=1"
117
+ MD5 = "8078529d5df96d0d1ecf6a505fdb767a"
118
+ PATH = "aida-conll-spel-roberta-tokenized-aug-23-2023.tar.gz"
119
+ DATASET_NAME = "AIDA20230827"
120
+ NUM_LINES = {'train': 1585, 'valid': 391, 'test': 372}
121
+
122
+
123
+ tokenizer = AutoTokenizer.from_pretrained(BERT_MODEL_NAME, cache_dir=get_checkpoints_dir() / "hf")
124
+
125
+ WIKI_EXTRACTED_FILES = {"train": "train.json", "valid": "valid.json", "test": " test.json"}
126
+
127
+
128
+ def wiki_filter_fn(split, fname_and_stream):
129
+ return WIKI_EXTRACTED_FILES[split] in fname_and_stream[0]
130
+
131
+
132
+ def wiki_data_record_convert(line):
133
+ element = json.loads(line)
134
+ r = {'tokens': element['tokens'], 'mentions': [], 'mention_entity_probs': [], 'mention_probs': []}
135
+ for token, mentions, mention_entity_probs, mention_probs in zip(element['tokens'], element['mentions'],
136
+ element['mention_entity_probs'],
137
+ element['mention_probs']):
138
+ if len(mention_probs) < len(mentions):
139
+ mention_probs.extend([1.0 for _ in range(len(mentions) - len(mention_probs))])
140
+ sorted_mentions = sorted(list(zip(mentions, mention_entity_probs, mention_probs)),
141
+ key=lambda x: x[1], reverse=True)
142
+ mentions_ = [dl_sa.aida_canonical_redirects[x[0]] if x[0] in dl_sa.aida_canonical_redirects else x[0]
143
+ for x in sorted_mentions if x[0]] # ignore mentions that are None
144
+ mention_entity_probs_ = [x[1] for x in sorted_mentions if x[0]] # ignore prob. for None mentions
145
+ mention_probs_ = [x[2] for x in sorted_mentions if x[0]] # ignore m_probs for None mentions
146
+ r['mentions'].append(mentions_[:MAX_SPAN_ANNOTATION_SIZE])
147
+ r['mention_probs'].append(mention_probs_[:MAX_SPAN_ANNOTATION_SIZE])
148
+ r['mention_entity_probs'].append(mention_entity_probs_[:MAX_SPAN_ANNOTATION_SIZE])
149
+ if len(mentions_) > MAX_SPAN_ANNOTATION_SIZE:
150
+ r['mention_entity_probs'][-1] = [x / sum(r['mention_entity_probs'][-1])
151
+ for x in r['mention_entity_probs'][-1]]
152
+ return r
153
+
154
+ @_create_dataset_directory(dataset_name=ENWIKI20230827Config.DATASET_NAME)
155
+ @_wrap_split_argument(("train", "valid", "test"))
156
+ def ENWIKI20230827(root: str, split: Union[Tuple[str], str]):
157
+ root = root if root else ".data"
158
+ path = root + "/" + ENWIKI20230827Config.PATH
159
+ if not os.path.exists(path):
160
+ download_from_url(ENWIKI20230827Config.URL, root=root, path=path, hash_value=ENWIKI20230827Config.MD5,
161
+ hash_type='md5')
162
+ online_reader_dp = FileLister(root, ENWIKI20230827Config.PATH)
163
+ tar_file_dp = FileOpener(online_reader_dp, mode="b").load_from_tar().filter(
164
+ partial(wiki_filter_fn, split)).readlines(return_path=False).map(wiki_data_record_convert)
165
+ return tar_file_dp
166
+
167
+ @_create_dataset_directory(dataset_name=ENWIKI20230827V2Config.DATASET_NAME)
168
+ @_wrap_split_argument(("train", "valid"))
169
+ def ENWIKI20230827V2(root: str, split: Union[Tuple[str], str]):
170
+ root = root if root else ".data"
171
+ path = root + "/" + ENWIKI20230827V2Config.PATH
172
+ if not os.path.exists(path):
173
+ download_from_url(ENWIKI20230827V2Config.URL, root=root, path=path, hash_value=ENWIKI20230827V2Config.MD5,
174
+ hash_type='md5')
175
+ online_reader_dp = FileLister(root, ENWIKI20230827V2Config.PATH)
176
+ tar_file_dp = FileOpener(online_reader_dp, mode="b").load_from_tar().filter(
177
+ partial(wiki_filter_fn, split)).readlines(return_path=False).map(wiki_data_record_convert)
178
+ return tar_file_dp
179
+
180
+ def aida_path_fn(r, _=None):
181
+ return os.path.join(r, AIDA20230827Config.PATH)
182
+
183
+
184
+ def aida_select_split(s, file_name_data):
185
+ return file_name_data[1][s]
186
+
187
+
188
+ def aida_data_record_convert(r):
189
+ for x in r: # making sure each token comes with exactly one annotation
190
+ assert len(x) == 7 or len(x) == 8 # whether it contains the candidates or not
191
+ return {"tokens": [x[0] for x in r], "mentions": [[x[4] if x[4] else "|||O|||"] for x in r],
192
+ "mention_entity_probs": [[1.0] for _ in r], "mention_probs": [[1.0] for _ in r],
193
+ "candidates": [x[7] if x[7] else [] for x in r] if len(x) == 8 else [[] for x in r]}
194
+
195
+
196
+ @_create_dataset_directory(dataset_name=AIDA20230827Config.DATASET_NAME)
197
+ @_wrap_split_argument(('train', 'valid', 'test'))
198
+ def AIDA20230827(root, split):
199
+ online_reader_dp = HttpReader(IterableWrapper([AIDA20230827Config.URL])).on_disk_cache(
200
+ filepath_fn=partial(aida_path_fn, root), hash_dict={aida_path_fn(root): AIDA20230827Config.MD5},
201
+ hash_type="md5").end_caching(mode="wb", same_filepath_fn=True)
202
+ return FileOpener(online_reader_dp, mode="b").load_from_tar().parse_json_files().flatmap(
203
+ partial(aida_select_split, split)).map(aida_data_record_convert)
204
+
205
+
206
+ class DistributableDataset(Dataset):
207
+ """
208
+ Based on the documentations in torch.utils.data.DataLoader, `IterableDataset` does not support custom `sampler`
209
+ Therefore we cannot use the DistributedSampler with the DataLoader to split the data samples.
210
+ This class is a workaround to make the IterableDataset work with the DistributedSampler.
211
+ """
212
+ def __init__(self, dataset, size, world_size, rank):
213
+ self.size = size
214
+ self.data = iter(dataset)
215
+ self.world_size = world_size
216
+ self.rank = rank
217
+ self.initial_fetch = True
218
+
219
+ def __len__(self):
220
+ return self.size
221
+
222
+ def __getitem__(self, index):
223
+ # Since we don't have a means of accessing the data by indices, we try skipping the indices that we believe
224
+ # belong to other processes
225
+ skip_size = self.rank if self.initial_fetch else self.world_size - 1
226
+ self.initial_fetch = False
227
+ for _ in range(skip_size):
228
+ next(self.data)
229
+ return next(self.data)
230
+
231
+
232
+ def convert_is_in_mention_to_bioes(is_in_mention):
233
+ # B = 0, I = 1, O = 2, E = 3, S = 4
234
+ bioes = []
235
+ for iim, current in enumerate(is_in_mention):
236
+ before = is_in_mention[iim - 1] if iim > 0 else 0
237
+ after = is_in_mention[iim + 1] if iim < len(is_in_mention) - 1 else 0
238
+ bioes.append(
239
+ 2 if not current else (4 if not before and not after else (0 if not before else (3 if not after else 1))))
240
+ return bioes
241
+
242
+
243
+ def get_dataset(dataset_name: str, split: str, batch_size: int, get_labels_with_high_model_score=None,
244
+ label_size: int = 0, load_distributed: bool = False, world_size: int = 1, rank: int = 0,
245
+ use_retokenized_wikipedia_data: bool = False):
246
+ """
247
+ :param dataset_name: The dataset name can either be "enwiki" or "aida"
248
+ :param split: the requested dataset split which can be 'train', 'valid' or 'test'
249
+ :param batch_size: the size of the resulting batch from the data loader
250
+ :param get_labels_with_high_model_score: The function that finds high scoring negative samples for the model
251
+ :param label_size: The maximum output distribution size. You can pass the output vocabulary size for this parameter.
252
+ :param load_distributed: The flag hinting whether the data loader will be loaded in a multi-gpu setting.
253
+ :param world_size: the number of machines that the dataloader is expected to serve.
254
+ :param rank: the rank of the gpu on which the data is expected to be served.
255
+ :param use_retokenized_wikipedia_data: a flag indicating whether to use ENWIKI20230827 dataset or ENWIKI20230827V2
256
+ """
257
+
258
+ assert dataset_name in ["enwiki", "aida"]
259
+ if not load_distributed or rank == 0:
260
+ print(f"Initializing the {dataset_name.upper()}/{split} dataset ...")
261
+
262
+ def collate_batch(batch):
263
+ data = {}
264
+ for key in ["tokens", "mentions", "mention_entity_probs", "eval_mask", "candidates", "is_in_mention", "bioes"]:
265
+ data[key] = []
266
+ for annotated_line_in_file in batch:
267
+ data["tokens"].append(tokenizer.convert_tokens_to_ids(annotated_line_in_file["tokens"]))
268
+ data["mentions"].append([
269
+ [(dl_sa.mentions_vocab[x] if x not in dl_sa.aida_canonical_redirects else
270
+ dl_sa.mentions_vocab[dl_sa.aida_canonical_redirects[x]])
271
+ if x is not None and x not in ['Gmina_Żabno'] else dl_sa.mentions_vocab["|||O|||"] for x in el]
272
+ for el in annotated_line_in_file["mentions"]
273
+ ])
274
+ data["mention_entity_probs"].append(annotated_line_in_file["mention_entity_probs"])
275
+ data["eval_mask"].append(list(map(
276
+ lambda item: 1 if len(item) == 1 else 0, annotated_line_in_file["mention_probs"])))
277
+ is_in_mention = [1 if x != '|||O|||' else 0 for el, elp in zip(
278
+ annotated_line_in_file["mentions"], annotated_line_in_file["mention_entity_probs"])
279
+ for x, y in zip(el, elp) if y == max(elp)]
280
+ data["is_in_mention"].append(is_in_mention)
281
+ data["bioes"].append(convert_is_in_mention_to_bioes(is_in_mention))
282
+
283
+ maxlen = max([len(x) for x in data["tokens"]])
284
+ token_ids = torch.LongTensor([sample + [0] * (maxlen - len(sample)) for sample in data["tokens"]])
285
+ eval_mask = torch.LongTensor([sample + [0] * (maxlen - len(sample)) for sample in data["eval_mask"]])
286
+ is_in_mention = torch.LongTensor([sample + [0] * (maxlen - len(sample)) for sample in data["is_in_mention"]])
287
+ bioes = torch.LongTensor([sample + [2] * (maxlen - len(sample)) for sample in data["bioes"]])
288
+ if get_labels_with_high_model_score:
289
+ labels_with_high_model_score = get_labels_with_high_model_score(token_ids)
290
+ else:
291
+ labels_with_high_model_score = None
292
+ subword_mentions = create_output_with_negative_examples(
293
+ data["mentions"], data["mention_entity_probs"], token_ids.size(0), token_ids.size(1),
294
+ len(dl_sa.mentions_vocab), label_size, labels_with_high_model_score)
295
+ inputs = BatchEncoding({
296
+ 'token_ids': token_ids,
297
+ 'eval_mask': eval_mask,
298
+ 'raw_mentions': data["mentions"],
299
+ 'is_in_mention': is_in_mention,
300
+ "bioes": bioes
301
+ })
302
+ return inputs, subword_mentions
303
+ if not load_distributed or rank == 0:
304
+ print(f"Done initializing the {dataset_name.upper()}/{split} dataset ...")
305
+ wikipedia_dataset = ENWIKI20230827
306
+ wikipedia_dataset_config = ENWIKI20230827Config
307
+ retokenized_wikipedia_dataset = ENWIKI20230827V2
308
+ retokenized_wikipedia_dataset_config = ENWIKI20230827V2Config
309
+ aida_dataset = AIDA20230827
310
+ aida_dataset_config = AIDA20230827Config
311
+ dset_class = (retokenized_wikipedia_dataset if use_retokenized_wikipedia_data else wikipedia_dataset) \
312
+ if dataset_name == "enwiki" else aida_dataset
313
+ d_size = (retokenized_wikipedia_dataset_config.NUM_LINES[split] if use_retokenized_wikipedia_data else
314
+ wikipedia_dataset_config.NUM_LINES[split]) \
315
+ if dataset_name == "enwiki" else aida_dataset_config.NUM_LINES[split]
316
+ dataset_ = DistributableDataset(dset_class(split=split, root=get_checkpoints_dir()), d_size, world_size, rank) \
317
+ if load_distributed else dset_class(split=split, root=get_checkpoints_dir())
318
+ return DataLoader(dataset_, batch_size=batch_size, collate_fn=collate_batch,
319
+ sampler=DistributedSampler(dataset_, num_replicas=world_size, rank=rank)) \
320
+ if load_distributed and split == "train" else DataLoader(dset_class(split=split, root=get_checkpoints_dir()),
321
+ batch_size=batch_size,
322
+ collate_fn=collate_batch)
323
+
324
+
325
+ def create_output_with_negative_examples(batch_entity_ids, batch_entity_probs, batch_size, maxlen, label_vocab_size,
326
+ label_size, labels_with_high_model_score=None):
327
+ all_entity_ids = OrderedDict()
328
+ for batch_offset, (batch_item_token_item_entity_ids, batch_item_token_entity_probs) in enumerate(
329
+ zip(batch_entity_ids, batch_entity_probs)
330
+ ):
331
+ for tok_id, (token_entity_ids, token_entity_probs) in enumerate(
332
+ zip(batch_item_token_item_entity_ids, batch_item_token_entity_probs)
333
+ ):
334
+ for eid in token_entity_ids:
335
+ if eid not in all_entity_ids:
336
+ all_entity_ids[eid] = len(all_entity_ids)
337
+ # #####################################################
338
+ shared_label_ids = list(all_entity_ids.keys())
339
+
340
+ if len(shared_label_ids) < label_size and labels_with_high_model_score is not None:
341
+ negative_examples = set(labels_with_high_model_score)
342
+ negative_examples.difference_update(shared_label_ids)
343
+ shared_label_ids += list(negative_examples)
344
+
345
+ if len(shared_label_ids) < label_size:
346
+ negative_samples = set(numpy.random.choice(label_vocab_size, label_size, replace=False))
347
+ negative_samples.difference_update(shared_label_ids)
348
+ shared_label_ids += list(negative_samples)
349
+
350
+ shared_label_ids = shared_label_ids[: label_size]
351
+
352
+ all_batch_entity_ids, batch_shared_label_ids = all_entity_ids, shared_label_ids
353
+ if label_size > 0:
354
+ label_probs = torch.zeros(batch_size, maxlen, len(batch_shared_label_ids))
355
+ else:
356
+ label_probs = torch.zeros(batch_size, maxlen, label_vocab_size)
357
+ # loop through the batch x tokens x (label_ids, label_probs)
358
+ for batch_offset, (batch_item_token_item_entity_ids, batch_item_token_entity_probs) in enumerate(
359
+ zip(batch_entity_ids, batch_entity_probs)
360
+ ):
361
+ # loop through tokens x (label_ids, label_probs)
362
+ for tok_id, (token_entity_ids, token_entity_probs) in enumerate(
363
+ zip(batch_item_token_item_entity_ids, batch_item_token_entity_probs)):
364
+ if label_size is None:
365
+ label_probs[batch_offset][tok_id][torch.LongTensor(token_entity_ids)] = torch.Tensor(
366
+ batch_item_token_item_entity_ids)
367
+ else:
368
+ label_probs[batch_offset][tok_id][
369
+ torch.LongTensor(list(map(all_batch_entity_ids.__getitem__, token_entity_ids)))
370
+ ] = torch.Tensor(token_entity_probs)
371
+
372
+ label_ids = torch.LongTensor(batch_shared_label_ids)
373
+ return BatchEncoding({
374
+ "ids": label_ids, # of size label_size
375
+ "probs": label_probs, # of size input_batch_size x input_max_len x label_size
376
+ "dictionary": {v: k for k, v in all_batch_entity_ids.items()} # contains all original ids for mentions in batch
377
+ })
378
+
379
+
380
+ def _make_vocab_file():
381
+ wiki_vocab = set()
382
+ vocab_file = open("enwiki_20230827.txt", "w")
383
+ for spl in ['train', 'valid', 'test']:
384
+ for el in tqdm(ENWIKI20230827(split=spl, root=get_checkpoints_dir())):
385
+ for x in el['mentions']:
386
+ for y in x:
387
+ if y not in wiki_vocab:
388
+ vocab_file.write(f"{y}\n")
389
+ wiki_vocab.add(y)
390
+ vocab_file.close()
decao_eval.py ADDED
@@ -0,0 +1,1391 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ This file serves as a standalone evaluation provider for evaluating the predictions of a entity linking system.
3
+ The content of this module are taken from https://github.com/nicola-decao/efficient-autoregressive-EL and the necessary
4
+ boilerplate code is copied along with the metric classes to help the code act as standalone.
5
+
6
+ To perform evaluation, import the following classes (or any subset of the evaluation metrics that you need):
7
+ MicroF1, MicroPrecision, MicroRecall, MacroRecall, MacroPrecision, MacroF1
8
+ Collect the el_model predictions in the format of {(start_index, end_index, annotation string)} for document d.
9
+ Collect the gold dataset annotations in the format of {(start_index, end_index, annotation string)} for document d.
10
+ Call the metric instances for the two mentioned sets p and g:
11
+ micro_f1(p, g)
12
+ micro_prec(p, g)
13
+ micro_rec(p, g)
14
+ macro_f1(p, g)
15
+ macro_prec(p, g)
16
+ macro_rec(p, g)
17
+
18
+ Once you are done with all the documents and all predictions are added, you may access the evaluation results using:
19
+ {'macro_f1': macro_f1.compute(),
20
+ 'macro_prec': macro_prec.compute(),
21
+ 'macro_rec': macro_rec.compute(),
22
+ 'micro_f1': micro_f1.compute(),
23
+ 'micro_prec': micro_prec.compute(),
24
+ 'micro_rec': micro_rec.compute()}
25
+ """
26
+ from abc import ABC, abstractmethod
27
+ from typing import Any, Dict, Hashable, Iterable, Generator, Sequence, Tuple, Union, List, Mapping, Callable, Optional
28
+ import operator as op
29
+ import functools
30
+ import torch
31
+ import torch.nn as nn
32
+ from torch import Tensor
33
+ import torch.nn.functional as F
34
+ from contextlib import contextmanager
35
+ import inspect
36
+ from collections import OrderedDict
37
+ from copy import deepcopy
38
+ from importlib import import_module
39
+ from importlib.util import find_spec
40
+
41
+ from packaging.version import Version
42
+ from pkg_resources import DistributionNotFound, get_distribution
43
+
44
+
45
+ def dim_zero_sum(x: Tensor) -> Tensor:
46
+ """summation along the zero dimension."""
47
+ return torch.sum(x, dim=0)
48
+
49
+
50
+ def dim_zero_mean(x: Tensor) -> Tensor:
51
+ """average along the zero dimension."""
52
+ return torch.mean(x, dim=0)
53
+
54
+
55
+ def dim_zero_max(x: Tensor) -> Tensor:
56
+ """max along the zero dimension."""
57
+ return torch.max(x, dim=0).values
58
+
59
+
60
+ def dim_zero_min(x: Tensor) -> Tensor:
61
+ """min along the zero dimension."""
62
+ return torch.min(x, dim=0).values
63
+
64
+
65
+ def dim_zero_cat(x: Union[Tensor, List[Tensor]]) -> Tensor:
66
+ """concatenation along the zero dimension."""
67
+ x = x if isinstance(x, (list, tuple)) else [x]
68
+ x = [y.unsqueeze(0) if y.numel() == 1 and y.ndim == 0 else y for y in x]
69
+ if not x: # empty list
70
+ raise ValueError("No samples to concatenate")
71
+ return torch.cat(x, dim=0)
72
+
73
+
74
+ def _module_available(module_path: str) -> bool:
75
+ try:
76
+ return find_spec(module_path) is not None
77
+ except AttributeError:
78
+ # Python 3.6
79
+ return False
80
+ except ModuleNotFoundError:
81
+ # Python 3.7+
82
+ return False
83
+
84
+
85
+ def _compare_version(package: str, op: Callable, version: str) -> Optional[bool]:
86
+ if not _module_available(package):
87
+ return None
88
+ try:
89
+ pkg = import_module(package)
90
+ pkg_version = pkg.__version__ # type: ignore
91
+ except (ModuleNotFoundError, DistributionNotFound):
92
+ return None
93
+ except ImportError:
94
+ # catches cyclic imports - the case with integrated libs
95
+ # see: https://stackoverflow.com/a/32965521
96
+ pkg_version = get_distribution(package).version
97
+ try:
98
+ pkg_version = Version(pkg_version)
99
+ except TypeError:
100
+ # this is mock by sphinx, so it shall return True ro generate all summaries
101
+ return True
102
+ return op(pkg_version, Version(version))
103
+
104
+
105
+ class TorchMetricsUserError(Exception):
106
+ """Error used to inform users of a wrong combinison of Metric API calls."""
107
+
108
+
109
+ def _simple_gather_all_tensors(result: Tensor, group: Any, world_size: int) -> List[Tensor]:
110
+ gathered_result = [torch.zeros_like(result) for _ in range(world_size)]
111
+ torch.distributed.all_gather(gathered_result, result, group)
112
+ return gathered_result
113
+
114
+
115
+ def gather_all_tensors(result: Tensor, group: Optional[Any] = None) -> List[Tensor]:
116
+ """Function to gather all tensors from several ddp processes onto a list that is broadcasted to all processes.
117
+ Works on tensors that have the same number of dimensions, but where each dimension may differ. In this case
118
+ tensors are padded, gathered and then trimmed to secure equal workload for all processes.
119
+
120
+ Args:
121
+ result: the value to sync
122
+ group: the process group to gather results from. Defaults to all processes (world)
123
+
124
+ Return:
125
+ gathered_result: list with size equal to the process group where
126
+ gathered_result[i] corresponds to result tensor from process i
127
+ """
128
+ if group is None:
129
+ group = torch.distributed.group.WORLD
130
+
131
+ # convert tensors to contiguous format
132
+ result = result.contiguous()
133
+
134
+ world_size = torch.distributed.get_world_size(group)
135
+ torch.distributed.barrier(group=group)
136
+
137
+ # if the tensor is scalar, things are easy
138
+ if result.ndim == 0:
139
+ return _simple_gather_all_tensors(result, group, world_size)
140
+
141
+ # 1. Gather sizes of all tensors
142
+ local_size = torch.tensor(result.shape, device=result.device)
143
+ local_sizes = [torch.zeros_like(local_size) for _ in range(world_size)]
144
+ torch.distributed.all_gather(local_sizes, local_size, group=group)
145
+ max_size = torch.stack(local_sizes).max(dim=0).values
146
+ all_sizes_equal = all(all(ls == max_size) for ls in local_sizes)
147
+
148
+ # 2. If shapes are all the same, then do a simple gather:
149
+ if all_sizes_equal:
150
+ return _simple_gather_all_tensors(result, group, world_size)
151
+
152
+ # 3. If not, we need to pad each local tensor to maximum size, gather and then truncate
153
+ pad_dims = []
154
+ pad_by = (max_size - local_size).detach().cpu()
155
+ for val in reversed(pad_by):
156
+ pad_dims.append(0)
157
+ pad_dims.append(val.item())
158
+ result_padded = F.pad(result, pad_dims)
159
+ gathered_result = [torch.zeros_like(result_padded) for _ in range(world_size)]
160
+ torch.distributed.all_gather(gathered_result, result_padded, group)
161
+ for idx, item_size in enumerate(local_sizes):
162
+ slice_param = [slice(dim_size) for dim_size in item_size]
163
+ gathered_result[idx] = gathered_result[idx][slice_param]
164
+ return gathered_result
165
+
166
+
167
+ def apply_to_collection(
168
+ data: Any,
169
+ dtype: Union[type, tuple],
170
+ function: Callable,
171
+ *args: Any,
172
+ wrong_dtype: Optional[Union[type, tuple]] = None,
173
+ **kwargs: Any,
174
+ ) -> Any:
175
+ """Recursively applies a function to all elements of a certain dtype.
176
+
177
+ Args:
178
+ data: the collection to apply the function to
179
+ dtype: the given function will be applied to all elements of this dtype
180
+ function: the function to apply
181
+ *args: positional arguments (will be forwarded to calls of ``function``)
182
+ wrong_dtype: the given function won't be applied if this type is specified and the given collections is of
183
+ the :attr:`wrong_type` even if it is of type :attr`dtype`
184
+ **kwargs: keyword arguments (will be forwarded to calls of ``function``)
185
+
186
+ Returns:
187
+ the resulting collection
188
+
189
+ Example:
190
+ >>> apply_to_collection(torch.tensor([8, 0, 2, 6, 7]), dtype=Tensor, function=lambda x: x ** 2)
191
+ tensor([64, 0, 4, 36, 49])
192
+ >>> apply_to_collection([8, 0, 2, 6, 7], dtype=int, function=lambda x: x ** 2)
193
+ [64, 0, 4, 36, 49]
194
+ >>> apply_to_collection(dict(abc=123), dtype=int, function=lambda x: x ** 2)
195
+ {'abc': 15129}
196
+ """
197
+ elem_type = type(data)
198
+
199
+ # Breaking condition
200
+ if isinstance(data, dtype) and (wrong_dtype is None or not isinstance(data, wrong_dtype)):
201
+ return function(data, *args, **kwargs)
202
+
203
+ # Recursively apply to collection items
204
+ if isinstance(data, Mapping):
205
+ return elem_type({k: apply_to_collection(v, dtype, function, *args, **kwargs) for k, v in data.items()})
206
+
207
+ if isinstance(data, tuple) and hasattr(data, "_fields"): # named tuple
208
+ return elem_type(*(apply_to_collection(d, dtype, function, *args, **kwargs) for d in data))
209
+
210
+ if isinstance(data, Sequence) and not isinstance(data, str):
211
+ return elem_type([apply_to_collection(d, dtype, function, *args, **kwargs) for d in data])
212
+
213
+ # data is neither of dtype, nor a collection
214
+ return data
215
+
216
+
217
+ def _flatten(x: Sequence) -> list:
218
+ return [item for sublist in x for item in sublist]
219
+
220
+
221
+ def jit_distributed_available() -> bool:
222
+ return torch.distributed.is_available() and torch.distributed.is_initialized()
223
+
224
+
225
+ class _Metric(nn.Module, ABC):
226
+ __jit_ignored_attributes__ = ["device"]
227
+ __jit_unused_properties__ = ["is_differentiable"]
228
+ is_differentiable: Optional[bool] = None
229
+ higher_is_better: Optional[bool] = None
230
+
231
+ def __init__(
232
+ self,
233
+ compute_on_step: bool = True,
234
+ dist_sync_on_step: bool = False,
235
+ process_group: Optional[Any] = None,
236
+ dist_sync_fn: Callable = None,
237
+ ) -> None:
238
+ super().__init__()
239
+
240
+ # see (https://github.com/pytorch/pytorch/blob/3e6bb5233f9ca2c5aa55d9cda22a7ee85439aa6e/
241
+ # torch/nn/modules/module.py#L227)
242
+ torch._C._log_api_usage_once(f"torchmetrics.metric.{self.__class__.__name__}")
243
+
244
+ self._LIGHTNING_GREATER_EQUAL_1_3 = _compare_version("pytorch_lightning", op.ge, "1.3.0")
245
+ self._device = torch.device("cpu")
246
+
247
+ self.dist_sync_on_step = dist_sync_on_step
248
+ self.compute_on_step = compute_on_step
249
+ self.process_group = process_group
250
+ self.dist_sync_fn = dist_sync_fn
251
+ self._to_sync = True
252
+ self._should_unsync = True
253
+
254
+ self._update_signature = inspect.signature(self.update)
255
+ self.update: Callable = self._wrap_update(self.update) # type: ignore
256
+ self.compute: Callable = self._wrap_compute(self.compute) # type: ignore
257
+ self._computed = None
258
+ self._forward_cache = None
259
+ self._update_called = False
260
+
261
+ # initialize state
262
+ self._defaults: Dict[str, Union[List, Tensor]] = {}
263
+ self._persistent: Dict[str, bool] = {}
264
+ self._reductions: Dict[str, Union[str, Callable[[Union[List[Tensor], Tensor]], Tensor], None]] = {}
265
+
266
+ # state management
267
+ self._is_synced = False
268
+ self._cache: Optional[Dict[str, Union[List[Tensor], Tensor]]] = None
269
+
270
+ def add_state(
271
+ self,
272
+ name: str,
273
+ default: Union[list, Tensor],
274
+ dist_reduce_fx: Optional[Union[str, Callable]] = None,
275
+ persistent: bool = False,
276
+ ) -> None:
277
+ if not isinstance(default, (Tensor, list)) or (isinstance(default, list) and default):
278
+ raise ValueError("state variable must be a tensor or any empty list (where you can append tensors)")
279
+
280
+ if dist_reduce_fx == "sum":
281
+ dist_reduce_fx = dim_zero_sum
282
+ elif dist_reduce_fx == "mean":
283
+ dist_reduce_fx = dim_zero_mean
284
+ elif dist_reduce_fx == "max":
285
+ dist_reduce_fx = dim_zero_max
286
+ elif dist_reduce_fx == "min":
287
+ dist_reduce_fx = dim_zero_min
288
+ elif dist_reduce_fx == "cat":
289
+ dist_reduce_fx = dim_zero_cat
290
+ elif dist_reduce_fx is not None and not callable(dist_reduce_fx):
291
+ raise ValueError("`dist_reduce_fx` must be callable or one of ['mean', 'sum', 'cat', None]")
292
+
293
+ if isinstance(default, Tensor):
294
+ default = default.contiguous()
295
+
296
+ setattr(self, name, default)
297
+
298
+ self._defaults[name] = deepcopy(default)
299
+ self._persistent[name] = persistent
300
+ self._reductions[name] = dist_reduce_fx
301
+
302
+ @torch.jit.unused
303
+ def forward(self, *args: Any, **kwargs: Any) -> Any:
304
+ """Automatically calls ``update()``.
305
+
306
+ Returns the metric value over inputs if ``compute_on_step`` is True.
307
+ """
308
+ # add current step
309
+ if self._is_synced:
310
+ raise TorchMetricsUserError(
311
+ "The Metric shouldn't be synced when performing ``update``. "
312
+ "HINT: Did you forget to call ``unsync`` ?."
313
+ )
314
+
315
+ with torch.no_grad():
316
+ self.update(*args, **kwargs)
317
+
318
+ if self.compute_on_step:
319
+ self._to_sync = self.dist_sync_on_step
320
+ # skip restore cache operation from compute as cache is stored below.
321
+ self._should_unsync = False
322
+
323
+ # save context before switch
324
+ cache = {attr: getattr(self, attr) for attr in self._defaults}
325
+
326
+ # call reset, update, compute, on single batch
327
+ self.reset()
328
+ self.update(*args, **kwargs)
329
+ self._forward_cache = self.compute()
330
+
331
+ # restore context
332
+ for attr, val in cache.items():
333
+ setattr(self, attr, val)
334
+ self._is_synced = False
335
+
336
+ self._should_unsync = True
337
+ self._to_sync = True
338
+ self._computed = None
339
+
340
+ return self._forward_cache
341
+
342
+ def _sync_dist(self, dist_sync_fn: Callable = gather_all_tensors, process_group: Optional[Any] = None) -> None:
343
+ input_dict = {attr: getattr(self, attr) for attr in self._reductions}
344
+
345
+ for attr, reduction_fn in self._reductions.items():
346
+ # pre-concatenate metric states that are lists to reduce number of all_gather operations
347
+ if reduction_fn == dim_zero_cat and isinstance(input_dict[attr], list) and len(input_dict[attr]) > 1:
348
+ input_dict[attr] = [dim_zero_cat(input_dict[attr])]
349
+
350
+ output_dict = apply_to_collection(
351
+ input_dict,
352
+ Tensor,
353
+ dist_sync_fn,
354
+ group=process_group or self.process_group,
355
+ )
356
+
357
+ for attr, reduction_fn in self._reductions.items():
358
+ # pre-processing ops (stack or flatten for inputs)
359
+ if isinstance(output_dict[attr][0], Tensor):
360
+ output_dict[attr] = torch.stack(output_dict[attr])
361
+ elif isinstance(output_dict[attr][0], list):
362
+ output_dict[attr] = _flatten(output_dict[attr])
363
+
364
+ if not (callable(reduction_fn) or reduction_fn is None):
365
+ raise TypeError("reduction_fn must be callable or None")
366
+ reduced = reduction_fn(output_dict[attr]) if reduction_fn is not None else output_dict[attr]
367
+ setattr(self, attr, reduced)
368
+
369
+ def _wrap_update(self, update: Callable) -> Callable:
370
+ @functools.wraps(update)
371
+ def wrapped_func(*args: Any, **kwargs: Any) -> Optional[Any]:
372
+ self._computed = None
373
+ self._update_called = True
374
+ return update(*args, **kwargs)
375
+
376
+ return wrapped_func
377
+
378
+ def sync(
379
+ self,
380
+ dist_sync_fn: Optional[Callable] = None,
381
+ process_group: Optional[Any] = None,
382
+ should_sync: bool = True,
383
+ distributed_available: Optional[Callable] = jit_distributed_available,
384
+ ) -> None:
385
+ """Sync function for manually controlling when metrics states should be synced across processes.
386
+
387
+ Args:
388
+ dist_sync_fn: Function to be used to perform states synchronization
389
+ process_group:
390
+ Specify the process group on which synchronization is called.
391
+ default: None (which selects the entire world)
392
+ should_sync: Whether to apply to state synchronization. This will have an impact
393
+ only when running in a distributed setting.
394
+ distributed_available: Function to determine if we are running inside a distributed setting
395
+ """
396
+ if self._is_synced and should_sync:
397
+ raise TorchMetricsUserError("The Metric has already been synced.")
398
+
399
+ is_distributed = distributed_available() if callable(distributed_available) else None
400
+
401
+ if not should_sync or not is_distributed:
402
+ return
403
+
404
+ if dist_sync_fn is None:
405
+ dist_sync_fn = gather_all_tensors
406
+
407
+ # cache prior to syncing
408
+ self._cache = {attr: getattr(self, attr) for attr in self._defaults}
409
+
410
+ # sync
411
+ self._sync_dist(dist_sync_fn, process_group=process_group)
412
+ self._is_synced = True
413
+
414
+ def unsync(self, should_unsync: bool = True) -> None:
415
+ """Unsync function for manually controlling when metrics states should be reverted back to their local
416
+ states.
417
+
418
+ Args:
419
+ should_unsync: Whether to perform unsync
420
+ """
421
+ if not should_unsync:
422
+ return
423
+
424
+ if not self._is_synced:
425
+ raise TorchMetricsUserError("The Metric has already been un-synced.")
426
+
427
+ if self._cache is None:
428
+ raise TorchMetricsUserError("The internal cache should exist to unsync the Metric.")
429
+
430
+ # if we synced, restore to cache so that we can continue to accumulate un-synced state
431
+ for attr, val in self._cache.items():
432
+ setattr(self, attr, val)
433
+ self._is_synced = False
434
+ self._cache = None
435
+
436
+ @contextmanager
437
+ def sync_context(
438
+ self,
439
+ dist_sync_fn: Optional[Callable] = None,
440
+ process_group: Optional[Any] = None,
441
+ should_sync: bool = True,
442
+ should_unsync: bool = True,
443
+ distributed_available: Optional[Callable] = jit_distributed_available,
444
+ ) -> Generator:
445
+ """Context manager to synchronize the states between processes when running in a distributed setting and
446
+ restore the local cache states after yielding.
447
+
448
+ Args:
449
+ dist_sync_fn: Function to be used to perform states synchronization
450
+ process_group:
451
+ Specify the process group on which synchronization is called.
452
+ default: None (which selects the entire world)
453
+ should_sync: Whether to apply to state synchronization. This will have an impact
454
+ only when running in a distributed setting.
455
+ should_unsync: Whether to restore the cache state so that the metrics can
456
+ continue to be accumulated.
457
+ distributed_available: Function to determine if we are running inside a distributed setting
458
+ """
459
+ self.sync(
460
+ dist_sync_fn=dist_sync_fn,
461
+ process_group=process_group,
462
+ should_sync=should_sync,
463
+ distributed_available=distributed_available,
464
+ )
465
+
466
+ yield
467
+
468
+ self.unsync(should_unsync=self._is_synced and should_unsync)
469
+
470
+ def _wrap_compute(self, compute: Callable) -> Callable:
471
+ @functools.wraps(compute)
472
+ def wrapped_func(*args: Any, **kwargs: Any) -> Any:
473
+ # return cached value
474
+ if self._computed is not None:
475
+ return self._computed
476
+
477
+ # compute relies on the sync context manager to gather the states across processes and apply reduction
478
+ # if synchronization happened, the current rank accumulated states will be restored to keep
479
+ # accumulation going if ``should_unsync=True``,
480
+ with self.sync_context(
481
+ dist_sync_fn=self.dist_sync_fn, should_sync=self._to_sync, should_unsync=self._should_unsync
482
+ ):
483
+ self._computed = compute(*args, **kwargs)
484
+
485
+ return self._computed
486
+
487
+ return wrapped_func
488
+
489
+ @abstractmethod
490
+ def update(self, *_: Any, **__: Any) -> None:
491
+ """Override this method to update the state variables of your metric class."""
492
+
493
+ @abstractmethod
494
+ def compute(self) -> Any:
495
+ """Override this method to compute the final metric value from state variables synchronized across the
496
+ distributed backend."""
497
+
498
+ def reset(self) -> None:
499
+ """This method automatically resets the metric state variables to their default value."""
500
+ self._update_called = False
501
+ self._forward_cache = None
502
+ # lower lightning versions requires this implicitly to log metric objects correctly in self.log
503
+ self._computed = None
504
+
505
+ for attr, default in self._defaults.items():
506
+ current_val = getattr(self, attr)
507
+ if isinstance(default, Tensor):
508
+ setattr(self, attr, default.detach().clone().to(current_val.device))
509
+ else:
510
+ setattr(self, attr, [])
511
+
512
+ # reset internal states
513
+ self._cache = None
514
+ self._is_synced = False
515
+
516
+ def clone(self) -> "_Metric":
517
+ """Make a copy of the metric."""
518
+ return deepcopy(self)
519
+
520
+ def __getstate__(self) -> Dict[str, Any]:
521
+ # ignore update and compute functions for pickling
522
+ return {k: v for k, v in self.__dict__.items() if k not in ["update", "compute", "_update_signature"]}
523
+
524
+ def __setstate__(self, state: Dict[str, Any]) -> None:
525
+ # manually restore update and compute functions for pickling
526
+ self.__dict__.update(state)
527
+ self._update_signature = inspect.signature(self.update)
528
+ self.update: Callable = self._wrap_update(self.update) # type: ignore
529
+ self.compute: Callable = self._wrap_compute(self.compute) # type: ignore
530
+
531
+ def __setattr__(self, name: str, value: Any) -> None:
532
+ if name in ("higher_is_better", "is_differentiable"):
533
+ raise RuntimeError(f"Can't change const `{name}`.")
534
+ super().__setattr__(name, value)
535
+
536
+ @property
537
+ def device(self) -> "torch.device":
538
+ """Return the device of the metric."""
539
+ return self._device
540
+
541
+ def type(self, dst_type: Union[str, torch.dtype]) -> "_Metric":
542
+ """Method override default and prevent dtype casting.
543
+
544
+ Please use `metric.set_dtype(dtype)` instead.
545
+ """
546
+ return self
547
+
548
+ def float(self) -> "_Metric":
549
+ """Method override default and prevent dtype casting.
550
+
551
+ Please use `metric.set_dtype(dtype)` instead.
552
+ """
553
+ return self
554
+
555
+ def double(self) -> "_Metric":
556
+ """Method override default and prevent dtype casting.
557
+
558
+ Please use `metric.set_dtype(dtype)` instead.
559
+ """
560
+ return self
561
+
562
+ def half(self) -> "_Metric":
563
+ """Method override default and prevent dtype casting.
564
+
565
+ Please use `metric.set_dtype(dtype)` instead.
566
+ """
567
+ return self
568
+
569
+ def set_dtype(self, dst_type: Union[str, torch.dtype]) -> None:
570
+ """Special version of `type` for transferring all metric states to specific dtype
571
+ Arguments:
572
+ dst_type (type or string): the desired type
573
+ """
574
+ return super().type(dst_type)
575
+
576
+ def _apply(self, fn: Callable) -> nn.Module:
577
+ """Overwrite _apply function such that we can also move metric states to the correct device when `.to`,
578
+ `.cuda`, etc methods are called."""
579
+ this = super()._apply(fn)
580
+ # Also apply fn to metric states and defaults
581
+ for key, value in this._defaults.items():
582
+ if isinstance(value, Tensor):
583
+ this._defaults[key] = fn(value)
584
+ elif isinstance(value, Sequence):
585
+ this._defaults[key] = [fn(v) for v in value]
586
+
587
+ current_val = getattr(this, key)
588
+ if isinstance(current_val, Tensor):
589
+ setattr(this, key, fn(current_val))
590
+ elif isinstance(current_val, Sequence):
591
+ setattr(this, key, [fn(cur_v) for cur_v in current_val])
592
+ else:
593
+ raise TypeError(
594
+ "Expected metric state to be either a Tensor" f"or a list of Tensor, but encountered {current_val}"
595
+ )
596
+
597
+ # make sure to update the device attribute
598
+ # if the dummy tensor moves device by fn function we should also update the attribute
599
+ self._device = fn(torch.zeros(1, device=self.device)).device
600
+
601
+ # Additional apply to forward cache and computed attributes (may be nested)
602
+ if this._computed is not None:
603
+ this._computed = apply_to_collection(this._computed, Tensor, fn)
604
+ if this._forward_cache is not None:
605
+ this._forward_cache = apply_to_collection(this._forward_cache, Tensor, fn)
606
+
607
+ return this
608
+
609
+ def persistent(self, mode: bool = False) -> None:
610
+ """Method for post-init to change if metric states should be saved to its state_dict."""
611
+ for key in self._persistent:
612
+ self._persistent[key] = mode
613
+
614
+ def state_dict(
615
+ self,
616
+ destination: Dict[str, Any] = None,
617
+ prefix: str = "",
618
+ keep_vars: bool = False,
619
+ ) -> Optional[Dict[str, Any]]:
620
+ destination = super().state_dict(destination=destination, prefix=prefix, keep_vars=keep_vars)
621
+ # Register metric states to be part of the state_dict
622
+ for key in self._defaults:
623
+ if not self._persistent[key]:
624
+ continue
625
+ current_val = getattr(self, key)
626
+ if not keep_vars:
627
+ if isinstance(current_val, Tensor):
628
+ current_val = current_val.detach()
629
+ elif isinstance(current_val, list):
630
+ current_val = [cur_v.detach() if isinstance(cur_v, Tensor) else cur_v for cur_v in current_val]
631
+ destination[prefix + key] = deepcopy(current_val) # type: ignore
632
+ return destination
633
+
634
+ def _load_from_state_dict(
635
+ self,
636
+ state_dict: dict,
637
+ prefix: str,
638
+ local_metadata: dict,
639
+ strict: bool,
640
+ missing_keys: List[str],
641
+ unexpected_keys: List[str],
642
+ error_msgs: List[str],
643
+ ) -> None:
644
+ """Loads metric states from state_dict."""
645
+
646
+ for key in self._defaults:
647
+ name = prefix + key
648
+ if name in state_dict:
649
+ setattr(self, key, state_dict.pop(name))
650
+ super()._load_from_state_dict(
651
+ state_dict, prefix, local_metadata, True, missing_keys, unexpected_keys, error_msgs
652
+ )
653
+
654
+ def _filter_kwargs(self, **kwargs: Any) -> Dict[str, Any]:
655
+ """filter kwargs such that they match the update signature of the metric."""
656
+
657
+ # filter all parameters based on update signature except those of
658
+ # type VAR_POSITIONAL (*args) and VAR_KEYWORD (**kwargs)
659
+ _params = (inspect.Parameter.VAR_POSITIONAL, inspect.Parameter.VAR_KEYWORD)
660
+ _sign_params = self._update_signature.parameters
661
+ filtered_kwargs = {
662
+ k: v for k, v in kwargs.items() if (k in _sign_params.keys() and _sign_params[k].kind not in _params)
663
+ }
664
+
665
+ # if no kwargs filtered, return al kwargs as default
666
+ if not filtered_kwargs:
667
+ filtered_kwargs = kwargs
668
+ return filtered_kwargs
669
+
670
+ def __hash__(self) -> int:
671
+ # we need to add the id here, since PyTorch requires a module hash to be unique.
672
+ # Internally, PyTorch nn.Module relies on that for children discovery
673
+ # (see https://github.com/pytorch/pytorch/blob/v1.9.0/torch/nn/modules/module.py#L1544)
674
+ # For metrics that include tensors it is not a problem,
675
+ # since their hash is unique based on the memory location but we cannot rely on that for every metric.
676
+ hash_vals = [self.__class__.__name__, id(self)]
677
+
678
+ for key in self._defaults:
679
+ val = getattr(self, key)
680
+ # Special case: allow list values, so long
681
+ # as their elements are hashable
682
+ if hasattr(val, "__iter__") and not isinstance(val, Tensor):
683
+ hash_vals.extend(val)
684
+ else:
685
+ hash_vals.append(val)
686
+
687
+ return hash(tuple(hash_vals))
688
+
689
+ def __add__(self, other: "Metric") -> "Metric":
690
+ return CompositionalMetric(torch.add, self, other)
691
+
692
+ def __and__(self, other: "Metric") -> "Metric":
693
+ return CompositionalMetric(torch.bitwise_and, self, other)
694
+
695
+ # Fixme: this shall return bool instead of Metric
696
+ def __eq__(self, other: "Metric") -> "Metric": # type: ignore
697
+ return CompositionalMetric(torch.eq, self, other)
698
+
699
+ def __floordiv__(self, other: "Metric") -> "Metric":
700
+ return CompositionalMetric(torch.floor_divide, self, other)
701
+
702
+ def __ge__(self, other: "Metric") -> "Metric":
703
+ return CompositionalMetric(torch.ge, self, other)
704
+
705
+ def __gt__(self, other: "Metric") -> "Metric":
706
+ return CompositionalMetric(torch.gt, self, other)
707
+
708
+ def __le__(self, other: "Metric") -> "Metric":
709
+ return CompositionalMetric(torch.le, self, other)
710
+
711
+ def __lt__(self, other: "Metric") -> "Metric":
712
+ return CompositionalMetric(torch.lt, self, other)
713
+
714
+ def __matmul__(self, other: "Metric") -> "Metric":
715
+ return CompositionalMetric(torch.matmul, self, other)
716
+
717
+ def __mod__(self, other: "Metric") -> "Metric":
718
+ return CompositionalMetric(torch.fmod, self, other)
719
+
720
+ def __mul__(self, other: "Metric") -> "Metric":
721
+ return CompositionalMetric(torch.mul, self, other)
722
+
723
+ # Fixme: this shall return bool instead of Metric
724
+ def __ne__(self, other: "Metric") -> "Metric": # type: ignore
725
+ return CompositionalMetric(torch.ne, self, other)
726
+
727
+ def __or__(self, other: "Metric") -> "Metric":
728
+ return CompositionalMetric(torch.bitwise_or, self, other)
729
+
730
+ def __pow__(self, other: "Metric") -> "Metric":
731
+ return CompositionalMetric(torch.pow, self, other)
732
+
733
+ def __radd__(self, other: "Metric") -> "Metric":
734
+ return CompositionalMetric(torch.add, other, self)
735
+
736
+ def __rand__(self, other: "Metric") -> "Metric":
737
+ # swap them since bitwise_and only supports that way and it's commutative
738
+ return CompositionalMetric(torch.bitwise_and, self, other)
739
+
740
+ def __rfloordiv__(self, other: "Metric") -> "Metric":
741
+ return CompositionalMetric(torch.floor_divide, other, self)
742
+
743
+ def __rmatmul__(self, other: "Metric") -> "Metric":
744
+ return CompositionalMetric(torch.matmul, other, self)
745
+
746
+ def __rmod__(self, other: "Metric") -> "Metric":
747
+ return CompositionalMetric(torch.fmod, other, self)
748
+
749
+ def __rmul__(self, other: "Metric") -> "Metric":
750
+ return CompositionalMetric(torch.mul, other, self)
751
+
752
+ def __ror__(self, other: "Metric") -> "Metric":
753
+ return CompositionalMetric(torch.bitwise_or, other, self)
754
+
755
+ def __rpow__(self, other: "Metric") -> "Metric":
756
+ return CompositionalMetric(torch.pow, other, self)
757
+
758
+ def __rsub__(self, other: "Metric") -> "Metric":
759
+ return CompositionalMetric(torch.sub, other, self)
760
+
761
+ def __rtruediv__(self, other: "Metric") -> "Metric":
762
+ return CompositionalMetric(torch.true_divide, other, self)
763
+
764
+ def __rxor__(self, other: "Metric") -> "Metric":
765
+ return CompositionalMetric(torch.bitwise_xor, other, self)
766
+
767
+ def __sub__(self, other: "Metric") -> "Metric":
768
+ return CompositionalMetric(torch.sub, self, other)
769
+
770
+ def __truediv__(self, other: "Metric") -> "Metric":
771
+ return CompositionalMetric(torch.true_divide, self, other)
772
+
773
+ def __xor__(self, other: "Metric") -> "Metric":
774
+ return CompositionalMetric(torch.bitwise_xor, self, other)
775
+
776
+ def __abs__(self) -> "Metric":
777
+ return CompositionalMetric(torch.abs, self, None)
778
+
779
+ def __inv__(self) -> "Metric":
780
+ return CompositionalMetric(torch.bitwise_not, self, None)
781
+
782
+ def __invert__(self) -> "Metric":
783
+ return self.__inv__()
784
+
785
+ def __neg__(self) -> "Metric":
786
+ return CompositionalMetric(_neg, self, None)
787
+
788
+ def __pos__(self) -> "Metric":
789
+ return CompositionalMetric(torch.abs, self, None)
790
+
791
+ def __getitem__(self, idx: int) -> "Metric":
792
+ return CompositionalMetric(lambda x: x[idx], self, None)
793
+
794
+
795
+
796
+ class CompositionalMetric(_Metric):
797
+ """Composition of two metrics with a specific operator which will be executed upon metrics compute."""
798
+
799
+ def __init__(
800
+ self,
801
+ operator: Callable,
802
+ metric_a: Union[_Metric, int, float, Tensor],
803
+ metric_b: Union[_Metric, int, float, Tensor, None],
804
+ ) -> None:
805
+ """
806
+ Args:
807
+ operator: the operator taking in one (if metric_b is None)
808
+ or two arguments. Will be applied to outputs of metric_a.compute()
809
+ and (optionally if metric_b is not None) metric_b.compute()
810
+ metric_a: first metric whose compute() result is the first argument of operator
811
+ metric_b: second metric whose compute() result is the second argument of operator.
812
+ For operators taking in only one input, this should be None
813
+ """
814
+ super().__init__()
815
+
816
+ self.op = operator
817
+
818
+ if isinstance(metric_a, Tensor):
819
+ self.register_buffer("metric_a", metric_a)
820
+ else:
821
+ self.metric_a = metric_a
822
+
823
+ if isinstance(metric_b, Tensor):
824
+ self.register_buffer("metric_b", metric_b)
825
+ else:
826
+ self.metric_b = metric_b
827
+
828
+ def _sync_dist(self, dist_sync_fn: Optional[Callable] = None, process_group: Optional[Any] = None) -> None:
829
+ # No syncing required here. syncing will be done in metric_a and metric_b
830
+ pass
831
+
832
+ def update(self, *args: Any, **kwargs: Any) -> None:
833
+ if isinstance(self.metric_a, Metric):
834
+ self.metric_a.update(*args, **self.metric_a._filter_kwargs(**kwargs))
835
+
836
+ if isinstance(self.metric_b, Metric):
837
+ self.metric_b.update(*args, **self.metric_b._filter_kwargs(**kwargs))
838
+
839
+ def compute(self) -> Any:
840
+
841
+ # also some parsing for kwargs?
842
+ if isinstance(self.metric_a, Metric):
843
+ val_a = self.metric_a.compute()
844
+ else:
845
+ val_a = self.metric_a
846
+
847
+ if isinstance(self.metric_b, Metric):
848
+ val_b = self.metric_b.compute()
849
+ else:
850
+ val_b = self.metric_b
851
+
852
+ if val_b is None:
853
+ return self.op(val_a)
854
+
855
+ return self.op(val_a, val_b)
856
+
857
+ def reset(self) -> None:
858
+ if isinstance(self.metric_a, Metric):
859
+ self.metric_a.reset()
860
+
861
+ if isinstance(self.metric_b, Metric):
862
+ self.metric_b.reset()
863
+
864
+ def persistent(self, mode: bool = False) -> None:
865
+ if isinstance(self.metric_a, Metric):
866
+ self.metric_a.persistent(mode=mode)
867
+ if isinstance(self.metric_b, Metric):
868
+ self.metric_b.persistent(mode=mode)
869
+
870
+ def __repr__(self) -> str:
871
+ _op_metrics = f"(\n {self.op.__name__}(\n {repr(self.metric_a)},\n {repr(self.metric_b)}\n )\n)"
872
+ repr_str = self.__class__.__name__ + _op_metrics
873
+
874
+ return repr_str
875
+
876
+
877
+ class MetricCollection_(nn.ModuleDict):
878
+ def __init__(
879
+ self,
880
+ metrics: Union[_Metric, Sequence[_Metric], Dict[str, _Metric]],
881
+ *additional_metrics: _Metric,
882
+ prefix: Optional[str] = None,
883
+ postfix: Optional[str] = None,
884
+ ) -> None:
885
+ super().__init__()
886
+
887
+ self.add_metrics(metrics, *additional_metrics)
888
+
889
+ self.prefix = self._check_arg(prefix, "prefix")
890
+ self.postfix = self._check_arg(postfix, "postfix")
891
+
892
+ @torch.jit.unused
893
+ def forward(self, *args: Any, **kwargs: Any) -> Dict[str, Any]:
894
+ """Iteratively call forward for each metric.
895
+
896
+ Positional arguments (args) will be passed to every metric in the collection, while keyword arguments (kwargs)
897
+ will be filtered based on the signature of the individual metric.
898
+ """
899
+ return {k: m(*args, **m._filter_kwargs(**kwargs)) for k, m in self.items()}
900
+
901
+ def update(self, *args: Any, **kwargs: Any) -> None:
902
+ """Iteratively call update for each metric.
903
+
904
+ Positional arguments (args) will be passed to every metric in the collection, while keyword arguments (kwargs)
905
+ will be filtered based on the signature of the individual metric.
906
+ """
907
+ for _, m in self.items(keep_base=True):
908
+ m_kwargs = m._filter_kwargs(**kwargs)
909
+ m.update(*args, **m_kwargs)
910
+
911
+ def compute(self) -> Dict[str, Any]:
912
+ return {k: m.compute() for k, m in self.items()}
913
+
914
+ def reset(self) -> None:
915
+ """Iteratively call reset for each metric."""
916
+ for _, m in self.items(keep_base=True):
917
+ m.reset()
918
+
919
+ def clone(self, prefix: Optional[str] = None, postfix: Optional[str] = None) -> "MetricCollection_":
920
+ """Make a copy of the metric collection
921
+ Args:
922
+ prefix: a string to append in front of the metric keys
923
+ postfix: a string to append after the keys of the output dict
924
+
925
+ """
926
+ mc = deepcopy(self)
927
+ if prefix:
928
+ mc.prefix = self._check_arg(prefix, "prefix")
929
+ if postfix:
930
+ mc.postfix = self._check_arg(postfix, "postfix")
931
+ return mc
932
+
933
+ def persistent(self, mode: bool = True) -> None:
934
+ """Method for post-init to change if metric states should be saved to its state_dict."""
935
+ for _, m in self.items(keep_base=True):
936
+ m.persistent(mode)
937
+
938
+ def add_metrics(
939
+ self, metrics: Union[_Metric, Sequence[_Metric], Dict[str, _Metric]], *additional_metrics: _Metric
940
+ ) -> None:
941
+ """Add new metrics to Metric Collection."""
942
+ if isinstance(metrics, Metric):
943
+ # set compatible with original type expectations
944
+ metrics = [metrics]
945
+ if isinstance(metrics, Sequence):
946
+ # prepare for optional additions
947
+ metrics = list(metrics)
948
+ remain: list = []
949
+ for m in additional_metrics:
950
+ (metrics if isinstance(m, Metric) else remain).append(m)
951
+
952
+ elif additional_metrics:
953
+ raise ValueError(
954
+ f"You have passes extra arguments {additional_metrics} which are not compatible"
955
+ f" with first passed dictionary {metrics} so they will be ignored."
956
+ )
957
+
958
+ if isinstance(metrics, dict):
959
+ # Check all values are metrics
960
+ # Make sure that metrics are added in deterministic order
961
+ for name in sorted(metrics.keys()):
962
+ metric = metrics[name]
963
+ if not isinstance(metric, Metric):
964
+ raise ValueError(
965
+ f"Value {metric} belonging to key {name} is not an instance of `pl.metrics.Metric`"
966
+ )
967
+ self[name] = metric
968
+ elif isinstance(metrics, Sequence):
969
+ for metric in metrics:
970
+ if not isinstance(metric, Metric):
971
+ raise ValueError(f"Input {metric} to `MetricCollection` is not a instance of `pl.metrics.Metric`")
972
+ name = metric.__class__.__name__
973
+ if name in self:
974
+ raise ValueError(f"Encountered two metrics both named {name}")
975
+ self[name] = metric
976
+ else:
977
+ raise ValueError("Unknown input to MetricCollection.")
978
+
979
+ def _set_name(self, base: str) -> str:
980
+ name = base if self.prefix is None else self.prefix + base
981
+ name = name if self.postfix is None else name + self.postfix
982
+ return name
983
+
984
+ def _to_renamed_ordered_dict(self) -> OrderedDict:
985
+ od = OrderedDict()
986
+ for k, v in self._modules.items():
987
+ od[self._set_name(k)] = v
988
+ return od
989
+
990
+ def keys(self, keep_base: bool = False) -> Iterable[Hashable]:
991
+ r"""Return an iterable of the ModuleDict key.
992
+ Args:
993
+ keep_base: Whether to add prefix/postfix on the items collection.
994
+ """
995
+ if keep_base:
996
+ return self._modules.keys()
997
+ return self._to_renamed_ordered_dict().keys()
998
+
999
+ def items(self, keep_base: bool = False) -> Iterable[Tuple[str, nn.Module]]:
1000
+ r"""Return an iterable of the ModuleDict key/value pairs.
1001
+ Args:
1002
+ keep_base: Whether to add prefix/postfix on the items collection.
1003
+ """
1004
+ if keep_base:
1005
+ return self._modules.items()
1006
+ return self._to_renamed_ordered_dict().items()
1007
+
1008
+ @staticmethod
1009
+ def _check_arg(arg: Optional[str], name: str) -> Optional[str]:
1010
+ if arg is None or isinstance(arg, str):
1011
+ return arg
1012
+ raise ValueError(f"Expected input `{name}` to be a string, but got {type(arg)}")
1013
+
1014
+ def __repr__(self) -> str:
1015
+ repr_str = super().__repr__()[:-2]
1016
+ if self.prefix:
1017
+ repr_str += f",\n prefix={self.prefix}{',' if self.postfix else ''}"
1018
+ if self.postfix:
1019
+ repr_str += f"{',' if not self.prefix else ''}\n postfix={self.postfix}"
1020
+ return repr_str + "\n)"
1021
+
1022
+
1023
+ class Metric(_Metric):
1024
+ r"""
1025
+ This implementation refers to :class:`~torchmetrics.Metric`.
1026
+
1027
+ .. warning:: This metric is deprecated, use ``torchmetrics.Metric``. Will be removed in v1.5.0.
1028
+ """
1029
+
1030
+ def __init__(
1031
+ self,
1032
+ compute_on_step: bool = True,
1033
+ dist_sync_on_step: bool = False,
1034
+ process_group: Optional[Any] = None,
1035
+ dist_sync_fn: Callable = None,
1036
+ ):
1037
+ super().__init__(
1038
+ compute_on_step=compute_on_step,
1039
+ dist_sync_on_step=dist_sync_on_step,
1040
+ process_group=process_group,
1041
+ dist_sync_fn=dist_sync_fn,
1042
+ )
1043
+
1044
+ def __hash__(self):
1045
+ return super().__hash__()
1046
+
1047
+ def __add__(self, other: Any):
1048
+ from pytorch_lightning.metrics.compositional import CompositionalMetric
1049
+ return CompositionalMetric(torch.add, self, other)
1050
+
1051
+ def __and__(self, other: Any):
1052
+ from pytorch_lightning.metrics.compositional import CompositionalMetric
1053
+ return CompositionalMetric(torch.bitwise_and, self, other)
1054
+
1055
+ def __eq__(self, other: Any):
1056
+ from pytorch_lightning.metrics.compositional import CompositionalMetric
1057
+ return CompositionalMetric(torch.eq, self, other)
1058
+
1059
+ def __floordiv__(self, other: Any):
1060
+ from pytorch_lightning.metrics.compositional import CompositionalMetric
1061
+ return CompositionalMetric(torch.floor_divide, self, other)
1062
+
1063
+ def __ge__(self, other: Any):
1064
+ from pytorch_lightning.metrics.compositional import CompositionalMetric
1065
+ return CompositionalMetric(torch.ge, self, other)
1066
+
1067
+ def __gt__(self, other: Any):
1068
+ from pytorch_lightning.metrics.compositional import CompositionalMetric
1069
+ return CompositionalMetric(torch.gt, self, other)
1070
+
1071
+ def __le__(self, other: Any):
1072
+ from pytorch_lightning.metrics.compositional import CompositionalMetric
1073
+ return CompositionalMetric(torch.le, self, other)
1074
+
1075
+ def __lt__(self, other: Any):
1076
+ from pytorch_lightning.metrics.compositional import CompositionalMetric
1077
+ return CompositionalMetric(torch.lt, self, other)
1078
+
1079
+ def __matmul__(self, other: Any):
1080
+ from pytorch_lightning.metrics.compositional import CompositionalMetric
1081
+ return CompositionalMetric(torch.matmul, self, other)
1082
+
1083
+ def __mod__(self, other: Any):
1084
+ from pytorch_lightning.metrics.compositional import CompositionalMetric
1085
+ return CompositionalMetric(torch.fmod, self, other)
1086
+
1087
+ def __mul__(self, other: Any):
1088
+ from pytorch_lightning.metrics.compositional import CompositionalMetric
1089
+ return CompositionalMetric(torch.mul, self, other)
1090
+
1091
+ def __ne__(self, other: Any):
1092
+ from pytorch_lightning.metrics.compositional import CompositionalMetric
1093
+ return CompositionalMetric(torch.ne, self, other)
1094
+
1095
+ def __or__(self, other: Any):
1096
+ from pytorch_lightning.metrics.compositional import CompositionalMetric
1097
+ return CompositionalMetric(torch.bitwise_or, self, other)
1098
+
1099
+ def __pow__(self, other: Any):
1100
+ from pytorch_lightning.metrics.compositional import CompositionalMetric
1101
+ return CompositionalMetric(torch.pow, self, other)
1102
+
1103
+ def __radd__(self, other: Any):
1104
+ from pytorch_lightning.metrics.compositional import CompositionalMetric
1105
+ return CompositionalMetric(torch.add, other, self)
1106
+
1107
+ def __rand__(self, other: Any):
1108
+ from pytorch_lightning.metrics.compositional import CompositionalMetric
1109
+
1110
+ # swap them since bitwise_and only supports that way and it's commutative
1111
+ return CompositionalMetric(torch.bitwise_and, self, other)
1112
+
1113
+ def __rfloordiv__(self, other: Any):
1114
+ from pytorch_lightning.metrics.compositional import CompositionalMetric
1115
+ return CompositionalMetric(torch.floor_divide, other, self)
1116
+
1117
+ def __rmatmul__(self, other: Any):
1118
+ from pytorch_lightning.metrics.compositional import CompositionalMetric
1119
+ return CompositionalMetric(torch.matmul, other, self)
1120
+
1121
+ def __rmod__(self, other: Any):
1122
+ from pytorch_lightning.metrics.compositional import CompositionalMetric
1123
+ return CompositionalMetric(torch.fmod, other, self)
1124
+
1125
+ def __rmul__(self, other: Any):
1126
+ from pytorch_lightning.metrics.compositional import CompositionalMetric
1127
+ return CompositionalMetric(torch.mul, other, self)
1128
+
1129
+ def __ror__(self, other: Any):
1130
+ from pytorch_lightning.metrics.compositional import CompositionalMetric
1131
+ return CompositionalMetric(torch.bitwise_or, other, self)
1132
+
1133
+ def __rpow__(self, other: Any):
1134
+ from pytorch_lightning.metrics.compositional import CompositionalMetric
1135
+ return CompositionalMetric(torch.pow, other, self)
1136
+
1137
+ def __rsub__(self, other: Any):
1138
+ from pytorch_lightning.metrics.compositional import CompositionalMetric
1139
+ return CompositionalMetric(torch.sub, other, self)
1140
+
1141
+ def __rtruediv__(self, other: Any):
1142
+ from pytorch_lightning.metrics.compositional import CompositionalMetric
1143
+ return CompositionalMetric(torch.true_divide, other, self)
1144
+
1145
+ def __rxor__(self, other: Any):
1146
+ from pytorch_lightning.metrics.compositional import CompositionalMetric
1147
+ return CompositionalMetric(torch.bitwise_xor, other, self)
1148
+
1149
+ def __sub__(self, other: Any):
1150
+ from pytorch_lightning.metrics.compositional import CompositionalMetric
1151
+ return CompositionalMetric(torch.sub, self, other)
1152
+
1153
+ def __truediv__(self, other: Any):
1154
+ from pytorch_lightning.metrics.compositional import CompositionalMetric
1155
+ return CompositionalMetric(torch.true_divide, self, other)
1156
+
1157
+ def __xor__(self, other: Any):
1158
+ from pytorch_lightning.metrics.compositional import CompositionalMetric
1159
+ return CompositionalMetric(torch.bitwise_xor, self, other)
1160
+
1161
+ def __abs__(self):
1162
+ from pytorch_lightning.metrics.compositional import CompositionalMetric
1163
+ return CompositionalMetric(torch.abs, self, None)
1164
+
1165
+ def __inv__(self):
1166
+ from pytorch_lightning.metrics.compositional import CompositionalMetric
1167
+ return CompositionalMetric(torch.bitwise_not, self, None)
1168
+
1169
+ def __invert__(self):
1170
+ return self.__inv__()
1171
+
1172
+ def __neg__(self):
1173
+ from pytorch_lightning.metrics.compositional import CompositionalMetric
1174
+ return CompositionalMetric(_neg, self, None)
1175
+
1176
+ def __pos__(self):
1177
+ from pytorch_lightning.metrics.compositional import CompositionalMetric
1178
+ return CompositionalMetric(torch.abs, self, None)
1179
+
1180
+
1181
+ def _neg(tensor: torch.Tensor):
1182
+ return -torch.abs(tensor)
1183
+
1184
+
1185
+ class MicroF1(Metric):
1186
+ def __init__(self, dist_sync_on_step=False):
1187
+ super().__init__(dist_sync_on_step=dist_sync_on_step)
1188
+
1189
+ self.add_state("n", default=torch.tensor(0), dist_reduce_fx="sum")
1190
+ self.add_state("prec_d", default=torch.tensor(0), dist_reduce_fx="sum")
1191
+ self.add_state("rec_d", default=torch.tensor(0), dist_reduce_fx="sum")
1192
+
1193
+ def update(self, p, g):
1194
+
1195
+ self.n += len(g.intersection(p))
1196
+ self.prec_d += len(p)
1197
+ self.rec_d += len(g)
1198
+
1199
+ def compute(self):
1200
+ p = self.n.float() / self.prec_d
1201
+ r = self.n.float() / self.rec_d
1202
+ return (2 * p * r / (p + r)) if (p + r) > 0 else (p + r)
1203
+
1204
+
1205
+ class MacroF1(Metric):
1206
+ def __init__(self, dist_sync_on_step=False):
1207
+ super().__init__(dist_sync_on_step=dist_sync_on_step)
1208
+
1209
+ self.add_state("n", default=torch.tensor(0.0), dist_reduce_fx="sum")
1210
+ self.add_state("d", default=torch.tensor(0), dist_reduce_fx="sum")
1211
+
1212
+ def update(self, p, g):
1213
+
1214
+ prec = len(g.intersection(p)) / len(p)
1215
+ rec = len(g.intersection(p)) / len(g) if g else 0.0
1216
+
1217
+ self.n += (2 * prec * rec / (prec + rec)) if (prec + rec) > 0 else (prec + rec)
1218
+ self.d += 1
1219
+
1220
+ def compute(self):
1221
+ return (self.n / self.d) if self.d > 0 else self.d
1222
+
1223
+
1224
+ class MicroPrecision(Metric):
1225
+ def __init__(self, dist_sync_on_step=False):
1226
+ super().__init__(dist_sync_on_step=dist_sync_on_step)
1227
+
1228
+ self.add_state("n", default=torch.tensor(0), dist_reduce_fx="sum")
1229
+ self.add_state("d", default=torch.tensor(0), dist_reduce_fx="sum")
1230
+
1231
+ def update(self, p, g):
1232
+ self.n += len(g.intersection(p))
1233
+ self.d += len(p)
1234
+
1235
+ def compute(self):
1236
+ return (self.n.float() / self.d) if self.d > 0 else self.d
1237
+
1238
+
1239
+ class MacroPrecision(Metric):
1240
+ def __init__(self, dist_sync_on_step=False):
1241
+ super().__init__(dist_sync_on_step=dist_sync_on_step)
1242
+
1243
+ self.add_state("n", default=torch.tensor(0.0), dist_reduce_fx="sum")
1244
+ self.add_state("d", default=torch.tensor(0), dist_reduce_fx="sum")
1245
+
1246
+ def update(self, p, g):
1247
+ self.n += len(g.intersection(p)) / len(p)
1248
+ self.d += 1
1249
+
1250
+ def compute(self):
1251
+ return (self.n / self.d) if self.d > 0 else self.d
1252
+
1253
+
1254
+ class MicroRecall(Metric):
1255
+ def __init__(self, dist_sync_on_step=False):
1256
+ super().__init__(dist_sync_on_step=dist_sync_on_step)
1257
+
1258
+ self.add_state("n", default=torch.tensor(0), dist_reduce_fx="sum")
1259
+ self.add_state("d", default=torch.tensor(0), dist_reduce_fx="sum")
1260
+
1261
+ def update(self, p, g):
1262
+ self.n += len(g.intersection(p))
1263
+ self.d += len(g)
1264
+
1265
+ def compute(self):
1266
+ return (self.n.float() / self.d) if self.d > 0 else self.d
1267
+
1268
+
1269
+ class MacroRecall(Metric):
1270
+ def __init__(self, dist_sync_on_step=False):
1271
+ super().__init__(dist_sync_on_step=dist_sync_on_step)
1272
+
1273
+ self.add_state("n", default=torch.tensor(0.0), dist_reduce_fx="sum")
1274
+ self.add_state("d", default=torch.tensor(0), dist_reduce_fx="sum")
1275
+
1276
+ def update(self, p, g):
1277
+ self.n += len(g.intersection(p)) / len(g) if g else 0.0
1278
+ self.d += 1
1279
+
1280
+ def compute(self):
1281
+ return (self.n / self.d) if self.d > 0 else self.d
1282
+
1283
+ # The following two classes are not inherited from https://github.com/nicola-decao/efficient-autoregressive-EL
1284
+ # and are implemented in this project.
1285
+
1286
+
1287
+ class _EvaluationScores:
1288
+ def __init__(self, is_micro):
1289
+ self.is_micro = is_micro
1290
+ if is_micro:
1291
+ self.f1 = MicroF1()
1292
+ self.p = MicroPrecision()
1293
+ self.r = MicroRecall()
1294
+ else:
1295
+ self.f1 = MacroF1()
1296
+ self.p = MacroPrecision()
1297
+ self.r = MacroRecall()
1298
+
1299
+ def record_results(self, prediction, gold):
1300
+ self.f1(prediction, gold)
1301
+ self.p(prediction, gold)
1302
+ self.r(prediction, gold)
1303
+
1304
+ def __str__(self):
1305
+ im = "Micro" if self.is_micro else "Macro"
1306
+ return f"\t{im} evaluation results: F1: {self.f1.compute() * 100:.3f}%\tP: {self.p.compute() * 100:.3f}%" \
1307
+ f"\t R: {self.r.compute() * 100:.3f}%"
1308
+
1309
+
1310
+ class EntityEvaluationScores:
1311
+ def __init__(self, dataset_name):
1312
+ self.dataset_name = dataset_name
1313
+ self.micro_mention_detection = _EvaluationScores(True)
1314
+ self.macro_mention_detection = _EvaluationScores(False)
1315
+ self.micro_entity_linking = _EvaluationScores(True)
1316
+ self.macro_entity_linking = _EvaluationScores(False)
1317
+
1318
+ def record_mention_detection_results(self, prediction, gold):
1319
+ self.micro_mention_detection.record_results(prediction, gold)
1320
+ self.macro_mention_detection.record_results(prediction, gold)
1321
+
1322
+ def record_entity_linking_results(self, prediction, gold):
1323
+ self.micro_entity_linking.record_results(prediction, gold)
1324
+ self.macro_entity_linking.record_results(prediction, gold)
1325
+
1326
+ def __str__(self):
1327
+ return f"Evaluated model for set: {self.dataset_name} (Entity Linking)\n" \
1328
+ f"{str(self.macro_entity_linking)}\n" \
1329
+ f"{str(self.micro_entity_linking)}\n" \
1330
+ f"Evaluated model for set: {self.dataset_name} (Mention Detection)\n" \
1331
+ f"{str(self.macro_mention_detection)}\n" \
1332
+ f"{str(self.micro_mention_detection)}"
1333
+
1334
+
1335
+ class InOutMentionEvaluationResult:
1336
+ def __init__(self, activation_threshold=0.5, vocab_index_of_o=-1):
1337
+ self.activation_threshold = activation_threshold
1338
+ self.vocab_index_of_o = vocab_index_of_o
1339
+ self.total_predictions = 0.0
1340
+ self.correct_predictions = 0.0
1341
+ self.total_true_predictions = 0.0
1342
+ self.correct_true_predictions = 0.0
1343
+ self.total_false_predictions = 0.0
1344
+ self.correct_false_predictions = 0.0
1345
+
1346
+ def _preprocess_logits(self, subword_logits):
1347
+ if self.vocab_index_of_o > -1:
1348
+ return (subword_logits.argmax(-1) != self.vocab_index_of_o).bool()
1349
+ else:
1350
+ return (subword_logits > self.activation_threshold).squeeze(-1)
1351
+
1352
+ def update_scores(self, inputs_eval_mask, s_mentions_is_in_mention, subword_logits):
1353
+ self.total_predictions += inputs_eval_mask.sum().item()
1354
+ for em, ac, pr in zip(inputs_eval_mask, s_mentions_is_in_mention.bool(),
1355
+ self._preprocess_logits(subword_logits)):
1356
+ for m, a, p in zip(em, ac, pr):
1357
+ if m:
1358
+ if a == p:
1359
+ self.correct_predictions += 1.0
1360
+ if a:
1361
+ self.total_true_predictions += 1.0
1362
+ if p:
1363
+ self.correct_true_predictions += 1.0
1364
+ else:
1365
+ self.total_false_predictions += 1.0
1366
+ if not p:
1367
+ self.correct_false_predictions += 1.0
1368
+
1369
+ @property
1370
+ def overall_mention_detection_accuracy(self):
1371
+ return self.correct_predictions * 100 / self.total_predictions if self.total_predictions > 0.0 else 0.0
1372
+
1373
+ @property
1374
+ def in_mention_mention_detection_accuracy(self):
1375
+ return self.correct_true_predictions * 100 / self.total_true_predictions \
1376
+ if self.total_true_predictions > 0.0 else 0.0
1377
+
1378
+ @property
1379
+ def out_of_mention_overall_mention_detection_accuracy(self):
1380
+ return self.correct_false_predictions * 100 / self.total_false_predictions \
1381
+ if self.total_false_predictions > 0.0 else 0.0
1382
+
1383
+ def __str__(self):
1384
+ return f"Subword-level mention detection accuracy = {self.overall_mention_detection_accuracy:.3f}% " \
1385
+ f"({int(self.correct_predictions)}/{int(self.total_predictions)})\n" \
1386
+ f"\t In-Mention Subword-level mention detection accuracy = " \
1387
+ f"{self.in_mention_mention_detection_accuracy:.3f}% " \
1388
+ f"({int(self.correct_true_predictions)}/{int(self.total_true_predictions)})\n" \
1389
+ f"\tOut-of-Mention Subword-level mention detection accuracy = " \
1390
+ f"{self.out_of_mention_overall_mention_detection_accuracy:.3f}% " \
1391
+ f"({int(self.correct_false_predictions)}/{int(self.total_false_predictions)})"
model.py ADDED
@@ -0,0 +1,439 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ The implementation of the main annotator class from "SpEL: Structured Prediction for Entity Linking"
3
+ """
4
+ import os
5
+ import re
6
+ import pickle
7
+ import numpy
8
+ from typing import List
9
+ from glob import glob
10
+ from itertools import chain
11
+
12
+ from transformers import AutoModelForMaskedLM
13
+ import torch
14
+ import torch.nn as nn
15
+ from torch import optim
16
+ from tqdm import tqdm
17
+
18
+ from utils import store_validation_data_wiki, chunk_annotate_and_merge_to_phrase, \
19
+ get_aida_set_phrase_splitted_documents, compare_gold_and_predicted_annotation_documents
20
+ from decao_eval import EntityEvaluationScores, InOutMentionEvaluationResult
21
+ from span_annotation import SubwordAnnotation
22
+ from data_loader import BERT_MODEL_NAME, dl_sa, tokenizer
23
+ from configuration import get_checkpoints_dir, get_aida_train_canonical_redirects, get_ood_canonical_redirects, \
24
+ get_logdir_dir, get_exec_run_file
25
+
26
+ class SpELAnnotator:
27
+ def __init__(self):
28
+ super(SpELAnnotator, self).__init__()
29
+ self.checkpoints_root = get_checkpoints_dir()
30
+ self.logdir = get_logdir_dir()
31
+ self.exec_run_file = get_exec_run_file()
32
+
33
+ self.text_chunk_length = 254
34
+ self.text_chunk_overlap = 20
35
+
36
+ self.bert_lm = None
37
+ self.number_of_bert_layers = 0
38
+ self.bert_lm_h = 0
39
+ self.out = None
40
+ self.softmax = None
41
+
42
+ def init_model_from_scratch(self, base_model=BERT_MODEL_NAME, device="cpu"):
43
+ """
44
+ This is required to be called to load up the base model architecture before loading the fine-tuned checkpoint.
45
+ """
46
+ if base_model:
47
+ self.bert_lm = AutoModelForMaskedLM.from_pretrained(base_model, output_hidden_states=True,
48
+ cache_dir=get_checkpoints_dir() / "hf").to(device)
49
+ self.disable_roberta_lm_head()
50
+ self.number_of_bert_layers = self.bert_lm.config.num_hidden_layers + 1
51
+ self.bert_lm_h = self.bert_lm.config.hidden_size
52
+ self.out = nn.Embedding(num_embeddings=len(dl_sa.mentions_vocab),
53
+ embedding_dim=self.bert_lm_h, sparse=True).to(device)
54
+ self.softmax = nn.Softmax(dim=-1)
55
+
56
+ def shrink_classification_head_to_aida(self, device):
57
+ """
58
+ This will be called in fine-tuning step 3 to shrink the classification head to in-domain data vocabulary.
59
+ """
60
+ aida_mentions_vocab, aida_mentions_itos = dl_sa.get_aida_vocab_and_itos()
61
+ if self.out_module.num_embeddings == len(aida_mentions_vocab):
62
+ return
63
+ current_state_dict = self.out_module.state_dict()
64
+ new_out = nn.Embedding(num_embeddings=len(aida_mentions_vocab),
65
+ embedding_dim=self.bert_lm_h, sparse=True).to(device)
66
+ new_state_dict = new_out.state_dict()
67
+ for index_new in range(len(aida_mentions_itos)):
68
+ item_new = aida_mentions_itos[index_new]
69
+ assert item_new in dl_sa.mentions_vocab, \
70
+ "the aida fine-tuned mention vocab must be a subset of the original vocab"
71
+ index_current = dl_sa.mentions_vocab[item_new]
72
+ new_state_dict['weight'][index_new] = current_state_dict['weight'][index_current]
73
+ new_out.load_state_dict(new_state_dict, strict=False)
74
+ self.out = new_out.to(device)
75
+ dl_sa.shrink_vocab_to_aida()
76
+ model_params = sum(p.numel() for p in self.bert_lm.parameters())
77
+ out_params = sum(p.numel() for p in self.out.parameters())
78
+ print(f' * Shrank model to {model_params+out_params} number of parameters ({model_params} parameters '
79
+ f'for the encoder and {out_params} parameters for the classification head)!')
80
+
81
+ @property
82
+ def current_device(self):
83
+ return self.lm_module.device
84
+
85
+ @property
86
+ def lm_module(self):
87
+ return self.bert_lm.module if isinstance(self.bert_lm, nn.DataParallel) or \
88
+ isinstance(self.bert_lm, nn.parallel.DistributedDataParallel) else self.bert_lm
89
+
90
+ @property
91
+ def out_module(self):
92
+ return self.out.module if isinstance(self.out, nn.DataParallel) or \
93
+ isinstance(self.out, nn.parallel.DistributedDataParallel) else self.out
94
+
95
+ @staticmethod
96
+ def get_canonical_redirects(limit_to_conll=True):
97
+ return get_aida_train_canonical_redirects() if limit_to_conll else get_ood_canonical_redirects()
98
+
99
+ def create_optimizers(self, encoder_lr=5e-5, decoder_lr=0.1, exclude_parameter_names_regex=None):
100
+ if exclude_parameter_names_regex is not None:
101
+ bert_lm_parameters = list()
102
+ regex = re.compile(exclude_parameter_names_regex)
103
+ for n, p in list(self.lm_module.named_parameters()):
104
+ if not len(regex.findall(n)) > 0:
105
+ bert_lm_parameters.append(p)
106
+ else:
107
+ bert_lm_parameters = list(self.lm_module.parameters())
108
+ bert_optim = optim.Adam(bert_lm_parameters, lr=encoder_lr)
109
+ if decoder_lr < 1e-323:
110
+ # IMPORTANT! This is a hack since if we don't consider an optimizer for the last layer(e.g. decoder_lr=0.0),
111
+ # BCEWithLogitsLoss will become unstable and memory will explode.
112
+ decoder_lr = 1e-323
113
+ out_optim = optim.SparseAdam(self.out.parameters(), lr=decoder_lr)
114
+ return bert_optim, out_optim
115
+
116
+ @staticmethod
117
+ def create_warmup_scheduler(optimizer, warmup_steps):
118
+ """
119
+ Creates a scheduler which increases the :param optimizer: learning rate from 0 to the specified learning rate
120
+ in :param warmup_steps: number of batches.
121
+ You need to call scheduler.step() after optimizer.step() in your code for this scheduler to take effect
122
+ """
123
+ return optim.lr_scheduler.LambdaLR(
124
+ optimizer, lambda epoch: epoch / warmup_steps if epoch < warmup_steps else 1.0)
125
+
126
+ def get_highest_confidence_model_predictions(self, batch_token_ids, topk_per_token=20, topk_from_batch=8196):
127
+ """
128
+ This function will be used for hard negative mining. For a given input batch, it will return
129
+ the `topk_from_batch` mentions which have had model puzzled. In the process, to reduce the computational
130
+ complexity the model will first select `topk_per_token` number of candidates from the vocabulary, and then
131
+ applies the topk selection on it.
132
+ """
133
+ with torch.no_grad():
134
+ logits = self.get_model_raw_logits_inference(batch_token_ids)
135
+ # topk_logit_per_token, topk_eids_per_token = logits.topk(topk_per_token, sorted=False, dim=-1)
136
+ # This is a workaround to the torch.topk bug for large sized tensors
137
+ topk_logit_per_token, topk_eids_per_token = [], []
138
+ for batch_item in logits:
139
+ topk_probs, topk_ids = batch_item.topk(topk_per_token, sorted=False, dim=-1)
140
+ topk_logit_per_token.append(topk_probs)
141
+ topk_eids_per_token.append(topk_ids)
142
+ topk_logit_per_token = torch.stack(topk_logit_per_token, dim=0)
143
+ topk_eids_per_token = torch.stack(topk_eids_per_token, dim=0)
144
+ i = torch.cat(
145
+ [
146
+ topk_eids_per_token.view(1, -1),
147
+ torch.zeros(topk_eids_per_token.view(-1).size(), dtype=torch.long,
148
+ device=topk_eids_per_token.device).view(1, -1),
149
+ ],
150
+ dim=0,
151
+ )
152
+ v = topk_logit_per_token.view(-1)
153
+ st = torch.sparse.FloatTensor(i, v)
154
+ stc = st.coalesce()
155
+ topk_indices = stc._values().sort(descending=True)[1][:topk_from_batch]
156
+ result = stc._indices()[0, topk_indices]
157
+
158
+ return result.cpu().tolist()
159
+ # ###########################################################################################
160
+
161
+ def annotate_subword_ids(self, subword_ids_list: List, k_for_top_k_to_keep: int, token_offsets=None) \
162
+ -> List[SubwordAnnotation]:
163
+ with torch.no_grad():
164
+ token_ids = torch.LongTensor(subword_ids_list)
165
+ raw_logits, hidden_states = self.get_model_raw_logits_inference(token_ids, return_hidden_states=True)
166
+ logits = self.get_model_logits_inference(raw_logits, hidden_states, k_for_top_k_to_keep, token_offsets)
167
+ return logits
168
+
169
+ def get_model_raw_logits_training(self, token_ids, label_ids, label_probs):
170
+ # label_probs is not used in this function but provided for the classes inheriting SpELAnnotator.
171
+ enc = self.bert_lm(token_ids).hidden_states[-1]
172
+ out = self.out(label_ids)
173
+ logits = enc.matmul(out.transpose(0, 1))
174
+ return logits
175
+
176
+ def get_model_logits_inference(self, raw_logits, hidden_states, k_for_top_k_to_keep, token_offsets=None) \
177
+ -> List[SubwordAnnotation]:
178
+ # hidden_states is not used in this function but provided for the classes inheriting SpELAnnotator.
179
+ logits = self.softmax(raw_logits)
180
+ # The following line could possibly cause errors in torch version 1.13.1
181
+ # see https://github.com/pytorch/pytorch/issues/95455 for more information
182
+ top_k_logits, top_k_indices = logits.topk(k_for_top_k_to_keep)
183
+ top_k_logits = top_k_logits.squeeze(0).cpu().tolist()
184
+ top_k_indices = top_k_indices.squeeze(0).cpu().tolist()
185
+ chunk = ["" for _ in top_k_logits] if token_offsets is None else token_offsets
186
+ return [SubwordAnnotation(p, i, x[0]) for p, i, x in zip(top_k_logits, top_k_indices, chunk)]
187
+
188
+ def get_model_raw_logits_inference(self, token_ids, return_hidden_states=False):
189
+ encs = self.lm_module(token_ids.to(self.current_device)).hidden_states
190
+ out = self.out_module.weight
191
+ logits = encs[-1].matmul(out.transpose(0, 1))
192
+ return (logits, encs) if return_hidden_states else logits
193
+
194
+ def evaluate(self, epoch, batch_size, label_size, best_f1, is_training=True, use_retokenized_wikipedia_data=False,
195
+ potent_score_threshold=0.82):
196
+ self.bert_lm.eval()
197
+ self.out.eval()
198
+ vocab_pad_id = dl_sa.mentions_vocab['<pad>']
199
+
200
+ all_words, all_tags, all_y, all_y_hat, all_predicted, all_token_ids = [], [], [], [], [], []
201
+ subword_eval = InOutMentionEvaluationResult(vocab_index_of_o=dl_sa.mentions_vocab['|||O|||'])
202
+ dataset_name = store_validation_data_wiki(
203
+ self.checkpoints_root, batch_size, label_size, is_training=is_training,
204
+ use_retokenized_wikipedia_data=use_retokenized_wikipedia_data)
205
+ with torch.no_grad():
206
+ for d_file in tqdm(sorted(glob(os.path.join(self.checkpoints_root, dataset_name, "*")))):
207
+ batch_token_ids, label_ids, label_probs, eval_mask, label_id_to_entity_id_dict, \
208
+ batch_entity_ids, is_in_mention, _ = pickle.load(open(d_file, "rb"))
209
+ logits = self.get_model_raw_logits_inference(batch_token_ids)
210
+ subword_eval.update_scores(eval_mask, is_in_mention, logits)
211
+ y_hat = logits.argmax(-1)
212
+
213
+ tags = list()
214
+ predtags = list()
215
+ y_resolved_list = list()
216
+ y_hat_resolved_list = list()
217
+ token_list = list()
218
+
219
+ for batch_id, seq in enumerate(label_probs.max(-1)[1]):
220
+ for token_id, label_id in enumerate(seq[:-self.text_chunk_overlap]):
221
+ if eval_mask[batch_id][token_id].item() == 0:
222
+ y_resolved = vocab_pad_id
223
+ else:
224
+ y_resolved = label_ids[label_id].item()
225
+ y_resolved_list.append(y_resolved)
226
+ tags.append(dl_sa.mentions_itos[y_resolved])
227
+ y_hat_resolved = y_hat[batch_id][token_id].item()
228
+ y_hat_resolved_list.append(y_hat_resolved)
229
+ predtags.append(dl_sa.mentions_itos[y_hat_resolved])
230
+ token_list.append(batch_token_ids[batch_id][token_id].item())
231
+
232
+ all_y.append(y_resolved_list)
233
+ all_y_hat.append(y_hat_resolved_list)
234
+ all_tags.append(tags)
235
+ all_predicted.append(predtags)
236
+ all_words.append(tokenizer.convert_ids_to_tokens(token_list))
237
+ all_token_ids.append(token_list)
238
+ del batch_token_ids, label_ids, label_probs, eval_mask, \
239
+ label_id_to_entity_id_dict, batch_entity_ids, logits, y_hat
240
+
241
+ y_true = numpy.array(list(chain(*all_y)))
242
+ y_pred = numpy.array(list(chain(*all_y_hat)))
243
+ all_token_ids = numpy.array(list(chain(*all_token_ids)))
244
+
245
+ num_proposed = len(y_pred[(1 < y_pred) & (all_token_ids > 0)])
246
+ num_correct = (((y_true == y_pred) & (1 < y_true) & (all_token_ids > 0))).astype(int).sum()
247
+ num_gold = len(y_true[(1 < y_true) & (all_token_ids > 0)])
248
+
249
+ precision = num_correct / num_proposed if num_proposed > 0.0 else 0.0
250
+ recall = num_correct / num_gold if num_gold > 0.0 else 0.0
251
+ f1 = 2.0 * precision * recall / (precision + recall) if precision + recall > 0.0 else 0.0
252
+ f05 = 1.5 * precision * recall / (precision + recall) if precision + recall > 0.0 else 0.0
253
+ if f1 > best_f1:
254
+ print("Saving the best checkpoint ...")
255
+ config = self.prepare_model_checkpoint(epoch)
256
+ fname = self.get_mode_checkpoint_name()
257
+ torch.save(config, f"{fname}.pt")
258
+ print(f"weights were saved to {fname}.pt")
259
+ if precision > potent_score_threshold and recall > potent_score_threshold and is_training:
260
+ print(f"Saving the potent checkpoint with both precision and recall above {potent_score_threshold} ...")
261
+ config = self.prepare_model_checkpoint(epoch)
262
+ try:
263
+ fname = self.get_mode_checkpoint_name()
264
+ torch.save(config, f"{fname}-potent.pt")
265
+ print(f"weights were saved to {fname}-potent.pt")
266
+ except NotImplementedError:
267
+ pass
268
+ self.bert_lm.train()
269
+ self.out.train()
270
+ with open(self.exec_run_file, "a+") as exec_file:
271
+ exec_file.write(f"{precision}, {recall}, {f1}, {f05}, {num_proposed}, {num_correct}, {num_gold}, "
272
+ f"{epoch+1},,\n")
273
+ return precision, recall, f1, f05, num_proposed, num_correct, num_gold, subword_eval
274
+
275
+ def inference_evaluate(self, epoch, best_f1, dataset_name='testa'):
276
+ self.bert_lm.eval()
277
+ self.out.eval()
278
+ evaluation_results = EntityEvaluationScores(dataset_name)
279
+ gold_documents = get_aida_set_phrase_splitted_documents(dataset_name)
280
+ for gold_document in tqdm(gold_documents):
281
+ t_sentence = " ".join([x.word_string for x in gold_document])
282
+ predicted_document = chunk_annotate_and_merge_to_phrase(self, t_sentence, k_for_top_k_to_keep=1)
283
+ comparison_results = compare_gold_and_predicted_annotation_documents(gold_document, predicted_document)
284
+ g_md = set((e[1].begin_character, e[1].end_character)
285
+ for e in comparison_results if e[0].resolved_annotation)
286
+ p_md = set((e[1].begin_character, e[1].end_character)
287
+ for e in comparison_results if e[1].resolved_annotation)
288
+ g_el = set((e[1].begin_character, e[1].end_character, dl_sa.mentions_itos[e[0].resolved_annotation])
289
+ for e in comparison_results if e[0].resolved_annotation)
290
+ p_el = set((e[1].begin_character, e[1].end_character, dl_sa.mentions_itos[e[1].resolved_annotation])
291
+ for e in comparison_results if e[1].resolved_annotation)
292
+ if p_el:
293
+ evaluation_results.record_mention_detection_results(p_md, g_md)
294
+ evaluation_results.record_entity_linking_results(p_el, g_el)
295
+ if evaluation_results.micro_entity_linking.f1.compute() > best_f1:
296
+ print("Saving the best checkpoint ...")
297
+ config = self.prepare_model_checkpoint(epoch)
298
+ fname = self.get_mode_checkpoint_name()
299
+ torch.save(config, f"{fname}.pt")
300
+ print(f"weights were saved to {fname}.pt")
301
+ self.bert_lm.train()
302
+ self.out.train()
303
+ return evaluation_results
304
+
305
+ def prepare_model_checkpoint(self, epoch):
306
+ chk_point = {
307
+ "bert_lm": self.lm_module.state_dict(),
308
+ "number_of_bert_layers": self.number_of_bert_layers,
309
+ "bert_lm_h": self.bert_lm_h,
310
+ "out": self.out_module.state_dict(),
311
+ "epoch": epoch,
312
+ }
313
+ sub_model_specific_checkpoint_data = self.sub_model_specific_checkpoint_data()
314
+ for key in sub_model_specific_checkpoint_data:
315
+ assert key not in ["bert_lm", "number_of_bert_layers", "bert_lm_h", "out", "epoch"], \
316
+ f"{key} is already considered in prepare_model_checkpoint function"
317
+ chk_point[key] = sub_model_specific_checkpoint_data[key]
318
+ return chk_point
319
+
320
+ def disable_roberta_lm_head(self):
321
+ assert self.bert_lm is not None
322
+ self.bert_lm.lm_head.layer_norm.bias.requires_grad = False
323
+ self.bert_lm.lm_head.layer_norm.weight.requires_grad = False
324
+ self.bert_lm.lm_head.dense.bias.requires_grad = False
325
+ self.bert_lm.lm_head.dense.weight.requires_grad = False
326
+ self.bert_lm.lm_head.decoder.bias.requires_grad = False
327
+
328
+ def _load_from_checkpoint_object(self, checkpoint, device="cpu"):
329
+ torch.cuda.empty_cache()
330
+ self.bert_lm.load_state_dict(checkpoint["bert_lm"], strict=False)
331
+ self.bert_lm.to(device)
332
+ self.disable_roberta_lm_head()
333
+ self.out.load_state_dict(checkpoint["out"], strict=False)
334
+ self.out.to(device)
335
+ self.number_of_bert_layers = checkpoint["number_of_bert_layers"]
336
+ self.bert_lm_h = checkpoint["bert_lm_h"]
337
+ self.sub_model_specific_load_checkpoint_data(checkpoint)
338
+ self.bert_lm.eval()
339
+ self.out.eval()
340
+ model_params = sum(p.numel() for p in self.bert_lm.parameters())
341
+ out_params = sum(p.numel() for p in self.out.parameters())
342
+ print(f' * Loaded model with {model_params+out_params} number of parameters ({model_params} parameters '
343
+ f'for the encoder and {out_params} parameters for the classification head)!')
344
+
345
+ @staticmethod
346
+ def download_from_torch_hub(finetuned_after_step=1):
347
+ assert 4 >= finetuned_after_step >= 1
348
+ if finetuned_after_step == 4:
349
+ # This model is the same SpEL finetuned model after step 3 except that its classification layer projects to
350
+ # the entirety of the step-2 model rather than shrinking it in size
351
+ file_name = "spel-base-step-3-500K.pt"
352
+ # Downloads and returns the finetuned model checkpoint created on Oct-03-2023
353
+ checkpoint = torch.hub.load_state_dict_from_url('https://vault.sfu.ca/index.php/s/8nw5fFXdz2yBP5z/download',
354
+ model_dir=str(get_checkpoints_dir()), map_location="cpu",
355
+ file_name=file_name)
356
+ elif finetuned_after_step == 3:
357
+ file_name = "spel-base-step-3.pt"
358
+ # Downloads and returns the finetuned model checkpoint created on Sep-26-2023 with P=92.06|R=91.93|F1=91.99
359
+ checkpoint = torch.hub.load_state_dict_from_url('https://vault.sfu.ca/index.php/s/HpQ3PMm6A3y1NBl/download',
360
+ model_dir=str(get_checkpoints_dir()), map_location="cpu",
361
+ file_name=file_name)
362
+ elif finetuned_after_step == 2:
363
+ file_name = 'spel-base-step-2.pt'
364
+ # Downloads and returns the pretrained model checkpoint created on Sep-26-2023 with P=77.60|R=77.91|F1=77.75
365
+ checkpoint = torch.hub.load_state_dict_from_url('https://vault.sfu.ca/index.php/s/Hf37vc1foluHPBh/download',
366
+ model_dir=str(get_checkpoints_dir()), map_location="cpu",
367
+ file_name=file_name)
368
+ else:
369
+ file_name = 'spel-base-step-1.pt'
370
+ # Downloads and returns the pretrained model checkpoint created on Sep-11-2023 with P=82.50|R=83.16|F1=82.83
371
+ checkpoint = torch.hub.load_state_dict_from_url('https://vault.sfu.ca/index.php/s/9OAoAG5eYeREE9V/download',
372
+ model_dir=str(get_checkpoints_dir()), map_location="cpu",
373
+ file_name=file_name)
374
+ print(f" * Loaded pretrained model checkpoint: {file_name}")
375
+ return checkpoint
376
+
377
+ @staticmethod
378
+ def download_large_from_torch_hub(finetuned_after_step=1):
379
+ assert 4 >= finetuned_after_step >= 1
380
+ if finetuned_after_step == 4:
381
+ # This model is the same SpEL finetuned model after step 3 except that its classification layer projects to
382
+ # the entirety of the step-2 model rather than shrinking it in size
383
+ file_name = "spel-large-step-3-500K.pt"
384
+ # Downloads and returns the finetuned model checkpoint created on Oct-03-2023
385
+ checkpoint = torch.hub.load_state_dict_from_url('https://vault.sfu.ca/index.php/s/BCvputD1ByAvILC/download',
386
+ model_dir=str(get_checkpoints_dir()), map_location="cpu",
387
+ file_name=file_name)
388
+ elif finetuned_after_step == 3:
389
+ file_name = "spel-large-step-3.pt"
390
+ # Downloads and returns the finetuned model checkpoint created on Oct-02-2023 with P=92.53|R=92.99|F1=93.76
391
+ checkpoint = torch.hub.load_state_dict_from_url('https://vault.sfu.ca/index.php/s/kBBlYVM4Tr59P0q/download',
392
+ model_dir=str(get_checkpoints_dir()), map_location="cpu",
393
+ file_name=file_name)
394
+ elif finetuned_after_step == 2:
395
+ file_name = 'spel-large-step-2.pt'
396
+ # Downloads and returns the pretrained model checkpoint created on Oct-02-2023 with P=77.36|R=73.11|F1=75.18
397
+ checkpoint = torch.hub.load_state_dict_from_url('https://vault.sfu.ca/index.php/s/rnDiuKns7gzADyb/download',
398
+ model_dir=str(get_checkpoints_dir()), map_location="cpu",
399
+ file_name=file_name)
400
+ else:
401
+ file_name = 'spel-large-step-1.pt'
402
+ # Downloads and returns the pretrained model checkpoint created on Sep-11-2023 with P=84.02|R=82.74|F1=83.37
403
+ checkpoint = torch.hub.load_state_dict_from_url('https://vault.sfu.ca/index.php/s/bTp6UN2xL7Yh52w/download',
404
+ model_dir=str(get_checkpoints_dir()), map_location="cpu",
405
+ file_name=file_name)
406
+ print(f" * Loaded pretrained model checkpoint: {file_name}")
407
+ return checkpoint
408
+
409
+
410
+ def load_checkpoint(self, checkpoint_name, device="cpu", rank=0, load_from_torch_hub=False, finetuned_after_step=1):
411
+ if load_from_torch_hub and BERT_MODEL_NAME == "roberta-large":
412
+ checkpoint = self.download_large_from_torch_hub(finetuned_after_step)
413
+ self._load_from_checkpoint_object(checkpoint, device)
414
+ elif load_from_torch_hub and BERT_MODEL_NAME == "roberta-base":
415
+ checkpoint = self.download_from_torch_hub(finetuned_after_step)
416
+ self._load_from_checkpoint_object(checkpoint, device)
417
+ else: # load from the local .checkpoints directory
418
+ if rank == 0:
419
+ print("Loading model checkpoint: {}".format(checkpoint_name))
420
+ fname = os.path.join(self.checkpoints_root, checkpoint_name)
421
+ checkpoint = torch.load(fname, map_location="cpu")
422
+ self._load_from_checkpoint_object(checkpoint, device)
423
+
424
+ # #############################FUNCTIONS THAT THE SUB-MODELS MUST REIMPLEMENT####################################
425
+ def sub_model_specific_checkpoint_data(self):
426
+ """
427
+ :return: a dictionary of key values containing everything that matters to the sub-model and is not already
428
+ considered in prepare_model_checkpoint.
429
+ """
430
+ return {}
431
+
432
+ def sub_model_specific_load_checkpoint_data(self, checkpoint):
433
+ return
434
+
435
+ def get_mode_checkpoint_name(self):
436
+ raise NotImplementedError
437
+
438
+ def annotate(self, nif_collection, **kwargs):
439
+ raise NotImplementedError
resources/data/aida_canonical_redirects.json ADDED
The diff for this file is too large to render. See raw diff
 
resources/vocab/aida.txt ADDED
@@ -0,0 +1,5598 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 1._FC_Köln
2
+ 1._FC_Tatran_Prešov
3
+ 1._HFC_Humenné
4
+ 10_Downing_Street
5
+ 14th_Dalai_Lama
6
+ 1936_Summer_Olympics
7
+ 1966_FIFA_World_Cup
8
+ 1972_Summer_Olympics
9
+ 1976_Winter_Olympics
10
+ 1990_FIFA_World_Cup
11
+ 1992_Summer_Olympics
12
+ 1994_Asian_Games
13
+ 1995_Rugby_World_Cup
14
+ 1996_AFC_Asian_Cup
15
+ 1996_Belgian_Grand_Prix
16
+ 1996_IAAF_Grand_Prix_Final
17
+ 1996_Summer_Olympics
18
+ 1996_Wimbledon_Championships
19
+ 1996–97_UEFA_Cup
20
+ 1997_Fed_Cup
21
+ 1998_FIFA_World_Cup
22
+ 2,000_Guineas_Stakes
23
+ 2000_Summer_Olympics
24
+ 7-Eleven_(cycling_team)
25
+ A-Ram
26
+ A.C._Cesena
27
+ A.C._ChievoVerona
28
+ A.C._Milan
29
+ A.C._Monza_Brianza_1912
30
+ A.C._Reggiana_1919
31
+ A.F.C._Bournemouth
32
+ A.S.G._Nocerina
33
+ A.S._Andria_BAT
34
+ A.S._Bari
35
+ A.S._Lucchese_Libertas_1905
36
+ A.S._Roma
37
+ A._G._Edwards
38
+ ABC_(newspaper)
39
+ ABN_AMRO
40
+ ACC_Limited
41
+ ACES_Colombia
42
+ ACF_Fiorentina
43
+ ACF_Gloria_1922_Bistriţa
44
+ AEK_Athens_F.C.
45
+ AEK_Larnaca
46
+ AES_Eletropaulo
47
+ AEX_index
48
+ AFC_Ajax
49
+ AFC_Asian_Cup
50
+ AFC_Progresul_Bucureşti
51
+ AIDS
52
+ AJ_Auxerre
53
+ AK-47
54
+ ALBA_Berlin
55
+ AMR_Corporation
56
+ AOL
57
+ ASARCO
58
+ ASEAN
59
+ ASEC_Mimosas
60
+ ASVEL_Lyon-Villeurbanne
61
+ AS_Cannes
62
+ AS_Monaco_FC
63
+ AS_Nancy
64
+ AT&T_Classic
65
+ AZ_(football_club)
66
+ A_Coruña
67
+ Aamer_Sohail
68
+ Aaron_Slight
69
+ Aashish_Kapoor
70
+ Abdalá_Bucaram
71
+ Abdou_Diouf
72
+ Abdul_Rashid_Dostum
73
+ Abdullah_Ercan
74
+ Abdullah_of_Saudi_Arabia
75
+ Abel_Antón
76
+ Abel_Balbo
77
+ Aberdeen_F.C.
78
+ Abidjan
79
+ Abimael_Guzmán
80
+ Abraham_Lincoln
81
+ Abu_Dhabi
82
+ Abu_Dhabi_(emirate)
83
+ Abuja
84
+ Abulhassan_Banisadr
85
+ Academic_Staff_Union_of_Universities
86
+ Academy_Award
87
+ Academy_Award_for_Best_Actor
88
+ Acatepec
89
+ Action_Against_Hunger
90
+ Ad-Diyar
91
+ Adam_Fedoruk
92
+ Adam_Hollioake
93
+ Adam_Hunter_(golfer)
94
+ Adelaide_Football_Club
95
+ Adige
96
+ Adilson_da_Silva
97
+ Adnan_Al_Talyani
98
+ Adolf_Hitler
99
+ Adrian_Ilie
100
+ Adrian_Knup
101
+ Adrian_Năstase
102
+ Adrian_Voinea
103
+ Adriatic_Sea
104
+ Aegon
105
+ Afghanistan
106
+ Africa
107
+ Africa_Cup_of_Nations
108
+ African_American
109
+ African_National_Congress
110
+ Afrikaans
111
+ Afrikaner
112
+ Agent_Orange
113
+ Agenzia_Nazionale_Stampa_Associata
114
+ Agnieszka_Kotlarska
115
+ Agung_Setyabudi
116
+ Ahmad_Shah_Massoud
117
+ Ahmed_Barada
118
+ Ahmed_Sékou_Touré
119
+ Ahmed_Yassin
120
+ Ahmedou_Ould-Abdallah
121
+ Ahold
122
+ Ai_Sugiyama
123
+ Aidan_Quinn
124
+ Aimé_Jacquet
125
+ Air_France
126
+ Airdrieonians_F.C.
127
+ Ajaccio
128
+ Ajay_Jadeja
129
+ Akira_Ryō
130
+ Akron,_Ohio
131
+ Al-Aqsa_Mosque
132
+ Al_Ain
133
+ Al_Akhbar_(Lebanon)
134
+ Al_Gore
135
+ Al_Karak
136
+ Al_Martin
137
+ Al_Rai
138
+ Al_Unser,_Jr.
139
+ Alabama
140
+ Alain_Caveglia
141
+ Alain_Juppé
142
+ Alama_Ieremia
143
+ Alan_Ball,_Jr.
144
+ Alan_Benes
145
+ Alan_Budikusuma
146
+ Alan_Greenspan
147
+ Alan_Hunte
148
+ Alan_Kelly,_Sr.
149
+ Alan_Kernaghan
150
+ Alan_McLoughlin
151
+ Alan_Moore_(footballer)
152
+ Alan_Mullally
153
+ Alan_Shearer
154
+ Alaska
155
+ Alaska_Aces_(PBA)
156
+ Albania
157
+ Albania_national_football_team
158
+ Albanian_Football_Association
159
+ Albanian_language
160
+ Albanians
161
+ Albert_Belle
162
+ Albert_Emon
163
+ Albert_Ferrer
164
+ Albert_II,_Prince_of_Monaco
165
+ Albert_Schweitzer
166
+ Alberta
167
+ Alberto_Berasategui
168
+ Alberto_García_Aspe
169
+ Albuquerque,_New_Mexico
170
+ Aldi
171
+ Alec_Stewart
172
+ Alejandro_Agustín_Lanusse
173
+ Aleksander_Kwaśniewski
174
+ Aleksandra_Olsza
175
+ Alen_Bokšić
176
+ Alessandra_Mussolini
177
+ Alessandro_Del_Piero
178
+ Alessandro_Lambruschini
179
+ Alessandro_Melli
180
+ Alessandro_Moscardi
181
+ Alessandro_Troncon
182
+ Alex_Arias
183
+ Alex_Ferguson
184
+ Alex_Fernandez_(baseball)
185
+ Alex_Rodriguez
186
+ Alex_Rădulescu
187
+ Alex_Zanardi
188
+ Alex_Čejka
189
+ Alexander_Downer
190
+ Alexander_Gontchenkov
191
+ Alexander_III_of_Scotland
192
+ Alexander_Lebed
193
+ Alexander_Lukashenko
194
+ Alexander_Popov_(swimmer)
195
+ Alexander_Vladimirovich_Volkov
196
+ Alexander_Zickler
197
+ Alexandra_Fusai
198
+ Alexandra_Meissnitzer
199
+ Alexandre_Comisetti
200
+ Alexei_Markov
201
+ Alexia_Dechaume-Balleret
202
+ Aleš_Valenta
203
+ Alfred_Berkeley
204
+ Alfred_Sant
205
+ Algemeen_Nederlands_Persbureau
206
+ Algeria
207
+ Algiers
208
+ Ali_Alatas
209
+ Ali_Brown
210
+ Ali_Shah
211
+ Alicante
212
+ Alicia_Machado
213
+ Alija_Izetbegović
214
+ Alina_Astafei
215
+ Alistair_Campbell_(cricketer)
216
+ Aliuska_López
217
+ Alla_Dudayeva
218
+ Allah
219
+ Allan_Bateman
220
+ Allan_Donald
221
+ Allan_Wells
222
+ Allen_Johnson
223
+ Allenby_Bridge
224
+ Allensbach
225
+ Allentown,_Pennsylvania
226
+ Alliance_'90/The_Greens
227
+ Alloa_Athletic_F.C.
228
+ Ally_McCoist
229
+ Almere
230
+ Aloÿs_Nizigama
231
+ Alpay_Özalan
232
+ Alpine_skiing_at_the_1994_Winter_Olympics
233
+ Alsace
234
+ Altenberg,_Germany
235
+ Altin_Haxhi
236
+ Altin_Rraklli
237
+ Amalgamated_Roadstone_Corporation
238
+ Amanda_Coetzer
239
+ Amara_Essy
240
+ Amarillo,_Texas
241
+ Amazon_River
242
+ American_Airlines
243
+ American_Broadcasting_Company
244
+ American_Civil_War
245
+ American_Depositary_Receipt
246
+ American_League
247
+ American_League_Central
248
+ American_League_East
249
+ American_League_West
250
+ American_Stock_Exchange
251
+ American_University
252
+ American_Veterinary_Medical_Association
253
+ American_studies
254
+ Amica_Wronki
255
+ Amman
256
+ Amnesty_International
257
+ Amoco
258
+ Amr_Moussa
259
+ Amr_Shabana
260
+ Amsterdam
261
+ Amstetten,_Lower_Austria
262
+ Amtrak
263
+ Amy_Frazier
264
+ An-Nahar
265
+ Ana_Fidelia_Quirot
266
+ Anaheim_Ducks
267
+ Anatolia
268
+ Anchorage,_Alaska
269
+ Anders_Forsbrand
270
+ Andhra_Pradesh
271
+ Andre_Agassi
272
+ Andre_Markgraaff
273
+ Andre_Snyman
274
+ Andrea_Collinelli
275
+ Andrea_Ferrigato
276
+ Andrea_Gaudenzi
277
+ Andrea_Giaconi
278
+ Andrea_Glass
279
+ Andreas_Andersson
280
+ Andreas_Goldberger
281
+ Andreas_Heraf
282
+ Andreas_Herzog
283
+ Andreas_Kappes
284
+ Andreas_Köpke
285
+ Andreas_Möller
286
+ Andreas_Ogris
287
+ Andreas_Seelig
288
+ Andreas_Thom
289
+ Andreas_Zeyer
290
+ Andrei_Chesnokov
291
+ Andrei_Kanchelskis
292
+ Andrei_Olhovskiy
293
+ Andrei_Pavel
294
+ Andrei_Tchmil
295
+ Andrew_Caddick
296
+ Andrew_Coltart
297
+ Andrew_Hudson
298
+ Andrew_Magee
299
+ Andrew_Mehrtens
300
+ Andrew_Symonds
301
+ Andrew_Wakefield
302
+ Andriy_Medvedev
303
+ Andruw_Jones
304
+ André_Joubert
305
+ André_Ribeiro
306
+ André_Trulsen
307
+ Andrés_Galarraga
308
+ Andy_Benes
309
+ Andy_Etchebarren
310
+ Andy_Flower
311
+ Andy_Goram
312
+ Andy_Hinchcliffe
313
+ Andy_Melville
314
+ Andy_Pettitte
315
+ Andy_Sinton
316
+ Andy_Townsend
317
+ Andy_Whittall
318
+ Ange-Félix_Patassé
319
+ Angel_Miranda
320
+ Anghel_Iordănescu
321
+ Anglicanism
322
+ Anglo-Welsh_Cup
323
+ Angola
324
+ Angola_national_football_team
325
+ Angélica_Gavaldón
326
+ Anhui
327
+ Anil_Kumble
328
+ Anita_Gradin
329
+ Anita_Wachter
330
+ Anja_Rücker
331
+ Ankara
332
+ Anke_Huber
333
+ Ann_Grossman
334
+ Anna_Kournikova
335
+ Anne-Gaëlle_Sidot
336
+ Anne_Boleyn
337
+ Annemari_Sandell-Hyvärinen
338
+ Annemarie_Jorritsma
339
+ Annett_Neumann
340
+ Antara_(news_agency)
341
+ Anthony_Bancarel
342
+ Anthony_Gobert
343
+ Anthony_Quayle
344
+ Anthony_Raine_Barker
345
+ Anthony_Sullivan
346
+ Anthony_Washington
347
+ Anthuan_Maybank
348
+ Antioquia_Department
349
+ Anto_Drobnjak
350
+ Antoine_Kombouaré
351
+ Antoine_Lahad
352
+ Anton_Bruckner
353
+ Anton_Doboş
354
+ Anton_Pfeffer
355
+ Anton_Shantyr
356
+ Antonella_Bellutti
357
+ Antonio_Esposito
358
+ Antonio_Quarracino
359
+ Antonio_Tartaglia
360
+ Antony_Marlow
361
+ Antwerp
362
+ António_Folha
363
+ António_Luís_Alves_Ribeiro_Oliveira
364
+ Anyang_LG_Cheetahs
365
+ Apple_Inc.
366
+ Appleton,_Wisconsin
367
+ Aqaba
368
+ Arab_Contractors_(company)
369
+ Arab_World
370
+ Arab_citizens_of_Israel
371
+ Arab_people
372
+ Arabic_language
373
+ Arad,_Romania
374
+ Arantxa_Parra_Santonja
375
+ Arantxa_Sánchez_Vicario
376
+ Aravinda_de_Silva
377
+ Arbil
378
+ Arbroath_F.C.
379
+ Arctic
380
+ Ards_F.C.
381
+ Argentina
382
+ Ari-Pekka_Nikkola
383
+ Arif_Erdem
384
+ Arizona
385
+ Arizona_Cardinals
386
+ Arizona_State_Sun_Devils_football
387
+ Arjan_Xhumba
388
+ Arjuna_Ranatunga
389
+ Arkadiusz_Bąk
390
+ Arkansas
391
+ Arkansas_State_Police
392
+ Arlen_Specter
393
+ Arlington,_Texas
394
+ Armando_Reynoso
395
+ Armen_Martirosyan_(athlete)
396
+ Armenia
397
+ Armenia_national_football_team
398
+ Arminia_Bielefeld
399
+ Arnaud_Boetsch
400
+ Arnold_Rüütel
401
+ Aron_Winter
402
+ Arrigo_Sacchi
403
+ Arrows
404
+ Arsenal_F.C.
405
+ Art_Howe
406
+ Arthur_Levitt
407
+ Arthur_Numan
408
+ Artur_Jorge_(footballer)
409
+ Artur_Lekbello
410
+ Arundel
411
+ Aryan_race
412
+ As-Safir
413
+ Asanka_Gurusinha
414
+ Ascot_Racecourse
415
+ Ashford_Town_F.C._(Kent)
416
+ Ashia_Hansen
417
+ Ashoknagar
418
+ Ashta
419
+ Asia
420
+ Asif_Ali_Zardari
421
+ Asif_Mujtaba
422
+ Aslan_Maskhadov
423
+ Asmara
424
+ Association_for_Relations_Across_the_Taiwan_Straits
425
+ Association_of_Tennis_Professionals
426
+ Associação_Desportiva_Vasco_da_Gama
427
+ Associação_Portuguesa_de_Desportos
428
+ Aston_Villa_F.C.
429
+ Astrid_Kumbernuss
430
+ Ata-ur-Rehman
431
+ Atalanta_B.C.
432
+ Athens
433
+ Athens_Metro
434
+ Athletic_Bilbao
435
+ Atlanta
436
+ Atlanta_Braves
437
+ Atlanta_Falcons
438
+ Atlanta_Hawks
439
+ Atlante_F.C.
440
+ Atlantic_City,_New_Jersey
441
+ Atlantic_Division_(NBA)
442
+ Atlantic_Division_(NHL)
443
+ Atlantic_Highlands,_New_Jersey
444
+ Atlético_Madrid
445
+ Ato_Boldon
446
+ Auchan
447
+ Auckland
448
+ Audi
449
+ Aung_San
450
+ Aung_San_Suu_Kyi
451
+ Australia
452
+ Australia_Davis_Cup_team
453
+ Australia_national_cricket_team
454
+ Australia_national_rugby_union_team
455
+ Australian_Capital_Territory
456
+ Australian_Democrats
457
+ Australian_Football_League
458
+ Australian_Greens
459
+ Australian_Labor_Party
460
+ Australian_Open
461
+ Australian_people
462
+ Australian_rules_football
463
+ Austria
464
+ Austria_Fed_Cup_team
465
+ Austria_national_football_team
466
+ Austria_national_under-21_football_team
467
+ Austrian_Empire
468
+ Austrians
469
+ Avalon,_New_Jersey
470
+ Avianca
471
+ Avigdor_Kahalani
472
+ Axel_Schulz
473
+ Axis_Bank
474
+ Ayodhya
475
+ Ayr_United_F.C.
476
+ Azad_Kashmir
477
+ Azerbaijan
478
+ Azerbaijan_national_football_team
479
+ BBC
480
+ BNP_Paribas
481
+ BP
482
+ BSC_Young_Boys
483
+ BVSC_Budapest
484
+ Ba'ath_Party
485
+ Babel_(newspaper)
486
+ Baburam_Bhattarai
487
+ Bachirou_Salou
488
+ Baghdad
489
+ Bagram
490
+ Bahrain
491
+ Bahía_Blanca
492
+ Bailundo
493
+ Baja_California
494
+ Baku
495
+ Bali
496
+ Balkan_Bulgarian_Airlines
497
+ Balkans
498
+ Bally_Shoe
499
+ Ballybunion
500
+ Balmain_Tigers
501
+ Balochistan,_Pakistan
502
+ Baltimore
503
+ Baltimore_Orioles
504
+ Baltimore_Ravens
505
+ Bancomext
506
+ Bandundu
507
+ Bangkok
508
+ Bangladesh
509
+ Bangladesh_Awami_League
510
+ Bangladesh_Nationalist_Party
511
+ Bangui
512
+ Banharn_Silpa-archa
513
+ Banja_Luka
514
+ Bank_Indonesia
515
+ Bank_One_Corporation
516
+ Bank_of_Canada
517
+ Bank_of_Finland
518
+ Bank_of_Israel
519
+ Bank_of_Japan
520
+ Bank_of_Mexico
521
+ Bank_of_New_Zealand
522
+ Bank_of_Spain
523
+ Banque_de_France
524
+ Barangay_Ginebra_Kings
525
+ Barbara_Paulus
526
+ Barbara_Rittner
527
+ Barbara_Schett
528
+ Barbarian_F.C.
529
+ Barcelona
530
+ Barentsburg
531
+ Barnet_F.C.
532
+ Barnsley_F.C.
533
+ Barrick_Gold
534
+ Barry,_Vale_of_Glamorgan
535
+ Barry_Bonds
536
+ Barry_Lane
537
+ Barry_Larkin
538
+ Bart_Voskamp
539
+ Barvikha
540
+ Basarab_Panduru
541
+ Basque_Country_(autonomous_community)
542
+ Basque_Country_(greater_region)
543
+ Batajnica
544
+ Batasuna
545
+ Bath_Rugby
546
+ Baudouin_of_Belgium
547
+ Bavaria
548
+ Bayer_04_Leverkusen
549
+ Bayer_Giants_Leverkusen
550
+ Bayer_HealthCare_Pharmaceuticals
551
+ Beatrix_of_the_Netherlands
552
+ Bedouin
553
+ Beijing
554
+ Beirut
555
+ Beirut_Stock_Exchange
556
+ Beitar_Jerusalem_F.C.
557
+ Belarus
558
+ Belarus_national_football_team
559
+ Belfast
560
+ Belga_(news_agency)
561
+ Belgium
562
+ Belgium_Fed_Cup_team
563
+ Belgium_national_football_team
564
+ Belgrade
565
+ BellSouth
566
+ Bellerive_Oval
567
+ Belém
568
+ Benazir_Bhutto
569
+ Benedetto_Santapaola
570
+ Benelux
571
+ Benetton_Formula
572
+ Benghazi
573
+ Benin_national_football_team
574
+ Benito_Mussolini
575
+ Benito_Santiago
576
+ Benjamin_Netanyahu
577
+ Benoît_Cauet
578
+ Benson_Koech
579
+ Bergen
580
+ Berkshire_Hathaway
581
+ Berlin
582
+ Berlin_Tegel_Airport
583
+ Berlin_Tempelhof_Airport
584
+ Bermuda
585
+ Bern
586
+ Bernama
587
+ Bernard_Barmasai
588
+ Bernard_Collomb
589
+ Bernard_Ingham
590
+ Bernard_Lama
591
+ Bernard_Tapie
592
+ Bernd_Karbacher
593
+ Bernhard_Langer
594
+ Berovo
595
+ Bert_Konterman
596
+ Bertelsmann
597
+ Berti_Vogts
598
+ Bertrand_Crasson
599
+ Berwick_Rangers_F.C.
600
+ Bessemer_Venture_Partners
601
+ Best_Products
602
+ Beth_Daniel
603
+ Bethlehem
604
+ Be��chatów
605
+ Beşiktaş_J.K.
606
+ Bharat_Ratna
607
+ Bharatiya_Janata_Party
608
+ Bhavnagar
609
+ Biathlon_World_Cup
610
+ Bible
611
+ Bilbao
612
+ Bild
613
+ Biljana_Plavšić
614
+ Bill_Birch
615
+ Bill_Clinton
616
+ Bill_Russell_(baseball)
617
+ Billa_(supermarket)
618
+ Billie_Jean_King
619
+ Billings,_Montana
620
+ Billy_Andrade
621
+ Billy_Ashley
622
+ Billy_Davies
623
+ Billy_Dodds
624
+ Billy_Mayfair
625
+ Bima_Sakti
626
+ Biogen_Idec
627
+ Birendra_of_Nepal
628
+ Birmingham
629
+ Birmingham_City_F.C.
630
+ Bistriţa
631
+ Bixente_Lizarazu
632
+ Black_Sea
633
+ Blackburn_Rovers_F.C.
634
+ Blackpool_F.C.
635
+ Blaise_Compaoré
636
+ Blantyre,_Malawi
637
+ Bledar_Kola
638
+ Blendi_Nallbani
639
+ Blida_Province
640
+ Bnei_Yehuda_Tel_Aviv_F.C.
641
+ Board_of_Control_for_Cricket_in_India
642
+ Boavista_F.C.
643
+ Bob_Brett
644
+ Bob_Dole
645
+ Bob_Dwyer
646
+ Bob_Estes
647
+ Bob_Halverson
648
+ Bob_Kennedy_(athlete)
649
+ Bob_May_(golfer)
650
+ Bob_Wickman
651
+ Bob_Willis
652
+ Bob_Wollek
653
+ Bob_Woolmer
654
+ Bobbie_Goulding
655
+ Bobby_Bonilla
656
+ Bobby_Charlton
657
+ Bobby_Rahal
658
+ Bobby_Robson
659
+ Bobby_Valentine
660
+ Boca_Juniors
661
+ Boddington_Gold_Mine
662
+ Boeing
663
+ Bogdan_Stelea
664
+ Bogotá
665
+ Bogra
666
+ Bohdan_Ulihrach
667
+ Bohemians_1905
668
+ Boland_cricket_team
669
+ Bolivia
670
+ Bolloré
671
+ Bologna_F.C._1909
672
+ Bolton_Wanderers_F.C.
673
+ Bonn
674
+ Bordeaux
675
+ Boreham_Wood_F.C.
676
+ Boris_Becker
677
+ Boris_Yeltsin
678
+ Borodino_(village),_Mozhaysky_District,_Moscow_Oblast
679
+ Boroughmuir_RFC
680
+ Borussia_Dortmund
681
+ Borussia_Mönchengladbach
682
+ Borussia_Neunkirchen
683
+ Bosnia
684
+ Bosnia_(region)
685
+ Boston
686
+ Boston_Bruins
687
+ Boston_Celtics
688
+ Boston_Red_Sox
689
+ Boston_United_F.C.
690
+ Botafogo_de_Futebol_e_Regatas
691
+ Botswana
692
+ Botswana_national_football_team
693
+ Boulder,_Colorado
694
+ Boutros_Boutros-Ghali
695
+ Boutros_Harb
696
+ Bovine_spongiform_encephalopathy
697
+ Boxer_Protocol
698
+ Brad_Ausmus
699
+ Brad_Bryant
700
+ Brad_Clontz
701
+ Brad_Hogg
702
+ Bradford_Bulls
703
+ Bradford_City_A.F.C.
704
+ Bradford_Vaughan
705
+ Bradley_Hughes_(golfer)
706
+ Brady_Anderson
707
+ Brahim_Lahlafi
708
+ Brasília
709
+ Bratislava
710
+ Brazil
711
+ Brazil_national_football_team
712
+ Brazilian_Football_Confederation
713
+ Bre-X
714
+ Brechin_City_F.C.
715
+ Breda
716
+ Breisach
717
+ Bremen_Airport
718
+ Brenda_Schultz-McCarthy
719
+ Brenden_Pappas
720
+ Brent_Mayne
721
+ Brentford_F.C.
722
+ Brett_Liddle
723
+ Brett_Martin
724
+ Brian_Burke_(ice_hockey)
725
+ Brian_Currin
726
+ Brian_De_Palma
727
+ Brian_Giles
728
+ Brian_Henninger
729
+ Brian_Lara
730
+ Brian_Laudrup
731
+ Brian_McMillan
732
+ Brian_McRae
733
+ Brian_Shimer
734
+ Brian_Wellman
735
+ Bridgend_Ravens
736
+ Brighton_&_Hove_Albion_F.C.
737
+ Brigita_Bukovec
738
+ Brisbane
739
+ Brisbane_Broncos
740
+ Brisbane_Lions
741
+ Bristol
742
+ Bristol_City_F.C.
743
+ Bristol_Rovers_F.C.
744
+ Bristol_Rugby
745
+ British_Airways
746
+ British_Columbia
747
+ British_Heart_Foundation
748
+ British_Jews
749
+ British_Land
750
+ British_Masters
751
+ British_Universities_cricket_team
752
+ British_West_Indies
753
+ British_people
754
+ Brno
755
+ Brown_Deer_Park_Golf_Course
756
+ Bruce_Dyer
757
+ Bruce_Grobbelaar
758
+ Brunei
759
+ Bruno_Risi
760
+ Bruno_Rodriguez
761
+ Bruno_Thiry
762
+ Bruny_Surin
763
+ Brush_Engineered_Materials
764
+ Brussels
765
+ Bryan_Herta
766
+ Bryan_Robson
767
+ Bucharest
768
+ Budapest
769
+ Budapest_Honvéd_FC
770
+ Budapest_Stock_Exchange
771
+ Buddy_Groom
772
+ Buenaventura,_Valle_del_Cauca
773
+ Buenos_Aires
774
+ Buenos_Aires_Province
775
+ Buffalo_Bills
776
+ Buffalo_Sabres
777
+ Building_(magazine)
778
+ Bukavu
779
+ Bulgaria
780
+ Bulgaria_national_football_team
781
+ Bulgaria_national_under-21_football_team
782
+ Bulgarians
783
+ Bunge_Limited
784
+ Bureau_Veritas
785
+ Burhanuddin_Rabbani
786
+ Burkhard_Reich
787
+ Burkina_Faso
788
+ Burma
789
+ Burnley_F.C.
790
+ Burundi
791
+ Burundi_national_football_team
792
+ Bury_F.C.
793
+ Business_Recorder
794
+ Butch_Harmon
795
+ Buñol
796
+ Byron_Black
797
+ Béja
798
+ Béla_Markó
799
+ C.D._Guadalajara
800
+ C.F._Monterrey
801
+ C.F._Os_Belenenses
802
+ C.F._Pachuca
803
+ C._Rangarajan
804
+ CB_Estudiantes
805
+ CB_Sevilla
806
+ CD_Tenerife
807
+ CF_Extremadura
808
+ CITIC_Pacific
809
+ CNBC
810
+ CNN
811
+ CONCACAF_Champions_League
812
+ CRH_plc
813
+ CR_Vasco_da_Gama
814
+ CSP_Limoges
815
+ CS_Jiul_Petroşani
816
+ Cabinet_(government)
817
+ Caerphilly_RFC
818
+ Caesarea
819
+ Cagliari_Calcio
820
+ Cairo
821
+ Cal_Eldred
822
+ Cal_Ripken,_Jr.
823
+ Calgary
824
+ Calgary_Flames
825
+ California
826
+ Caltex
827
+ Calvin_Davis
828
+ Cambodia
829
+ Cambridge,_Massachusetts
830
+ Cambridge_United_F.C.
831
+ Camden,_Arkansas
832
+ Camelot,_Chesapeake,_Virginia
833
+ Cameroon
834
+ Cameroon_national_football_team
835
+ Camilla_Martin
836
+ Canada
837
+ Canadian_Grain_Commission
838
+ Canadian_Prairies
839
+ Canadian_Wheat_Board
840
+ Canberra
841
+ Canberra_Raiders
842
+ Canopic_jar
843
+ Canterbury-Bankstown_Bulldogs
844
+ Cape_Town
845
+ Caracas
846
+ Cardiff
847
+ Cardiff_City_F.C.
848
+ Cardiff_RFC
849
+ Cargill
850
+ Caribbean
851
+ Carl-Uwe_Steeb
852
+ Carl_Fogarty
853
+ Carl_Hooper
854
+ Carl_Lewis
855
+ Carl_Mason
856
+ Carl_Suneson
857
+ Carla_Sacramento
858
+ Carlisle_United_F.C.
859
+ Carlo_Ancelotti
860
+ Carlo_Checchinato
861
+ Carlos_Bianchi
862
+ Carlos_Calado
863
+ Carlos_Checa
864
+ Carlos_Costa
865
+ Carlos_Delgado
866
+ Carlos_Filipe_Ximenes_Belo
867
+ Carlos_Moyá
868
+ Carlos_Pavón
869
+ Carlos_Sainz
870
+ Carlos_Secretário
871
+ Carlsbad,_California
872
+ Carlsberg_Group
873
+ Carlton_Football_Club
874
+ Carole_Montillet
875
+ Carolina_Panthers
876
+ Caroline,_Princess_of_Hanover
877
+ Carroll_A._Campbell,_Jr.
878
+ Carsten_Ramelow
879
+ Carsten_Wolf
880
+ Cartagena,_Colombia
881
+ Casa_de_Nariño
882
+ Casey_FitzRandolph
883
+ Castle_Park_Cricket_Ground
884
+ Castleford_Tigers
885
+ Catalonia
886
+ Catania
887
+ Catholic_Church
888
+ Catholicism
889
+ Cathy_Freeman
890
+ CeBIT
891
+ Cecil_Mamiit
892
+ Celsius
893
+ Celta_de_Vigo
894
+ Celtic_F.C.
895
+ Central_African_Republic
896
+ Central_African_Republic_national_football_team
897
+ Central_Asia
898
+ Central_Bank_of_Brazil
899
+ Central_Clinical_Hospital
900
+ Central_Division_(NBA)
901
+ Central_Division_(NHL)
902
+ Central_European_Free_Trade_Agreement
903
+ Central_Narcotics_Bureau
904
+ Cercle_Brugge_K.S.V.
905
+ Chabab_Massira
906
+ Chad_Curtis
907
+ Chad_Ogea
908
+ Challenge_Cup
909
+ Chaminda_Vaas
910
+ Chanda_Rubin
911
+ Chandigarh
912
+ Chandra_Sturrup
913
+ Chandrika_Kumaratunga
914
+ Changi_Prison
915
+ Channel_2_(Israel)
916
+ Channel_Islands
917
+ Charleroi
918
+ Charles,_Prince_of_Wales
919
+ Charles_Austin
920
+ Charles_Nagy
921
+ Charles_Taylor_(Liberia)
922
+ Charles_VI,_Holy_Roman_Emperor
923
+ Charleston,_South_Carolina
924
+ Charleston,_West_Virginia
925
+ Charlotte_Bobcats
926
+ Charlton_Athletic_F.C.
927
+ Charmaine_Crooks
928
+ Chatichai_Choonhavan
929
+ Chavakacheri
930
+ Chavalit_Yongchaiyudh
931
+ Cheah_Soon_Kit
932
+ Chechen_people
933
+ Chechnya
934
+ Chelsea_Clinton
935
+ Chelsea_F.C.
936
+ Chenab_River
937
+ Cherbourg-Octeville
938
+ Chesapeake,_Virginia
939
+ Chester-le-Street
940
+ Chester_City_F.C.
941
+ Chesterfield
942
+ Chesterfield_F.C.
943
+ Chevron_Corporation
944
+ Chiapas
945
+ Chicago
946
+ Chicago_Bears
947
+ Chicago_Blackhawks
948
+ Chicago_Board_Options_Exchange
949
+ Chicago_Board_of_Trade
950
+ Chicago_Bulls
951
+ Chicago_Cubs
952
+ Chicago_Mercantile_Exchange
953
+ Chicago_Stock_Exchange
954
+ Chicago_White_Sox
955
+ Chihuahua,_Chihuahua
956
+ Chile
957
+ Chile_national_football_team
958
+ Chili_Davis
959
+ Chillicothe,_Ohio
960
+ Chilpancingo
961
+ China_Daily
962
+ China_PR_national_football_team
963
+ China_Steel
964
+ Chinese_language
965
+ Chinese_people
966
+ Chipper_Jones
967
+ Chiquinho_Conde
968
+ Chiron_Corporation
969
+ Chişinău
970
+ Chongqing
971
+ Chorzów
972
+ Chris_Adams_(cricketer)
973
+ Chris_Boardman
974
+ Chris_Cairns
975
+ Chris_Harris_(cricketer)
976
+ Chris_Hay
977
+ Chris_Hoiles
978
+ Chris_Lewis_(cricketer)
979
+ Chris_Patten,_Baron_Patten_of_Barnes
980
+ Chris_Powell
981
+ Chris_Sutton
982
+ Chris_Walker_(squash_player)
983
+ Chris_Witty
984
+ Chris_Woodruff
985
+ Christchurch
986
+ Christian_Blanc
987
+ Christian_Cullen
988
+ Christian_Cévaër
989
+ Christian_Karembeu
990
+ Christian_Ruud
991
+ Christian_Ruuttu
992
+ Christian_Springer
993
+ Christian_Ziege
994
+ Christian_theology
995
+ Christine_Magnusson
996
+ Christine_Wachtel
997
+ Christophe_Bonvin
998
+ Christophe_Ohrel
999
+ Christopher_Reeve
1000
+ Christopher_Wreh
1001
+ Chrysler_Classic_of_Tucson
1002
+ Chryste_Gaines
1003
+ Chua_Jui_Meng
1004
+ Chuck_Adams
1005
+ Chuck_Finley
1006
+ Cincinnati
1007
+ Cincinnati_Bengals
1008
+ Cincinnati_Reds
1009
+ Circuit_de_Spa-Francorchamps
1010
+ Ciriaco_Sforza
1011
+ Civil_Aviation_Administration_of_China
1012
+ Civil_Aviation_Authority_(United_Kingdom)
1013
+ Clarence_Park,_Weston-super-Mare
1014
+ Clarence_Rose
1015
+ Clarence_Seedorf
1016
+ Clarence_Woolmer
1017
+ Clarksburg,_West_Virginia
1018
+ Claude_Lelouch
1019
+ Claude_Makélélé
1020
+ Claudio_Suárez
1021
+ Cleveland
1022
+ Cleveland_Cavaliers
1023
+ Cleveland_Indians
1024
+ Cliftonville_F.C.
1025
+ Clinton_Whitelaw
1026
+ Clive_Lloyd
1027
+ Club_América
1028
+ Club_Atlas
1029
+ Club_Atlético_Banfield
1030
+ Club_Atlético_Huracán
1031
+ Club_Atlético_Independiente
1032
+ Club_Atlético_Lanús
1033
+ Club_Atlético_Platense
1034
+ Club_Atlético_River_Plate
1035
+ Club_Atlético_Vélez_Sársfield
1036
+ Club_Brugge_K.V.
1037
+ Club_Celaya
1038
+ Club_León
1039
+ Club_Necaxa
1040
+ Club_Santos_Laguna
1041
+ Club_Universidad_Nacional
1042
+ Clube_Atlético_Bragantino
1043
+ Clube_Atlético_Mineiro
1044
+ Clube_Atlético_Paranaense
1045
+ Clube_de_Regatas_do_Flamengo
1046
+ Clyde_F.C.
1047
+ Clydebank_F.C.
1048
+ Coalition_(Australia)
1049
+ Coast_guard
1050
+ Cochabamba
1051
+ Cocker_Spaniel
1052
+ Codelco
1053
+ Codos
1054
+ Colchester
1055
+ Colchester_United_F.C.
1056
+ Coleraine_F.C.
1057
+ Colin_Calderwood
1058
+ Colin_Cameron_(footballer)
1059
+ Colin_Edwards
1060
+ Colin_Hendry
1061
+ Colin_Jackson
1062
+ Colin_McRae
1063
+ Colin_Montgomerie
1064
+ Collingtree_Park
1065
+ Collingwood_Football_Club
1066
+ Cologne
1067
+ Colombia
1068
+ Colombian_Liberal_Party
1069
+ Colombo
1070
+ Colorado
1071
+ Colorado_Avalanche
1072
+ Colorado_Rockies
1073
+ Columbia_University
1074
+ Columbus,_Ohio
1075
+ Commack,_New_York
1076
+ Commerce_Bancshares
1077
+ Commission_on_Presidential_Debates
1078
+ Commonwealth_Games
1079
+ Commonwealth_of_Nations
1080
+ Communist_Party_of_China
1081
+ Communist_party
1082
+ Community_of_Madrid
1083
+ Competition_Commission_(United_Kingdom)
1084
+ Comprehensive_Nuclear-Test-Ban_Treaty
1085
+ Conakry
1086
+ Conchita_Martínez
1087
+ Confederate_States_of_America
1088
+ Confederation_of_African_Football
1089
+ Confederation_of_British_Industry
1090
+ Congo_DR_national_football_team
1091
+ Congo_national_football_team
1092
+ Conservative_Party_(UK)
1093
+ Constand_Viljoen
1094
+ Constantin_Gâlcă
1095
+ Constanţa
1096
+ Construction_Industry_Council
1097
+ Continental_AG
1098
+ Copenhagen
1099
+ Coppa_Italia
1100
+ Corey_Pavin
1101
+ Corina_Morariu
1102
+ Coritiba_Foot_Ball_Club
1103
+ Cornelius_Euser
1104
+ Cornell_Brown
1105
+ Corporación_Venezolana_de_Guayana
1106
+ Corsica
1107
+ Cosenza_Calcio_1914
1108
+ Cosmin_Contra
1109
+ Costa_Rica
1110
+ Costa_Rica_national_football_team
1111
+ Costain_Group
1112
+ Costantino_Rocca
1113
+ Costas_Simitis
1114
+ Council_of_Europe
1115
+ County_Antrim
1116
+ County_Championship
1117
+ County_Cricket_Ground,_Bristol
1118
+ County_Cricket_Ground,_Hove
1119
+ County_Tipperary
1120
+ Court_of_Cassation_(France)
1121
+ Courtney_Walsh
1122
+ Coventry_City_F.C.
1123
+ Cowdenbeath_F.C.
1124
+ Craig_Brown_(footballer)
1125
+ Craig_Burley
1126
+ Craig_Dowd
1127
+ Craig_Evans
1128
+ Craig_MacLean
1129
+ Craig_Matthews
1130
+ Craig_Parry
1131
+ Craig_Quinnell
1132
+ Craig_Spearman
1133
+ Craig_Stadler
1134
+ Craig_Wishart
1135
+ Crawley
1136
+ Credit_Suisse
1137
+ Creedmoor,_North_Carolina
1138
+ Creutzfeldt–Jakob_disease
1139
+ Crewe_Alexandra_F.C.
1140
+ Criciúma_Esporte_Clube
1141
+ Cricket_World_Cup
1142
+ Criminal_Investigation_Department
1143
+ Croatia
1144
+ Croatia_Davis_Cup_team
1145
+ Croatia_national_football_team
1146
+ Crohn's_disease
1147
+ Cronulla-Sutherland_Sharks
1148
+ Crossfire_(film)
1149
+ Crusaders_F.C.
1150
+ Cruz_Azul
1151
+ Cruzeiro_Esporte_Clube
1152
+ Crystal_Palace,_London
1153
+ Crystal_Palace_F.C.
1154
+ Crédit_Agricole_(cycling_team)
1155
+ Csepel_SC
1156
+ Cuauhtémoc_Blanco
1157
+ Cuba
1158
+ Cuba_national_football_team
1159
+ Culpeper,_Virginia
1160
+ Curitiba
1161
+ Curragh_Racecourse
1162
+ Currie_RFC
1163
+ Curtis_Fleming
1164
+ Curtly_Ambrose
1165
+ Cy_Young_Award
1166
+ Cycle_Collstrop
1167
+ Cynthia_McKinney
1168
+ Cyprus
1169
+ Cyprus_national_football_team
1170
+ Czech_National_Bank
1171
+ Czech_Republic
1172
+ Czech_Republic_Fed_Cup_team
1173
+ Czech_Republic_men's_national_ice_hockey_team
1174
+ Czech_Republic_national_football_team
1175
+ Czechoslovakia
1176
+ Czechs
1177
+ Cédric_Pioline
1178
+ Côte_d'Ivoire
1179
+ D._A._Weibring
1180
+ DAX
1181
+ DFB-Pokal
1182
+ Daewoo
1183
+ Daily_News_(New_York)
1184
+ Dakar
1185
+ Dale_McIntosh
1186
+ Dalian
1187
+ Dallas
1188
+ Dallas_Cowboys
1189
+ Dallas_Mavericks
1190
+ Dallas_Stars
1191
+ Dally_Randriantefy
1192
+ Dalmatian_(dog)
1193
+ Damascus
1194
+ Damian_Lynch
1195
+ Damion_Easley
1196
+ Damon_Hill
1197
+ Dampier,_Western_Australia
1198
+ Dan_Crowley
1199
+ Dan_Glickman
1200
+ Dan_Jenson
1201
+ Dan_Petrescu
1202
+ Dana_Rohrabacher
1203
+ Daniel_Andersson_(footballer_born_1977)
1204
+ Daniel_Bravo
1205
+ Daniel_Chopra
1206
+ Daniel_Ducruet
1207
+ Daniel_Herbert
1208
+ Daniel_Komen
1209
+ Daniel_Nestor
1210
+ Daniel_Prodan
1211
+ Daniel_Vacek
1212
+ Daniel_da_Cruz_Carvalho
1213
+ Daniele_Carnasciali
1214
+ Danilo_Hondo
1215
+ Danny_Blind
1216
+ Danny_Tartabull
1217
+ Dante_Bichette
1218
+ Darius_Milhaud
1219
+ Dariusz_Rosati
1220
+ Dariusz_Wosz
1221
+ Darlington_F.C.
1222
+ Darren_Anderton
1223
+ Darren_Bragg
1224
+ Darren_Dreifort
1225
+ Darren_Eadie
1226
+ Darren_Garforth
1227
+ Darren_Gough
1228
+ Darren_Jackson
1229
+ Darren_Lehmann
1230
+ Darryl_Powell
1231
+ Darryl_Strawberry
1232
+ Darryn_Hill
1233
+ Daryll_Cullinan
1234
+ Dave_Barr_(golfer)
1235
+ Dave_Gilbert_(cricketer)
1236
+ Dave_McPherson_(footballer)
1237
+ Dave_Nilsson
1238
+ Dave_Richardson
1239
+ Dave_Telgheder
1240
+ Dave_Wilson_(rugby_union)
1241
+ Davey_Johnson
1242
+ David
1243
+ David_Batty
1244
+ David_Boon
1245
+ David_Brabham
1246
+ David_Byas
1247
+ David_Campese
1248
+ David_Carter_(golfer)
1249
+ David_Coulthard
1250
+ David_Elleray
1251
+ David_Evans_(squash_player)
1252
+ David_Giffin
1253
+ David_Gilford
1254
+ David_Ginola
1255
+ David_Howell_(golfer)
1256
+ David_Hulse_(baseball)
1257
+ David_J_Russell
1258
+ David_Kelly_(footballer)
1259
+ David_Levy_(Israeli_politician)
1260
+ David_MacEachern
1261
+ David_Millns
1262
+ David_Peleg
1263
+ David_Platt_(footballer)
1264
+ David_Prinosil
1265
+ David_Richards_(racing)
1266
+ David_Rikl
1267
+ David_Rowson
1268
+ David_Segui
1269
+ David_Sesa
1270
+ David_Wheaton
1271
+ David_Zitelli
1272
+ David_Škoch
1273
+ Davidson_Ezinwa
1274
+ Davis_Cup
1275
+ Davis_Kamoga
1276
+ Davis_Love_III
1277
+ Davor_Šuker
1278
+ Dawn_(newspaper)
1279
+ Dayton,_Ohio
1280
+ Dayton_Agreement
1281
+ De_Graafschap
1282
+ Dead_Sea
1283
+ Dean_Gorré
1284
+ Dean_Headley
1285
+ Dean_Holdsworth
1286
+ Dean_Jones_(cricketer)
1287
+ Dean_Palmer
1288
+ Dean_Richards_(rugby_union)
1289
+ Dean_Saunders
1290
+ Dean_Sturridge
1291
+ Dean_Ward
1292
+ Dean_Windass
1293
+ Debbie_Graham
1294
+ Deborah_Compagnoni
1295
+ Debreceni_VSC
1296
+ Decatur,_Illinois
1297
+ Deere_&_Company
1298
+ Dejan_Koturović
1299
+ Dejan_Savićević
1300
+ Dejan_Stefanović
1301
+ Del_Harris_(squash_player)
1302
+ Delfino_Pescara_1936
1303
+ Delino_DeShields
1304
+ Democracy_Wall
1305
+ Democratic_Karen_Buddhist_Army
1306
+ Democratic_Left_Alliance
1307
+ Democratic_National_Convention
1308
+ Democratic_Party_(Hong_Kong)
1309
+ Democratic_Party_(United_States)
1310
+ Democratic_Party_of_Iranian_Kurdistan
1311
+ Democratic_Republic_of_the_Congo
1312
+ Democratic_Union_of_Hungarians_in_Romania
1313
+ Deng_Xiaoping
1314
+ Denis_Irwin
1315
+ Denis_Streak
1316
+ Denmark
1317
+ Dennis_Bergkamp
1318
+ Dennis_Lillee
1319
+ Dennis_Mitchell
1320
+ Dennis_Ross
1321
+ Denny_Neagle
1322
+ Denver
1323
+ Denver_Broncos
1324
+ Denver_Nuggets
1325
+ Deon_Hemmings
1326
+ Department_for_International_Development
1327
+ Deportivo_Español
1328
+ Deportivo_Toluca_F.C.
1329
+ Deportivo_de_La_Coruña
1330
+ Der_Spiegel
1331
+ Derby_County_F.C.
1332
+ Derbyshire_County_Cricket_Club
1333
+ Derek_Crookes
1334
+ Derek_Jeter
1335
+ Derek_Mills
1336
+ Derek_Ringer
1337
+ Derek_Ryan
1338
+ Derrick_Adkins
1339
+ Derrick_Cooper
1340
+ Des_Moines,_Iowa
1341
+ Des_Smyth
1342
+ Des_Terblanche
1343
+ Desmond_Tutu
1344
+ Desvonde_Botes
1345
+ Detroit
1346
+ Detroit_Lions
1347
+ Detroit_Pistons
1348
+ Detroit_Red_Wings
1349
+ Detroit_Tigers
1350
+ Deutsche_Bahn
1351
+ Deutsche_Bundesbank
1352
+ Deutsche_Mark
1353
+ Devon_White_(baseball)
1354
+ Dewas
1355
+ Dhahran
1356
+ Dhaka
1357
+ Dhaka_Stock_Exchange
1358
+ Dhar
1359
+ Dia_(supermarket_chain)
1360
+ Diana,_Princess_of_Wales
1361
+ Dianne_Feinstein
1362
+ Diario_16
1363
+ Dick_Gephardt
1364
+ Dick_Morris
1365
+ Dick_Schreuder
1366
+ Dick_Spring
1367
+ Didier_Deschamps
1368
+ Diego_Borrego
1369
+ Diego_Domínguez
1370
+ Dieter_Eilts
1371
+ Dieter_Ramusch
1372
+ Dieter_Thoma
1373
+ Dietmar_Beiersdorfer
1374
+ Dietmar_Hirsch
1375
+ Dietmar_Kühbauer
1376
+ Dili
1377
+ Dimas_Teixeira
1378
+ Dinamina
1379
+ Dirk_Coetzee
1380
+ Dirk_Medved
1381
+ Dirk_Wiese
1382
+ Discovery_Channel_Pro_Cycling_Team
1383
+ Divaina
1384
+ Dmitri_Dashinski
1385
+ Dmitri_Markov
1386
+ Dnevni_Avaz
1387
+ Doberman_Pinscher
1388
+ Doboj
1389
+ Dodge_City,_Kansas
1390
+ Doetinchem
1391
+ Doku_Zavgayev
1392
+ Dominic_Cork
1393
+ Dominic_Hewson
1394
+ Dominion_Bond_Rating_Service
1395
+ Dominique_Baratelli
1396
+ Dominique_Monami
1397
+ Don_McKinnon
1398
+ Don_Wengert
1399
+ Donald_Tsang
1400
+ Donaldson,_Lufkin_&_Jenrette
1401
+ Donato_Gama_da_Silva
1402
+ Doncaster_Cup
1403
+ Doncaster_Rovers_F.C.
1404
+ Dong_Jiong
1405
+ Donna_Andrews_(golfer)
1406
+ Donna_Weinbrecht
1407
+ Donne_Wall
1408
+ Donovan_Bailey
1409
+ Dore_Gold
1410
+ Dorinel_Munteanu
1411
+ Doug_Flach
1412
+ Doug_Young
1413
+ Dow_Chemical_Company
1414
+ Dow_Jones_Industrial_Average
1415
+ Dresden
1416
+ Dresden_Airport
1417
+ Dubai
1418
+ Dublin
1419
+ Ducati
1420
+ Ducati_Corse
1421
+ Duff_&_Phelps
1422
+ Duffy_Waldorf
1423
+ Duhok,_Iraq
1424
+ Duilio_Davino
1425
+ Duluth,_Minnesota
1426
+ Duma
1427
+ Dumbarton_F.C.
1428
+ Duncan_Ferguson
1429
+ Dundee_F.C.
1430
+ Dundee_United_F.C.
1431
+ Dunedin
1432
+ Dunfermline_Athletic_F.C.
1433
+ Dunnes_Stores
1434
+ Dunvant_RFC
1435
+ Durban
1436
+ Durham_County_Cricket_Club
1437
+ Durrës
1438
+ Dushanbe
1439
+ Dust_Bowl
1440
+ Dutch_people
1441
+ Dwight_Yorke
1442
+ Dynamo_Sports_Club
1443
+ Dzhokhar_Dudayev
1444
+ Dési_Bouterse
1445
+ Dănuţ_Lupu
1446
+ ETA
1447
+ Eamonn_Darcy
1448
+ East_Fife_F.C.
1449
+ East_Germany
1450
+ East_Java
1451
+ East_Jerusalem
1452
+ East_Kalimantan
1453
+ East_North_Central_States
1454
+ East_Stirlingshire_F.C.
1455
+ East_Timor
1456
+ Eau_Claire,_Wisconsin
1457
+ Ebbw_Vale_RFC
1458
+ Eberhard_Carl
1459
+ Economic_Community_of_West_African_States
1460
+ Economic_Community_of_West_African_States_Monitoring_Group
1461
+ Ecuador
1462
+ Ed_Vosberg
1463
+ Ed_de_Goey
1464
+ Eddie_Irvine
1465
+ Eddie_Murray
1466
+ Eddo_Brandes
1467
+ Edgar_Davids
1468
+ Edgar_Martínez
1469
+ Edgar_Rentería
1470
+ Edgbaston
1471
+ Edgbaston_Cricket_Ground
1472
+ Edinburgh
1473
+ Edmonton
1474
+ Edmonton_Oilers
1475
+ Eduard_Gritsun
1476
+ Eduardo_Romero
1477
+ Edvard_Grieg
1478
+ Edward_Dmytryk
1479
+ Edward_Said
1480
+ Edwin_Godee
1481
+ Edwin_Vurens
1482
+ Edwin_van_der_Sar
1483
+ Edwina_Currie
1484
+ Efan_Ekoku
1485
+ Efes_Pilsen_S.K.
1486
+ Egypt
1487
+ EgyptAir
1488
+ Egyptians
1489
+ Eindhoven
1490
+ Ejup_Ganić
1491
+ El_Al
1492
+ El_Mundo_(Spain)
1493
+ El_Nuevo_Diario
1494
+ El_País
1495
+ El_Salvador
1496
+ El_Salvador_national_football_team
1497
+ El_Watan
1498
+ Elakhbar
1499
+ Elena_Likhovtseva
1500
+ Elena_Makarova
1501
+ Elena_Pampoulova
1502
+ Eleusina
1503
+ Elf_Aquitaine
1504
+ Elgin_City_F.C.
1505
+ Elia_Kazan
1506
+ Eliyahu_Ben-Elissar
1507
+ Elizabeth,_New_Jersey
1508
+ Elizabeth_I_of_England
1509
+ Elizabeth_McIntyre
1510
+ Elland_Road
1511
+ Ellina_Zvereva
1512
+ Ellis_Burks
1513
+ Ellis_Park_Stadium
1514
+ Els_Callens
1515
+ Emanuele_Canonica
1516
+ Emil_Constantinescu
1517
+ Emiliano_Mondonico
1518
+ Emilio_Valle
1519
+ Emmanuel_Tetteh
1520
+ Empire_State_Building
1521
+ Empire_of_Brazil
1522
+ Empoli_F.C.
1523
+ En_Avant_de_Guingamp
1524
+ Endrio_Leoni
1525
+ Enfield_Town_F.C.
1526
+ England
1527
+ England_cricket_team
1528
+ England_national_football_team
1529
+ England_national_rugby_union_team
1530
+ English_language
1531
+ English_people
1532
+ Enrico_Chiesa
1533
+ Enrique_Alfaro
1534
+ Environment_Canada
1535
+ Enzo_Scifo
1536
+ Equipe_Ligier
1537
+ Equitas
1538
+ Eredivisie
1539
+ Erez
1540
+ Eric_Anthony
1541
+ Eric_Bergoust
1542
+ Eric_Davis_(baseball)
1543
+ Eric_Rush
1544
+ Eric_Thomas_(athlete)
1545
+ Eric_Wynalda
1546
+ Ericsson
1547
+ Erik_Breukink
1548
+ Erik_Dekker
1549
+ Erik_Hanson
1550
+ Erik_Zabel
1551
+ Eritrea
1552
+ Erjon_Bogdani
1553
+ Ernest_Faber
1554
+ Ernesto_Samper
1555
+ Ernie_Els
1556
+ Ervin_Fakaj
1557
+ Erzincan
1558
+ Eschen
1559
+ Espen_Bredesen
1560
+ Esporte_Clube_Bahia
1561
+ Esporte_Clube_Flamengo
1562
+ Esporte_Clube_Juventude
1563
+ Esporte_Clube_Vitória
1564
+ Essendon_Football_Club
1565
+ Essex
1566
+ Essex_County_Cricket_Club
1567
+ Essilor
1568
+ Estonia
1569
+ Estonia_national_football_team
1570
+ Estonian_Reform_Party
1571
+ Estudiantes_Tecos
1572
+ Estudiantes_de_La_Plata
1573
+ Eternit
1574
+ Ethiopia
1575
+ Ethiopia_national_football_team
1576
+ Etienne_Saqr
1577
+ Etruria
1578
+ Eugene_Emeralds
1579
+ Eugene_de_Kock
1580
+ Eugenio_Corini
1581
+ Eurodollar
1582
+ Euroleague_Basketball
1583
+ Euronext_Paris
1584
+ Europe
1585
+ Europe_1
1586
+ European_Commission
1587
+ European_Court_of_Human_Rights
1588
+ European_Cup_(athletics)
1589
+ European_Economic_Community
1590
+ European_Union
1591
+ Euroscepticism
1592
+ Eurostat
1593
+ Eurotunnel
1594
+ Evelyne_Leu
1595
+ Everton_F.C.
1596
+ Ewald_Brenner
1597
+ Exeter_City_F.C.
1598
+ Expansión
1599
+ Exxon
1600
+ ExxonMobil
1601
+ Ezer_Weizman
1602
+ F.C._Hansa_Rostock
1603
+ F.C._Internazionale_Milano
1604
+ F.C._Porto
1605
+ F._W._de_Klerk
1606
+ FA_Cup
1607
+ FC_Aarau
1608
+ FC_Alania_Vladikavkaz
1609
+ FC_Baltika_Kaliningrad
1610
+ FC_Baník_Ostrava
1611
+ FC_Barcelona
1612
+ FC_Barcelona_Bàsquet
1613
+ FC_Bayern_Munich
1614
+ FC_CSKA_Kyiv
1615
+ FC_Ceahlăul_Piatra_Neamţ
1616
+ FC_Chornomorets_Odesa
1617
+ FC_Dinamo_Batumi
1618
+ FC_Dinamo_Bucureşti
1619
+ FC_Dnipro_Dnipropetrovsk
1620
+ FC_Dynamo_Kyiv
1621
+ FC_Dynamo_Moscow
1622
+ FC_Girondins_de_Bordeaux
1623
+ FC_Groningen
1624
+ FC_Hradec_Králové
1625
+ FC_Karpaty_Lviv
1626
+ FC_Kotayk_Abovian
1627
+ FC_Kremin_Kremenchuk
1628
+ FC_Krylia_Sovetov_Samara
1629
+ FC_Kryvbas_Kryvyi_Rih
1630
+ FC_Lada_Togliatti
1631
+ FC_Lausanne-Sport
1632
+ FC_Linz
1633
+ FC_Lokomotiv_Moscow
1634
+ FC_Lokomotiv_Nizhny_Novgorod
1635
+ FC_Lokomotíva_Košice
1636
+ FC_Lugano
1637
+ FC_Luzern
1638
+ FC_Metalurh_Zaporizhya
1639
+ FC_Metz
1640
+ FC_Nantes
1641
+ FC_Nitra
1642
+ FC_Nyva_Ternopil
1643
+ FC_Oţelul_Galaţi
1644
+ FC_Rapid_Bucureşti
1645
+ FC_Red_Bull_Salzburg
1646
+ FC_Rostov
1647
+ FC_Rotor_Volgograd
1648
+ FC_Schalke_04
1649
+ FC_Shakhtar_Donetsk
1650
+ FC_Sion
1651
+ FC_Slovan_Liberec
1652
+ FC_Spartak_Moscow
1653
+ FC_Spartak_Trnava
1654
+ FC_Sportul_Studenţesc_Bucureşti
1655
+ FC_St._Gallen
1656
+ FC_St._Pauli
1657
+ FC_Steaua_Bucureşti
1658
+ FC_Tekstilshchik_Kamyshin
1659
+ FC_Tiraspol
1660
+ FC_Tirol_Innsbruck
1661
+ FC_Torpedo_Moscow
1662
+ FC_Torpedo_Zaporizhya
1663
+ FC_Twente
1664
+ FC_Universitatea_Cluj
1665
+ FC_Universitatea_Craiova
1666
+ FC_Ural_Sverdlovsk_Oblast
1667
+ FC_Utrecht
1668
+ FC_Vaduz
1669
+ FC_Viktoria_Plzeň
1670
+ FC_Volendam
1671
+ FC_Volgograd
1672
+ FC_Vorskla_Poltava
1673
+ FC_Zbrojovka_Brno
1674
+ FC_Zenit_Saint_Petersburg
1675
+ FC_Zhemchuzhina-Sochi
1676
+ FDA_(trade_union)
1677
+ FIFA
1678
+ FIFA_World_Cup
1679
+ FIS_Alpine_Ski_World_Cup
1680
+ FIS_Freestyle_Skiing_World_Cup
1681
+ FIS_Ski_Jumping_World_Cup
1682
+ FK_Austria_Wien
1683
+ FK_Baumit_Jablonec
1684
+ FK_Borac_Čačak
1685
+ FK_Budućnost_Podgorica
1686
+ FK_DAC_1904_Dunajská_Streda
1687
+ FK_Drnovice
1688
+ FK_Dukla_Banská_Bystrica
1689
+ FK_Hajduk_Beograd
1690
+ FK_Hajduk_Kula
1691
+ FK_Inter_Bratislava
1692
+ FK_Kareda_Kaunas
1693
+ FK_Kikinda
1694
+ FK_Metalurg_Skopje
1695
+ FK_Partizan
1696
+ FK_Proleter_Zrenjanin
1697
+ FK_Rad
1698
+ FK_Rudar_Pljevlja
1699
+ FK_Rudar_Ugljevik
1700
+ FK_Sloboda_Tuzla
1701
+ FK_Sloga_Jugomagnat
1702
+ FK_Spartak_Zlatibor_Voda
1703
+ FK_Sutjeska_Foča
1704
+ FK_Sutjeska_Nikšić
1705
+ FK_Teplice
1706
+ FK_Viktoria_Žižkov
1707
+ FK_Vojvodina
1708
+ FK_Zemun
1709
+ FSC_Prykarpattya_Ivano-Frankivsk
1710
+ FTSE_100_Index
1711
+ Fabio_Baldato
1712
+ Fabio_Cannavaro
1713
+ Fabio_Capello
1714
+ Fabrizio_Mori
1715
+ Fabrizio_Ravanelli
1716
+ Fairmont,_West_Virginia
1717
+ Fairview,_Texas
1718
+ Falilat_Ogunkoya
1719
+ Falk_Balzer
1720
+ Falkirk_F.C.
1721
+ Fanie_de_Villiers
1722
+ Farmington_Hills,_Michigan
1723
+ Faroe_Islands
1724
+ Faroe_Islands_national_football_team
1725
+ Fatima_Yusuf
1726
+ Fatmir_Vata
1727
+ Fatos_Nano
1728
+ Faustino_Asprilla
1729
+ Fausto_Pizzi
1730
+ Fed_Cup
1731
+ Federal_Aviation_Administration
1732
+ Federal_Bureau_of_Investigation
1733
+ Federal_Open_Market_Committee
1734
+ Federal_Reserve_System
1735
+ Felipe_Lira
1736
+ Fenerbahçe_S.K.
1737
+ Ferdi_Vierklau
1738
+ Ferenc_Horváth
1739
+ Ferencvárosi_TC
1740
+ Fernanda_Ribeiro
1741
+ Fernando_Couto
1742
+ Fernando_Henrique_Cardoso
1743
+ Fernando_Hierro
1744
+ Fernando_Meligeni
1745
+ Fernando_Redondo
1746
+ Fernando_Sanz
1747
+ Ferrari
1748
+ Ferro_Carril_Oeste
1749
+ Feyenoord
1750
+ Fidel_Castro
1751
+ Fidel_V._Ramos
1752
+ Filip_De_Wilde
1753
+ Filip_Dewulf
1754
+ Filippo_Inzaghi
1755
+ Finance_minister
1756
+ Financial_Post
1757
+ Financial_Services_Authority
1758
+ Finland
1759
+ Finland_men's_national_ice_hockey_team
1760
+ Fiona_May
1761
+ First_Chicago_Bank
1762
+ First_Pacific
1763
+ Fita_Bayisa
1764
+ Fitzroy_Football_Club
1765
+ Five_Nations_XV
1766
+ Flachau
1767
+ Flag_of_the_United_States
1768
+ Flamurtari_Vlorë
1769
+ Flavio_Cotti
1770
+ Florence
1771
+ Florence_Masnada
1772
+ Florencia_Labat
1773
+ Florian_Maurice
1774
+ Florian_Rousseau
1775
+ Florian_Schwarthoff
1776
+ Florida_Marlins
1777
+ Florida_Panthers
1778
+ Florin_Prunea
1779
+ Fluminense_Football_Club
1780
+ Flushing_Meadows_–_Corona_Park
1781
+ Foindu
1782
+ Football_Association_of_Ireland
1783
+ Footwork_Arrows
1784
+ Ford_Escort
1785
+ Ford_Escort_(Europe)
1786
+ Ford_Motor_Company
1787
+ Ford_World_Rally_Team
1788
+ Foreign_Office_(Germany)
1789
+ Foreign_minister
1790
+ Forfar_Athletic_F.C.
1791
+ Formula_One
1792
+ Forrest_Gump
1793
+ Forsa_institute
1794
+ Fort_Lauderdale,_Florida
1795
+ Fort_Worth,_Texas
1796
+ Forth_Road_Bridge
1797
+ Fortitudo_Bologna
1798
+ Fortuna_Düsseldorf
1799
+ Fortuna_Sittard
1800
+ Fos-sur-Mer
1801
+ Fourth_World_Conference_on_Women
1802
+ Fox_Broadcasting_Company
1803
+ France
1804
+ France_Fed_Cup_team
1805
+ France_Soir
1806
+ France_Télécom
1807
+ France_national_football_team
1808
+ Francesca_Lubiani
1809
+ Francesco_Casagrande
1810
+ Francesco_Mazzariol
1811
+ Francesco_Toldo
1812
+ Francesco_Totti
1813
+ Francis_Agyepong
1814
+ Francis_Moreau
1815
+ Francisco_Clavet
1816
+ Francisco_Palencia
1817
+ Franco_Baresi
1818
+ Frank_Bruno
1819
+ Frank_Bunce
1820
+ Frank_Busemann
1821
+ Frank_Leboeuf
1822
+ Frank_Nobilo
1823
+ Frank_Rodriguez
1824
+ Frank_Thomas_(baseball,_born_1968)
1825
+ Frank_Vandenbroucke_(cyclist)
1826
+ Frank_de_Boer
1827
+ Franka_Dietzsch
1828
+ Frankfurt_Airport
1829
+ Frankfurt_Stock_Exchange
1830
+ Frankfurt_am_Main
1831
+ Frankie_Fredericks
1832
+ Franklin_D._Roosevelt
1833
+ Franz_Fischler
1834
+ Franz_Konrad
1835
+ Franziska_Schenk
1836
+ François_Pienaar
1837
+ Fraser,_Australian_Capital_Territory
1838
+ Fred_Couples
1839
+ Fred_Funk
1840
+ Fred_McGriff
1841
+ Fred_Trueman
1842
+ Frederick_Chiluba
1843
+ Fredi_Bobic
1844
+ Free_Democratic_Party_(Germany)
1845
+ Free_Democratic_Party_of_Switzerland
1846
+ Freedom_Front_Plus
1847
+ Freetown
1848
+ Freiburg_im_Breisgau
1849
+ Fremantle_Football_Club
1850
+ French_Democratic_Confederation_of_Labour
1851
+ French_Guiana
1852
+ French_Open
1853
+ French_Riviera
1854
+ Fribourg
1855
+ Fritz_van_Heerden
1856
+ Frode_Andresen
1857
+ Frédéric_Magné
1858
+ Frédéric_Peiremans
1859
+ Fulgencio_Batista
1860
+ Fulham_F.C.
1861
+ Fung_Permadi
1862
+ Fußball-Bundesliga
1863
+ Fédération_Syndicale_Unitaire
1864
+ Félicia_Ballanger
1865
+ Félix_Mantilla_Botella
1866
+ GKS_Bełchatów
1867
+ GKS_Katowice
1868
+ Gabon
1869
+ Gabon_national_football_team
1870
+ Gabriel_Batistuta
1871
+ Gabriel_Curuchet
1872
+ Gabriela_Sabatini
1873
+ Gabriela_Szabo
1874
+ Gabriele_Colombo
1875
+ Gail_Devers
1876
+ Gala_León_García
1877
+ Galanta
1878
+ Galatasaray_S.K._(football_team)
1879
+ Gale_Norton
1880
+ Galina_Malchugina
1881
+ Galo_Blanco
1882
+ Garamba_National_Park
1883
+ Gareth_Farrelly
1884
+ Garhi_Habibullah
1885
+ Garret_Anderson
1886
+ Garrett_Hines
1887
+ Garry_Galley
1888
+ Garry_Pagel
1889
+ Gartner
1890
+ Gary_Breen
1891
+ Gary_DiSarcina
1892
+ Gary_Emerson
1893
+ Gary_Kelly_(footballer_born_1974)
1894
+ Gary_Kirsten
1895
+ Gary_McAllister
1896
+ Gary_Neiwand
1897
+ Gary_Orr
1898
+ Gary_Sheffield
1899
+ Gary_Speed
1900
+ Gary_Teichmann
1901
+ Gaston_Taument
1902
+ Gatwick_Airport
1903
+ Gaza
1904
+ Gaza_Strip
1905
+ Gazeta_Wyborcza
1906
+ Gazprom
1907
+ Gdańsk
1908
+ Geelong_Football_Club
1909
+ Geir_Moen
1910
+ General_Administration_of_Customs
1911
+ General_Confederation_of_Labour_(France)
1912
+ General_Motors
1913
+ Geneva
1914
+ Genoa
1915
+ Genoa_C.F.C.
1916
+ Gente_(magazine)
1917
+ Geoff_Aunger
1918
+ Geoff_Marsh
1919
+ Geoffrey_Claeys
1920
+ Geoffrey_Fieger
1921
+ George_Duffield
1922
+ George_H._W._Bush
1923
+ George_Hincapie
1924
+ George_Joulwan
1925
+ George_Weah
1926
+ Georges_Simenon
1927
+ Georgia_(U.S._state)
1928
+ Georgia_(country)
1929
+ Georgios_Panagiotopoulos
1930
+ Georgiy_Mammadov
1931
+ Gerard_van_Velde
1932
+ Gerhard_Berger
1933
+ German_Open_(golf)
1934
+ German_Shepherd_Dog
1935
+ German_language
1936
+ Germans
1937
+ Germany
1938
+ Germany_Davis_Cup_team
1939
+ Germany_Fed_Cup_team
1940
+ Germany_men's_national_ice_hockey_team
1941
+ Germany_national_football_team
1942
+ Germán_Villa
1943
+ Geronimo
1944
+ Gerry_Britton
1945
+ Gert_Verheyen
1946
+ Gete_Wami
1947
+ Ghana
1948
+ Gheorghe_Craioveanu
1949
+ Gheorghe_Funar
1950
+ Gheorghe_Hagi
1951
+ Gheorghe_Popescu
1952
+ Gianfranco_Fini
1953
+ Gianfranco_Zola
1954
+ Gianluca_Pozzi
1955
+ Gianluca_Vialli
1956
+ Gianluigi_Lentini
1957
+ Gianni_Bugno
1958
+ Gianni_Versace
1959
+ Gigi_Fernández
1960
+ Gil_Vicente_F.C.
1961
+ Gil_de_Ferran
1962
+ Gilbert_Agius
1963
+ Gilbert_Gress
1964
+ Gilbert_Schaller
1965
+ Gilles_De_Bilde
1966
+ Gillian_Russell
1967
+ Gillingham_F.C.
1968
+ Giovanni_Lavaggi
1969
+ Giovanni_Lombardi
1970
+ Giovanni_Silva_de_Oliveira
1971
+ Giovanni_van_Bronckhorst
1972
+ Gisenyi
1973
+ Giuseppe_Garibaldi
1974
+ Gjon_Buzuku
1975
+ Glamorgan_County_Cricket_Club
1976
+ Glasgow
1977
+ Glen_Osborne
1978
+ Glenallen_Hill
1979
+ Glenavon_F.C.
1980
+ Glenn_Hoddle
1981
+ Glenn_McGrath
1982
+ Glentoran_F.C.
1983
+ Gloria_Pizzichini
1984
+ Gloucester_Rugby
1985
+ Gloucestershire_County_Cricket_Club
1986
+ God
1987
+ Goiás_Esporte_Clube
1988
+ Golan_Heights
1989
+ Gold_Coast_Chargers
1990
+ Gold_Coast_Titans
1991
+ Golden_State_Warriors
1992
+ Goldman_Sachs
1993
+ Goma
1994
+ Gong_Zhichao
1995
+ Goran_Ivanišević
1996
+ Gordon_Durie
1997
+ Gordon_Parsons
1998
+ Gorgona,_Colombia
1999
+ Gorleben
2000
+ Gouda
2001
+ Government_of_Russia
2002
+ Governor_of_South_Carolina
2003
+ Grace_Kelly
2004
+ Grace_Road
2005
+ Graeme_Hick
2006
+ Graeme_Obree
2007
+ Graham_Gooch
2008
+ Graham_Lloyd
2009
+ Graham_Thorpe
2010
+ Grand_Rapids,_Michigan
2011
+ Grand_Slam_(tennis)
2012
+ Grand_Slam_Cup
2013
+ Grande_Prairie
2014
+ Granma_(newspaper)
2015
+ Grant_Flower
2016
+ Grant_Stafford
2017
+ Grasshopper_Club_Zürich
2018
+ Grazer_AK
2019
+ Great_Britain_national_rugby_league_team
2020
+ Great_Hall_of_the_People
2021
+ Great_Lakes
2022
+ Greece
2023
+ Green_Bay_Packers
2024
+ Greenock_Morton_F.C.
2025
+ Greenville_Braves
2026
+ Greenwich_Mean_Time
2027
+ Greg_Blewett
2028
+ Greg_Chalmers
2029
+ Greg_Chappell
2030
+ Greg_Gagne_(baseball)
2031
+ Greg_Kraft
2032
+ Greg_Norman
2033
+ Greg_Rusedski
2034
+ Greg_Turner
2035
+ Gregor_Townsend
2036
+ Grimsby_Town_F.C.
2037
+ Griqualand_West
2038
+ Groningen_(city)
2039
+ Grozny
2040
+ Grupo_Santander
2041
+ Grupos_Antiterroristas_de_Liberación
2042
+ Grégory_Carraz
2043
+ Grêmio_Foot-Ball_Porto_Alegrense
2044
+ Guangdong
2045
+ Guangxi
2046
+ Guangzhou
2047
+ Guarani_Futebol_Clube
2048
+ Guardians_of_the_Cedars
2049
+ Guatemala
2050
+ Guernsey
2051
+ Guerrero
2052
+ Guido_Acklin
2053
+ Guido_Fulst
2054
+ Guilin
2055
+ Guillaume_Raoux
2056
+ Guillermo_Amor
2057
+ Guinea_national_football_team
2058
+ Gujar_Khan
2059
+ Gujarat
2060
+ Gulbuddin_Hekmatyar
2061
+ Gulf_War
2062
+ Gulf_of_Mexico
2063
+ Gunhild_Haugen
2064
+ Gunn_Margit_Andreassen
2065
+ Gunther_Schepens
2066
+ Guus_Hiddink
2067
+ Guy_Forget
2068
+ Guy_Hellers
2069
+ Guy_Whittall
2070
+ Guy_Whittingham
2071
+ Gwen_Torrence
2072
+ Győri_ETO_FC
2073
+ Gérard_de_Nooijer
2074
+ Górnik_Zabrze
2075
+ Günther_Huber
2076
+ H._D._Deve_Gowda
2077
+ HALO_Trust
2078
+ HINA
2079
+ HP_Enterprise_Services
2080
+ Haaretz
2081
+ Haarlem
2082
+ Habib_Boularès
2083
+ Hachette_Filipacchi_Médias
2084
+ Hafez_al-Assad
2085
+ Haile_Gebrselassie
2086
+ Hainan
2087
+ Haiti
2088
+ Hakan_Ünsal
2089
+ Hakan_Şükür
2090
+ Hakkâri
2091
+ Hakkâri_Province
2092
+ Hal_Morris
2093
+ Hal_Sutton
2094
+ Halifax_RLFC
2095
+ Hamas
2096
+ Hambrecht_&_Quist
2097
+ Hamburg
2098
+ Hamburg_Airport
2099
+ Hamburger_SV
2100
+ Hamid_Algabid
2101
+ Hamilton,_New_Zealand
2102
+ Hamilton_Academical_F.C.
2103
+ Hammerson
2104
+ Hampshire
2105
+ Hampshire_County_Cricket_Club
2106
+ Haneda_Airport
2107
+ Hang_Seng_Index
2108
+ Hangzhou
2109
+ Hanne_Haugland
2110
+ Hannes_Strydom
2111
+ Hanover
2112
+ Hans_Segers
2113
+ Hansie_Cronje
2114
+ Hanson_plc
2115
+ Hanwha_Eagles
2116
+ Hapoel_Be'er_Sheva_A.F.C.
2117
+ Hapoel_Haifa_F.C.
2118
+ Hapoel_Ironi_Rishon_LeZion_F.C.
2119
+ Hapoel_Jerusalem_F.C.
2120
+ Hapoel_Petah_Tikva_F.C.
2121
+ Hapoel_Tayibe_F.C.
2122
+ Hapoel_Tel_Aviv_F.C.
2123
+ Hapoel_Tzafririm_Holon_F.C.
2124
+ Harald_Spörl
2125
+ Harare
2126
+ Harlequin_F.C.
2127
+ Harlequins_Rugby_League
2128
+ Harleysville,_Pennsylvania
2129
+ Harold_Baines
2130
+ HarperCollins
2131
+ Harrison_Dillard
2132
+ Harry_Boland
2133
+ Harry_Decheiver
2134
+ Hartford,_Connecticut
2135
+ Hartford_Whalers
2136
+ Hartford_Wolf_Pack
2137
+ Hartlepool_United_F.C.
2138
+ Haruchika_Aoki
2139
+ Hasan_Muratović
2140
+ Hasely_Crawford
2141
+ Hassan_Abbas
2142
+ Hassan_II_of_Morocco
2143
+ Hassan_al-Turabi
2144
+ Hassania_Agadir
2145
+ Hastings_Banda
2146
+ Havana
2147
+ Havnar_Bóltfelag
2148
+ Havre,_Montana
2149
+ Hawaii
2150
+ Hawick_RFC
2151
+ Hawthorn_Football_Club
2152
+ Hay_Point,_Queensland
2153
+ Headingley
2154
+ Headingley_Stadium
2155
+ Heart_of_Midlothian_F.C.
2156
+ Heath_Streak
2157
+ Heathrow,_London
2158
+ Hebrew_University_of_Jerusalem
2159
+ Hebrew_language
2160
+ Hebron
2161
+ Hednesford_Town_F.C.
2162
+ Heidi_Zurbriggen
2163
+ Heidrun_oil_field
2164
+ Heike_Drechsler
2165
+ Heineken_International
2166
+ Heineken_Pilsener
2167
+ Heinz-Harald_Frentzen
2168
+ Helen_Clark
2169
+ Helena_Suková
2170
+ Heli_Rantanen
2171
+ Helibor
2172
+ Hellas_Verona_F.C.
2173
+ Helmut_Kohl
2174
+ Helsinki
2175
+ Hendrik_Dreekmann
2176
+ Hendro_Kartiko
2177
+ Hengelo
2178
+ Henri_Konan_Bédié
2179
+ Henrik_Larsson
2180
+ Henry_Campbell-Bannerman
2181
+ Henry_Honiball
2182
+ Henry_Hub
2183
+ Henry_VIII_of_England
2184
+ Herbert_Prohaska
2185
+ Herculez_Gomez
2186
+ Hereford_United_F.C.
2187
+ Herfried_Sabitzer
2188
+ Herman_Wijffels
2189
+ Hermann_Göring
2190
+ Hermawan_Susanto
2191
+ Hernán_Gumy
2192
+ Herschelle_Gibbs
2193
+ Hervé_de_Charette
2194
+ Hezbi_Islami
2195
+ Hezbollah
2196
+ Hibernian_F.C.
2197
+ Hicham_Arazi
2198
+ Hicham_El_Guerrouj
2199
+ Hidemichi_Tanaka
2200
+ Hideo_Nomo
2201
+ High_Court_(Hong_Kong)
2202
+ High_Plains_(United_States)
2203
+ Hilary_Lindh
2204
+ Hilde_Gerg
2205
+ Hilde_Synnøve_Lid
2206
+ Hillary_Rodham_Clinton
2207
+ Himalaya_Kingdom
2208
+ Himalayas
2209
+ Hindu_nationalism
2210
+ Hintsa_kaKhawuta
2211
+ Hiroaki_Morishima
2212
+ Hiroshi_Nanami
2213
+ Hiroshige_Yanagimoto
2214
+ Hobart
2215
+ Holding_company
2216
+ Hollywood
2217
+ Holy_See
2218
+ Holzgerlingen
2219
+ Honda
2220
+ Honda_Racing_Corporation
2221
+ Honduras
2222
+ Honduras_national_football_team
2223
+ Hong_Kong
2224
+ Hong_Kong_Economic_Times
2225
+ Hong_Kong_dollar
2226
+ Horst_Siegl
2227
+ Hosni_Mubarak
2228
+ House_of_Commons
2229
+ House_of_Orléans
2230
+ House_of_Representatives
2231
+ Houston
2232
+ Houston_Astros
2233
+ Houston_Cougars_football
2234
+ Houston_Rockets
2235
+ Houston_Texans
2236
+ Hove
2237
+ Howard_Wilkinson
2238
+ Howard_Wolpe
2239
+ Hristo_Stoichkov
2240
+ Hubei
2241
+ Hubert_Schösser
2242
+ Huddersfield_Town_F.C.
2243
+ Hugh_Baiocchi
2244
+ Huize_County
2245
+ Hull_City_A.F.C.
2246
+ Human_Rights_in_China_(organization)
2247
+ Humayun_Rashid_Choudhury
2248
+ Hun_Sen
2249
+ Hungarian_National_Bank
2250
+ Hungary
2251
+ Huntington,_West_Virginia
2252
+ Huntly_F.C.
2253
+ Hurricane_Edouard_(1996)
2254
+ Husaberg
2255
+ Husqvarna_Motorcycles
2256
+ Hussein_of_Jordan
2257
+ Hutnik_Kraków
2258
+ Hutu
2259
+ Hwang_Sun-Hong
2260
+ Hyderabad,_Sindh
2261
+ HypoVereinsbank
2262
+ Héctor_Camacho
2263
+ Hélder_Cristóvão
2264
+ Hércules_CF
2265
+ IBM
2266
+ IFK_Göteborg
2267
+ IFOR
2268
+ IK_Start
2269
+ ING_Group
2270
+ ISO_9000
2271
+ Iain_Pyman
2272
+ Ian_Botham
2273
+ Ian_Ferguson_(footballer_born_1967)
2274
+ Ian_Harte
2275
+ Ian_Harvey
2276
+ Ian_Healy
2277
+ Ian_Jones_(rugby_union)
2278
+ Ian_Lang,_Baron_Lang_of_Monkton
2279
+ Ian_McGeechan
2280
+ Ian_Rush
2281
+ Ian_Salisbury
2282
+ Ian_Woosnam
2283
+ Ian_Wright
2284
+ Iberian_Peninsula
2285
+ Ice-Cold_in_Alex
2286
+ Iceland
2287
+ Idris_I_of_Libya
2288
+ Idriss_Déby
2289
+ Ieng_Mouly
2290
+ Ieng_Sary
2291
+ Igor_Ivanov
2292
+ Igor_Kolyvanov
2293
+ Igor_Korneev
2294
+ Igor_Potapovich
2295
+ Igor_Shkvyrin
2296
+ Igor_Trandenkov
2297
+ Ijaz_Ahmed_(cricketer)
2298
+ Ilir_Shulku
2299
+ Ilke_Wyludda
2300
+ Illawarra_Steelers
2301
+ Illinois
2302
+ Iltalehti
2303
+ Imam_Utomo
2304
+ Imola
2305
+ Imran_Khan
2306
+ In-Nazzjon
2307
+ Ina-Yoko_Teutenberg
2308
+ Incheon
2309
+ Independence_Day_(film)
2310
+ Independent_Commission_Against_Corruption_(Hong_Kong)
2311
+ Independent_State_of_Croatia
2312
+ Inder_Kumar_Gujral
2313
+ India
2314
+ India_national_cricket_team
2315
+ Indian_Oil_Corporation
2316
+ Indiana
2317
+ Indiana_Pacers
2318
+ Indianapolis
2319
+ Indianapolis_Colts
2320
+ Indigenous_peoples_of_the_Americas
2321
+ Indonesia
2322
+ Indonesia_national_football_team
2323
+ Indonesian_Democratic_Party
2324
+ Indore
2325
+ Indrajit_Gupta
2326
+ Indre
2327
+ Industrial_Production_Index
2328
+ IndyCar_Series
2329
+ Inez_Turner
2330
+ Information_Technology_Association_of_America
2331
+ Information_Telegraph_Agency_of_Russia
2332
+ Inger_Miller
2333
+ Ingolstadt
2334
+ Inha_Babakova
2335
+ Inia,_Paphos
2336
+ Inkatha_Freedom_Party
2337
+ Insein_Prison
2338
+ Insein_Township
2339
+ Institute_for_Supply_Management
2340
+ Institute_of_Policy_Studies_(Singapore)
2341
+ Intellectual_Property_Owners_Association
2342
+ Intelligent_network
2343
+ Interfax
2344
+ Interferon_beta-1b
2345
+ Interior_ministry
2346
+ International_Association_of_Athletics_Federations
2347
+ International_Boxing_Association
2348
+ International_Committee_of_the_Red_Cross
2349
+ International_Confederation_of_Free_Trade_Unions
2350
+ International_Labour_Organization
2351
+ International_Monetary_Fund
2352
+ International_Petroleum_Exchange
2353
+ International_Union_of_Railways
2354
+ International_cricketers_of_South_African_origin
2355
+ Interstate_5
2356
+ Interstate_80
2357
+ Interstate_95
2358
+ Inverness
2359
+ Inverness_Caledonian_Thistle_F.C.
2360
+ Inverness_Thistle_F.C.
2361
+ Inzamam-ul-Haq
2362
+ Inés_Gorrochategui
2363
+ Ion_Iliescu
2364
+ Ionel_Dănciulescu
2365
+ Ipsos
2366
+ Ipswich_Town_F.C.
2367
+ Iran
2368
+ Iranian_Kurdistan
2369
+ Iraq
2370
+ Iraqi_Kurdistan
2371
+ Iraqi_National_Congress
2372
+ Ireland
2373
+ Ireland_national_rugby_union_team
2374
+ Irina_Korzhanenko
2375
+ Irina_Privalova
2376
+ Irina_Spîrlea
2377
+ Irish_Independent
2378
+ Irish_Republican_Army
2379
+ Irvine,_California
2380
+ Iryna_Yatchenko
2381
+ Isaac_Viciosa
2382
+ Isel_López
2383
+ Islam
2384
+ Islamabad
2385
+ Islamic_Museum
2386
+ Islamic_Republic_News_Agency
2387
+ Islamic_Republic_of_Iran
2388
+ Islamic_Salvation_Front
2389
+ Islamism
2390
+ Island_Beach_State_Park
2391
+ Isolde_Kostner
2392
+ Israel
2393
+ Israel_national_football_team
2394
+ Israel_national_under-21_football_team
2395
+ Israeli_Labor_Party
2396
+ Israelis
2397
+ Istanbul
2398
+ Isthmus_of_Tehuantepec
2399
+ Italy
2400
+ Italy_national_football_team
2401
+ Italy_national_rugby_union_team
2402
+ Itamar_Rabinovich
2403
+ Iulian_Filipescu
2404
+ Iva_Majoli
2405
+ Ivan_Francescato
2406
+ Ivan_Lendl
2407
+ Iván_García
2408
+ Iván_Rodríguez
2409
+ Iván_Zamorano
2410
+ J._Russell_(Essex_cricketer)
2411
+ JCPenney_Classic
2412
+ JK_Tallinna_Sadam
2413
+ Jaap_Stam
2414
+ Jacek_Dembiński
2415
+ Jack_Charlton
2416
+ Jack_Kemp
2417
+ Jack_Kevorkian
2418
+ Jack_Pierce_(athlete)
2419
+ Jackie_McNamara
2420
+ Jackie_Stewart
2421
+ Jacksonville,_Florida
2422
+ Jacksonville_Jaguars
2423
+ Jacky_Martens
2424
+ Jacob_Brumfield
2425
+ Jacques_Chirac
2426
+ Jacques_Kallis
2427
+ Jacques_Toubon
2428
+ Jacques_Villeneuve
2429
+ Jacqui_Cooper
2430
+ Jaffna
2431
+ Jaime_Oncins
2432
+ Jakarta
2433
+ Jakob_Hlasek
2434
+ Jalal_Talabani
2435
+ Jalalabad
2436
+ Jamaat-e-Islami
2437
+ Jamaica
2438
+ James_Baker
2439
+ James_Baldwin_(baseball)
2440
+ James_Brady
2441
+ James_Dalton_(rugby_player)
2442
+ James_Debbah
2443
+ James_Heath_(Boxer)
2444
+ James_Love
2445
+ James_Van_Allen
2446
+ James_Weaver_(racing_driver)
2447
+ Jamie_Baulch
2448
+ Jamie_Moyer
2449
+ Jamie_Spence
2450
+ Jan_Bos
2451
+ Jan_Ove_Pedersen
2452
+ Jan_Siemerink
2453
+ Jan_van_Eijden
2454
+ Jana_Kandarr
2455
+ Jana_Novotná
2456
+ Janakantha
2457
+ Janet_Reno
2458
+ Janette_Husárová
2459
+ Janez_Drnovšek
2460
+ Jani_Soininen
2461
+ Janne_Ojanen
2462
+ Jans_Koerts
2463
+ Jansher_Khan
2464
+ Jaora
2465
+ Japan
2466
+ Japan_Fed_Cup_team
2467
+ Japan_national_football_team
2468
+ Japie_Mulder
2469
+ Jared_Palmer
2470
+ Jared_Tomich
2471
+ Jari_Litmanen
2472
+ Jarmo_Kytölehto
2473
+ Jason_Belser
2474
+ Jason_Dickson
2475
+ Jason_Gillespie
2476
+ Jason_Little_(rugby_union)
2477
+ Jason_McAteer
2478
+ Jason_Rouser
2479
+ Jason_Stoltenberg
2480
+ Jason_Weaver_(jockey)
2481
+ Java_(programming_language)
2482
+ Javagal_Srinath
2483
+ Javier_Clemente
2484
+ Javier_Frana
2485
+ Javier_Sánchez
2486
+ Jay_Buhner
2487
+ Jean-Denis_Délétraz
2488
+ Jean-Louis_Debré
2489
+ Jean-Luc_Brassard
2490
+ Jean-Marc_Gounon
2491
+ Jean-Michel_Bayle
2492
+ Jean-Paul_van_Gastel
2493
+ Jean-Philippe_Fleurian
2494
+ Jean-Pierre_Cyprien
2495
+ Jean_Alesi
2496
+ Jean_Carlo_Witte
2497
+ Jean_Galfione
2498
+ Jean_Van_de_Velde
2499
+ Jearl_Miles_Clark
2500
+ Jed-Forest_RFC
2501
+ Jeddah
2502
+ Jeff_Bagwell
2503
+ Jeff_Bean
2504
+ Jeff_Parrett
2505
+ Jeff_Russell
2506
+ Jeff_Tarango
2507
+ Jeff_Williams_(athlete)
2508
+ Jeff_Wilson_(sportsman)
2509
+ Jeffrey_Lurie
2510
+ Jennifer_Capriati
2511
+ Jennifer_Flavin
2512
+ Jens_Fiedler_(cyclist)
2513
+ Jens_Todt
2514
+ Jeremy_Guscott
2515
+ Jeremy_Wotherspoon
2516
+ Jeroen_Blijlevens
2517
+ Jerry_Kelly
2518
+ Jersey
2519
+ Jerusalem
2520
+ Jesper_Parnevik
2521
+ Jesper_Skibby
2522
+ Jesse_Owens
2523
+ Jesus
2524
+ Jesús_Arellano
2525
+ Jesús_Tavárez
2526
+ Jews
2527
+ Jiang_Zemin
2528
+ Jiangsu
2529
+ Jiangxi
2530
+ Jill_Craybas
2531
+ Jim_Bolger
2532
+ Jim_Courier
2533
+ Jim_Edmonds
2534
+ Jim_Furyk
2535
+ Jim_Grabb
2536
+ Jim_Harbaugh
2537
+ Jim_Hines
2538
+ Jim_Leyritz
2539
+ Jim_Payne_(golfer)
2540
+ Jim_Telfer
2541
+ Jim_Thome
2542
+ Jimi_Hendrix
2543
+ Jimmy_Adams
2544
+ Jimmy_Key
2545
+ Jimmy_Thomson_(footballer)
2546
+ Jimmy_Vasser
2547
+ Jimy_Szymanski
2548
+ Jiří_Dopita
2549
+ Jiří_Džmura
2550
+ Jiří_Novák
2551
+ Joakim_Haeggman
2552
+ Joakim_Persson
2553
+ Joan_Llaneras
2554
+ Joannette_Kruger
2555
+ Joaquín_del_Olmo
2556
+ Jocelyn_Gourvennec
2557
+ Jodie_Foster
2558
+ Jody_Reed
2559
+ Joe-Max_Moore
2560
+ Joe_Carter
2561
+ Joe_Kneipp
2562
+ Joe_Miller_(footballer)
2563
+ Joe_Roff
2564
+ Joel_Stransky
2565
+ Joey_Hamilton
2566
+ Johan_Capiot
2567
+ Johan_Museeuw
2568
+ Johan_de_Kock
2569
+ Johann_Sebastian_Bach
2570
+ Johannesburg
2571
+ John_Boles_(baseball)
2572
+ John_C._Kornblum
2573
+ John_Collins_(footballer)
2574
+ John_Cook_(golfer)
2575
+ John_Crawley
2576
+ John_Fashanu
2577
+ John_Hagelin
2578
+ John_Hart_(rugby_coach)
2579
+ John_Hartson
2580
+ John_Hinckley,_Jr.
2581
+ John_Howard
2582
+ John_J._Sheehan
2583
+ John_Jaha
2584
+ John_Kerry
2585
+ John_Kocinski
2586
+ John_Langmore
2587
+ John_Lewis_Partnership
2588
+ John_Major
2589
+ John_Mark_Inienger
2590
+ John_Marzano
2591
+ John_Mayock
2592
+ John_McNamara_(baseball)
2593
+ John_Michael_Gorst
2594
+ John_Newcombe
2595
+ John_Regis_(athlete)
2596
+ John_Robinson_(footballer)
2597
+ John_Smiley
2598
+ John_Smoltz
2599
+ John_Stephenson_(cricketer,_born_1965)
2600
+ John_Talen
2601
+ John_Toshack
2602
+ John_Veldman
2603
+ John_White_(squash_player)
2604
+ John_Winslow_Bissell
2605
+ John_Y._Simon
2606
+ Johnny_Bench
2607
+ Johnny_Damon
2608
+ Johnny_Herbert
2609
+ Johnny_Oates
2610
+ Johns_Hopkins_Hospital
2611
+ Johnson_Controls
2612
+ Johor_Bahru
2613
+ Jon_Dahl_Tomasson
2614
+ Jon_Drummond
2615
+ Jonah_Lomu
2616
+ Jonas_Björkman
2617
+ Jonas_Savimbi
2618
+ Jonathan_Akpoborie
2619
+ Jonathan_Bachini
2620
+ Jonathan_Edwards_(athlete)
2621
+ Jonathan_Lomas
2622
+ Jonathan_Stark
2623
+ Jonathon_Power
2624
+ Jonty_Rhodes
2625
+ Joost_van_der_Westhuizen
2626
+ Jordan
2627
+ Jordan_Grand_Prix
2628
+ Jordan_River
2629
+ Jordi_Burillo
2630
+ Jordi_Cruyff
2631
+ Jorge_Cadete
2632
+ Jorge_Costa
2633
+ Jos_Verstappen
2634
+ Josef_Zieleniec
2635
+ Joseph_Bernardin
2636
+ Joseph_Keter
2637
+ Joseph_Nanven_Garba
2638
+ Joseph_Stalin
2639
+ Josh_Kronfeld
2640
+ José_Antonio_Escuredo
2641
+ José_Cóceres
2642
+ José_Emilio_Amavisca
2643
+ José_Hernández_(baseball)
2644
+ José_Herrera_(1990s_outfielder)
2645
+ José_Luis_Caminero
2646
+ José_Luís_Vidigal
2647
+ José_Maria_Cañizares
2648
+ José_María_Aznar
2649
+ José_María_Olazábal
2650
+ José_Miguel_Insulza
2651
+ José_Offerman
2652
+ José_Parra_(baseball)
2653
+ José_Ramos-Horta
2654
+ José_Rivero
2655
+ José_Rosado
2656
+ José_Taira
2657
+ José_Valentín
2658
+ João_Domingos_Pinto
2659
+ João_Havelange
2660
+ João_Vieira_Pinto
2661
+ Joël_Smets
2662
+ Juan_Antonio_Pizzi
2663
+ Juan_Curuchet
2664
+ Juan_Esnáider
2665
+ Juan_González_(baseball)
2666
+ Juan_Guzmán_(baseball)
2667
+ Juan_Perón
2668
+ Juan_Roque
2669
+ Juan_Somavía
2670
+ Judith_Arndt
2671
+ Judith_Wiesner
2672
+ Juha_Kankkunen
2673
+ Juha_Ylönen
2674
+ Julen_Lopetegui
2675
+ Jules_Maigret
2676
+ Julia_Carling
2677
+ Julia_Roberts
2678
+ Juliana_of_the_Netherlands
2679
+ Juliet_Cuthbert
2680
+ Julio_Dely_Valdés
2681
+ Julius_Nyerere
2682
+ Julián_Tavárez
2683
+ Jumet
2684
+ Junior_Murray
2685
+ Justin_Hobday
2686
+ Justin_Leonard
2687
+ Justin_Marshall
2688
+ Justin_Thompson
2689
+ Justin_Vaughan
2690
+ Juventus_F.C.
2691
+ Jyrki_Lumme
2692
+ Jyväskylä
2693
+ Ján_Krošlák
2694
+ Jörg_Heinrich
2695
+ Jürgen_Klinsmann
2696
+ Jürgen_Kohler
2697
+ Jürgen_Schult
2698
+ K.A.A._Gent
2699
+ K.F.C._Lommel_S.K.
2700
+ K.R.C._Genk
2701
+ K.R.C._Zuid-West-Vlaanderen
2702
+ K.S.C._Lokeren_Oost-Vlaanderen
2703
+ K._Sint-Truidense_V.V.
2704
+ KBC_Bank
2705
+ KIL_Toppfotball
2706
+ KK_Cibona
2707
+ KK_Crvena_zvezda
2708
+ KK_Partizan
2709
+ KK_Split
2710
+ KSE_100_Index
2711
+ KTM
2712
+ KV_Mechelen
2713
+ Ka_Wah_Bank
2714
+ Kabul
2715
+ Kader_Ferhaoui
2716
+ Kamiel_Maase
2717
+ Kampala
2718
+ Kandahar
2719
+ Kanpur
2720
+ Kansai_International_Airport
2721
+ Kansas
2722
+ Kansas_City,_Missouri
2723
+ Kansas_City_Chiefs
2724
+ Kansas_City_Royals
2725
+ Kaohsiung
2726
+ Kaolack
2727
+ Kapil_Dev
2728
+ Karachi
2729
+ Karachi_Stock_Exchange
2730
+ Karen_National_Union
2731
+ Karen_people
2732
+ Karim_Alami
2733
+ Karin_Janke
2734
+ Karin_Kschwendt
2735
+ Karina_Habšudová
2736
+ Karl-Heinz_Riedle
2737
+ Karl_Inderfurth
2738
+ Karl_Marx
2739
+ Karlsruher_SC
2740
+ Karol_Kučera
2741
+ Karsten_Bäron
2742
+ Kashmir
2743
+ Kassala
2744
+ Kassala_(state)
2745
+ Katarína_Studeníková
2746
+ Kate_Michelman
2747
+ Kate_Pace
2748
+ Katharina_Gutensohn
2749
+ Kathmandu
2750
+ Kathy_Rinaldi
2751
+ Katja_Seizinger
2752
+ Kawasaki_Heavy_Industries
2753
+ Kawasaki_Motors_Racing
2754
+ Kazakhstan
2755
+ Kazanlak
2756
+ Kazuyoshi_Funaki
2757
+ Kazuyoshi_Miura
2758
+ Keane_(company)
2759
+ Kecskemét
2760
+ Kehl
2761
+ Keirin
2762
+ Keith_Fletcher
2763
+ Keith_Wright_(footballer)
2764
+ Kelli_Kuehne
2765
+ Ken_Caminiti
2766
+ Ken_Green_(golfer)
2767
+ Ken_Griffey,_Jr.
2768
+ Ken_Hill_(baseball)
2769
+ Ken_Saro-Wiwa
2770
+ Kengo_Wa_Dondo
2771
+ Kenichi_Shimokawa
2772
+ Kennedy_Ochieng
2773
+ Kennedy_family
2774
+ Kennet_Andersson
2775
+ Kenneth_Benjamin
2776
+ Kenneth_Carlsen
2777
+ Kenneth_Clarke
2778
+ Kenneth_Eriksson
2779
+ Kenny_Cunningham
2780
+ Kenny_Dalglish
2781
+ Kenny_Harrison
2782
+ Kenny_Lofton
2783
+ Kenny_Rogers_(baseball)
2784
+ Kent_County_Cricket_Club
2785
+ Kentucky
2786
+ Kenya
2787
+ Kenya_national_cricket_team
2788
+ Kenya_national_football_team
2789
+ Kerwin_Bell
2790
+ Kevin_Brown_(right-handed_pitcher)
2791
+ Kevin_Foster_(baseball)
2792
+ Kevin_Keegan
2793
+ Kevin_Kim
2794
+ Kevin_Mitchell_(baseball)
2795
+ Kevin_Seitzer
2796
+ Kevin_Tapani
2797
+ KeyBank
2798
+ Khaled_Al_Zaher
2799
+ Khaleda_Zia
2800
+ Khalid_Boulami
2801
+ Khandwa
2802
+ Khartoum
2803
+ Khasavyurt
2804
+ Khmer_people
2805
+ Kia_Motors
2806
+ Kid_Brands
2807
+ Kiev
2808
+ Kigali
2809
+ Kilmarnock_F.C.
2810
+ Kim_Graham
2811
+ Kim_Yoon-man
2812
+ Kim_Young-sam
2813
+ Kimberley,_Northern_Cape
2814
+ Kimberly_Po
2815
+ Kimiko_Date_Krumm
2816
+ KinderCare_Learning_Centers
2817
+ Kindu
2818
+ King_Baudouin_Stadium
2819
+ Kingdom_of_England
2820
+ Kingdome
2821
+ Kingston_Technology
2822
+ Kinshasa
2823
+ Kirkcaldy
2824
+ Kirkland,_Washington
2825
+ Kisangani
2826
+ Kitty_Kiernan
2827
+ Kivu
2828
+ Klas_Eriksson
2829
+ Klaus_Kinkel
2830
+ Kleiner_Perkins_Caufield_&_Byers
2831
+ Knowsley_Road
2832
+ Kobus_Wiese
2833
+ Kofi_Annan
2834
+ Koha_Jonë
2835
+ Kol_Yisrael
2836
+ Kolkata
2837
+ Komerční_banka
2838
+ Kompas
2839
+ Konstantinos_Mitsotakis
2840
+ Konstantinos_Stephanopoulos
2841
+ Korea
2842
+ Korea_Republic_national_football_team
2843
+ Kosovo
2844
+ Kostas_Skandalidis
2845
+ Kragujevac
2846
+ Krasnoyarsk_Krai
2847
+ Krassimir_Balakov
2848
+ Kristian_Brenden
2849
+ Kristie_Boogert
2850
+ Kristina_Brandi
2851
+ Krzysztof_Piskuła
2852
+ Krzysztof_Warzycha
2853
+ Kuala_Lumpur
2854
+ Kubilay_Türkyilmaz
2855
+ Kumar_Dharmasena
2856
+ Kunar_River
2857
+ Kurdish_nationalism
2858
+ Kurdish_people
2859
+ Kurdistan
2860
+ Kurdistan_Democratic_Party
2861
+ Kurdistan_Workers'_Party
2862
+ Kurt_Abbott
2863
+ Kurt_Betschart
2864
+ Kurt_Schork
2865
+ Kuusamo
2866
+ Kuwait
2867
+ Kuwait_national_football_team
2868
+ KwaZulu-Natal
2869
+ Kyle_Abbott
2870
+ Kyran_Bracken
2871
+ Kyōko_Nagatsuka
2872
+ L-Orizzont
2873
+ LASK_Linz
2874
+ LG_Telecom
2875
+ LPGA_Championship
2876
+ LaMont_Smith
2877
+ La_Gazzetta_dello_Sport
2878
+ La_Jornada
2879
+ La_Nación
2880
+ La_Plagne
2881
+ La_Presse_(French_newspaper)
2882
+ La_Stampa
2883
+ La_création_du_monde
2884
+ Laban_Rotich
2885
+ Labor_Day
2886
+ Labour_Day
2887
+ Labour_Party_(Malta)
2888
+ Labour_Party_(UK)
2889
+ Lagos
2890
+ Lahore
2891
+ Lake_Maracaibo
2892
+ Lamberto_Dini
2893
+ Lamonts
2894
+ Lanarkshire
2895
+ Lancashire_County_Cricket_Club
2896
+ Lance_Armstrong
2897
+ Lance_Gibbs
2898
+ Land_Securities
2899
+ Landgraaf
2900
+ Landskrona
2901
+ Lansana_Conté
2902
+ Lantau_Peak
2903
+ Larisa_Neiland
2904
+ Larnaca
2905
+ Larry_Walker
2906
+ Lars_Bohinen
2907
+ Lars_Riedel
2908
+ Las_Cruces,_New_Mexico
2909
+ Las_Vegas,_Nevada
2910
+ Latin_America
2911
+ Latvia
2912
+ Latvia_national_football_team
2913
+ Laurence_Courtois
2914
+ Laurent-Désiré_Kabila
2915
+ Laurent_Blanc
2916
+ Laurent_Bénézech
2917
+ Laurent_Cabannes
2918
+ Laurent_Charvet
2919
+ Laurent_Gané
2920
+ Laurent_Jalabert
2921
+ Laurent_Ottoz
2922
+ Laurie_Brereton
2923
+ Laurie_Harper
2924
+ Lawrence_B._Lindsey
2925
+ Lawrence_of_Arabia_(film)
2926
+ Laxmi_Poruri
2927
+ Le_Canard_enchaîné
2928
+ Le_Figaro
2929
+ Le_Havre_AC
2930
+ Le_Jour
2931
+ Le_Matin_(France)
2932
+ Le_Monde
2933
+ Le_Temps
2934
+ League_of_Women_Voters
2935
+ Leah_Pells
2936
+ Leander_Paes
2937
+ Lebanon
2938
+ Lech_Poznań
2939
+ Lee's_Summit,_Missouri
2940
+ Lee_Bowyer
2941
+ Lee_Germon
2942
+ Lee_Sharpe
2943
+ Lee_Tinsley
2944
+ Lee_Wan_Wah
2945
+ Lee_Westwood
2946
+ Leeds
2947
+ Leeds_Rhinos
2948
+ Leeds_United_A.F.C.
2949
+ Lega_Nord
2950
+ Legal_Department
2951
+ Legia_Warszawa
2952
+ Lehman_Brothers
2953
+ Leicester
2954
+ Leicester_City_F.C.
2955
+ Leicester_Tigers
2956
+ Leicestershire_County_Cricket_Club
2957
+ Leipzig/Halle_Airport
2958
+ Lennart_Meri
2959
+ Lenzing_AG
2960
+ Leonid_Kuchma
2961
+ Leopold_III_of_Belgium
2962
+ Les_Ferdinand
2963
+ Les_Échos_(France)
2964
+ Letitia_Vriesde
2965
+ Levothyroxine
2966
+ Lewis_Gunn
2967
+ Leyton_Orient_F.C.
2968
+ Lhasa
2969
+ Li_Peng
2970
+ Liam_Botham
2971
+ Liam_Daish
2972
+ Liam_Fox
2973
+ Liam_Gallagher
2974
+ Liam_Neeson
2975
+ Liaoning
2976
+ Liberal_Democratic_Party_(Japan)
2977
+ Liberal_Party_(UK)
2978
+ Liberal_Party_of_Australia
2979
+ Liberation_Tigers_of_Tamil_Eelam
2980
+ Liberia
2981
+ Liberia_national_football_team
2982
+ Libya
2983
+ Lidl
2984
+ Liechtenstein
2985
+ Liechtenstein_national_football_team
2986
+ Lien_Chan
2987
+ Lierse_S.K.
2988
+ Lihir_Island
2989
+ Likud
2990
+ Lilian_Thuram
2991
+ Lille_OSC
2992
+ Lillehammer
2993
+ Lima
2994
+ Limerick
2995
+ Limited_company
2996
+ Lincoln,_Nebraska
2997
+ Lincoln_City_F.C.
2998
+ Linda_Kisabaka
2999
+ Linda_Wild
3000
+ Lindsay_Davenport
3001
+ Lindy_Remigino
3002
+ Linfield_F.C.
3003
+ Linford_Christie
3004
+ Linz
3005
+ Lisa_Raymond
3006
+ Lisbeth_Stuer-Lauridsen
3007
+ Lisbon
3008
+ Lithuania
3009
+ Lithuania_national_football_team
3010
+ Lithuania_national_under-21_football_team
3011
+ Little_Rock,_Arkansas
3012
+ Liverpool_F.C.
3013
+ Livingston_F.C.
3014
+ Liviu_Ciobotariu
3015
+ Lixion_Avila
3016
+ Liège
3017
+ Ljubljana
3018
+ Llanelli_RFC
3019
+ Lloyd_Axworthy
3020
+ Lloyd_Bentsen
3021
+ Lockerbie
3022
+ Loftus_Versfeld_Stadium
3023
+ Lombardi_Award
3024
+ Lomé
3025
+ London
3026
+ London_Heathrow_Airport
3027
+ London_Interbank_Offered_Rate
3028
+ London_Irish
3029
+ London_Metal_Exchange
3030
+ London_Stansted_Airport
3031
+ London_Stock_Exchange
3032
+ London_Wasps
3033
+ Long_Island
3034
+ Longchamp_Racecourse
3035
+ Longyearbyen
3036
+ Lonmin
3037
+ Loren_Roberts
3038
+ Lorenzo_Amoruso
3039
+ Lori_McNeil
3040
+ Los_Angeles
3041
+ Los_Angeles_Angels_of_Anaheim
3042
+ Los_Angeles_Clippers
3043
+ Los_Angeles_Dodgers
3044
+ Los_Angeles_Kings
3045
+ Los_Angeles_Lakers
3046
+ Lou_Gehrig
3047
+ Louis_Farrakhan
3048
+ Louis_Freeh
3049
+ Louise_Currey
3050
+ Loyola_de_Palacio
3051
+ Luanda
3052
+ Luc_Nilis
3053
+ Luca_Cadalora
3054
+ Lucas_Moreira_Neves
3055
+ Lucy_Tyler-Sharman
3056
+ Ludmila_Engquist
3057
+ Ludmila_Richterová
3058
+ Ludovic_Giuly
3059
+ Lufthansa
3060
+ Luis_Enrique_Martínez_García
3061
+ Luis_García_Postigo
3062
+ Luis_Milla
3063
+ Luis_Oliveira
3064
+ Luis_Polonia
3065
+ Luke_Jensen
3066
+ Luke_Kipkosgei
3067
+ Lukoil
3068
+ Lusaka_Protocol
3069
+ Luton_Town_F.C.
3070
+ Luxembourg
3071
+ Luxembourg_(city)
3072
+ Luís_Figo
3073
+ Lynda_Chalker,_Baroness_Chalker_of_Wallasey
3074
+ Lyngby_Boldklub
3075
+ M._Karunanidhi
3076
+ MBC_Dynamo_Moscow
3077
+ MCI_Communications
3078
+ MFK_Dubnica
3079
+ MFK_Košice
3080
+ MFK_Petržalka
3081
+ MMC_Norilsk_Nickel
3082
+ MSV_Duisburg
3083
+ MTK_Hungária_FC
3084
+ MYPA
3085
+ Maariv_(newspaper)
3086
+ Maarten_den_Bakker
3087
+ Maccabi_Haifa_F.C.
3088
+ Maccabi_Herzliya_F.C.
3089
+ Maccabi_Petah_Tikva_F.C.
3090
+ Maccabi_Tel_Aviv_B.C.
3091
+ Maccabi_Tel_Aviv_F.C.
3092
+ Macedonia_national_football_team
3093
+ Madagascar
3094
+ Madame_Bovary
3095
+ Madeleine_Albright
3096
+ Madhumalla
3097
+ Madrid
3098
+ Mae_Sot
3099
+ Mafikeng
3100
+ Magdalena_Grzybowska
3101
+ Magdalena_Maleeva
3102
+ Magna_International
3103
+ Magnus_Gustafsson
3104
+ Magnus_Larsson
3105
+ Mahala
3106
+ Mahmoud_Abbas
3107
+ Major_League_Baseball
3108
+ Major_League_Baseball_Most_Valuable_Player_Award
3109
+ Maksim_Tarasov
3110
+ Malawi
3111
+ Malawi_national_football_team
3112
+ Malaysia
3113
+ Malaysia_Open_(badminton)
3114
+ Malcolm_Marshall
3115
+ Malcolm_Rifkind
3116
+ Mali
3117
+ MaliVai_Washington
3118
+ Malta
3119
+ Malta_Freeport
3120
+ Malta_national_football_team
3121
+ Managua
3122
+ Manama
3123
+ Manchester
3124
+ Manchester_City_F.C.
3125
+ Manchester_United_F.C.
3126
+ Mandsaur
3127
+ Manfred_Kanther
3128
+ Manfred_Schwabl
3129
+ Mangosuthu_Buthelezi
3130
+ Manhattan
3131
+ Manhattan_Beach,_California
3132
+ Maniema
3133
+ Manly-Warringah_Sea_Eagles
3134
+ Mansfield_Town_F.C.
3135
+ Manuel_Medina_(boxer)
3136
+ Mao_Zedong
3137
+ Maputo
3138
+ Maracaibo
3139
+ Marc-Vivien_Foé
3140
+ Marc_Blume
3141
+ Marc_Degryse
3142
+ Marc_Dutroux
3143
+ Marc_Hottiger
3144
+ Marc_Keller
3145
+ Marc_Newfield
3146
+ Marc_Overmars
3147
+ Marc_Rosset
3148
+ Marc_Wilkins_(baseball)
3149
+ Marc_Wilmots
3150
+ Marcel_Desailly
3151
+ Marcello_Cuttitta
3152
+ Marcelo_Filippini
3153
+ Marcelo_Otero
3154
+ Marcelo_Ríos
3155
+ Marcelo_Salas
3156
+ Marcelo_Silva_Ramos
3157
+ Marcin_Mięciel
3158
+ Marco_Bode
3159
+ Marco_Delvecchio
3160
+ Marco_Lietti
3161
+ Marco_Pascolo
3162
+ Marco_Vaccari
3163
+ Marco_Villa
3164
+ Marcos_Ondruska
3165
+ Marcus_Gayle
3166
+ Marcus_Grönholm
3167
+ Marek_Citko
3168
+ Margaret_Crowley_(athlete)
3169
+ Margaret_Thatcher
3170
+ Maria_Mutola
3171
+ Mariaan_de_Swardt
3172
+ Marianne_Timmer
3173
+ Marianne_Werdel
3174
+ Mariano_Bombarda
3175
+ Mariano_Duncan
3176
+ Mariano_Juan
3177
+ Mariano_Rivera
3178
+ Maribor
3179
+ Marie-José_Pérec
3180
+ Marie_Lindgren
3181
+ Marina_Trandenkova
3182
+ Mario_Basler
3183
+ Mario_Silva
3184
+ Mario_Stanić
3185
+ Marion_Clignet
3186
+ Marius_Hurter
3187
+ Mark_Acre
3188
+ Mark_Andrews_(rugby_player)
3189
+ Mark_Brooks_(golfer)
3190
+ Mark_Butcher
3191
+ Mark_Cairns_(squash_player)
3192
+ Mark_Calcavecchia
3193
+ Mark_Chaloner
3194
+ Mark_Crear
3195
+ Mark_Davis_(golfer)
3196
+ Mark_Dekker
3197
+ Mark_Ealham
3198
+ Mark_Gleeson_(journalist)
3199
+ Mark_Hughes
3200
+ Mark_Hutton
3201
+ Mark_Ilott
3202
+ Mark_Kennedy_(footballer)
3203
+ Mark_Knowles
3204
+ Mark_Lawrence_Wolf
3205
+ Mark_McGwire
3206
+ Mark_McNulty
3207
+ Mark_Mouland
3208
+ Mark_Pembridge
3209
+ Mark_Petkovsek
3210
+ Mark_Philippoussis
3211
+ Mark_Prescott
3212
+ Mark_Richardson_(athlete)
3213
+ Mark_Roe
3214
+ Mark_Rutherford_(footballer)
3215
+ Mark_Spitz
3216
+ Mark_Taylor_(cricketer)
3217
+ Mark_Thompson_(baseball)
3218
+ Mark_Waugh
3219
+ Mark_Woodforde
3220
+ Markham,_Ontario
3221
+ Marko_Koers
3222
+ Markus_Babbel
3223
+ Markus_Feldhoff
3224
+ Markus_Schopp
3225
+ Marlene_Thomsen
3226
+ Maronite_Church
3227
+ Marseille
3228
+ Marshall_Faulk
3229
+ Martela
3230
+ Martin_Brundle
3231
+ Martin_Damm
3232
+ Martin_Driller
3233
+ Martin_Heath
3234
+ Martin_Max
3235
+ Martin_McCague
3236
+ Martina_Ertl-Renz
3237
+ Martina_Hingis
3238
+ Marty_Cordova
3239
+ Marty_Nothstein
3240
+ Marubeni
3241
+ Marvin_Benard
3242
+ Marvin_Harrison
3243
+ Marxism
3244
+ Mary_Joe_Fernandez
3245
+ Mary_Onyali-Omagbemi
3246
+ Mary_Pierce
3247
+ Marylebone_Cricket_Club
3248
+ Masakiyo_Maezono
3249
+ Masami_Ihara
3250
+ Masayoshi_Takemura
3251
+ Masayuki_Okano
3252
+ Massachusetts
3253
+ Massimo_Cuttitta
3254
+ Massoud_Barzani
3255
+ Massoud_Rajavi
3256
+ Matagalpa
3257
+ Mate_Granić
3258
+ Mathias_Jack
3259
+ Mathias_Ntawulikura
3260
+ Mats_Lanner
3261
+ Mats_Wilander
3262
+ Matt_Burke
3263
+ Matt_Lawton
3264
+ Matt_Stairs
3265
+ Matthew_Le_Tissier
3266
+ Matthew_Maynard
3267
+ Matthew_Pinsent
3268
+ Matthew_Windows
3269
+ Matthias_Hagner
3270
+ Matthias_Sammer
3271
+ Matuzići
3272
+ Maurice_Baril
3273
+ Maurice_Graef
3274
+ Mauricio_Hadad
3275
+ Mauritania
3276
+ Mauritania_national_football_team
3277
+ Mauritius
3278
+ Mauritius_national_football_team
3279
+ Maurizio_Fondriest
3280
+ Mauro_Silva
3281
+ Maurício_Gugelmin
3282
+ Max_Delbrück
3283
+ Max_Sciandri
3284
+ Max_van_Heeswijk
3285
+ Małgorzata_Rydz
3286
+ McLaren
3287
+ Mecca
3288
+ Medellín
3289
+ Media_Indonesia
3290
+ Mediaset
3291
+ Medicare_(United_States)
3292
+ Mediterranean_Sea
3293
+ Megawati_Sukarnoputri
3294
+ Meghann_Shaughnessy
3295
+ Mehmet_Scholl
3296
+ Melbourne
3297
+ Melbourne_Cricket_Ground
3298
+ Melbourne_Football_Club
3299
+ Melrose_RFC
3300
+ Meluawati
3301
+ Melvin_Nieves
3302
+ Memphis_Grizzlies
3303
+ Menlo_Park,_California
3304
+ Mercedes-Benz
3305
+ Meredith_McGrath
3306
+ Merlene_Ottey
3307
+ Merrill_Lynch
3308
+ Metro_AG
3309
+ Metro_Manila
3310
+ Metropolitan_Fiber_Systems
3311
+ Metropolitan_Museum_of_Art
3312
+ Mette_Bergmann
3313
+ Mexican_Army
3314
+ Mexico
3315
+ Mexico_City
3316
+ Mexico_national_football_team
3317
+ Mhow
3318
+ Mia_Audina
3319
+ Miami
3320
+ Miami_Dolphins
3321
+ Miami_Heat
3322
+ Michael_Andersson
3323
+ Michael_Andretti
3324
+ Michael_Bevan
3325
+ Michael_Branch
3326
+ Michael_Campbell
3327
+ Michael_Chang
3328
+ Michael_Collins_(Irish_leader)
3329
+ Michael_Collins_(film)
3330
+ Michael_Di_Venuto
3331
+ Michael_Doohan
3332
+ Michael_Green_(sprinter)
3333
+ Michael_Heseltine
3334
+ Michael_Hübner
3335
+ Michael_J._Astrue
3336
+ Michael_Jeffrey
3337
+ Michael_Johnson_(athlete)
3338
+ Michael_Jones_(rugby_union)
3339
+ Michael_Jonzon
3340
+ Michael_Joyce_(tennis)
3341
+ Michael_Konsel
3342
+ Michael_Lynagh
3343
+ Michael_Marsh_(athlete)
3344
+ Michael_Möllenbeck
3345
+ Michael_Reiziger
3346
+ Michael_Schumacher
3347
+ Michael_Slater
3348
+ Michael_Stich
3349
+ Michael_Tebbutt
3350
+ Michael_Tucker_(baseball)
3351
+ Michael_Zorc
3352
+ Michaela_Dorfmeister
3353
+ Michel_Hansenne
3354
+ Michele_Bartoli
3355
+ Michelle_Freeman
3356
+ Michigan
3357
+ Michigan_City,_Indiana
3358
+ Michigan_National_Bank
3359
+ Michoacán
3360
+ Mick_McCarthy
3361
+ Mickey_Cochrane
3362
+ Mickey_Kantor
3363
+ Mickey_Rivers
3364
+ Microsoft
3365
+ Microsoft_Windows
3366
+ Middle_East
3367
+ Middle_East_Economic_Survey
3368
+ Middle_Kingdom_of_Egypt
3369
+ Middlesbrough_F.C.
3370
+ Middlesex_County_Cricket_Club
3371
+ Midwestern_United_States
3372
+ Miguel_Ángel_Jiménez
3373
+ Miguel_Ángel_Martín_(golfer)
3374
+ Miguel_Ángel_Nadal
3375
+ Mihai_Tararache
3376
+ Mika_Häkkinen
3377
+ Mika_Salo
3378
+ Mikael_Tillström
3379
+ Mike_Atherton
3380
+ Mike_Conley,_Sr.
3381
+ Mike_Devereaux
3382
+ Mike_Fetters
3383
+ Mike_Gatting
3384
+ Mike_Harwood
3385
+ Mike_Hulbert
3386
+ Mike_McCurry_(press_secretary)
3387
+ Mike_Mordecai
3388
+ Mike_Oquist
3389
+ Mike_Piazza
3390
+ Mike_Stanley
3391
+ Mike_Sullivan_(golfer)
3392
+ Mike_Tyson
3393
+ Mike_Watkinson
3394
+ Mike_Williams_(baseball)
3395
+ Mikhail_Gorbachev
3396
+ Mikhail_Kutuzov
3397
+ Mil_Mi-17
3398
+ Miladin_Bečanović
3399
+ Milan
3400
+ Milan_Kučan
3401
+ Miles_Tunnicliff
3402
+ Milinko_Pantić
3403
+ Million_Man_March
3404
+ Millwall_F.C.
3405
+ Miltiadis_Evert
3406
+ Milwaukee
3407
+ Milwaukee_Brewers
3408
+ Milwaukee_Bucks
3409
+ Minardi
3410
+ Mindanao
3411
+ Ming_Pao
3412
+ Minister_for_Foreign_Affairs_(Australia)
3413
+ Minister_of_the_Interior_(France)
3414
+ Ministry_of_Finance_(Chile)
3415
+ Ministry_of_Finance_(Japan)
3416
+ Ministry_of_Foreign_Affairs_(Israel)
3417
+ Ministry_of_Foreign_Affairs_(Poland)
3418
+ Ministry_of_Foreign_Affairs_(Turkey)
3419
+ Ministry_of_Foreign_Affairs_of_the_People's_Republic_of_China
3420
+ Minneapolis
3421
+ Minnesota
3422
+ Minnesota_Timberwolves
3423
+ Minnesota_Twins
3424
+ Minnesota_Vikings
3425
+ Minnetonka,_Minnesota
3426
+ Minsk
3427
+ Miriam_Oremans
3428
+ Miriam_Vogt
3429
+ Miron_Cozma
3430
+ Miroslav_Votava
3431
+ Miss_Universe
3432
+ Missionaries_of_Charity
3433
+ Mississippi_River
3434
+ Missouri
3435
+ Mitsubishi_Lancer
3436
+ Mitsubishi_Motors
3437
+ Mitsui
3438
+ Mladá_fronta_DNES
3439
+ Mo_Vaughn
3440
+ Mobil
3441
+ Mobutu_Sese_Seko
3442
+ Moengo
3443
+ Mohali
3444
+ Mohammad_Akram
3445
+ Mohammad_Azharuddin
3446
+ Moin_Khan
3447
+ Molde_FK
3448
+ Moldova
3449
+ Moldova_national_football_team
3450
+ Mona_Eltahawy
3451
+ Monaco
3452
+ Monarcas_Morelia
3453
+ Mongolia
3454
+ Monica_Seles
3455
+ Monrovia
3456
+ Mons_Ivar_Mjelde
3457
+ Mont-sur-Marchienne
3458
+ Montana
3459
+ Monte_Carlo
3460
+ Montenegrins
3461
+ Montenegro
3462
+ Monterey,_California
3463
+ Montgomery,_Alabama
3464
+ Montpelier,_Vermont
3465
+ Montpellier_HSC
3466
+ Montreal
3467
+ Montreal_Canadiens
3468
+ Montreal_Expos
3469
+ Montrose_F.C.
3470
+ Montserrat
3471
+ Morelia
3472
+ Morgan_Stanley
3473
+ Morinville,_Alberta
3474
+ Moro_National_Liberation_Front
3475
+ Morocco
3476
+ Moscow
3477
+ Moscow_Kremlin
3478
+ Mosenergo
3479
+ Moses_Kiptanui
3480
+ Moss_FK
3481
+ Moss_Landing_Marine_Laboratories
3482
+ Mostostal
3483
+ Mother_Teresa
3484
+ Motherwell_F.C.
3485
+ Motohiro_Yamaguchi
3486
+ Mountain_View,_California
3487
+ Moustapha_Niasse
3488
+ Movement_of_Socialist_Democrats
3489
+ Movladi_Udugov
3490
+ Mozambique
3491
+ Mozambique_national_football_team
3492
+ Mr._Clean
3493
+ Muammar_al-Gaddafi
3494
+ Muhammad
3495
+ Mumbai
3496
+ Munich
3497
+ Munich_Airport
3498
+ Munich_Re
3499
+ Murat_Yakin
3500
+ Murphy_Jensen
3501
+ Murrayfield_Stadium
3502
+ Mushtaq_Ahmed
3503
+ Muslim_Commercial_Bank
3504
+ Mustapha_Hadji
3505
+ Muttahida_Qaumi_Movement
3506
+ Muttiah_Muralitharan
3507
+ Muzaffarabad
3508
+ My_Turn_(memoir)
3509
+ Márcio_Roberto_dos_Santos
3510
+ Mário_Covas
3511
+ Mário_Jardel
3512
+ Médecins_Sans_Frontières
3513
+ MŠK_Rimavská_Sobota
3514
+ MŠK_Žilina
3515
+ N.E.C._(football_club)
3516
+ NAC_Breda
3517
+ NARAL_Pro-Choice_America
3518
+ NASDAQ
3519
+ NATO
3520
+ NATO_intervention_in_Bosnia
3521
+ NBC
3522
+ NK_Olimpija_Ljubljana_(1911)
3523
+ NK_Rudar_Velenje
3524
+ NK_Varaždin
3525
+ NSDAP/AO
3526
+ NTV_(Russia)
3527
+ Nabih_Berri
3528
+ Nablus
3529
+ Nagorno-Karabakh_Republic
3530
+ Nagoum_Yamassoum
3531
+ Nagoya
3532
+ Nagyatád
3533
+ Nail_Beširović
3534
+ Naina_Yeltsina
3535
+ Nairobi
3536
+ Namibia
3537
+ Namibia_national_football_team
3538
+ Nana_Miyagi
3539
+ Nanabhoy_Palkhivala
3540
+ Nancy_Reagan
3541
+ Nandrin
3542
+ Naoki_Soma
3543
+ Naoko_Sawamatsu
3544
+ Naoto_Kan
3545
+ Naples
3546
+ Napoleon_I
3547
+ Napoleon_III
3548
+ Naqoura
3549
+ Narciso_dos_Santos
3550
+ Naseem_Hamed
3551
+ Nasrallah_Boutros_Sfeir
3552
+ Nasser_Hussain
3553
+ Natalya_Pomoshchnikova-Voronova
3554
+ Natalya_Sadova
3555
+ Natalya_Shikolenko
3556
+ Natalya_Torshina-Alimzhanova
3557
+ Natasha_Zvereva
3558
+ Nate_Miller
3559
+ Nathalie_Dechy
3560
+ Nathalie_Lancien
3561
+ Nathalie_Tauziat
3562
+ Nathan_Astle
3563
+ Nation_of_Islam
3564
+ National_Air_Traffic_Services
3565
+ National_Alliance_(Italy)
3566
+ National_Assembly_of_Angola
3567
+ National_Bank_(Bangladesh)
3568
+ National_Bank_of_Poland
3569
+ National_Basketball_Association
3570
+ National_Collegiate_Athletic_Association
3571
+ National_Congress_of_Brazil
3572
+ National_Democratic_Alliance_(Sudan)
3573
+ National_Football_League
3574
+ National_Guard_of_the_United_States
3575
+ National_Hockey_League
3576
+ National_Human_Rights_Commission_(Mexico)
3577
+ National_Hurricane_Center
3578
+ National_Iraqi_News_Agency
3579
+ National_Islamic_Front
3580
+ National_League
3581
+ National_League_Central
3582
+ National_League_East
3583
+ National_League_West
3584
+ National_League_for_Democracy
3585
+ National_Liberation_Front_of_Corsica
3586
+ National_Olympic_Committee
3587
+ National_Park_Service
3588
+ National_Party_(South_Africa)
3589
+ National_Rifle_Association
3590
+ National_Tennis_Centre_(United_Kingdom)
3591
+ National_Transportation_Safety_Board
3592
+ National_University_of_Singapore
3593
+ National_Weather_Service
3594
+ National_stadium
3595
+ NationsBank
3596
+ Native_Americans_in_the_United_States
3597
+ Natural_Law_Party_(United_States)
3598
+ Nature_(journal)
3599
+ Navjot_Singh_Sidhu
3600
+ Nawaz_Sharif
3601
+ Nayan_Mongia
3602
+ Nazi_Germany
3603
+ Nazi_Party
3604
+ Nazism
3605
+ Neal_Lancaster
3606
+ Neale_Fraser
3607
+ Neath_RFC
3608
+ Nebraska
3609
+ Nebraska_Cornhuskers_football
3610
+ Necmettin_Erbakan
3611
+ Neemuch
3612
+ Neil_Back
3613
+ Neil_Fairbrother
3614
+ Neil_Hodgson
3615
+ Neil_Jordan
3616
+ Neil_Williams_(cricketer)
3617
+ Nelson,_New_Zealand
3618
+ Nelson_A._Miles
3619
+ Nelson_Mandela
3620
+ Nelė_Žilinskienė
3621
+ Neo-Nazism
3622
+ Nepal
3623
+ Nepali_Congress
3624
+ Netherlands
3625
+ Netherlands_Fed_Cup_team
3626
+ Netherlands_national_football_team
3627
+ Neuchâtel
3628
+ Neuchâtel_Xamax
3629
+ Nevada
3630
+ Nevil_Dede
3631
+ Nevill_Ground
3632
+ Neville_Godwin
3633
+ NewYork–Presbyterian_Hospital
3634
+ New_Aspiration_Party
3635
+ New_Caledonia
3636
+ New_Delhi
3637
+ New_Democracy_(Greece)
3638
+ New_England
3639
+ New_England_Patriots
3640
+ New_Hampshire,_Ohio
3641
+ New_Haven,_Connecticut
3642
+ New_Jersey
3643
+ New_Jersey_Devils
3644
+ New_Jersey_Nets
3645
+ New_Mexico
3646
+ New_Mexico_Activities_Association
3647
+ New_Orleans_Hornets
3648
+ New_Orleans_Saints
3649
+ New_Party_Sakigake
3650
+ New_Scientist
3651
+ New_South_Wales_rugby_league_team
3652
+ New_Territories
3653
+ New_York
3654
+ New_York_City
3655
+ New_York_City_Department_of_Transportation
3656
+ New_York_Cotton_Exchange
3657
+ New_York_Giants
3658
+ New_York_Islanders
3659
+ New_York_Jets
3660
+ New_York_Knicks
3661
+ New_York_Mercantile_Exchange
3662
+ New_York_Mets
3663
+ New_York_Post
3664
+ New_York_Rangers
3665
+ New_York_Stock_Exchange
3666
+ New_York_Yankees
3667
+ New_Zealand
3668
+ New_Zealand_Barbarians
3669
+ New_Zealand_First
3670
+ New_Zealand_Labour_Party
3671
+ New_Zealand_National_Party
3672
+ New_Zealand_Press_Association
3673
+ New_Zealand_Warriors
3674
+ New_Zealand_dollar
3675
+ New_Zealand_national_cricket_team
3676
+ New_Zealand_national_rugby_union_team
3677
+ New_Zealanders
3678
+ New_media
3679
+ Newark,_New_Jersey
3680
+ Newbridge_RFC
3681
+ Newbury,_Berkshire
3682
+ Newcastle_Knights
3683
+ Newcastle_United_F.C.
3684
+ Newcastle_upon_Tyne
3685
+ Newcrest_Mining
3686
+ Newfoundland_(island)
3687
+ Newmarket,_Suffolk
3688
+ Newmont_Mining_Corporation
3689
+ Newport_Gwent_Dragons
3690
+ Newport_RFC
3691
+ News_Corporation
3692
+ Newsnight
3693
+ Newt_Gingrich
3694
+ Niall_Quinn
3695
+ Nicaragua
3696
+ Nicaraguans
3697
+ Nice
3698
+ Nick_Faldo
3699
+ Nick_Farr-Jones
3700
+ Nick_Knight
3701
+ Nick_Nieland
3702
+ Nick_Popplewell
3703
+ Nick_Price
3704
+ Nicklas_Kulti
3705
+ Nicky_Boje
3706
+ Niclas_Fasth
3707
+ Nico-Jan_Hoogma
3708
+ Nico_Motchebon
3709
+ Nico_van_Kerckhoven
3710
+ Nicola_Mazzucato
3711
+ Nicolas_Ouédec
3712
+ Nicole_Arendt
3713
+ Nicole_Bradtke
3714
+ Nicole_Brown_Simpson
3715
+ Nicoleta_Grasu
3716
+ Nicolás_Lapentti
3717
+ Nicolás_Pereira
3718
+ Nicosia
3719
+ Nigel_Mobbs
3720
+ Nigel_Walker
3721
+ Niger
3722
+ Nigeria
3723
+ Nihon_Keizai_Shimbun
3724
+ Nikkei_225
3725
+ Nir_Sivilia
3726
+ Nixon_McLean
3727
+ Nobel_Peace_Prize
3728
+ Nobel_Prize
3729
+ Nobina_Sverige
3730
+ Noel_Gallagher
3731
+ Noel_Whelan
3732
+ Nolan_Henke
3733
+ Nolan_Ryan
3734
+ Noranda_(mining_company)
3735
+ Norberto_Téllez
3736
+ Nordegg,_Alberta
3737
+ Noriaki_Kasai
3738
+ Norifumi_Abe
3739
+ Norio_Omura
3740
+ Noriyuki_Haga
3741
+ Norm_Charlton
3742
+ Norm_Hewitt
3743
+ Norodom_Ranariddh
3744
+ Norodom_Sihanouk
3745
+ North_America
3746
+ North_Battleford
3747
+ North_Caucasus
3748
+ North_Dakota
3749
+ North_Island
3750
+ North_Kivu
3751
+ North_Korea
3752
+ North_Melbourne_Football_Club
3753
+ North_Queensland_Cowboys
3754
+ North_Shore_City
3755
+ North_Sydney_Bears
3756
+ North_West_(South_African_province)
3757
+ North_Yemen
3758
+ Northampton
3759
+ Northampton_Saints
3760
+ Northampton_Town_F.C.
3761
+ Northamptonshire_County_Cricket_Club
3762
+ Northern_Cape
3763
+ Northern_Ireland
3764
+ Northern_Ireland_national_football_team
3765
+ Northerns_cricket_team
3766
+ Norway
3767
+ Norwest
3768
+ Norwich_City_F.C.
3769
+ Notting_Hill_Carnival
3770
+ Nottingham
3771
+ Nottingham_Forest_F.C.
3772
+ Nottinghamshire_County_Cricket_Club
3773
+ Notts_County_F.C.
3774
+ Nouakchott
3775
+ Noureddine_Morceli
3776
+ Novye_Atagi
3777
+ Numico
3778
+ Nunthorpe_Stakes
3779
+ Nuremberg
3780
+ Nuremberg_Airport
3781
+ O._J._Simpson
3782
+ OFK_Beograd
3783
+ OGC_Nice
3784
+ OKD
3785
+ OKS_1945_Olsztyn
3786
+ OPEC
3787
+ OSCE_Minsk_Group
3788
+ O_Globo
3789
+ Oakland,_California
3790
+ Oakland,_New_Jersey
3791
+ Oakland_Athletics
3792
+ Oakland_Raiders
3793
+ Oasis_(band)
3794
+ Oaxaca
3795
+ Oceano_da_Cruz
3796
+ Odra_Wodzisław
3797
+ Oerlikon_(Zürich)
3798
+ Office_of_Fair_Trading
3799
+ Ogün_Temizkanoğlu
3800
+ Ohio
3801
+ Ohio_State_Buckeyes_football
3802
+ Okinawa_Prefecture
3803
+ Oklahoma
3804
+ Oklahoma_Panhandle
3805
+ Oklahoma_State_University–Stillwater
3806
+ Okocim
3807
+ Oksana_Grishina
3808
+ Olabisi_Afolabi
3809
+ Olaf_Ludwig
3810
+ Olaf_Thon
3811
+ Old_City_(Jerusalem)
3812
+ Old_Testament
3813
+ Old_Trafford
3814
+ Old_Trafford_Cricket_Ground
3815
+ Oldham_Athletic_A.F.C.
3816
+ Oldham_Roughyeds
3817
+ Ole_Einar_Bjørndalen
3818
+ Ole_Gunnar_Solskjær
3819
+ Olga_Chernyavskaya
3820
+ Olga_Melnik
3821
+ Olga_Slyusareva
3822
+ Olin_Browne
3823
+ Oliver_Bierhoff
3824
+ Oliver_Kahn
3825
+ Oliver_McCall
3826
+ Oliver_Reck
3827
+ Olivier_Grouillard
3828
+ Olivier_Panis
3829
+ Olo_Brown
3830
+ Olympiacos_F.C.
3831
+ Olympiad
3832
+ Olympic_Airlines
3833
+ Olympic_sports
3834
+ Olympique_Khouribga
3835
+ Olympique_Lyonnais
3836
+ Olympique_de_Marseille
3837
+ Omaha,_Nebraska
3838
+ Oman
3839
+ Omar_Bongo
3840
+ Omar_Karami
3841
+ On_the_Waterfront
3842
+ Ong_Ewe_Hock
3843
+ Opel
3844
+ Operation_Provide_Comfort
3845
+ Oppenheimer_Holdings
3846
+ Oradea_International_Airport
3847
+ Oran
3848
+ Orel_Hershiser
3849
+ Organisation_of_the_Islamic_Conference
3850
+ Orhan_Çıkırıkçı
3851
+ Orlando_Magic
3852
+ Orlando_Miller
3853
+ Orlando_Pace
3854
+ Orlando_Pirates_FC
3855
+ Orrell_R.U.F.C.
3856
+ Orvieto
3857
+ Osaka
3858
+ Osaka_International_Airport
3859
+ Oscar_Luigi_Scalfaro
3860
+ Oslo
3861
+ Oslobođenje
3862
+ Osmond_Ezinwa
3863
+ Otis_Nixon
3864
+ Ottawa
3865
+ Ottawa_67's
3866
+ Ottawa_Senators
3867
+ Ottoman_Empire
3868
+ Ove_Sellberg
3869
+ Ovidiu_Stângă
3870
+ Owen_Finegan
3871
+ Oxford
3872
+ Oxford_United_F.C.
3873
+ Ozzie_Guillén
3874
+ Oğuz_Çetin
3875
+ P._W._Botha
3876
+ PBC_CSKA_Moscow
3877
+ PFC_CSKA_Moscow
3878
+ PFC_Levski_Sofia
3879
+ PFC_Nyva_Vinnytsia
3880
+ PGA_European_Tour
3881
+ PGA_Tour
3882
+ PMC-Sierra
3883
+ PSINet
3884
+ PSV_Eindhoven
3885
+ Pablo_Sánchez
3886
+ Pacific_Division_(NBA)
3887
+ Pacific_Division_(NHL)
3888
+ Pacific_Exchange
3889
+ Pacific_Rim
3890
+ Pacific_coast
3891
+ Paine_Webber
3892
+ Pakistan
3893
+ Pakistan_Muslim_League_(N)
3894
+ Pakistan_national_cricket_team
3895
+ Palestine
3896
+ Palestinian_Legislative_Council
3897
+ Palestinian_National_Authority
3898
+ Palestinian_territories
3899
+ Palk_Strait
3900
+ Pallacanestro_Treviso
3901
+ Pam_Shriver
3902
+ Pamir_Mountains
3903
+ Pamplona
3904
+ Pan-Turkism
3905
+ Pan_Am_Flight_103
3906
+ Pan_American_World_Airways
3907
+ Panama
3908
+ Panama_City
3909
+ Panama_national_football_team
3910
+ Panathinaikos_BC
3911
+ Panathinaikos_F.C.
3912
+ Panhellenic_Socialist_Movement
3913
+ Panionios_B.C.
3914
+ Paola_Suárez
3915
+ Paolo_Di_Canio
3916
+ Paolo_Negro
3917
+ Paolo_Vaccari
3918
+ Papendrecht
3919
+ Papua_New_Guinea
3920
+ Paraguay
3921
+ Paralympic_Games
3922
+ Paramaribo
3923
+ Paramount,_California
3924
+ Paramount_Pictures
3925
+ Paraná_Clube
3926
+ Parc_des_Princes
3927
+ Paris
3928
+ Paris,_Arkansas
3929
+ Paris-Charles_de_Gaulle_Airport
3930
+ Paris_Saint-Germain_F.C.
3931
+ Paris_Saint_Germain_(rugby_league_team)
3932
+ Park_Sung-hee
3933
+ Parma_F.C.
3934
+ Parramatta_Eels
3935
+ Partick_Thistle_F.C.
3936
+ Partition_of_India
3937
+ Party_of_Democratic_Action
3938
+ Party_of_Democratic_Socialism_(Germany)
3939
+ Pará
3940
+ Pascal_Renier
3941
+ Pascal_Zuberbühler
3942
+ Pat_Cash
3943
+ Pat_Hentgen
3944
+ Pat_Howard
3945
+ Pat_Hurst
3946
+ Pat_Manson
3947
+ Pat_McGinlay
3948
+ Pat_Symcox
3949
+ Patrice_Loko
3950
+ Patricia_Girard
3951
+ Patrick_Colleter
3952
+ Patrick_Kluivert
3953
+ Patrick_Rafter
3954
+ Patrick_Sang
3955
+ Patrick_Seale
3956
+ Patrick_Stevens
3957
+ Patrick_Sylvestre
3958
+ Patrick_Vieira
3959
+ Patrik_Andersson
3960
+ Patrik_Sjöberg
3961
+ Patriotic_Salvation_Movement
3962
+ Patriotic_Union_of_Kurdistan
3963
+ Patsy_Kensit
3964
+ Pattaya
3965
+ Paul_Adams_(cricketer)
3966
+ Paul_Affleck
3967
+ Paul_Belmondo
3968
+ Paul_Broadhurst
3969
+ Paul_Curry_(golfer)
3970
+ Paul_Eales
3971
+ Paul_Gascoigne
3972
+ Paul_Goydos
3973
+ Paul_Haarhuis
3974
+ Paul_Ince
3975
+ Paul_Johnson_(cricketer)
3976
+ Paul_Johnson_(squash_player)
3977
+ Paul_Justin
3978
+ Paul_Keating
3979
+ Paul_Koech
3980
+ Paul_Lawrie
3981
+ Paul_McGinley
3982
+ Paul_McGrath_(footballer)
3983
+ Paul_Newlove
3984
+ Paul_Peschisolido
3985
+ Paul_Prichard
3986
+ Paul_Reiffel
3987
+ Paul_Sorrento
3988
+ Paul_Stankowski
3989
+ Paul_Strang
3990
+ Paul_Tergat
3991
+ Paul_Tracy
3992
+ Paul_Van_Himst
3993
+ Paul_Warhurst
3994
+ Paul_Wright_(footballer)
3995
+ Paula_Radcliffe
3996
+ Pauline_Davis-Thompson
3997
+ Pauline_Konga
3998
+ Paulinho_Santos
3999
+ Paulo_Alves
4000
+ Paulo_Bento
4001
+ Paulo_Roberto_do_Carmo
4002
+ Paulo_Sousa
4003
+ Paulo_Sérgio_Silvestre_do_Nascimento
4004
+ Pavel_Buran
4005
+ Pavel_Bure
4006
+ Pavel_Muslimov
4007
+ Pavel_Nedvěd
4008
+ Pavel_Polomský
4009
+ Payne_Stewart
4010
+ Peace_River_(Canada)
4011
+ Pedro_Diniz
4012
+ Pedro_I_of_Brazil
4013
+ Pedro_Lamy
4014
+ Pedro_Linhart
4015
+ Pedro_Martínez
4016
+ Peer_Gynt
4017
+ Pemberton,_British_Columbia
4018
+ Pembroke,_Massachusetts
4019
+ Pembroke_Pines,_Florida
4020
+ Pendleton,_Oregon
4021
+ Pennsylvania
4022
+ Penrith_Panthers
4023
+ People's_Party_(Spain)
4024
+ People's_Party_–_Movement_for_a_Democratic_Slovakia
4025
+ People's_Republic_of_China
4026
+ People_of_the_United_States
4027
+ Per_Nyman
4028
+ Perm
4029
+ Pernilla_Wiberg
4030
+ Peronism
4031
+ Persian_Gulf
4032
+ Perth,_Western_Australia
4033
+ Peru
4034
+ Perugia_Calcio
4035
+ Pesalai
4036
+ Pete_Harnisch
4037
+ Pete_Incaviglia
4038
+ Pete_Sampras
4039
+ Pete_Wilson
4040
+ Peter_Atherton
4041
+ Peter_Baker_(golfer)
4042
+ Peter_Beardsley
4043
+ Peter_Blank
4044
+ Peter_Dubovský
4045
+ Peter_Hedblom
4046
+ Peter_Johansson_(motorcyclist)
4047
+ Peter_Kox
4048
+ Peter_Martin_(cricketer)
4049
+ Peter_Mitchell_(golfer)
4050
+ Peter_Munk
4051
+ Peter_Nicol
4052
+ Peter_Schöttel
4053
+ Peter_Such
4054
+ Peter_Tramacchi
4055
+ Peter_van_Vossen
4056
+ Peterborough_United_F.C.
4057
+ Petersburg,_Virginia
4058
+ Petr_Gabriel
4059
+ Petr_Korda
4060
+ Petra_Behle
4061
+ Petra_Langrová
4062
+ Petra_News_Agency
4063
+ Petre_Roman
4064
+ Phil_Babb
4065
+ Phil_Gray
4066
+ Phil_Mickelson
4067
+ Phil_Regan_(baseball)
4068
+ Phil_Simmons
4069
+ Philadelphia
4070
+ Philadelphia_76ers
4071
+ Philadelphia_Eagles
4072
+ Philadelphia_Flyers
4073
+ Philadelphia_Phillies
4074
+ Philip_Morris_International
4075
+ Philip_Walton
4076
+ Philippe_Ermenault
4077
+ Philippe_Sella
4078
+ Philippe_Séguin
4079
+ Philippine_Basketball_Association
4080
+ Philippines
4081
+ Phillip_Cocu
4082
+ Phillip_DeFreitas
4083
+ Phillip_Price
4084
+ Phnom_Penh
4085
+ Phoenix,_Arizona
4086
+ Phoenix_Coyotes
4087
+ Phoenix_Suns
4088
+ Phylis_Smith
4089
+ Piacenza_Calcio
4090
+ Picabo_Street
4091
+ Pierluigi_Casiraghi
4092
+ Pierre-Henri_Raphanel
4093
+ Pierre_Buyoya
4094
+ Pierre_Fulke
4095
+ Pierre_Laigle
4096
+ Pierre_Lueders
4097
+ Pierre_van_Hooijdonk
4098
+ Pilot_Pen_Tennis
4099
+ Piper_Jaffray
4100
+ Pirelli
4101
+ Pittsburgh
4102
+ Pittsburgh_Penguins
4103
+ Pittsburgh_Pirates
4104
+ Pittsburgh_Steelers
4105
+ Place_Beauvau
4106
+ Plymouth,_Montserrat
4107
+ Plymouth_Argyle_F.C.
4108
+ Podujevo
4109
+ Pohang_Steelers
4110
+ Pol_Pot
4111
+ Poland
4112
+ Poland_national_football_team
4113
+ Poles
4114
+ Polish_Press_Agency
4115
+ Polonia_Warszawa
4116
+ Pontefract
4117
+ Pontiac,_Michigan
4118
+ Pontypridd
4119
+ Pontypridd_RFC
4120
+ Pope_John_Paul_II
4121
+ Popular_Revolutionary_Army
4122
+ Porsche
4123
+ Port_Arthur,_Tasmania
4124
+ Port_Louis
4125
+ Port_Vale_F.C.
4126
+ Portadown_F.C.
4127
+ Portland,_Oregon
4128
+ Portland_Trail_Blazers
4129
+ Portsmouth
4130
+ Portsmouth_F.C.
4131
+ Portugal
4132
+ Portugal_national_football_team
4133
+ Portuguese_Empire
4134
+ Portuguese_Liga
4135
+ Post-Communism
4136
+ Post-Soviet_states
4137
+ Potsdam
4138
+ Poul-Erik_Høyer_Larsen
4139
+ Prague
4140
+ Prague_Stock_Exchange
4141
+ Prakash_Chandra_Lohani
4142
+ Predrag_Mijatović
4143
+ Premier_League
4144
+ Prensa_Latina
4145
+ Press_Trust_of_India
4146
+ Preston
4147
+ Preston_North_End_F.C.
4148
+ Prestwick
4149
+ Pretoria
4150
+ Primož_Peterka
4151
+ Prince_Eugene_of_Savoy
4152
+ Prince_Rupert,_British_Columbia
4153
+ Prince_William_Sound
4154
+ Princeton_N._Lyman
4155
+ Pro_Bowl
4156
+ Pro_Football_Hall_of_Fame
4157
+ Professional_Squash_Association
4158
+ Progressive_Field
4159
+ Province_of_Mantua
4160
+ Provisional_Irish_Republican_Army
4161
+ Prudential_plc
4162
+ Public_Transport_Corporation
4163
+ Public_utilities_commission
4164
+ Pudong
4165
+ Puebla_F.C.
4166
+ Puerto_Rico
4167
+ Puertollano
4168
+ Punjab,_Pakistan
4169
+ Pyramiden
4170
+ Pádraig_Harrington
4171
+ Pável_Pardo
4172
+ Qasr-e_Shirin
4173
+ Qian_Qichen
4174
+ Qing_Dynasty
4175
+ Queen's_Park,_Chesterfield
4176
+ Queen_of_the_South_F.C.
4177
+ Queens_Park_Rangers_F.C.
4178
+ Queensland
4179
+ Quentin_Coryatt
4180
+ Quetta
4181
+ Quintana_Roo
4182
+ Quito
4183
+ Qur'an
4184
+ R.E._Mouscron
4185
+ R.S.C._Anderlecht
4186
+ R.W.D._Molenbeek
4187
+ R._Charleroi_S.C.
4188
+ R._Nicholas_Burns
4189
+ RAI
4190
+ RAO_UES
4191
+ RCD_Espanyol
4192
+ RC_Lens
4193
+ RC_Strasbourg
4194
+ RKC_Waalwijk
4195
+ RMS_Titanic
4196
+ RSA_Security
4197
+ Rabat
4198
+ Rabobank
4199
+ Rabobank_(cycling_team)
4200
+ Racing_Club_de_Avellaneda
4201
+ Racing_de_Santander
4202
+ Radek_Bonk
4203
+ Radical_Cause
4204
+ Radio_Kabul
4205
+ Radio_New_Zealand
4206
+ Radiotelevision_of_Bosnia-Herzegovina
4207
+ Radka_Zrubáková
4208
+ Rafael_Alkorta
4209
+ Rafael_Palmeiro
4210
+ Rafic_Hariri
4211
+ Rahul_Dravid
4212
+ Rainier_III,_Prince_of_Monaco
4213
+ Raith_Rovers_F.C.
4214
+ Raja_Casablanca
4215
+ Raków_Częstochowa
4216
+ Ralf_Kelleners
4217
+ Rama
4218
+ Ramallah
4219
+ Rambo_(film_series)
4220
+ Rameswaram
4221
+ Ramla
4222
+ Ramon_Vega
4223
+ Ramón_Delgado
4224
+ Ramón_Ramírez
4225
+ Randall_Cunningham
4226
+ Randy_Johnson
4227
+ Randy_Jones_(bobsleigh)
4228
+ Randy_Velarde
4229
+ Rangers_F.C.
4230
+ Ranji_Trophy
4231
+ Ransart,_Belgium
4232
+ Raphaël_Wicky
4233
+ Ras_Tanura
4234
+ Rashid_Latif
4235
+ Rashid_Sidek
4236
+ Ratlam
4237
+ Ravenna_Calcio
4238
+ Ravi_River
4239
+ Ravindra_Pushpakumara
4240
+ Ray_Buchanan
4241
+ Ray_Durham
4242
+ Ray_Houghton
4243
+ Ray_Knight
4244
+ Ray_Lankford
4245
+ Raymond_Atteveld
4246
+ Raymond_James
4247
+ Raymond_Russell
4248
+ Rayo_Vallecano
4249
+ Raúl_González
4250
+ Reading_F.C.
4251
+ Reading_R.F.C.
4252
+ Real_Betis
4253
+ Real_Madrid_C.F.
4254
+ Real_Oviedo
4255
+ Real_Sociedad
4256
+ Real_Valladolid
4257
+ Real_Zaragoza
4258
+ Recep_Çetin
4259
+ Red_Brigades
4260
+ Red_River_of_the_North
4261
+ Red_Sea
4262
+ Red_Star_Belgrade
4263
+ Rede_Globo
4264
+ Redmond,_Washington
4265
+ Reform_Party_of_the_United_States_of_America
4266
+ Regi_Blinker
4267
+ Regina_Jacobs
4268
+ Reinhard_Schwarzenberger
4269
+ Remington_Model_8
4270
+ Renate_Götschl
4271
+ Rennae_Stubbs
4272
+ Rennes
4273
+ Renzo_Furlan
4274
+ René_(novella)
4275
+ René_Eijkelkamp
4276
+ René_Schneider_(footballer)
4277
+ René_Tretschok
4278
+ Repsol_YPF
4279
+ Republic_of_China
4280
+ Republic_of_Ireland
4281
+ Republic_of_Ireland_national_football_team
4282
+ Republic_of_Macedonia
4283
+ Republic_of_Serbian_Krajina
4284
+ Republic_of_the_Congo
4285
+ Republican_National_Convention
4286
+ Republican_Party_(United_States)
4287
+ Republican_Rally_for_Democracy_in_Rwanda
4288
+ Republika_(Indonesian_newspaper)
4289
+ Republika_Srpska
4290
+ Resalat_(newspaper)
4291
+ Reserve_Bank_of_Australia
4292
+ Reserve_Bank_of_India
4293
+ Resistance_movement
4294
+ Retief_Goosen
4295
+ Reto_Götschi
4296
+ Reuters
4297
+ Revolutionary_Armed_Forces_of_Colombia
4298
+ Revolutionary_United_Front
4299
+ Reynald_Pedros
4300
+ Rheinpark_Stadion
4301
+ Ricardo_Peláez
4302
+ Ricardo_Rosset
4303
+ Ricardo_Sá_Pinto
4304
+ Ricco_Groß
4305
+ Richard_Hadlee
4306
+ Richard_Hannon
4307
+ Richard_Kettleborough
4308
+ Richard_Krajicek
4309
+ Richard_Lamm
4310
+ Richard_Roelofsen
4311
+ Richard_Witschge
4312
+ Richey_Reneberg
4313
+ Richmond,_Virginia
4314
+ Richmond_Football_Club
4315
+ Richter_magnitude_scale
4316
+ Rick_Huisman
4317
+ Rickey_Henderson
4318
+ Ricky_Ponting
4319
+ Ricky_Watters
4320
+ Riga
4321
+ Rika_Hiraki
4322
+ Rio_Tinto_Alcan
4323
+ Rio_de_Janeiro
4324
+ Riverside_Ground
4325
+ Riyadh
4326
+ Rob_Andrew
4327
+ Rob_Enderle
4328
+ Rob_Howley
4329
+ Robbie_Earle
4330
+ Robbie_Fowler
4331
+ Robbie_McEwen
4332
+ Robbie_Winters
4333
+ Robert_Allenby
4334
+ Robert_Coles_(golfer)
4335
+ Robert_Croft
4336
+ Robert_Karlsson
4337
+ Robert_Mugabe
4338
+ Robert_P._Casey
4339
+ Robert_Pelletreau
4340
+ Robert_Pirès
4341
+ Robert_Reichel
4342
+ Robert_Rubin
4343
+ Robert_Samuels
4344
+ Robert_Schuman
4345
+ Robert_W._Baird
4346
+ Roberta_Brunet
4347
+ Roberto_Alomar
4348
+ Roberto_Ayala
4349
+ Roberto_Baggio
4350
+ Roberto_Benigni
4351
+ Roberto_Bisconti
4352
+ Roberto_Carlos_(Spanish_footballer)
4353
+ Roberto_Carlos_(footballer)
4354
+ Roberto_Carretero
4355
+ Roberto_Chiappa
4356
+ Roberto_Durán
4357
+ Roberto_M._Levingston
4358
+ Roberto_Mancini
4359
+ Robin_Brooke
4360
+ Robin_Ventura
4361
+ Rochdale_A.F.C.
4362
+ Rochester,_New_Hampshire
4363
+ Rocky
4364
+ Rocky_Coppinger
4365
+ Rod_Beck
4366
+ Roda_JC
4367
+ Rodney_Eyles
4368
+ Roermond
4369
+ Rogel_Nachum
4370
+ Roger_Black
4371
+ Roger_Chapman_(golfer)
4372
+ Roger_Clemens
4373
+ Roger_García_Junyent
4374
+ Roger_Kingdom
4375
+ Roger_Pavlik
4376
+ Rogers_Cup_(tennis)
4377
+ Rohan_Robinson
4378
+ Roland_Holder
4379
+ Rolf_Ekéus
4380
+ Rolf_Fringer
4381
+ Rolf_Sørensen
4382
+ Roman_Empire
4383
+ Romania
4384
+ Romania_national_football_team
4385
+ Romania_national_under-21_football_team
4386
+ Romanian_National_Unity_Party
4387
+ Romanians
4388
+ Romano_Prodi
4389
+ Rome
4390
+ Romesh_Kaluwitharana
4391
+ Ron_Wright_(baseball)
4392
+ Ronald_Brunmayr
4393
+ Ronald_Goldman
4394
+ Ronald_Hamming
4395
+ Ronald_Reagan
4396
+ Ronald_Waterreus
4397
+ Ronald_de_Boer
4398
+ Ronaldo
4399
+ Ronan_Rafferty
4400
+ Ronde_van_Nederland
4401
+ Ronnie_Irani
4402
+ Roosevelt_family
4403
+ Rory_Underwood
4404
+ Rosario,_Santa_Fe
4405
+ Rosario_Central
4406
+ Rose_Bowl_(stadium)
4407
+ Rose_Cheruiyot
4408
+ Rosenborg_BK
4409
+ Roshan_Mahanama
4410
+ Ross_County_F.C.
4411
+ Ross_McFarlane
4412
+ Ross_Perot
4413
+ Rostelecom
4414
+ Rotary_International
4415
+ Rotherham_United_F.C.
4416
+ Rotterdam
4417
+ Rottweiler
4418
+ Rovereto
4419
+ Rowanduz
4420
+ Roxbury,_Vermont
4421
+ Roy_Aitken
4422
+ Roy_Keane
4423
+ Royal_Antwerp_FC
4424
+ Royal_Boskalis_Westminster
4425
+ Royal_Clipper
4426
+ Royal_Free_Hospital
4427
+ Royal_Meteorological_Institute
4428
+ Royal_Tunbridge_Wells
4429
+ Royce_Clayton
4430
+ Rubber_Board
4431
+ Ruben_Kruger
4432
+ Rubens_Barrichello
4433
+ Rubén_Sierra
4434
+ Ruch_Chorzów
4435
+ Rudi_Vata
4436
+ Rudyard,_Montana
4437
+ Rugby_Football_Union
4438
+ Rugby_World_Cup
4439
+ Rugby_football
4440
+ Rugby_league
4441
+ Rugby_union
4442
+ Ruggiero_Rizzitelli
4443
+ Rui_Barros
4444
+ Rui_Correia
4445
+ Rui_Costa
4446
+ Rui_Madeira
4447
+ Rumbek
4448
+ Run_(baseball)
4449
+ Rupert_Murdoch
4450
+ Ruse,_Bulgaria
4451
+ Russ_Cochran
4452
+ Russell_Claydon
4453
+ Russia
4454
+ Russia_national_football_team
4455
+ Russian_Empire
4456
+ Russian_Trading_System
4457
+ Rusty_Greer
4458
+ Ruth_Perry
4459
+ Ruxandra_Dragomir
4460
+ Rwanda
4461
+ Ryder_Cup
4462
+ Ryne_Sandberg
4463
+ Ryszard_Wieczorek
4464
+ Ryutaro_Hashimoto
4465
+ Régine_Cavagnoud
4466
+ S&P_500
4467
+ S.C._Braga
4468
+ S.C._Espinho
4469
+ S.L._Benfica
4470
+ S.S.C._Napoli
4471
+ S.S._Lazio
4472
+ SBC_Communications
4473
+ SBM_Offshore
4474
+ SC_Bastia
4475
+ SC_Freiburg
4476
+ SC_Heerenveen
4477
+ SC_Tavriya_Simferopol
4478
+ SD_Compostela
4479
+ SEGRO
4480
+ SFC_Opava
4481
+ SIDOR
4482
+ SK_Brann
4483
+ SK_Dynamo_České_Budějovice
4484
+ SK_Rapid_Wien
4485
+ SK_Sigma_Olomouc
4486
+ SK_Sturm_Graz
4487
+ SNCF
4488
+ SPAL_1907
4489
+ SV_Ried
4490
+ SV_Werder_Bremen
4491
+ Sabina_Panzanini
4492
+ Sabine_Appelmans
4493
+ Sabine_Hack
4494
+ Sabri_Lamouchi
4495
+ Sachin_Tendulkar
4496
+ Sacramento_Kings
4497
+ Saddam_Hussein
4498
+ Saeb_Erekat
4499
+ Saeed_Anwar
4500
+ Saffet_Sancaklı
4501
+ Saint_Croix,_U.S._Virgin_Islands
4502
+ Saint_Nicholas
4503
+ Sakai,_Osaka
4504
+ Salah_Hissou
4505
+ Salang,_Afghanistan
4506
+ Sale_Grammar_School
4507
+ Sale_Sharks
4508
+ Saleem_Malik
4509
+ Sali_Berisha
4510
+ Sally_Barsosio
4511
+ Sally_Boyden_(cyclist)
4512
+ Salomon_Brothers
4513
+ Salzburg
4514
+ Sammy_Sosa
4515
+ Samson_Kitur
4516
+ Samsung_Heavy_Industries
4517
+ Samuel_Matete
4518
+ San_Antonio_Spurs
4519
+ San_Diego
4520
+ San_Diego_Chargers
4521
+ San_Diego_Padres
4522
+ San_Francisco
4523
+ San_Francisco_49ers
4524
+ San_Francisco_Giants
4525
+ San_Jose,_California
4526
+ San_Jose_Sharks
4527
+ San_José,_Costa_Rica
4528
+ San_Lorenzo_de_Almagro
4529
+ San_Marino
4530
+ San_Marino_national_football_team
4531
+ San_Mateo,_California
4532
+ Sanath_Jayasuriya
4533
+ Sandon_Stolle
4534
+ Sandra_Cacic
4535
+ Sandra_Cecchini
4536
+ Sandra_Kleinová
4537
+ Sandrine_Testud
4538
+ Sandusky,_Ohio
4539
+ Santa_Barbara,_California
4540
+ Santa_Claus
4541
+ Santiago,_Chile
4542
+ Santiago_Bernabéu_Stadium
4543
+ Santiago_de_Cuba
4544
+ Santos_FC
4545
+ Saqlain_Mushtaq
4546
+ Saracens_F.C.
4547
+ Sarah_Brady
4548
+ Sarah_Pitkowski-Malcor
4549
+ Sarah_Thorsett
4550
+ Sarajevo
4551
+ Sardinia
4552
+ Saskatchewan
4553
+ Saskatchewan_Wheat_Pool
4554
+ Satoshi_Higashi
4555
+ Sauber
4556
+ Saudi_Arabia
4557
+ Saudi_Press_Agency
4558
+ Saxony
4559
+ Scarborough_F.C.
4560
+ Schering_AG
4561
+ Scotland
4562
+ Scotland_national_football_team
4563
+ Scotland_national_rugby_union_team
4564
+ Scotland_national_under-21_football_team
4565
+ Scott_Brosius
4566
+ Scott_Draper
4567
+ Scott_Erickson
4568
+ Scott_Hoch
4569
+ Scott_Humphries
4570
+ Scott_McCarron
4571
+ Scott_McGrory
4572
+ Scott_Quinnell
4573
+ Scott_Russell_(motorcyclist)
4574
+ Scott_Sanders_(baseball)
4575
+ Scott_Young_(footballer)
4576
+ Scottish_Cup
4577
+ Scottish_Highlands
4578
+ Scottish_Labour_Party
4579
+ Scottish_League_Cup
4580
+ Scottish_National_Party
4581
+ Scottish_Premier_League
4582
+ Scuderia_Ferrari
4583
+ Scunthorpe_United_F.C.
4584
+ Sean_Berry
4585
+ Sean_Dundee
4586
+ Sean_Fitzpatrick
4587
+ Sean_Olsson
4588
+ Seattle
4589
+ Seattle_Mariners
4590
+ Seattle_Seahawks
4591
+ Seattle_SuperSonics
4592
+ Sebastian_Lindholm
4593
+ Sebastien_Tortelli
4594
+ Second_French_Empire
4595
+ Security_Council_of_Russia
4596
+ Senegal
4597
+ Seoul
4598
+ Sepp_Dostthaler
4599
+ Serbia
4600
+ Serbs
4601
+ Sergey_Alexandrovich_Makarov
4602
+ Sergey_Klevchenya
4603
+ Sergey_Yastrzhembsky
4604
+ Sergi_Barjuán
4605
+ Sergi_Bruguera
4606
+ Sergiyev_Posad
4607
+ Serhiy_Rebrov
4608
+ Serie_A
4609
+ Servais_Knaven
4610
+ Servet_Pëllumbi
4611
+ Servette_FC
4612
+ Severiano_Ballesteros
4613
+ Sevilla_FC
4614
+ Seville
4615
+ Seychelles
4616
+ Seychelles_national_football_team
4617
+ Shadab_Kabir
4618
+ Shahid_Afridi
4619
+ Shane_Kelly
4620
+ Shane_Reynolds
4621
+ Shane_Warne
4622
+ Shanghai
4623
+ Shannon_Airport
4624
+ Sharjah_(city)
4625
+ Sharjah_(emirate)
4626
+ Shatoy
4627
+ Shaun_Pollock
4628
+ Shaun_Young
4629
+ Shawn_Estes
4630
+ Shay_Given
4631
+ Shayne_King
4632
+ Sheffield
4633
+ Sheffield_Eagles
4634
+ Sheffield_Shield
4635
+ Sheffield_United_F.C.
4636
+ Sheffield_Wednesday_F.C.
4637
+ Sheikh_Hasina
4638
+ Shelbourne_F.C.
4639
+ Shell_Oil_Company
4640
+ Shell_Turbo_Chargers
4641
+ Shem_Kororia
4642
+ Sherwin_Campbell
4643
+ Shigeki_Maruyama
4644
+ Shimon_Peres
4645
+ Shinichi_Itoh
4646
+ Shining_Path
4647
+ Shiraz
4648
+ Shivnarine_Chanderpaul
4649
+ Shrewsbury_Town_F.C.
4650
+ Shu_Kamo
4651
+ Shuzo_Matsuoka
4652
+ Shwedagon_Pagoda
4653
+ Sialkot
4654
+ Siam_Commercial_Bank
4655
+ Sibiu_International_Airport
4656
+ Sicily
4657
+ Sidwell_Friends_School
4658
+ Sidya_Touré
4659
+ Sierakowice,_Pomeranian_Voivodeship
4660
+ Sierra_Leone
4661
+ Sierra_Leone_national_football_team
4662
+ Sierra_Nevada_(U.S.)
4663
+ Sigurd_Njerve
4664
+ Silicon_Valley
4665
+ Silke_Renk
4666
+ Silvia_Farina_Elia
4667
+ Silvio_Berlusconi
4668
+ Simon_Brown_(cricketer)
4669
+ Simon_Crafar
4670
+ Simon_Culhane
4671
+ Simon_Doull
4672
+ Simon_Parke
4673
+ Simone_Greiner-Petter-Memm
4674
+ Sinaloa
4675
+ Sindh_High_Court
4676
+ Singapore
4677
+ Singapore_International_Monetary_Exchange
4678
+ Singer_World_Series
4679
+ Sisters_of_Loreto
4680
+ Sivas
4681
+ Six_Nations_Championship
4682
+ Sjeng_Schalken
4683
+ Skeid_Fotball
4684
+ Skeleton_Canyon
4685
+ Skhirat
4686
+ Skopje
4687
+ Sky_Digital_(UK_&_Ireland)
4688
+ Slavia_Prague
4689
+ Sliema
4690
+ Slobodan_Milošević
4691
+ Slovakia
4692
+ Slovakia_national_football_team
4693
+ Slovenia
4694
+ Small_Is_Beautiful
4695
+ Social_Democratic_Party_(Japan)
4696
+ Social_Democratic_Party_(Romania)
4697
+ Social_Democratic_Party_of_Germany
4698
+ Socialist_Federal_Republic_of_Yugoslavia
4699
+ Socialist_Party_of_Albania
4700
+ Socialist_Party_of_Serbia
4701
+ Sociedade_Esportiva_Palmeiras
4702
+ Sofia
4703
+ SoftBank
4704
+ Sokoto
4705
+ Solidere
4706
+ Somalia
4707
+ Somerset_County_Cricket_Club
4708
+ Son_Sen
4709
+ Sonora
4710
+ Sophia_Loren
4711
+ Sourav_Ganguly
4712
+ South_Africa
4713
+ South_Africa_national_cricket_team
4714
+ South_Africa_national_field_hockey_team
4715
+ South_Africa_national_rugby_union_team
4716
+ South_African_Breweries
4717
+ South_America
4718
+ South_China_Morning_Post
4719
+ South_Dakota
4720
+ South_Island
4721
+ South_Kivu
4722
+ South_Korea
4723
+ South_Lebanon_Army
4724
+ South_Queensland_Crushers
4725
+ South_Sydney_Rabbitohs
4726
+ South_Yemen
4727
+ Southampton
4728
+ Southampton_F.C.
4729
+ Southeast_Asia
4730
+ Southend_United_F.C.
4731
+ Southern_California_Open
4732
+ Southern_Illinois_University_Carbondale
4733
+ Southern_New_England_Telecommunications
4734
+ Soviet_Union
4735
+ Spain
4736
+ Spain_Fed_Cup_team
4737
+ Spanish_language
4738
+ Spanish_people
4739
+ Sparta_Prague
4740
+ Sparta_Rotterdam
4741
+ Speed_Skating_World_Cup
4742
+ Spetses
4743
+ Spirou_Charleroi
4744
+ Split_(city)
4745
+ Sport_Club_Corinthians_Paulista
4746
+ Sport_Club_Internacional
4747
+ Sport_Club_do_Recife
4748
+ Sporting_Clube_de_Portugal
4749
+ Sporting_de_Gijón
4750
+ Sprague,_Manitoba
4751
+ Springfield,_Massachusetts
4752
+ Srebrenica
4753
+ Sri_Lanka
4754
+ Sri_Lanka_national_cricket_team
4755
+ Sri_Lankan_Tamil_people
4756
+ Srinagar
4757
+ St._Albans_(city),_Vermont
4758
+ St._Gallen
4759
+ St._George_Dragons
4760
+ St._George_Illawarra_Dragons
4761
+ St._Johnstone_F.C.
4762
+ St._Leger_Stakes
4763
+ St._Louis,_Missouri
4764
+ St._Louis_Blues_(ice_hockey)
4765
+ St._Louis_Cardinals
4766
+ St._Mirren_F.C.
4767
+ St._Petersburg,_Florida
4768
+ St_Albans_City_F.C.
4769
+ St_Helens,_Merseyside
4770
+ St_Helens_RLFC
4771
+ St_Kilda_Football_Club
4772
+ Stabæk_Fotball
4773
+ Stade_Malherbe_Caen
4774
+ Stade_Rennais_F.C.
4775
+ Stagecoach_Group
4776
+ Standard_&_Poor's
4777
+ Standard_Liège
4778
+ Stansted_Mountfitchet
4779
+ Staouéli
4780
+ State_Council_of_the_People's_Republic_of_China
4781
+ State_of_Palestine
4782
+ Statistics_Canada
4783
+ Statoil
4784
+ Stefaan_De_Clerck
4785
+ Stefan_Edberg
4786
+ Stefan_Johansson
4787
+ Stefan_Kuntz
4788
+ Stefan_Reuter
4789
+ Stefan_Schwarz
4790
+ Stefano_Pescosolido
4791
+ Stefano_Tilli
4792
+ Stefano_Zanini
4793
+ Steffen_Freund
4794
+ Steffi_Graf
4795
+ Stefka_Kostadinova
4796
+ Stenhousemuir_F.C.
4797
+ Stephan_Marasek
4798
+ Stephanie_Storp
4799
+ Stephen_Ames
4800
+ Stephen_Brown_(athlete)
4801
+ Stephen_McAllister
4802
+ Stephen_Pate
4803
+ Stephenville_International_Airport
4804
+ Steptoe_&_Johnson
4805
+ Sterling_Hitchcock
4806
+ Steve_Backley
4807
+ Steve_Finley
4808
+ Steve_Howey_(footballer)
4809
+ Steve_Jones_(golfer)
4810
+ Steve_McManaman
4811
+ Steve_Redgrave
4812
+ Steve_Staunton
4813
+ Steve_Stricker
4814
+ Steve_Waugh
4815
+ Steve_Webster
4816
+ Steve_van_Vuuren
4817
+ Stevenage_F.C.
4818
+ Stewart_Ginn
4819
+ Steyr
4820
+ Stirling
4821
+ Stirling_Albion_F.C.
4822
+ Stockholm
4823
+ Stockport_County_F.C.
4824
+ Stoke_City_F.C.
4825
+ Straits_of_Florida
4826
+ Stranraer
4827
+ Stranraer_F.C.
4828
+ Strasbourg
4829
+ Strobe_Talbott
4830
+ Stuart_Appleby
4831
+ Stuart_Cage
4832
+ Stuart_Law
4833
+ Stuart_McCall
4834
+ Stung_Treng
4835
+ Stuttgart
4836
+ Stuttgart_Airport
4837
+ Stéphane_Chapuisat
4838
+ Stéphane_Guivarc'h
4839
+ Stéphane_Henchoz
4840
+ Stéphane_Ortelli
4841
+ Stéphane_Simian
4842
+ Subaru
4843
+ Subaru_Impreza
4844
+ Subaru_Impreza_WRX_STI
4845
+ Subaru_World_Rally_Team
4846
+ Sud-PTT
4847
+ Sudan
4848
+ Sudan_Airways
4849
+ Sudan_national_football_team
4850
+ Sudbury_Town_F.C.
4851
+ Suharto
4852
+ Sui_Southern_Gas_Company
4853
+ Sulaymaniyah
4854
+ Sultan,_Crown_Prince_of_Saudi_Arabia
4855
+ Summer_Olympic_Games
4856
+ SunGard
4857
+ Sun_Jun_(badminton)
4858
+ Sun_Microsystems
4859
+ Sunday_Bada
4860
+ Sunderland_A.F.C.
4861
+ Super_League
4862
+ Supercopa_de_España
4863
+ Supercoppa_Italiana
4864
+ Superman_(film)
4865
+ Supreme_Court_of_Florida
4866
+ Supreme_Court_of_New_South_Wales
4867
+ Supreme_Court_of_the_Republic_of_China
4868
+ Supreme_Court_of_the_United_States
4869
+ Supreme_Federal_Court
4870
+ Surabaya
4871
+ Suraj_Bhan
4872
+ Suriname
4873
+ Surrey
4874
+ Surrey_County_Cricket_Club
4875
+ Surčin
4876
+ Susan_Roosevelt_Weld
4877
+ Susi_Susanti
4878
+ Sussex_County_Cricket_Club
4879
+ Suwon_Samsung_Bluewings
4880
+ Suzuka_Circuit
4881
+ Suzuki
4882
+ Sven_Fischer
4883
+ Sven_Nylander
4884
+ Sven_Pieters
4885
+ Sven_Strüver
4886
+ Svetlana_Masterkova
4887
+ Svetlana_Paramygina
4888
+ Svetlana_Zalevskaya
4889
+ Swansea_City_A.F.C.
4890
+ Swansea_RFC
4891
+ Sweden
4892
+ Sweden_men's_national_ice_hockey_team
4893
+ Sweden_national_football_team
4894
+ Swerford
4895
+ Swindon_Town_F.C.
4896
+ Swiss_Bank_Corporation
4897
+ Switzerland
4898
+ Switzerland_national_football_team
4899
+ Sydney
4900
+ Sydney_Cricket_Ground
4901
+ Sydney_Roosters
4902
+ Sydney_Swans
4903
+ Sylvain_Bouchard
4904
+ Sylvester_Stallone
4905
+ Syria
4906
+ Syria_national_football_team
4907
+ São_Paulo
4908
+ São_Paulo_(state)
4909
+ São_Paulo_FC
4910
+ Sébastien_Foucras
4911
+ Sébastien_Fournier
4912
+ Sébastien_Lareau
4913
+ Sérgio_Conceição
4914
+ Sören_Lausberg
4915
+ Søren_Jessen-Petersen
4916
+ Süddeutsche_Zeitung
4917
+ Süleyman_Demirel
4918
+ Sławomir_Wojciechowski
4919
+ T-72
4920
+ TSV_1860_München
4921
+ TVM_(cycling_team)
4922
+ TV_Guide
4923
+ Ta_Mok
4924
+ Tadayuki_Okada
4925
+ Tae_Satoya
4926
+ Taibe,_Galilee
4927
+ Taipei
4928
+ Taiwan
4929
+ Taiwan_Strait
4930
+ Tajikistan
4931
+ Takanobu_Okabe
4932
+ Takuma_Aoki
4933
+ Takuya_Takagi
4934
+ Taliban
4935
+ Tallahassee,_Florida
4936
+ Tallinn
4937
+ Tamaulipas
4938
+ Tami_Whitlinger
4939
+ Tamil_Eelam_Liberation_Organization
4940
+ Tamil_Nadu
4941
+ Tamil_people
4942
+ Tampa_Bay_Buccaneers
4943
+ Tampa_Bay_Lightning
4944
+ Tampere
4945
+ Tampico,_Tamaulipas
4946
+ Tando_Allahyar
4947
+ Tang_Jiaxuan
4948
+ Tanja_Damaske
4949
+ Tanjug
4950
+ Tanjung_Priok
4951
+ Tansu_Çiller
4952
+ Tanya_Dubnicoff
4953
+ Tanzania
4954
+ Tanzania_national_football_team
4955
+ Tarek_Jabban
4956
+ Tariq_Aziz
4957
+ Tarmac_(company)
4958
+ Tarpon_Springs,_Florida
4959
+ Tartus
4960
+ Tasmania
4961
+ Tasmanian_Tigers
4962
+ Tatarstan
4963
+ Tatiana_Stiajkina
4964
+ Tatjana_Mittermayer
4965
+ Tatyana_Babashkina
4966
+ Tauranga
4967
+ Tavildara
4968
+ Tayfun_Korkut
4969
+ Tea_Vikstedt-Nyman
4970
+ Team_Polti
4971
+ Teddy_Sheringham
4972
+ Teemu_Selänne
4973
+ Tegla_Loroupe
4974
+ Tegucigalpa
4975
+ Tehran
4976
+ Tel_Aviv
4977
+ Telegraph_Agency_of_the_Soviet_Union
4978
+ Televisa
4979
+ Television_New_Zealand
4980
+ Telfer_Mine
4981
+ Telkom_Indonesia
4982
+ Telmex
4983
+ Telstra
4984
+ Tempe,_Arizona
4985
+ Ten_Commandments
4986
+ Tennessee
4987
+ Tennessee_Titans
4988
+ Teresa
4989
+ Terry_Kennedy
4990
+ Terry_Mulholland
4991
+ Terry_Pendleton
4992
+ Terry_Phelan
4993
+ Terry_Price_(golfer)
4994
+ Terry_Steinbach
4995
+ Texas
4996
+ Texas_Panhandle
4997
+ Texas_Rangers_(baseball)
4998
+ Thabo_Mbeki
4999
+ Thai_Nation_Party
5000
+ Thailand
5001
+ The_Ashes
5002
+ The_Bahamas
5003
+ The_Bank_of_Tokyo-Mitsubishi_UFJ
5004
+ The_Conference_Board
5005
+ The_Crying_Game
5006
+ The_Daily_Telegraph_(Australia)
5007
+ The_Hague
5008
+ The_Holocaust
5009
+ The_Irish_Times
5010
+ The_Jakarta_Post
5011
+ The_Jerusalem_Post
5012
+ The_Jordan_Times
5013
+ The_Lancet
5014
+ The_Land_is_Ours
5015
+ The_Miami_Herald
5016
+ The_Nation
5017
+ The_New_Saints_F.C.
5018
+ The_New_York_Times
5019
+ The_Open_Championship
5020
+ The_Oval
5021
+ The_Pentagon
5022
+ The_Statesman
5023
+ The_Sumitomo_Bank
5024
+ The_Sun_(United_Kingdom)
5025
+ The_Sunday_Telegraph
5026
+ The_Times
5027
+ The_Vancouver_Sun
5028
+ The_Wall_Street_Journal
5029
+ The_Walt_Disney_Company
5030
+ The_Washington_Post
5031
+ Theodore_Roosevelt
5032
+ Theodoros_Pangalos
5033
+ Thessaloniki
5034
+ Thierry_Henry
5035
+ Thomas_Bjørn
5036
+ Thomas_Bscher
5037
+ Thomas_Enqvist
5038
+ Thomas_Helmer
5039
+ Thomas_Häßler
5040
+ Thomas_Johansson
5041
+ Thomas_Muster
5042
+ Thomas_Rådström
5043
+ Thomas_Seeliger
5044
+ Thomas_Sobotzik
5045
+ Thomas_Stickroth
5046
+ Thomas_Strunz
5047
+ Thomas_Stuer-Lauridsen
5048
+ Thunder_Bay
5049
+ Tiananmen_Square
5050
+ Tianjin
5051
+ Tibet
5052
+ Tiburones_Rojos_de_Veracruz
5053
+ Tiger_Woods
5054
+ Tignes
5055
+ Tijani_Babangida
5056
+ Tijuana
5057
+ Tim_Belcher
5058
+ Tim_Forsyth
5059
+ Tim_Hancock
5060
+ Tim_Henman
5061
+ Tim_Herron
5062
+ Tim_Horan
5063
+ Tim_Lobinger
5064
+ Tim_Munton
5065
+ Tim_Simpson
5066
+ Tim_Stimpson
5067
+ Tim_Wakefield
5068
+ Timişoara
5069
+ Tina_Križan
5070
+ Tino_Martinez
5071
+ Tirana
5072
+ Todd_Hollandsworth
5073
+ Todd_Martin
5074
+ Todd_Stottlemyre
5075
+ Todd_Van_Poppel
5076
+ Todd_Woodbridge
5077
+ Todd_Worrell
5078
+ Todor_Zhivkov
5079
+ Toftir
5080
+ Togo
5081
+ Togo_national_football_team
5082
+ Tokyo
5083
+ Tokyo_Stock_Exchange
5084
+ Toledo,_Ohio
5085
+ Tolunay_Kafkas
5086
+ Tom_Boyd_(footballer)
5087
+ Tom_Candiotti
5088
+ Tom_Cruise
5089
+ Tom_Daschle
5090
+ Tom_Glavine
5091
+ Tom_Ikimi
5092
+ Tom_Johnson_(boxer)
5093
+ Tom_Kiernan
5094
+ Tom_Lehman
5095
+ Tom_Moody
5096
+ Tom_Nyariki
5097
+ Tom_Pagnozzi
5098
+ Tom_Pukstys
5099
+ Tom_Steels
5100
+ Tom_Watson_(golfer)
5101
+ Tomasz_Moskal
5102
+ Tommy_Haas
5103
+ Tommy_Martyn
5104
+ Tommy_Thompson
5105
+ Tomás_Carbonell
5106
+ Tonga
5107
+ Toni_Polster
5108
+ Tony_Adams_(footballer)
5109
+ Tony_Blair
5110
+ Tony_Cascarino
5111
+ Tony_Clark
5112
+ Tony_Eusebio
5113
+ Tony_Greig
5114
+ Tony_Jarrett
5115
+ Tony_Johnstone
5116
+ Tony_P._Hall
5117
+ Tony_Roche
5118
+ Tony_Siragusa
5119
+ Tony_Underwood
5120
+ Tony_Vairelles
5121
+ Toomas_Savi
5122
+ Topoľčany
5123
+ Tops_Markets_LLC
5124
+ Torino_F.C.
5125
+ Toronto
5126
+ Toronto_Blue_Jays
5127
+ Toronto_Maple_Leafs
5128
+ Toronto_Raptors
5129
+ Toros_Neza
5130
+ Torquay_United_F.C.
5131
+ Torrance_Zellner
5132
+ Tottenham_Hotspur_F.C.
5133
+ Tour_de_France
5134
+ Tour_de_Suisse
5135
+ Towers_Perrin
5136
+ Toyota
5137
+ Toyota_Australia
5138
+ Toyota_Celica
5139
+ Tranmere_Rovers_F.C.
5140
+ Trans_World_Airlines
5141
+ Transcendental_Meditation
5142
+ Transvaal_Province
5143
+ Transylvania
5144
+ Travis_Fryman
5145
+ Treaty_of_Baden
5146
+ Trenidad_Hubbard
5147
+ Trent_Bridge
5148
+ Treorchy_RFC
5149
+ Trevor_Dodds
5150
+ Trincomalee
5151
+ Trinidad
5152
+ Tripoli,_Lebanon
5153
+ Tristan_Hoffman
5154
+ Troy,_Tennessee
5155
+ Troy_Corser
5156
+ Troy_Vincent
5157
+ Tubmanburg
5158
+ Tunceli_Province
5159
+ Tunis
5160
+ Tunisia
5161
+ Tunisia_national_football_team
5162
+ Turin
5163
+ Turkey_national_football_team
5164
+ Turkish_Armed_Forces
5165
+ Turkish_Cypriots
5166
+ Turnhout
5167
+ Tuscany
5168
+ Tutsi
5169
+ Tuzla
5170
+ Twickenham_Stadium
5171
+ Twyford_Down
5172
+ Ty_Detmer
5173
+ Tychy
5174
+ Tyler_Houston
5175
+ Tyrrell_Racing
5176
+ Tórshavn
5177
+ Túlio_Costa
5178
+ U.C._Sampdoria
5179
+ U.S._Bank_Championship_in_Milwaukee
5180
+ U.S._Città_di_Palermo
5181
+ U.S._Cremonese
5182
+ U.S._Lecce
5183
+ U.S._Open_(golf)
5184
+ U._Chandana
5185
+ UBS
5186
+ UCI_Road_World_Cup
5187
+ UD_Salamanca
5188
+ UEFA
5189
+ UEFA_Champions_League
5190
+ UEFA_Cup_Winners'_Cup
5191
+ UEFA_Euro_1992
5192
+ UEFA_Euro_1996
5193
+ UEFA_Europa_League
5194
+ UEFA_European_Football_Championship
5195
+ UEFA_Intertoto_Cup
5196
+ UNITA
5197
+ USS_Carl_Vinson_(CVN-70)
5198
+ USS_Enterprise_(CVN-65)
5199
+ USTA_Billie_Jean_King_National_Tennis_Center
5200
+ US_Airways
5201
+ US_Open_(tennis)
5202
+ Udinese_Calcio
5203
+ Uganda
5204
+ Uganda_national_football_team
5205
+ Ujjain
5206
+ Ukraine
5207
+ Ukraine_national_football_team
5208
+ Ukyo_Katayama
5209
+ Ulan_Bator
5210
+ Ulf_Kirsten
5211
+ Ulsan
5212
+ Ulsan_Hyundai_FC
5213
+ Ulysses_S._Grant
5214
+ Umberto_Bossi
5215
+ Umbria
5216
+ Umkomaas
5217
+ Umm_Qasr
5218
+ Unified_Communist_Party_of_Nepal_(Maoist)
5219
+ Union_Bank_of_Switzerland
5220
+ Union_Luxembourg
5221
+ United_Arab_Emirates
5222
+ United_Arab_Emirates_national_football_team
5223
+ United_Daughters_of_the_Confederacy
5224
+ United_Kingdom
5225
+ United_Liberation_Movement_of_Liberia_for_Democracy_–_Johnson_faction
5226
+ United_National_Party
5227
+ United_Nations
5228
+ United_Nations_Commission_on_Human_Rights
5229
+ United_Nations_High_Commissioner_for_Refugees
5230
+ United_Nations_Interim_Force_in_Lebanon
5231
+ United_Nations_Security_Council
5232
+ United_Nations_Special_Commission
5233
+ United_News_of_India
5234
+ United_States
5235
+ United_States_Air_Force
5236
+ United_States_Amateur_Championship_(golf)
5237
+ United_States_Army
5238
+ United_States_Congress
5239
+ United_States_Court_of_Appeals_for_the_Fourth_Circuit
5240
+ United_States_Department_of_Agriculture
5241
+ United_States_Department_of_Commerce
5242
+ United_States_Department_of_Defense
5243
+ United_States_Department_of_State
5244
+ United_States_Department_of_the_Treasury
5245
+ United_States_Fed_Cup_team
5246
+ United_States_Marine_Corps
5247
+ United_States_Navy
5248
+ United_States_Senate_Committee_on_Armed_Services
5249
+ United_States_Senate_Select_Committee_on_Intelligence
5250
+ United_States_Tennis_Association
5251
+ United_States_Treasury_security
5252
+ United_States_courts_of_appeals
5253
+ United_States_district_court
5254
+ United_States_dollar
5255
+ United_States_men's_national_soccer_team
5256
+ University_of_Michigan
5257
+ University_of_Oklahoma
5258
+ University_of_Oxford
5259
+ University_of_Pennsylvania
5260
+ University_of_Yangon
5261
+ University_of_the_Witwatersrand
5262
+ Unión_de_Santa_Fe
5263
+ Uppsala
5264
+ Uppsala_University_Hospital
5265
+ Upul_Chandana
5266
+ Ural_Mountains
5267
+ Uruguay
5268
+ Urus-Martan
5269
+ Utah
5270
+ Utah_Jazz
5271
+ Uttar_Pradesh
5272
+ Uyo
5273
+ Uzbekistan
5274
+ Uzbekistan_national_football_team
5275
+ Uzi
5276
+ V.C._Eendracht_Aalst_2002
5277
+ Vadim_Sashurin
5278
+ Vaduz
5279
+ Vail,_Colorado
5280
+ Valdo_Filho
5281
+ Vale_(mining_company)
5282
+ Valencia,_Spain
5283
+ Valencia_CF
5284
+ Valentina_Fedyushina
5285
+ Valero_Energy_Corporation
5286
+ Valero_Texas_Open
5287
+ Valletta
5288
+ Valletta_F.C.
5289
+ Vampeta
5290
+ Van,_Turkey
5291
+ Vancouver
5292
+ Vancouver_Canucks
5293
+ Vanderbijlpark
5294
+ Variety_(magazine)
5295
+ Vasas_SC
5296
+ Vatican_City
5297
+ Vatican_Library
5298
+ Vavuniya
5299
+ Veliko_Tarnovo
5300
+ Velten
5301
+ Venezuela
5302
+ Venice
5303
+ Venice_Film_Festival
5304
+ Venkatesh_Prasad
5305
+ Venray
5306
+ Ventura,_California
5307
+ VeriSign
5308
+ Verona
5309
+ Veronica_Brenner
5310
+ VfB_Stuttgart
5311
+ VfL_Bochum
5312
+ Viacom_(1971–2005)
5313
+ Viatcheslav_Ekimov
5314
+ Vicente_Solano_Lima
5315
+ Vicenza_Calcio
5316
+ Victor_Babeş
5317
+ Victoria_(Australia)
5318
+ Victoria_Falls,_Zimbabwe
5319
+ Victorian_Bushrangers
5320
+ Vidadi_Rzayev
5321
+ Videoton_FC
5322
+ Vienna
5323
+ Vietnam
5324
+ Vietnam_War
5325
+ Viking_FK
5326
+ Viktor_Chernomyrdin
5327
+ Viktor_Paço
5328
+ Ville_Peltonen
5329
+ Vince_Lombardi
5330
+ Vince_Spadea
5331
+ Vince_Wells
5332
+ Vinko_Marinović
5333
+ Vinny_Castilla
5334
+ Vinnytsia
5335
+ Vinod_Kambli
5336
+ Violeta_Chamorro
5337
+ Viorel_Ion
5338
+ Viorel_Moldovan
5339
+ Virakesari
5340
+ Virgilijus_Alekna
5341
+ Virginia
5342
+ Virginia_Ruano_Pascual
5343
+ Virginia_Tech_Hokies_football
5344
+ Virtus_Bologna
5345
+ Vitali_Smirnov
5346
+ Vitesse
5347
+ Vitória_S.C.
5348
+ Viva_Zapata!
5349
+ Vladimir_Dubrovshchik
5350
+ Vladimir_Zhirinovsky
5351
+ Vladimír_Mečiar
5352
+ Vladimír_Šmicer
5353
+ Vogue_(magazine)
5354
+ Volker_Rühe
5355
+ Volkswagen
5356
+ Volkswagen_Group
5357
+ Volkswagen_Passat
5358
+ Vtm
5359
+ Václav_Havel
5360
+ Václav_Klaus
5361
+ Vålerenga_Fotball
5362
+ Vénuste_Niyongabo
5363
+ Víctor_Sánchez
5364
+ Vítor_Baía
5365
+ WA_Reds
5366
+ WBUR
5367
+ WGC-Bridgestone_Invitational
5368
+ WSC_World_XI
5369
+ Waiau,_Canterbury
5370
+ Waitrose
5371
+ Wales
5372
+ Wales_national_football_team
5373
+ Wales_national_rugby_union_team
5374
+ Wales_national_under-21_football_team
5375
+ Walikale
5376
+ Wall_Street
5377
+ Wally_Joyner
5378
+ Wally_Whitehurst
5379
+ Walsall_F.C.
5380
+ Walter_Little_(rugby_player)
5381
+ Walter_Mondale
5382
+ Wang_Chen_(badminton)
5383
+ Wang_Shi-ting
5384
+ Waqar_Younis
5385
+ War_of_the_Spanish_Succession
5386
+ Warner_Bros.
5387
+ Warren_Buffett
5388
+ Warrington_Wolves
5389
+ Warsaw
5390
+ Warsaw_Pact
5391
+ Warwickshire_County_Cricket_Club
5392
+ Washington,_D.C.
5393
+ Washington_(U.S._state)
5394
+ Washington_Capitals
5395
+ Washington_Redskins
5396
+ Washington_Wizards
5397
+ Wasim_Akram
5398
+ Wataru_Yoshikawa
5399
+ Waterville,_Washington
5400
+ Watford_F.C.
5401
+ Watsonians_RFC
5402
+ Wayne_Ferreira
5403
+ Wayne_Riley
5404
+ Wayne_Westner
5405
+ Wednesday
5406
+ Wellington
5407
+ Wembley_Arena
5408
+ Wembley_Stadium
5409
+ Wen_Hui_Bao
5410
+ Wen_Wei_Po
5411
+ Wenatchee,_Washington
5412
+ Wenchang
5413
+ Wendy_Everson
5414
+ West_Azarbaijan_province
5415
+ West_Bengal
5416
+ West_Bromwich_Albion_F.C.
5417
+ West_Coast_Eagles
5418
+ West_Ham_United_F.C.
5419
+ West_Hartlepool_R.F.C.
5420
+ West_Indies_cricket_team
5421
+ West_South_Central_States
5422
+ West_Virginia
5423
+ Western_Australia
5424
+ Western_Bulldogs
5425
+ Western_Hemisphere
5426
+ Western_Province_cricket_team
5427
+ Western_Suburbs_Magpies
5428
+ Western_world
5429
+ Weston-super-Mare
5430
+ Whistler,_British_Columbia
5431
+ Whistler_Mountain
5432
+ White_House
5433
+ Whitehill_Welfare_F.C.
5434
+ Whitewater_controversy
5435
+ Whittier,_Alaska
5436
+ Widnes_Vikings
5437
+ Widzew_Łódź
5438
+ Wigan
5439
+ Wigan_Athletic_F.C.
5440
+ Wigan_Warriors
5441
+ Wilfried_Peeters
5442
+ Wilfried_Van_Moer
5443
+ Wilhelmina_of_the_Netherlands
5444
+ Will_Carling
5445
+ Willem_II_(football_club)
5446
+ William_Goldman
5447
+ William_Hill_(bookmaker)
5448
+ William_Jordan,_Baron_Jordan
5449
+ William_Perry
5450
+ William_Tanui
5451
+ William_Weld
5452
+ WilliamsF1
5453
+ Willie_Wood_(golfer)
5454
+ Wilson_Kipketer
5455
+ Wiltshire
5456
+ Wim_Jonk
5457
+ Wimbledon_Championships
5458
+ Wimbledon_F.C.
5459
+ Wincanton_Classic
5460
+ Windhoek
5461
+ Windows_95
5462
+ Windows_NT
5463
+ Windows_NT_4.0
5464
+ Winnipeg
5465
+ Winston_Bogarde
5466
+ Winston_Peters
5467
+ Wisconsin
5468
+ Wisła_Kraków
5469
+ Woking_F.C.
5470
+ Wolf_Point,_Montana
5471
+ Wolfgang_Feiersinger
5472
+ Wolfgang_Ischinger
5473
+ Wolverhampton_Wanderers_F.C.
5474
+ Woodridge,_Illinois
5475
+ Worcester
5476
+ Worcestershire_County_Cricket_Club
5477
+ Workington_Town
5478
+ World_Badminton_Grand_Prix
5479
+ World_Boxing_Organization
5480
+ World_Cup_of_Hockey
5481
+ World_Open
5482
+ World_Series
5483
+ World_Series_Cricket
5484
+ World_Series_of_Golf
5485
+ World_Trade_Organization
5486
+ World_War_II
5487
+ World_Wide_Web
5488
+ World_championship
5489
+ Worthing
5490
+ Wrexham_F.C.
5491
+ Wrocław
5492
+ Wuhan
5493
+ Wuxi
5494
+ Wyche_Fowler
5495
+ Wycombe_Wanderers_F.C.
5496
+ Wydad_Casablanca
5497
+ Xavier_Gravelaine
5498
+ Xhosa
5499
+ Xiao_Qiang
5500
+ Xinhua_News_Agency
5501
+ Xiomara_Rivero
5502
+ Xu_Nannan
5503
+ YMCA
5504
+ Yakoma_people
5505
+ Yamaha_Motor_Company
5506
+ Yangon
5507
+ Yangon_Technological_University
5508
+ Yap_Kim_Hock
5509
+ Yasser_Arafat
5510
+ Yassine_Abdellaoui
5511
+ Yasushi_Akashi
5512
+ Yasuto_Honda
5513
+ Yayuk_Basuki
5514
+ Ye_Zhaoying
5515
+ Yedioth_Ahronoth
5516
+ Yegor_Stroyev
5517
+ Yekaterina_Podkopayeva
5518
+ Yelena_Gulyayeva
5519
+ Yemen
5520
+ Yeosu
5521
+ Yerevan
5522
+ Yevgeny_Kafelnikov
5523
+ Yevgeny_Primakov
5524
+ Yingkou
5525
+ Yitzhak_Mordechai
5526
+ Yitzhak_Shamir
5527
+ Yoelbi_Quesada
5528
+ Yonhap
5529
+ Yonsei_University
5530
+ York
5531
+ York_City_F.C.
5532
+ Yorkshire
5533
+ Yorkshire_County_Cricket_Club
5534
+ Younes_El_Aynaoui
5535
+ Youri_Djorkaeff
5536
+ Youri_Mulder
5537
+ Yugoslavia
5538
+ Yukio_Hatoyama
5539
+ Yunnan
5540
+ Yuriy_Nikiforov
5541
+ Yvan_Quentin
5542
+ Yvonne_McGregor
5543
+ ZDF
5544
+ Zaandam
5545
+ Zabrze
5546
+ Zadar
5547
+ Zagreb
5548
+ Zagłębie_Lubin
5549
+ Zahoor_Elahi
5550
+ Zalaegerszegi_TE
5551
+ Zambia
5552
+ Zambia_national_football_team
5553
+ Zapatista_Army_of_National_Liberation
5554
+ Zastava_Arms
5555
+ Zbigniew_Siemiątkowski
5556
+ Zdeněk_Svoboda
5557
+ Zelimkhan_Yandarbiyev
5558
+ Zenith_Electronics
5559
+ Zevenaar
5560
+ Zhang_Ning
5561
+ Zhanna_Pintusevich-Block
5562
+ Zhejiang
5563
+ Zimbabwe
5564
+ Zimbabwe_Open
5565
+ Zimbabwe_national_cricket_team
5566
+ Zimbabwe_national_football_team
5567
+ Zina_Garrison
5568
+ Zine_El_Abidine_Ben_Ali
5569
+ Zinedine_Zidane
5570
+ Zinzan_Brooke
5571
+ Zionism
5572
+ Zoran_Savić
5573
+ Zulu_Kingdom
5574
+ Zvornik
5575
+ Zygmunt_Solorz-Żak
5576
+ Zëri_i_Popullit
5577
+ Zürich
5578
+ Àlex_Corretja
5579
+ Àlex_Crivillé
5580
+ Álvaro_Espinoza
5581
+ Ángeles_Montolio
5582
+ Åsa_Svensson
5583
+ Écu
5584
+ Élan_Béarnais_Pau-Orthez
5585
+ Éric_Bernard
5586
+ Óscar_Tabárez
5587
+ Östersund
5588
+ Újpest_FC
5589
+ Čelopek
5590
+ Ľubomír_Moravčík
5591
+ ŁKS_Łódź
5592
+ Śląsk_Wrocław
5593
+ Şırnak_Province
5594
+ ŠK_Slovan_Bratislava
5595
+ Šiauliai
5596
+ Šárka_Kašpárková
5597
+ Życie_Warszawy
5598
+ Željko_Petrović
resources/vocab/enwiki_20230827.txt ADDED
The diff for this file is too large to render. See raw diff
 
resources/vocab/out_of_domain.txt ADDED
The diff for this file is too large to render. See raw diff
 
span_annotation.py ADDED
@@ -0,0 +1,166 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Data structure classes required and used for multiple levels of granularity in spans.
3
+ """
4
+ from data_loader import dl_sa
5
+ from mosestokenizer import MosesDetokenizer
6
+ detokenize = MosesDetokenizer('en')
7
+
8
+
9
+ class PhraseAnnotation:
10
+ def __init__(self, initial_word):
11
+ self.words = [initial_word]
12
+ self._resolved_annotation = initial_word.resolved_annotation
13
+ self.ppr_for_ned_candidates = initial_word.ppr_for_ned_candidates
14
+
15
+ @property
16
+ def has_valid_bioes_labels(self):
17
+ # B = 0, I = 1, O = 2, E = 3, S = 4
18
+ return all([x.has_valid_bioes_labels and x.bioes_labels is not None for x in self.words])
19
+
20
+ def add(self, word):
21
+ self.words.append(word)
22
+ # There are some phrases that are annotated as O but have PPRforNED candidates, those will be ignored!
23
+ if self._resolved_annotation > 0 and self.ppr_for_ned_candidates != word.ppr_for_ned_candidates:
24
+ self.ppr_for_ned_candidates = list(set(self.ppr_for_ned_candidates) & set(word.ppr_for_ned_candidates))
25
+
26
+ def all_possible_annotations(self):
27
+ all_common_ids = set.intersection(*[set([y[0] for y in x.candidates]) for x in self.words])
28
+ all_common_ids_average_confidence = map(lambda x: sum(x)/len(x), [
29
+ [sum(y[1])/len(y[1]) for x in self.words for y in x.candidates if y[0] == k] for k in all_common_ids])
30
+ return sorted(zip(all_common_ids, all_common_ids_average_confidence), key=lambda x: x[1], reverse=True)
31
+
32
+ def set_alternative_as_resolved_annotation(self, alternative):
33
+ self._resolved_annotation = alternative
34
+
35
+ @property
36
+ def resolved_annotation(self):
37
+ return self._resolved_annotation
38
+
39
+ @property
40
+ def subword_annotations(self):
41
+ return [x for w in self.words for x in w.annotations]
42
+
43
+ @property
44
+ def word_string(self):
45
+ return detokenize([x.word_string.replace("\n", "\u010a").replace("£", "£").replace("âĦ¢", '™')
46
+ .replace('ü','ü').replace('é', 'é').replace('ÃŃ', 'í') for x in self.words])
47
+
48
+ @property
49
+ def begin_character(self):
50
+ return self.words[0].token_offsets[0][1][0]
51
+
52
+ @property
53
+ def end_character(self):
54
+ return self.words[-1].token_offsets[-1][1][-1]
55
+
56
+ @property
57
+ def average_annotation_confidence(self):
58
+ ac = [x.resolved_annotation_confidence for x in self.words]
59
+ return sum(ac) / len(ac)
60
+
61
+ def __str__(self):
62
+ return f"{self.word_string} ({self.begin_character}, {self.end_character}) | annotation: " \
63
+ f"{self.words[0].annotations[0].idx2tag[self.resolved_annotation]}"
64
+
65
+
66
+ class WordAnnotation:
67
+ def __init__(self, subword_annotations, token_offsets, ppr_for_ned_candidates=None):
68
+ if ppr_for_ned_candidates is None:
69
+ ppr_for_ned_candidates = []
70
+ self.annotations = subword_annotations
71
+ self.token_offsets = token_offsets
72
+ self.ppr_for_ned_candidates = ppr_for_ned_candidates
73
+ self.is_valid_annotation = False if not subword_annotations else True
74
+ self.word_string = ''.join([x[0].replace('\u0120', '') for x in token_offsets])
75
+ # even if self.is_valid_annotation is True we could still have the candidates to be empty
76
+ # since there could be no consensus among the subword predictions.
77
+ self.candidates = sorted([] if not self.is_valid_annotation else [
78
+ (cid, self._get_assigned_probabilities(cid)) for cid in set.intersection(*[set(y.top_k_i_list)
79
+ for y in self.annotations])],
80
+ key=lambda x: sum(x[1])/len(x[1]), reverse=True)
81
+ self.resolved_annotation = self._resolve_annotation()
82
+ rc = self._get_assigned_probabilities(self.resolved_annotation)
83
+ self.resolved_annotation_confidence = sum(rc) / len(rc)
84
+ if not self.candidates:
85
+ self.candidates = [(self.resolved_annotation, rc)]
86
+ assert self.resolved_annotation in [x[0] for x in self.candidates]
87
+ self.has_valid_bioes_labels = all([x.has_valid_bioes_label for x in self.annotations])
88
+ self.bioes_labels = None if not self.has_valid_bioes_labels else [x.bioes_label for x in self.annotations]
89
+
90
+ def _resolve_annotation(self):
91
+ if not self.is_valid_annotation:
92
+ return 0
93
+ r = [x.item() for x in self.annotations]
94
+ if r.count(r[0]) == len(r):
95
+ annotation = r[0]
96
+ elif self.candidates:
97
+ # here we return the annotation with the highest average probability prediction over all the subwords
98
+ annotation = self.candidates[0][0]
99
+ else:
100
+ # here we return the annotation which the model has predicted as highest probability for
101
+ # the majority of the subwords
102
+ most_frequent = max(set(r), key=r.count)
103
+ if r.count(most_frequent) == 1:
104
+ annotation = r[0]
105
+ else:
106
+ annotation = most_frequent
107
+ return annotation
108
+
109
+ def _get_assigned_probabilities(self, cid):
110
+ assigned_probabilities = []
111
+ for a in self.annotations:
112
+ found = False
113
+ for i, p in zip(a.top_k_i_list, a.top_k_p_list):
114
+ if i == cid:
115
+ assigned_probabilities.append(p)
116
+ found = True
117
+ break
118
+ if not found:
119
+ assigned_probabilities.append(0.0)
120
+ assert len(assigned_probabilities) == len(self.annotations)
121
+ return assigned_probabilities
122
+
123
+ def __str__(self):
124
+ ann = self.annotations[0].idx2tag[self.resolved_annotation]
125
+ cdns = ','.join([f'({self.annotations[0].idx2tag[x[0]]}: {sum(x[1])/len(x[1])})' for x in self.candidates])
126
+ return f"{self.word_string} | annotation: {ann} | candidates: [{cdns}]"
127
+
128
+
129
+ class SubwordAnnotation:
130
+ """
131
+ The value of his class will be equal to the value of its "self.top_k_i_list[0]", the rest of the information will be
132
+ carried over for future decision-making and evaluation.
133
+ """
134
+ def __init__(self, top_k_p_list, top_k_i_list, subword_string):
135
+ self.top_k_p_list = top_k_p_list
136
+ self.top_k_i_list = top_k_i_list
137
+ subword_string = "UNDEF_STR" if not subword_string else subword_string
138
+ self.subword_string = subword_string.replace('\u0120', '')
139
+ self.bioes_label = 2
140
+ self.has_valid_bioes_label = False
141
+ self.bioes_probabilities = None
142
+
143
+ def __eq__(self, other):
144
+ if isinstance(other, int):
145
+ return self.top_k_i_list[0] == other
146
+ elif isinstance(other, SubwordAnnotation):
147
+ return self.top_k_i_list[0] == other.top_k_i_list[0]
148
+ else:
149
+ raise ValueError
150
+
151
+ def __str__(self):
152
+ return f"({self.subword_string}, <<" \
153
+ f"{'>> <<'.join([f'{dl_sa.mentions_itos[i]}: {p:.3f}' for i, p in zip(self.top_k_i_list, self.top_k_p_list)])}>>)"
154
+
155
+ def item(self):
156
+ return self.top_k_i_list[0]
157
+
158
+ def item_probability(self):
159
+ return self.top_k_p_list[0]
160
+
161
+ def set_bioes_label(self, label: int, probs: list):
162
+ assert 0 <= label <= 5
163
+ assert len(probs) == 5
164
+ self.has_valid_bioes_label = True
165
+ self.bioes_label = label
166
+ self.bioes_probabilities = probs
utils.py ADDED
@@ -0,0 +1,545 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ This file provides many functionalities that can be shared among different components.
3
+ The most important function in this file is `chunk_annotate_and_merge_to_phrase` which recieves a model and a raw text,
4
+ annotates the text, and returns the annotation spans.
5
+ """
6
+ import os
7
+ import json
8
+ import pickle
9
+ import string
10
+ from enum import Enum
11
+ from tqdm import tqdm
12
+
13
+ from data_loader import get_dataset, tokenizer, dl_sa
14
+ from span_annotation import SubwordAnnotation, WordAnnotation, PhraseAnnotation
15
+ from aida import AIDADataset
16
+ from configuration import get_resources_dir
17
+ from mosestokenizer import MosesTokenizer, MosesPunctuationNormalizer
18
+ moses_tokenize = MosesTokenizer('en', old_version=True)
19
+ normalize = MosesPunctuationNormalizer('en')
20
+
21
+
22
+ def get_punc_tokenized_words_list(word_list: list, labels_list: list = None):
23
+ tokens = []
24
+ labels = []
25
+ for w_ind, o_token in enumerate(word_list):
26
+ if o_token[0] not in string.punctuation and o_token[-1] not in string.punctuation:
27
+ tokens.append(o_token)
28
+ if labels_list:
29
+ labels.append(labels_list[w_ind])
30
+ if o_token.endswith("\'s") or o_token.endswith("\'S"):
31
+ tokens[-1] = tokens[-1][:-2]
32
+ tokens.append(o_token[-2:])
33
+ if labels_list:
34
+ labels.append(labels_list[w_ind])
35
+ continue
36
+ # cases where the tokens start or end with punctuation
37
+ before_tokens = []
38
+ after_tokens = []
39
+ while o_token and o_token[0] in string.punctuation:
40
+ before_tokens.append(o_token[0])
41
+ o_token = o_token[1:]
42
+ while o_token and o_token[-1] in string.punctuation:
43
+ after_tokens.append(o_token[-1])
44
+ o_token = o_token[:-1]
45
+ if before_tokens:
46
+ tokens.append("".join(before_tokens))
47
+ if labels_list:
48
+ labels.append(labels_list[w_ind])
49
+ if o_token:
50
+ tokens.append(o_token)
51
+ if labels_list:
52
+ labels.append(labels_list[w_ind])
53
+ if after_tokens:
54
+ tokens.append("".join(after_tokens[::-1]))
55
+ if labels_list:
56
+ labels.append(labels_list[w_ind])
57
+ if labels_list:
58
+ return tokens, labels
59
+ return tokens
60
+
61
+
62
+ def save_predictions_result(logdir, epoch, precision, recall, f1, num_proposed, num_correct, num_gold,
63
+ all_words, all_tags, all_y_hat, all_predicted):
64
+ final = logdir + "/%s.P%.2f_R%.2f_F%.2f" % ("{}".format(str(epoch)), precision, recall, f1,)
65
+ with open(final, "w") as fout:
66
+ for words, tags, y_hat, preds in zip(all_words, all_tags, all_y_hat, all_predicted):
67
+ assert len(preds) == len(words) == len(tags)
68
+ for w, t, p in zip(words, tags, preds):
69
+ if w == '<s>' or t == '<pad>':
70
+ continue
71
+ fout.write(f"{w}\t{t}\t{p}\n")
72
+ fout.write("\n")
73
+ fout.write(f"num_proposed={num_proposed}\n")
74
+ fout.write(f"num_correct={num_correct}\n")
75
+ fout.write(f"num_gold={num_gold}\n")
76
+ fout.write(f"precision={precision}\n")
77
+ fout.write(f"recall={recall}\n")
78
+ fout.write(f"f1={f1}\n")
79
+
80
+
81
+ def get_subword_to_word_mapping(subword_tokens, original_string, sequence_starts_and_ends_with_bos_eos=True):
82
+ # subword_tokens starts with <s> and ends with </s>
83
+ if sequence_starts_and_ends_with_bos_eos:
84
+ subword_tokens = subword_tokens[1:-1]
85
+ subword_to_word_mapping = []
86
+ start_subword_index = 0
87
+ end_subword_index = 0
88
+ original_tokens = get_punc_tokenized_words_list(original_string.split())
89
+
90
+ original_pointer = 0
91
+ while len(subword_to_word_mapping) != len(original_tokens):
92
+ next_t = tokenizer.convert_tokens_to_string(subword_tokens[start_subword_index:end_subword_index])
93
+ next_t = next_t.strip()
94
+ if next_t == original_tokens[original_pointer]:
95
+ subword_to_word_mapping.append((start_subword_index, end_subword_index))
96
+ original_pointer += 1
97
+ start_subword_index = end_subword_index
98
+ else:
99
+ end_subword_index += 1
100
+ if end_subword_index - start_subword_index > 1000:
101
+ for i in [0, 1, 2, 3, 4]:
102
+ n = tokenizer.convert_tokens_to_string(subword_tokens[start_subword_index:start_subword_index + 2 + i]).strip()
103
+ o = "".join(original_tokens[original_pointer: original_pointer + 2]).replace("`", "\'")
104
+ if n == o or n.replace(" ", "") == o.replace(" ", ""):
105
+ subword_to_word_mapping.append((start_subword_index, start_subword_index + 1))
106
+ original_pointer += 1
107
+ start_subword_index = start_subword_index + 1
108
+ subword_to_word_mapping.append((start_subword_index, start_subword_index + 1 + i))
109
+ original_pointer += 1
110
+ start_subword_index = start_subword_index + 1 + i
111
+ end_subword_index = start_subword_index
112
+ break
113
+ return subword_to_word_mapping
114
+
115
+
116
+ def store_validation_data_wiki(checkpoints_root, batch_size, label_size, is_training, use_retokenized_wikipedia_data):
117
+ dataset_name = f"validation_data_cache_b_{batch_size}_l_{label_size}_" \
118
+ f"{('rt_wiki' if use_retokenized_wikipedia_data else 'wiki') if is_training else 'conll'}/"
119
+ if not os.path.exists(os.path.join(checkpoints_root, dataset_name)):
120
+ os.mkdir(os.path.join(checkpoints_root, dataset_name))
121
+ else:
122
+ print("Retrieving the validation data ...")
123
+ return dataset_name
124
+ print("Caching the validation data ...")
125
+ if is_training:
126
+ valid_iter = tqdm(get_dataset(
127
+ dataset_name='enwiki', split='valid', batch_size=batch_size, label_size=label_size,
128
+ use_retokenized_wikipedia_data=use_retokenized_wikipedia_data))
129
+ else:
130
+ valid_iter = tqdm(get_dataset(dataset_name='aida', split='valid', batch_size=batch_size, label_size=label_size))
131
+ for ind, (inputs, subword_mentions) in enumerate(valid_iter):
132
+ with open(os.path.join(checkpoints_root, dataset_name, f"{ind}"), "wb") as store_file:
133
+ pickle.dump((inputs.token_ids.cpu(), subword_mentions.ids.cpu(), subword_mentions.probs.cpu(),
134
+ inputs.eval_mask.cpu(), subword_mentions.dictionary, inputs.raw_mentions,
135
+ inputs.is_in_mention.cpu(), inputs.bioes.cpu()), store_file,
136
+ protocol=pickle.HIGHEST_PROTOCOL)
137
+ return dataset_name
138
+
139
+
140
+ def postprocess_annotations(annotations, sentence):
141
+ res = []
142
+ for ann in annotations:
143
+ begin_index = ann[0]
144
+ end_index = ann[1]
145
+ annotation = ann[2]
146
+ requires_check = True
147
+ while requires_check and end_index > begin_index:
148
+ mention = sentence[begin_index:end_index]
149
+ if mention.lower().endswith("\'s") and all([any([m in c for c in annotation[0].lower().split("_")])
150
+ for m in mention[:-2].lower().split()]) and not \
151
+ all([any([m in c for c in annotation[0].lower().split("_")]) for m in mention.lower().split()]):
152
+ end_index -= 2
153
+ elif mention[0] in string.punctuation or mention[0] == ' ':
154
+ begin_index += 1
155
+ elif mention[-1] in string.punctuation and mention.lower()[-4:] not in ["u.s.", "u.n."]:
156
+ end_index -= 1
157
+ elif mention[-1] == ' ':
158
+ end_index -= 1
159
+ elif mention.lower()[-3:] in ["u.s", "u.n"] and end_index < len(sentence) and sentence[end_index] == '.':
160
+ end_index += 1
161
+ elif mention.lower() in ["a", "the", "in", "out", "to", "of", "for", "at", "by", "rd", "th", "and",
162
+ "or", "but", "on", "none", "is", "were", "was", "he", "she", "if", "as",
163
+ "have", "had", "has", "who", "when", "where", "a lot", "a little", "here",
164
+ "there", "\'s"]:
165
+ end_index = begin_index
166
+ requires_check = False
167
+ else:
168
+ requires_check = False
169
+ if begin_index < end_index:
170
+ res.append((begin_index, end_index, annotation))
171
+ return res
172
+
173
+
174
+ def get_aida_set_phrase_splitted_documents(dataset_name):
175
+ d_iter = AIDADataset().dataset[dataset_name]
176
+
177
+ phrase_documents = []
178
+
179
+ for document in d_iter:
180
+ document_words = []
181
+ document_labels = []
182
+ document_candidates = []
183
+
184
+ for annotation in document.annotations:
185
+ for a in annotation:
186
+ document_words.append(a.token)
187
+ document_candidates.append([x.url.replace('http://en.wikipedia.org/wiki/', '')
188
+ for x in a.candidates.candidates] if a.candidates else [])
189
+ if a.yago_entity and a.yago_entity != "--NME--":
190
+ document_labels.append(a.yago_entity.encode('ascii').decode('unicode-escape'))
191
+ else:
192
+ document_labels.append('|||O|||')
193
+ original_string = " ".join(document_words)
194
+ tokenized_mention = tokenizer(original_string)
195
+ tokens_offsets = list(zip(tokenized_mention.tokens(), tokenized_mention.encodings[0].offsets))[1:-1]
196
+ mapping = get_subword_to_word_mapping(tokenized_mention.tokens(), original_string)
197
+ subword_tokens = tokenized_mention.tokens()[1:-1]
198
+ w_ind = 0
199
+ subword_annotations = []
200
+ word_annotations = []
201
+ for w, l, cnds in zip(document_words, document_labels, document_candidates):
202
+ converted_to_words = "".join([x[1:] if x.startswith("\u0120")
203
+ else x for x in subword_tokens[mapping[w_ind][0]:mapping[w_ind][1]]])
204
+ if w == converted_to_words:
205
+ for sub_w in subword_tokens[mapping[w_ind][0]:mapping[w_ind][1]]:
206
+ subword_annotations.append(SubwordAnnotation([1.0], [dl_sa.mentions_vocab[l]], sub_w))
207
+ word_annotations.append(WordAnnotation(subword_annotations[mapping[w_ind][0]:mapping[w_ind][1]],
208
+ tokens_offsets[mapping[w_ind][0]:mapping[w_ind][1]], cnds))
209
+ w_ind += 1
210
+ elif len(mapping) > w_ind + 1 and w == "".join([x[1:] if x.startswith("\u0120")
211
+ else x for x in subword_tokens[
212
+ mapping[w_ind][0]:mapping[w_ind+1][1]]]):
213
+ for sub_w in subword_tokens[mapping[w_ind][0]:mapping[w_ind+1][1]]:
214
+ subword_annotations.append(SubwordAnnotation([1.0], [dl_sa.mentions_vocab[l]], sub_w))
215
+ word_annotations.append(WordAnnotation(subword_annotations[mapping[w_ind][0]:mapping[w_ind+1][1]],
216
+ tokens_offsets[mapping[w_ind][0]:mapping[w_ind+1][1]], cnds))
217
+ w_ind += 2
218
+ else:
219
+ raise ValueError("This should not happen")
220
+ phrase_annotations = []
221
+ for w in word_annotations:
222
+ if phrase_annotations and phrase_annotations[-1].resolved_annotation == w.resolved_annotation:
223
+ phrase_annotations[-1].add(w)
224
+ else:
225
+ phrase_annotations.append(PhraseAnnotation(w))
226
+ phrase_documents.append(phrase_annotations)
227
+ return phrase_documents
228
+
229
+
230
+ def _process_last_overlap(text_chunk_overlap, _overlap, l):
231
+ """
232
+ Function intended to merge the predictions in the text chunk overlaps.
233
+ Implemented to be used in chunk_annotate_and_merge_to_phrase function.
234
+ """
235
+ if not l:
236
+ l = _overlap
237
+ if len(l) < len(_overlap):
238
+ o = [x for x in _overlap]
239
+ o[-len(l):] = l
240
+ l = o
241
+ _r = []
242
+ if len(_overlap) < text_chunk_overlap:
243
+ text_chunk_overlap = len(_overlap)
244
+ for i in range(text_chunk_overlap):
245
+ if _overlap[i] == 0:
246
+ _r.append((l[i],))
247
+ elif l[i] == 0 or _overlap[i] == l[i]:
248
+ _r.append((_overlap[i],))
249
+ else: # keeping both for prediction resolution
250
+ _r.append((l[i], _overlap[i]))
251
+ return _r
252
+
253
+
254
+ def normalize_sentence_for_moses_alignment(sentence, normalize_for_chinese_characters=False):
255
+ for k, v in [('\u2018', '\''), ('\u2019', '\''), ('\u201d', '\"'), ('\u201c', '\"'), ('\u2013', '-'),
256
+ ('\u2014', '-'), ('\u2026', '.'), ('\u2022', '.'), ('\u00f6', 'o'),('\u00e1', 'a'), ('\u00e8', 'e'),
257
+ ('\u00c9', 'E'), ('\u014d', 'o'), ('\u0219', 's'), ('\n', '\u010a'), ('\u00a0', ' '), ('\u694a', ' '),
258
+ ('\u9234', ' '), ('\u6797', ' '), ('\u6636', ' '), ('\u4f50', ' '), ('\u738b', ' '), ('\u5b9c', ' '),
259
+ ('\u6b63', ' '), ('\u5168', ' '), ('\u52dd', ' '), ('\u80e1', ' '), ('\u5fd7', ' '), ('\u535a', ' '),
260
+ ('\u9673', ' '), ('\u7f8e', ' '), ('\u20ac', 'E'), ('\u201e', '\"'), ('\u0107', 'c'), ('\ufeff', ' '),
261
+ ('\u017e', 'z'), ('\u010d', 'c')]:
262
+ if k in sentence:
263
+ sentence = sentence.replace(k, v)
264
+ if normalize_for_chinese_characters:
265
+ for k, v in [('\u5e7c', ' '), ('\u5049', ' '), ('\u5b8f', ' '), ('\u9054', ' '), ('\u5bb9', ' '),
266
+ ('\u96fb', ' '), ('\u590f', ' '), ('\u5b63', ' '), ('\u660c', ' '), ('\u90b1', ' '),
267
+ ('\u4fca', ' '), ('\u6587', ' '), ('\u56b4', ' '), ('\u5b87', ' '), ('\u67cf', ' '),
268
+ ('\u8b5a', ' '), ('\u9f0e', ' '), ('\u6176', ' '), ('\u99ac', ' '), ('\u82f1', ' '),
269
+ ('\u4e5d', ' '), ('\u6797', ' '), ('\u7537', ' '), ('\u9996', ' '), ('\u60e0', ' '),
270
+ ('\u7d00', ' '), ('\u5143', ' '), ('\u8f1d', ' '), ('\u5289', ' '), ('\u4fd0', ' '),
271
+ ('\u8208', ' '), ('\u4e2d', ' '), ('\u8b1d', ' '), ('\u5922', ' '), ('\u9e9f', ' '),
272
+ ('\u6e38', ' '), ('\u570b', ' '), ('\u7167', ' '), ('\u658c', ' '), ('\u54f2', ' '),
273
+ ('\u9ec3', ' '), ('\u5433', ' '), ('\u53cb', ' '), ('\u6e05', ' '), ('\u856d', ' '),
274
+ ('\u8000', ' '), ('\u5eb7', ' '), ('\u6dd1', ' '), ('\u83ef', ' ')]:
275
+ if k in sentence:
276
+ sentence = sentence.replace(k, v)
277
+ return sentence
278
+
279
+
280
+ def chunk_annotate_and_merge_to_phrase(model, sentence, k_for_top_k_to_keep=5, normalize_for_chinese_characters=False):
281
+ sentence = sentence.rstrip()
282
+ sentence = normalize_sentence_for_moses_alignment(sentence, normalize_for_chinese_characters)
283
+ simple_split_words = moses_tokenize(sentence)
284
+ sentence = sentence.replace('\u010a', '\n')
285
+ tokenized_mention = tokenizer(sentence)
286
+ tokens_offsets = list(zip(tokenized_mention.tokens(), tokenized_mention.encodings[0].offsets))
287
+ subword_to_word_mapping = get_subword_to_word_mapping(tokenized_mention.tokens(), sentence)
288
+ chunks = [tokens_offsets[i: i + model.text_chunk_length] for i in range(
289
+ 0, len(tokens_offsets), model.text_chunk_length - model.text_chunk_overlap)]
290
+ result = []
291
+ last_overlap = []
292
+ logits = []
293
+ # ########################################################################################################
294
+ # Covert each chunk to tensors, predict the labels, and merge the overlaps (keep conflicting predictions).
295
+ # ########################################################################################################
296
+ for chunk in chunks:
297
+ subword_ids = [tokenizer.convert_tokens_to_ids([x[0] for x in chunk])]
298
+ logits = model.annotate_subword_ids(
299
+ subword_ids, k_for_top_k_to_keep, chunk)
300
+ if last_overlap:
301
+ result.extend(_process_last_overlap(model.text_chunk_overlap, last_overlap, logits))
302
+ else:
303
+ result.extend([(x,) for x in logits[:model.text_chunk_overlap]])
304
+ if len(logits) > 2 * model.text_chunk_overlap:
305
+ result.extend([(x,) for x in logits[model.text_chunk_overlap:-model.text_chunk_overlap]])
306
+ last_overlap = logits[-model.text_chunk_overlap:]
307
+ else:
308
+ result.extend([(x,) for x in logits[model.text_chunk_overlap:]])
309
+ last_overlap = []
310
+ logits = []
311
+ result.extend(_process_last_overlap(model.text_chunk_overlap, last_overlap, logits))
312
+ # ########################################################################################################
313
+ # Resolve the overlap merge conflicts using the model prediction probability
314
+ # ########################################################################################################
315
+ final_result = []
316
+ for p_ind, prediction in enumerate(result):
317
+ if len(prediction) == 1:
318
+ final_result.append(prediction[0])
319
+ else:
320
+ p_found = False
321
+ for p in prediction:
322
+ if p == final_result[-1] or (p_ind + 1 < len(result) and p in result[p_ind + 1]):
323
+ # It is equal to the one in the left or in the one to the right
324
+ final_result.append(p)
325
+ p_found = True
326
+ break
327
+ if not p_found: # choose the one the model is more confident about
328
+ final_result.append(sorted(prediction, key=lambda x: x.item_probability(), reverse=True)[0])
329
+ # ########################################################################################################
330
+ # Convert the model predictions (subword-level) to valid GERBIL annotation spans (continuous char-level)
331
+ # ########################################################################################################
332
+ tokens_offsets = tokens_offsets[1:-1]
333
+ final_result = final_result[1:]
334
+ # last_step_annotations = []
335
+ word_annotations = [WordAnnotation(final_result[m[0]:m[1]], tokens_offsets[m[0]:m[1]])
336
+ for m in subword_to_word_mapping]
337
+ # ########################################################################################################
338
+ # MAKING SURE WORDS ARE NOT BROKEN IN SEPARATE PHRASES!
339
+ # ########################################################################################################
340
+ w_p_1 = 0
341
+ w_p_2 = 0
342
+ w_2_buffer = ""
343
+ w_1_buffer = ""
344
+ while w_p_1 < len(word_annotations) and w_p_2 < len(simple_split_words):
345
+ w_1 = word_annotations[w_p_1]
346
+ w_2 = normalize(simple_split_words[w_p_2]).strip()
347
+ w_1_word_string = normalize(w_1.word_string).strip()
348
+ if w_1_word_string == w_2:
349
+ w_p_1 += 1
350
+ w_p_2 += 1
351
+ elif w_1_buffer and w_2_buffer and normalize(
352
+ w_1_buffer + w_1.word_string).strip() == normalize(w_2_buffer + simple_split_words[w_p_2]).strip():
353
+ w_p_1 += 1
354
+ w_p_2 += 1
355
+ w_1_buffer = ""
356
+ w_2_buffer = ""
357
+ elif w_2_buffer and w_1_word_string == normalize(w_2_buffer + simple_split_words[w_p_2]).strip():
358
+ w_p_1 += 1
359
+ w_p_2 += 1
360
+ w_2_buffer = ""
361
+ elif w_1_buffer and normalize(w_1_buffer + w_1.word_string).strip() == w_2:
362
+ w_p_1 += 1
363
+ w_p_2 += 1
364
+ w_1_buffer = ""
365
+ elif w_1_buffer and len(w_2) < len(normalize(w_1_buffer + w_1.word_string).strip()):
366
+ w_2_buffer += simple_split_words[w_p_2]
367
+ w_p_2 += 1
368
+ elif len(w_2) < len(w_1_word_string):
369
+ w_2_buffer += simple_split_words[w_p_2]
370
+ w_p_2 += 1
371
+ # Connecting the "." in between the names to the word it belongs to.
372
+ elif len(w_2) > len(w_1_word_string) and w_p_1 + 1 < len(word_annotations) \
373
+ and word_annotations[w_p_1 + 1].word_string == ".":
374
+ word_annotations[w_p_1 + 1] = WordAnnotation(
375
+ word_annotations[w_p_1].annotations + word_annotations[w_p_1 + 1].annotations,
376
+ word_annotations[w_p_1].token_offsets + word_annotations[w_p_1 + 1].token_offsets)
377
+ word_annotations[w_p_1].annotations = []
378
+ word_annotations[w_p_1].token_offsets = []
379
+ w_p_1 += 1
380
+ elif len(w_2) > len(w_1_word_string) and w_p_1 + 1 < len(word_annotations):
381
+ w_1_buffer += w_1.word_string
382
+ w_p_1 += 1
383
+ elif w_2_buffer and normalize(word_annotations[w_p_1].word_string + word_annotations[w_p_1 + 1].word_string).strip():
384
+ w_p_1 += 2
385
+ w_2_buffer = ""
386
+ else:
387
+ raise ValueError("This should not happen!")
388
+ # ################################################################################################################
389
+ phrase_annotations = []
390
+ for w in word_annotations:
391
+ if not w.annotations:
392
+ continue
393
+ if phrase_annotations and phrase_annotations[-1].resolved_annotation == w.resolved_annotation:
394
+ phrase_annotations[-1].add(w)
395
+ else:
396
+ phrase_annotations.append(PhraseAnnotation(w))
397
+ return phrase_annotations
398
+
399
+
400
+ class ComparisonResult(Enum):
401
+ CORRECTLY_IGNORED_O = 0
402
+ CORRECTLY_FOUND_BOTH_SPAN_AND_ANNOTATION = 1
403
+ CORRECTLY_FOUND_SPAN_BUT_NOT_ANNOTATION = 2
404
+ OVER_GENERATED_ANNOTATION = 3
405
+
406
+ @staticmethod
407
+ def get_correct_status(g_span, p_span):
408
+ g_is_o = g_span.resolved_annotation == 0
409
+ got_annotation_right = p_span.resolved_annotation == g_span.resolved_annotation
410
+ got_span_right = p_span.word_string.replace(" ", "") == g_span.word_string.replace(" ", "")
411
+ # p_span.average_annotation_confidence == g_span.average_annotation_confidence
412
+ if got_span_right and got_annotation_right and g_is_o:
413
+ return ComparisonResult.CORRECTLY_IGNORED_O
414
+ elif got_span_right and got_annotation_right and not g_is_o:
415
+ return ComparisonResult.CORRECTLY_FOUND_BOTH_SPAN_AND_ANNOTATION
416
+ elif got_span_right and not got_annotation_right and not g_is_o:
417
+ # it could be that p is o or not!
418
+ return ComparisonResult.CORRECTLY_FOUND_SPAN_BUT_NOT_ANNOTATION
419
+ elif got_span_right and not got_annotation_right and g_is_o:
420
+ return ComparisonResult.OVER_GENERATED_ANNOTATION
421
+ else:
422
+ raise ValueError("This should not happen!")
423
+
424
+
425
+ def compare_gold_and_predicted_annotation_documents(gold_document, predicted_document, ignore_over_generated=False,
426
+ ignore_predictions_outside_candidate_list=False):
427
+ """
428
+ Compares the output results of the model predictions and the gold annotations.
429
+ """
430
+ g_id = 0
431
+ p_id = 0
432
+ comparison_results = []
433
+ while g_id < len(gold_document) and p_id < len(predicted_document):
434
+ p_span = predicted_document[p_id]
435
+ g_span = gold_document[g_id]
436
+ special_condition = p_span.word_string != g_span.word_string and p_span.word_string.replace(
437
+ " ", "") == g_span.word_string.replace(" ", "")
438
+ if p_span.word_string == g_span.word_string or special_condition:
439
+ p_id += 1
440
+ g_id += 1
441
+ comparison_results.append((g_span, p_span, ComparisonResult.get_correct_status(g_span, p_span)))
442
+ elif len(p_span.word_string) < len(g_span.word_string) and \
443
+ len(p_span.words) == len(g_span.words) == 1 and p_id + 1 < len(predicted_document) and \
444
+ len(predicted_document[p_id+1].words) > 1:
445
+ p_span.add(predicted_document[p_id+1].words[0])
446
+ predicted_document[p_id+1].words.pop(0)
447
+ continue
448
+ elif len(p_span.word_string) < len(g_span.word_string):
449
+ # potentially over-generated span later
450
+ new_phrase = PhraseAnnotation(g_span.words[0])
451
+ i = 1
452
+ while new_phrase.word_string.replace(" ", "") != p_span.word_string.replace(" ", "") \
453
+ and i < len(g_span.words):
454
+ new_phrase.add(g_span.words[i])
455
+ i += 1
456
+ not_solved = new_phrase.word_string.replace(" ", "") != p_span.word_string.replace(" ", "")
457
+ if not_solved and p_id + 1 < len(predicted_document) and len(predicted_document[p_id+1].words) > 1:
458
+ p_span.add(predicted_document[p_id+1].words[0])
459
+ predicted_document[p_id+1].words.pop(0)
460
+ continue
461
+ elif not_solved and p_id + 1 < len(predicted_document) and len(predicted_document[p_id+1].words) == 1:
462
+ p_span.add(predicted_document[p_id+1].words[0])
463
+ predicted_document[p_id+1].words = p_span.words
464
+ predicted_document[p_id+1].set_alternative_as_resolved_annotation(p_span.resolved_annotation)
465
+ p_id += 1
466
+ continue
467
+ elif not_solved:
468
+ raise ValueError("This should not happen!")
469
+ else:
470
+ comparison_results.append((
471
+ new_phrase, p_span, ComparisonResult.get_correct_status(new_phrase, p_span)))
472
+ g_span.words = g_span.words[i:]
473
+ p_id += 1
474
+ elif len(p_span.word_string) > len(g_span.word_string):
475
+ # potentially missed a span
476
+ new_phrase = PhraseAnnotation(p_span.words[0])
477
+ i = 1
478
+ while new_phrase.word_string.replace(" ", "") != g_span.word_string.replace(" ", "") \
479
+ and i < len(p_span.words):
480
+ new_phrase.add(p_span.words[i])
481
+ i += 1
482
+ if new_phrase.word_string.replace(" ", "") != g_span.word_string.replace(" ", ""):
483
+ # re-alignment not helpful
484
+ new_p = PhraseAnnotation(p_span.words[0])
485
+ new_g = PhraseAnnotation(g_span.words[0])
486
+ i = 1
487
+ while new_p.word_string == new_g.word_string:
488
+ new_p.add(p_span.words[i])
489
+ new_g.add(g_span.words[i])
490
+ i += 1
491
+ new_p.words = new_p.words[:-1]
492
+ new_g.words = new_g.words[:-1]
493
+ comparison_results.append((new_g, new_p, ComparisonResult.get_correct_status(new_g, new_p)))
494
+ p_span.words = p_span.words[i - 1:]
495
+ g_span.words = g_span.words[i - 1:]
496
+ else:
497
+ comparison_results.append((
498
+ g_span, new_phrase, ComparisonResult.get_correct_status(g_span, new_phrase)))
499
+ p_span.words = p_span.words[i:]
500
+ g_id += 1
501
+ elif g_span.word_string.replace(" ", "").startswith(p_span.word_string.replace(" ", "")) and \
502
+ p_id + 1 < len(predicted_document) and p_span.word_string.replace(" ", "") + \
503
+ predicted_document[p_id + 1].word_string.replace(" ", "") == g_span.word_string.replace(" ", ""):
504
+ for next_span_word in predicted_document[p_id+1].words:
505
+ p_span.add(next_span_word)
506
+ predicted_document[p_id+1] = p_span
507
+ p_id += 1
508
+ continue
509
+ elif g_span.word_string.replace(" ", "").startswith(p_span.word_string.replace(" ", "")) and \
510
+ p_id + 1 < len(predicted_document) and p_span.word_string.replace(" ", "") + \
511
+ predicted_document[p_id + 1].words[0].word_string.replace(" ", "") == \
512
+ g_span.word_string.replace(" ", ""):
513
+ p_span.add(predicted_document[p_id+1].words[0])
514
+ predicted_document[p_id+1].words.pop(0)
515
+ continue
516
+ elif g_span.word_string.replace(" ", "").startswith(p_span.word_string.replace(" ", "")):
517
+ raise ValueError("This should be handled correctly!")
518
+ elif p_span.word_string.replace(" ", "").startswith(g_span.word_string.replace(" ", "")):
519
+ raise ValueError("This should be handled correctly!")
520
+ else:
521
+ raise ValueError("This should not happen!")
522
+ if ignore_over_generated:
523
+ c_results = []
524
+ for g, p, r in comparison_results:
525
+ if ignore_over_generated and r == ComparisonResult.OVER_GENERATED_ANNOTATION:
526
+ p.set_alternative_as_resolved_annotation(0)
527
+ r = ComparisonResult.CORRECTLY_IGNORED_O
528
+ c_results.append((g, p, r))
529
+ comparison_results = c_results
530
+ if ignore_predictions_outside_candidate_list:
531
+ c_results = []
532
+ for g, p, r in comparison_results:
533
+ g_ppr_for_ned_candidates = [dl_sa.mentions_vocab[x] for x in g.ppr_for_ned_candidates if x in dl_sa.mentions_vocab]
534
+ if g_ppr_for_ned_candidates:
535
+ all_p_anns = p.all_possible_annotations()
536
+ filtered_p_predictions = sorted([x for x in all_p_anns if x[0] in g_ppr_for_ned_candidates],
537
+ key=lambda y: y[1], reverse=True)
538
+ if filtered_p_predictions:
539
+ p.set_alternative_as_resolved_annotation(filtered_p_predictions[0][0])
540
+ else:
541
+ p.set_alternative_as_resolved_annotation(0)
542
+ r = ComparisonResult.get_correct_status(g, p)
543
+ c_results.append((g, p, r))
544
+ comparison_results = c_results
545
+ return comparison_results