Spaces:
Runtime error
Runtime error
"""tokenize mrtydi files and save in original format""" | |
from tqdm import tqdm | |
import os | |
import json | |
from datasets import load_dataset | |
from tools import get_mbert_tokenize_fn | |
LANGS = "arabic bengali english finnish indonesian japanese korean russian swahili telugu thai".split() | |
n_proc = 15 | |
token_type = "mbert" | |
assert token_type in {"mbert", "whitespace"} | |
print(f"Preparing tokenized mrtydi with {token_type} tokenizer.") | |
def gen_mrtydi(lang, set_name): | |
dataset = load_dataset("castorini/mr-tydi", lang, set_name) | |
for entry in tqdm(dataset[set_name], desc=f"{lang}-topics-{set_name}"): | |
yield entry | |
def gen_mrtydi_corpus(lang): | |
dataset = load_dataset("castorini/mr-tydi-corpus", lang) | |
for entry in tqdm(dataset["train"], desc=f"{lang}-documents"): | |
yield entry | |
def tokenize_single_lang(lang, outp_dir): | |
mbert_tokenize = get_mbert_tokenize_fn() | |
def _tokenize_psgs(psgs): | |
return [{ | |
"docid": psg["docid"], | |
"title": mbert_tokenize(psg["title"]), | |
"text": mbert_tokenize(psg["text"]), | |
} for psg in psgs] | |
mrtydi_dir = os.path.join(outp_dir, "mr-tydi", f"mr-tydi-v1.1-mbert-tokenize-{lang}") | |
os.makedirs(mrtydi_dir, exist_ok=True) | |
# tokenize "mr-tydi" | |
for set_name in ["train", "dev", "test"]: | |
outp_fn = os.path.join(mrtydi_dir, f"{set_name}.jsonl") | |
if os.path.exists(outp_fn): | |
print(f"Found existing file: {outp_fn}.") | |
continue | |
with open(outp_fn, "w") as fout: | |
for entry in gen_mrtydi(lang=lang, set_name=set_name): | |
query = entry["query"] | |
pos_psgs = entry["positive_passages"] | |
neg_psgs = entry["negative_passages"] | |
if set_name == "train": | |
pos_psgs = _tokenize_psgs(pos_psgs) | |
neg_psgs = _tokenize_psgs(neg_psgs) | |
mbert_entry = { | |
"query_id": entry["query_id"], | |
"query": mbert_tokenize(query), | |
"positive_passages": pos_psgs, | |
"negative_passages": neg_psgs, | |
} | |
line = json.dumps(mbert_entry, ensure_ascii=False) | |
fout.write(line + "\n") | |
# tokenize "mr-tydi-corpus" | |
mrtydi_corpus_dir = os.path.join(outp_dir, "mr-tydi-corpus", f"mr-tydi-v1.1-mbert-tokenize-{lang}") | |
os.makedirs(mrtydi_corpus_dir, exist_ok=True) | |
outp_fn = os.path.join(mrtydi_corpus_dir, f"corpus.jsonl") | |
if os.path.exists(outp_fn): | |
print(f"Found existing file: {outp_fn}.") | |
return | |
with open(outp_fn, "w") as fout: | |
for entry in gen_mrtydi_corpus(lang): | |
mbert_entry = { | |
"docid": entry["docid"], | |
"title": mbert_tokenize(entry["title"]), | |
"text": mbert_tokenize(entry["text"]), | |
} | |
line = json.dumps(mbert_entry, ensure_ascii=False) | |
fout.write(line + "\n") | |
def main(): | |
outp_dir = f"mbert-mrtydi/" | |
for i, lang in enumerate(LANGS): | |
tokenize_single_lang(lang, outp_dir + lang) | |
if __name__ == "__main__": | |
main() | |