NetsPresso_QA / scripts /mrtydi /mbert-tokenize-mrtydi.py
geonmin-kim's picture
Upload folder using huggingface_hub
d6585f5
raw history blame
No virus
3.17 kB
"""tokenize mrtydi files and save in original format"""
from tqdm import tqdm
import os
import json
from datasets import load_dataset
from tools import get_mbert_tokenize_fn
LANGS = "arabic bengali english finnish indonesian japanese korean russian swahili telugu thai".split()
n_proc = 15
token_type = "mbert"
assert token_type in {"mbert", "whitespace"}
print(f"Preparing tokenized mrtydi with {token_type} tokenizer.")
def gen_mrtydi(lang, set_name):
dataset = load_dataset("castorini/mr-tydi", lang, set_name)
for entry in tqdm(dataset[set_name], desc=f"{lang}-topics-{set_name}"):
yield entry
def gen_mrtydi_corpus(lang):
dataset = load_dataset("castorini/mr-tydi-corpus", lang)
for entry in tqdm(dataset["train"], desc=f"{lang}-documents"):
yield entry
def tokenize_single_lang(lang, outp_dir):
mbert_tokenize = get_mbert_tokenize_fn()
def _tokenize_psgs(psgs):
return [{
"docid": psg["docid"],
"title": mbert_tokenize(psg["title"]),
"text": mbert_tokenize(psg["text"]),
} for psg in psgs]
mrtydi_dir = os.path.join(outp_dir, "mr-tydi", f"mr-tydi-v1.1-mbert-tokenize-{lang}")
os.makedirs(mrtydi_dir, exist_ok=True)
# tokenize "mr-tydi"
for set_name in ["train", "dev", "test"]:
outp_fn = os.path.join(mrtydi_dir, f"{set_name}.jsonl")
if os.path.exists(outp_fn):
print(f"Found existing file: {outp_fn}.")
continue
with open(outp_fn, "w") as fout:
for entry in gen_mrtydi(lang=lang, set_name=set_name):
query = entry["query"]
pos_psgs = entry["positive_passages"]
neg_psgs = entry["negative_passages"]
if set_name == "train":
pos_psgs = _tokenize_psgs(pos_psgs)
neg_psgs = _tokenize_psgs(neg_psgs)
mbert_entry = {
"query_id": entry["query_id"],
"query": mbert_tokenize(query),
"positive_passages": pos_psgs,
"negative_passages": neg_psgs,
}
line = json.dumps(mbert_entry, ensure_ascii=False)
fout.write(line + "\n")
# tokenize "mr-tydi-corpus"
mrtydi_corpus_dir = os.path.join(outp_dir, "mr-tydi-corpus", f"mr-tydi-v1.1-mbert-tokenize-{lang}")
os.makedirs(mrtydi_corpus_dir, exist_ok=True)
outp_fn = os.path.join(mrtydi_corpus_dir, f"corpus.jsonl")
if os.path.exists(outp_fn):
print(f"Found existing file: {outp_fn}.")
return
with open(outp_fn, "w") as fout:
for entry in gen_mrtydi_corpus(lang):
mbert_entry = {
"docid": entry["docid"],
"title": mbert_tokenize(entry["title"]),
"text": mbert_tokenize(entry["text"]),
}
line = json.dumps(mbert_entry, ensure_ascii=False)
fout.write(line + "\n")
def main():
outp_dir = f"mbert-mrtydi/"
for i, lang in enumerate(LANGS):
tokenize_single_lang(lang, outp_dir + lang)
if __name__ == "__main__":
main()