versae's picture
Adding checkpointing, wandb, and new mlm script
d988382
raw
history blame
751 Bytes
#!/usr/bin/env python
import kenlm
from datasets import load_dataset
from tqdm import tqdm
def pp(log_score, length):
return 10.0 ** (-log_score / length)
# http://dl.fbaipublicfiles.com/cc_net/lm/es.arpa.bin
model = kenlm.Model("es.arpa.bin")
mc4 = load_dataset("mc4", "es", streaming=True)
with open("mc4-es-perplexity.txt", "w") as f:
for sample in tqdm(mc4["train"].shuffle(buffer_size=100_000), total=416057992):
lines = sample["text"].split("\n")
doc_log_score, doc_length = 0, 0
for line in lines:
log_score = model.score(line)
length = len(line.split()) + 1
doc_log_score += log_score
doc_length += length
f.write(f"{pp(doc_log_score, doc_length)}\n")