|
import json |
|
import kenlm |
|
from tqdm import tqdm |
|
|
|
model = kenlm.Model("../es.arpa.bin") |
|
|
|
def get_perplexity(doc): |
|
doc_log_score, doc_length = 0, 0 |
|
for line in doc.split("\n"): |
|
log_score = model.score(line) |
|
length = len(line.split()) + 1 |
|
doc_log_score += log_score |
|
doc_length += length |
|
return 10.0 ** (-doc_log_score / doc_length) |
|
|
|
with open("mc4-es-train-50M-stats.csv", "w") as csv: |
|
with open("mc4-es-train-50M-steps.jsonl", "r") as data: |
|
for line in tqdm(data): |
|
text = json.loads(line)["text"] |
|
csv.write(f"{len(text.split())},{get_perplexity(text)}\n") |
|
|