File size: 633 Bytes
8fb54c7 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 |
import json
import kenlm
from tqdm import tqdm
model = kenlm.Model("../es.arpa.bin")
def get_perplexity(doc):
doc_log_score, doc_length = 0, 0
for line in doc.split("\n"):
log_score = model.score(line)
length = len(line.split()) + 1
doc_log_score += log_score
doc_length += length
return 10.0 ** (-doc_log_score / doc_length)
with open("mc4-es-train-50M-stats.csv", "w") as csv:
with open("mc4-es-train-50M-steps.jsonl", "r") as data:
for line in tqdm(data):
text = json.loads(line)["text"]
csv.write(f"{len(text.split())},{get_perplexity(text)}\n")
|