File size: 633 Bytes
8fb54c7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
import json
import kenlm
from tqdm import tqdm

model = kenlm.Model("../es.arpa.bin")

def get_perplexity(doc):
    doc_log_score, doc_length = 0, 0
    for line in doc.split("\n"):
        log_score = model.score(line)
        length = len(line.split()) + 1
        doc_log_score += log_score
        doc_length += length
    return 10.0 ** (-doc_log_score / doc_length)

with open("mc4-es-train-50M-stats.csv", "w") as csv:
    with open("mc4-es-train-50M-steps.jsonl", "r") as data:
        for line in tqdm(data):
            text = json.loads(line)["text"]
            csv.write(f"{len(text.split())},{get_perplexity(text)}\n")