File size: 1,531 Bytes
a81e575
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
#!/usr/bin/env python
import kenlm
from datasets import load_dataset
from tqdm import tqdm
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer


TOTAL_SENTENCES = 20000
def pp(log_score, length):
    return 10.0 ** (-log_score / length)


embedder = "distiluse-base-multilingual-cased-v1"
embedder_model = SentenceTransformer(embedder)
embedding_shape = embedder_model.encode(["foo"])[0].shape[0]
# http://dl.fbaipublicfiles.com/cc_net/lm/es.arpa.bin
model = kenlm.Model("es.arpa.bin")
mc4 = load_dataset("mc4", "es", streaming=True)
count = 0
embeddings = []
lenghts = []
perplexities = []
sentences = []

for sample in tqdm(mc4["train"].shuffle(buffer_size=100_000), total=416057992):
    lines = sample["text"].split("\n")
    for line in lines:
        count += 1
        log_score = model.score(line)
        length = len(line.split()) + 1
        embedding = embedder_model.encode([line])[0]
        embeddings.append(embedding.tolist())
        perplexities.append(pp(log_score, length))
        lenghts.append(length)
        sentences.append(line)
        if count == TOTAL_SENTENCES:
            break
    if count == TOTAL_SENTENCES:
        embeddings = np.array(embeddings)
        df = pd.DataFrame({"sentence": sentences, "length": lenghts, "perplexity": perplexities})
        for dim in range(embedding_shape):
            df[f"dim_{dim}"] = embeddings[:, dim]
        df.to_csv("mc4-es-perplexity-sentences.tsv", index=None, sep="\t")
        print("DONE!")
        break