|
|
|
import kenlm |
|
from datasets import load_dataset |
|
from tqdm import tqdm |
|
import pandas as pd |
|
import numpy as np |
|
from sentence_transformers import SentenceTransformer |
|
|
|
|
|
TOTAL_SENTENCES = 20000 |
|
def pp(log_score, length): |
|
return 10.0 ** (-log_score / length) |
|
|
|
|
|
embedder = "distiluse-base-multilingual-cased-v1" |
|
embedder_model = SentenceTransformer(embedder) |
|
embedding_shape = embedder_model.encode(["foo"])[0].shape[0] |
|
|
|
model = kenlm.Model("es.arpa.bin") |
|
mc4 = load_dataset("mc4", "es", streaming=True) |
|
count = 0 |
|
embeddings = [] |
|
lenghts = [] |
|
perplexities = [] |
|
sentences = [] |
|
|
|
for sample in tqdm(mc4["train"].shuffle(buffer_size=100_000), total=416057992): |
|
lines = sample["text"].split("\n") |
|
for line in lines: |
|
count += 1 |
|
log_score = model.score(line) |
|
length = len(line.split()) + 1 |
|
embedding = embedder_model.encode([line])[0] |
|
embeddings.append(embedding.tolist()) |
|
perplexities.append(pp(log_score, length)) |
|
lenghts.append(length) |
|
sentences.append(line) |
|
if count == TOTAL_SENTENCES: |
|
break |
|
if count == TOTAL_SENTENCES: |
|
embeddings = np.array(embeddings) |
|
df = pd.DataFrame({"sentence": sentences, "length": lenghts, "perplexity": perplexities}) |
|
for dim in range(embedding_shape): |
|
df[f"dim_{dim}"] = embeddings[:, dim] |
|
df.to_csv("mc4-es-perplexity-sentences.tsv", index=None, sep="\t") |
|
print("DONE!") |
|
break |
|
|