get_embeddings_and_perplexity.py · bertin-project/bertin-roberta-base-spanish at 72f48846df1add11dfa82883f37dd90c1160cfc6

bertin-roberta-base-spanish / get_embeddings_and_perplexity.py

Add script to generate dataset of embeddings and perplexities. Add script to generate t-SNE plot for embedding and perplexity visualization.

a81e575 almost 3 years ago

No virus

1.53 kB

	#!/usr/bin/env python
	import kenlm
	from datasets import load_dataset
	from tqdm import tqdm
	import pandas as pd
	import numpy as np
	from sentence_transformers import SentenceTransformer


	TOTAL_SENTENCES = 20000
	def pp(log_score, length):
	return 10.0 ** (-log_score / length)


	embedder = "distiluse-base-multilingual-cased-v1"
	embedder_model = SentenceTransformer(embedder)
	embedding_shape = embedder_model.encode(["foo"])[0].shape[0]
	# http://dl.fbaipublicfiles.com/cc_net/lm/es.arpa.bin
	model = kenlm.Model("es.arpa.bin")
	mc4 = load_dataset("mc4", "es", streaming=True)
	count = 0
	embeddings = []
	lenghts = []
	perplexities = []
	sentences = []

	for sample in tqdm(mc4["train"].shuffle(buffer_size=100_000), total=416057992):
	lines = sample["text"].split("\n")
	for line in lines:
	count += 1
	log_score = model.score(line)
	length = len(line.split()) + 1
	embedding = embedder_model.encode([line])[0]
	embeddings.append(embedding.tolist())
	perplexities.append(pp(log_score, length))
	lenghts.append(length)
	sentences.append(line)
	if count == TOTAL_SENTENCES:
	break
	if count == TOTAL_SENTENCES:
	embeddings = np.array(embeddings)
	df = pd.DataFrame({"sentence": sentences, "length": lenghts, "perplexity": perplexities})
	for dim in range(embedding_shape):
	df[f"dim_{dim}"] = embeddings[:, dim]
	df.to_csv("mc4-es-perplexity-sentences.tsv", index=None, sep="\t")
	print("DONE!")
	break