convbert-base-finnish / dataset_to_sentences.py
aapot
Add 1M train step model
42db976
raw
history blame
949 Bytes
import csv
import datasets
import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize
import numpy as np
dataset = datasets.load_from_disk("/researchdisk/training_dataset_full_deduplicated")
def tokenize_sentences(example):
sentences = sent_tokenize(example["text"], "finnish")
sentences = [sentence for sentence in sentences if len(sentence.split()) >= 5]
sentences.append("")
example["text"] = "\n".join(sentences)
return example
dataset["train"] = dataset["train"].map(tokenize_sentences, num_proc=64, batched=False, writer_batch_size=100000)
dataset["validation"] = dataset["validation"].map(tokenize_sentences, num_proc=64, batched=False, writer_batch_size=100000)
np.savetxt('/researchdisk/training_dataset_sentences/train.txt', dataset["train"].to_pandas().values, fmt = "%s")
np.savetxt('/researchdisk/training_dataset_sentences/validation.txt', dataset["validation"].to_pandas().values, fmt = "%s")