import csv import datasets import nltk nltk.download('punkt') from nltk.tokenize import sent_tokenize import numpy as np dataset = datasets.load_from_disk("/researchdisk/training_dataset_full_deduplicated") def tokenize_sentences(example): sentences = sent_tokenize(example["text"], "finnish") sentences = [sentence for sentence in sentences if len(sentence.split()) >= 5] sentences.append("") example["text"] = "\n".join(sentences) return example dataset["train"] = dataset["train"].map(tokenize_sentences, num_proc=64, batched=False, writer_batch_size=100000) dataset["validation"] = dataset["validation"].map(tokenize_sentences, num_proc=64, batched=False, writer_batch_size=100000) np.savetxt('/researchdisk/training_dataset_sentences/train.txt', dataset["train"].to_pandas().values, fmt = "%s") np.savetxt('/researchdisk/training_dataset_sentences/validation.txt', dataset["validation"].to_pandas().values, fmt = "%s")