|
import csv |
|
import datasets |
|
import nltk |
|
nltk.download('punkt') |
|
from nltk.tokenize import sent_tokenize |
|
import numpy as np |
|
|
|
|
|
dataset = datasets.load_from_disk("/researchdisk/training_dataset_full_deduplicated") |
|
|
|
def tokenize_sentences(example): |
|
sentences = sent_tokenize(example["text"], "finnish") |
|
sentences = [sentence for sentence in sentences if len(sentence.split()) >= 5] |
|
sentences.append("") |
|
example["text"] = "\n".join(sentences) |
|
return example |
|
|
|
|
|
dataset["train"] = dataset["train"].map(tokenize_sentences, num_proc=64, batched=False, writer_batch_size=100000) |
|
dataset["validation"] = dataset["validation"].map(tokenize_sentences, num_proc=64, batched=False, writer_batch_size=100000) |
|
|
|
np.savetxt('/researchdisk/training_dataset_sentences/train.txt', dataset["train"].to_pandas().values, fmt = "%s") |
|
np.savetxt('/researchdisk/training_dataset_sentences/validation.txt', dataset["validation"].to_pandas().values, fmt = "%s") |