{ "cells": [ { "cell_type": "code", "execution_count": 3, "id": "723b5d4d", "metadata": {}, "outputs": [], "source": [ "import jax\n", "import optax\n", "import flax\n", "import jax.numpy as jnp\n", "import datasets\n", "from flax.training import train_state\n", "from flax.training.common_utils import get_metrics, onehot, shard\n", "from datasets import load_dataset\n", "from tokenizers import trainers, Tokenizer, normalizers, ByteLevelBPETokenizer\n", "from pathlib import Path\n", "import numpy as np\n", "import transformers\n", "from tqdm.notebook import tqdm\n", "from pathlib import Path\n", "from transformers import AutoConfig\n", "from typing import Dict, List, Optional, Tuple\n", "from transformers import AutoTokenizer\n", "from transformers import PreTrainedTokenizerBase\n", "from transformers import FlaxAutoModelForMaskedLM\n", "from dataclasses import dataclass, field\n", "import time\n", "import glob\n", "import random" ] }, { "cell_type": "code", "execution_count": 11, "id": "59076aa7", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Number of files 20 after adding /data/c4_cleaned\n" ] } ], "source": [ "#59G c4_cleaned compressed\n", "#937M nrc_uniq_cleaned_20210223 compressed\n", "#410M nu_uniq_cleaned_20210225 compressed\n", "#9.9G oscar_nl_cleaned compressed\n", "\n", "\n", "\n", "data_files = []\n", "SEED=42\n", "def add_jsonlines_dir(path):\n", " global data_files\n", " #data_files += glob.glob(f\"{path}/*47*.gz\")\n", " #data_files += glob.glob(f\"{path}/*32*.gz\")\n", " #data_files += glob.glob(f\"{path}/*59*.gz\")\n", " data_files += glob.glob(f\"{path}/*11*.gz\")\n", " print(f\"Number of files {len(data_files)} after adding {path}\")\n", " \n", "add_jsonlines_dir(\"/data/c4_cleaned\")\n", "#add_jsonlines_dir(\"/data/nrc_uniq_cleaned_20210223\")\n", "#add_jsonlines_dir(\"/data/nu_uniq_cleaned_20210225\")\n", "#add_jsonlines_dir(\"/data/oscar_nl_cleaned\") This one gives an error like field url not in \n", "\n" ] }, { "cell_type": "code", "execution_count": 5, "id": "fc9519d2", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Number of files 3 after adding /home/dat/subset_c4_cleannl\n", "Got 3 training files and 3 validation files\n" ] }, { "ename": "AssertionError", "evalue": "Train overlaps with test", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mAssertionError\u001b[0m Traceback (most recent call last)", "\u001b[0;32m/tmp/ipykernel_309684/1129081061.py\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 20\u001b[0m \u001b[0mval\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mdata_files\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 21\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34mf\"Got {len(train)} training files and {len(val)} validation files\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 22\u001b[0;31m \u001b[0;32massert\u001b[0m \u001b[0mlist\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mset\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtrain\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m&\u001b[0m \u001b[0mset\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mval\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;34m[\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"Train overlaps with test\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 23\u001b[0m \u001b[0mdatasets\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mload_dataset\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'json'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdata_files\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m{\u001b[0m\u001b[0;34m'train'\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mtrain\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'validation'\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mval\u001b[0m\u001b[0;34m}\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 24\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;31mAssertionError\u001b[0m: Train overlaps with test" ] } ], "source": [ "#59G c4_cleaned compressed\n", "#937M nrc_uniq_cleaned_20210223 compressed\n", "#410M nu_uniq_cleaned_20210225 compressed\n", "#9.9G oscar_nl_cleaned compressed\n", "\n", "\n", "\n", "data_files = []\n", "SEED=42\n", "def add_jsonlines_dir(path,filespec):\n", " global data_files\n", " data_files += glob.glob(f\"{path}/{filespec}\")\n", " print(f\"Number of files {len(data_files)} after adding {path}\")\n", " \n", "add_jsonlines_dir(\"/home/dat/subset_c4_cleannl\",\"*.gz\") \n", "#add_jsonlines_dir(\"/data/oscar_nl_cleaned\",\"*.gz\")\n", "#add_jsonlines_dir(\"/data/nrc_cleaned_idtextfmt\",\"*.gz\")\n", "#add_jsonlines_dir(\"/data/nu_cleaned_idtextfmt\",\"*.gz\")\n", "random.Random(SEED).shuffle(data_files)\n", "total = len(data_files)\n", "val_size = int(0.05 * total)\n", "train_size = total - val_size\n", "print(f\"95%: {train_size}\")\n", "train = data_files[:train_size]\n", "val = data_files[train_size:]\n", "print(f\"Got {len(train)} training files and {len(val)} validation files\")\n", "assert list(set(train) & set(val)) == [], \"Train overlaps with test\"\n", "datasets = load_dataset('json', data_files={'train': train, 'validation': val})\n", "\n", "\n", "assert list(set(train) & set(val)) == [], 'train overlaps with test'\n" ] }, { "cell_type": "code", "execution_count": null, "id": "ed0e2b82", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "b5839c79", "metadata": {}, "outputs": [], "source": [ "\n", " add_jsonlines_dir(\"/home/dat/subset_c4_cleannl\") \n", " add_jsonlines_dir(\"/data/oscar_nl_cleaned\")\n", " add_jsonlines_dir(\"/data/nrc_cleaned_idtextfmt\")\n", " add_jsonlines_dir(\"/data/nu_cleaned_idtextfmt\")\n", " random.Random(SEED).shuffle(data_files)\n", " total = len(data_files)\n", " val_size = int(0.05 * total)\n", " train_size = total - val_size\n", " print(f\"95%: {train_size}\")\n", " train = data_files\n", " val = data_files\n", " print(f\"Got {len(train)} training files and {len(val)} validation files\")\n", " assert list(set(train) & set(val)) == [], \"Train overlaps with test\"\n", " return train, val\n", " train, val = train_val_files()\n", " datasets = load_dataset('json', data_files={'train': train, 'validation': val})" ] }, { "cell_type": "code", "execution_count": 4, "id": "6685589f", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "\n", "\n" ] } ], "source": [ "from tokenizers import ByteLevelBPETokenizer\n", "tokenizer = ByteLevelBPETokenizer()\n", "\n", "def batch_iterator(batch_size=1000):\n", " for i in range(0, len(datasets), batch_size):\n", " yield datasets[\"train\"][i: i + batch_size][\"text\"]\n", "\n", "tokenizer.train_from_iterator(batch_iterator(), vocab_size=50358, min_frequency=2, special_tokens=[\n", " \"\",\n", " \"\",\n", " \"\",\n", " \"\",\n", " \"\",\n", "])" ] }, { "cell_type": "code", "execution_count": 5, "id": "5fed49b4", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "39503" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tokenizer.get_vocab_size()" ] }, { "cell_type": "code", "execution_count": 6, "id": "69401680", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "/home/dat/pino-roberta-base\n" ] } ], "source": [ "cd ~/pino-roberta-base" ] }, { "cell_type": "code", "execution_count": 7, "id": "7a98d754", "metadata": {}, "outputs": [], "source": [ "tokenizer.save(\"tokenizer.json\")" ] }, { "cell_type": "code", "execution_count": null, "id": "e686b9c8", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Using custom data configuration nl-lang=nl\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Downloading and preparing dataset cc100/nl (download: Unknown size, generated: Unknown size, post-processed: Unknown size, total: Unknown size) to /home/dat/.cache/huggingface/datasets/cc100/nl-lang=nl/0.0.0/b583dd47b0dd43a3c3773075abd993be12d0eee93dbd2cfe15a0e4e94d481e80...\n" ] }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "8bb6155775084c42841d5a786a3f014c", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Downloading: 0%| | 0.00/8.42G [00:00