{ "cells": [ { "cell_type": "code", "execution_count": 71, "id": "723b5d4d", "metadata": {}, "outputs": [], "source": [ "import jax\n", "import optax\n", "import flax\n", "import jax.numpy as jnp\n", "import datasets\n", "from flax.training import train_state\n", "from flax.training.common_utils import get_metrics, onehot, shard\n", "from datasets import load_dataset\n", "from tokenizers import trainers, Tokenizer, normalizers, ByteLevelBPETokenizer\n", "from pathlib import Path\n", "import numpy as np\n", "import transformers\n", "from tqdm.notebook import tqdm\n", "from pathlib import Path\n", "from transformers import AutoConfig\n", "from typing import Dict, List, Optional, Tuple\n", "from transformers import AutoTokenizer\n", "from transformers import PreTrainedTokenizerBase\n", "from transformers import FlaxAutoModelForMaskedLM\n", "from dataclasses import dataclass, field\n", "import time\n", "import glob\n", "import random" ] }, { "cell_type": "code", "execution_count": 72, "id": "f4a5edee", "metadata": {}, "outputs": [], "source": [ "from transformers import AutoConfig\n" ] }, { "cell_type": "code", "execution_count": 1, "id": "9241a429", "metadata": {}, "outputs": [], "source": [ "from transformers import AutoTokenizer" ] }, { "cell_type": "code", "execution_count": null, "id": "348a4dd4", "metadata": {}, "outputs": [], "source": [ "from transformers import pipeline, AutoTokenizer,BigBirdForMaskedLM, pipeline\n", "tokenizer = AutoTokenizer.from_pretrained(\".\")\n", "model = FlaxBigBirdForMaskedLM.from_pretrained(\".\")\n", "\n", "unmasker = pipeline('fill-mask',tokenizer=tokenizer, model=model)" ] }, { "cell_type": "code", "execution_count": 1, "id": "4b1bb489", "metadata": {}, "outputs": [ { "ename": "NameError", "evalue": "name 'unmasker' is not defined", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", "\u001b[0;32m/tmp/ipykernel_759685/2003099555.py\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0munmasker\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"test\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", "\u001b[0;31mNameError\u001b[0m: name 'unmasker' is not defined" ] } ], "source": [ "unmasker(\"test\")" ] }, { "cell_type": "code", "execution_count": 4, "id": "595f318e", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "36450" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tokenizer.vocab_size" ] }, { "cell_type": "code", "execution_count": 1, "id": "6b89ed82", "metadata": {}, "outputs": [], "source": [ "import transformers" ] }, { "cell_type": "code", "execution_count": null, "id": "5ad59ec0", "metadata": {}, "outputs": [], "source": [ "x = transformers.FillMaskPipeline" ] }, { "cell_type": "code", "execution_count": null, "id": "4cb13b65", "metadata": {}, "outputs": [], "source": [ ".mask_token = " ] }, { "cell_type": "code", "execution_count": 46, "id": "617073b7", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'input_ids': [4, 3620], 'attention_mask': [1, 1]}" ] }, "execution_count": 46, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tokenizer(\" word\")" ] }, { "cell_type": "code", "execution_count": 3, "id": "48daf2ec", "metadata": {}, "outputs": [], "source": [ "\n", "\n", "config = AutoConfig.from_pretrained(\"google/bigbird-roberta-base\")" ] }, { "cell_type": "code", "execution_count": 4, "id": "fc816572", "metadata": {}, "outputs": [], "source": [ "config.save_pretrained(\"./\")" ] }, { "cell_type": "code", "execution_count": 60, "id": "39b9fc3d", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "2848" ] }, "execution_count": 60, "metadata": {}, "output_type": "execute_result" } ], "source": [ "len([x for x in data_files if isinstance(x, str)])" ] }, { "cell_type": "code", "execution_count": 61, "id": "ba855add", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00943-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00018-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-01012-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00625-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00070-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00108-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00315-of-01024.json.gz',\n", " '/data/nrc_uniq_cleaned_20210223/part-00056-47c217ad-de48-4b5e-b177-ef4bcd69ad2b-c000.json.gz',\n", " '/data/nrc_uniq_cleaned_20210223/part-00140-47c217ad-de48-4b5e-b177-ef4bcd69ad2b-c000.json.gz',\n", " '/data/nu_uniq_cleaned_20210225/part-00128-f31777cc-bf2a-4d2c-95c8-22f5eec50039-c000.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00221-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00394-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00469-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00547-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00444-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00000-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00129-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00229-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00335-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00792-of-01024.json.gz',\n", " '/data/nrc_uniq_cleaned_20210223/part-00090-47c217ad-de48-4b5e-b177-ef4bcd69ad2b-c000.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00584-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00986-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00618-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00824-of-01024.json.gz',\n", " '/data/nu_uniq_cleaned_20210225/part-00114-f31777cc-bf2a-4d2c-95c8-22f5eec50039-c000.json.gz',\n", " '/data/nu_uniq_cleaned_20210225/part-00034-f31777cc-bf2a-4d2c-95c8-22f5eec50039-c000.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00465-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00185-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-01013-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00310-of-01024.json.gz',\n", " '/data/nrc_uniq_cleaned_20210223/part-00071-47c217ad-de48-4b5e-b177-ef4bcd69ad2b-c000.json.gz',\n", " '/data/nrc_uniq_cleaned_20210223/part-00030-47c217ad-de48-4b5e-b177-ef4bcd69ad2b-c000.json.gz',\n", " '/data/nrc_uniq_cleaned_20210223/part-00132-47c217ad-de48-4b5e-b177-ef4bcd69ad2b-c000.json.gz',\n", " '/data/nu_uniq_cleaned_20210225/part-00074-f31777cc-bf2a-4d2c-95c8-22f5eec50039-c000.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00480-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00460-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00847-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00783-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00141-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00967-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00145-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00586-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00188-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00745-of-01024.json.gz',\n", " '/data/nrc_uniq_cleaned_20210223/part-00047-47c217ad-de48-4b5e-b177-ef4bcd69ad2b-c000.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00850-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00124-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00952-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00333-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00005-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00760-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00882-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00581-of-01024.json.gz',\n", " '/data/nrc_uniq_cleaned_20210223/part-00164-47c217ad-de48-4b5e-b177-ef4bcd69ad2b-c000.json.gz',\n", " '/data/nrc_uniq_cleaned_20210223/part-00120-47c217ad-de48-4b5e-b177-ef4bcd69ad2b-c000.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00509-of-01024.json.gz',\n", " '/data/nu_uniq_cleaned_20210225/part-00167-f31777cc-bf2a-4d2c-95c8-22f5eec50039-c000.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00180-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00017-of-01024.json.gz',\n", " '/data/nrc_uniq_cleaned_20210223/part-00167-47c217ad-de48-4b5e-b177-ef4bcd69ad2b-c000.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-01004-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00756-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00728-of-01024.json.gz',\n", " '/data/nrc_uniq_cleaned_20210223/part-00033-47c217ad-de48-4b5e-b177-ef4bcd69ad2b-c000.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00551-of-01024.json.gz',\n", " '/data/nu_uniq_cleaned_20210225/part-00132-f31777cc-bf2a-4d2c-95c8-22f5eec50039-c000.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00231-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00924-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00725-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00362-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00123-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-01000-of-01024.json.gz',\n", " '/data/nrc_uniq_cleaned_20210223/part-00161-47c217ad-de48-4b5e-b177-ef4bcd69ad2b-c000.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00344-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00213-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00721-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-01011-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00446-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00235-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00061-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00671-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00294-of-01024.json.gz',\n", " '/data/nrc_uniq_cleaned_20210223/part-00177-47c217ad-de48-4b5e-b177-ef4bcd69ad2b-c000.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00081-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00407-of-01024.json.gz',\n", " '/data/nrc_uniq_cleaned_20210223/part-00113-47c217ad-de48-4b5e-b177-ef4bcd69ad2b-c000.json.gz',\n", " '/data/nu_uniq_cleaned_20210225/part-00030-f31777cc-bf2a-4d2c-95c8-22f5eec50039-c000.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00293-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00147-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00698-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00598-of-01024.json.gz',\n", " '/data/nu_uniq_cleaned_20210225/part-00006-f31777cc-bf2a-4d2c-95c8-22f5eec50039-c000.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00354-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00860-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00841-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00481-of-01024.json.gz',\n", " '/data/nu_uniq_cleaned_20210225/part-00129-f31777cc-bf2a-4d2c-95c8-22f5eec50039-c000.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00109-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00478-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00667-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00390-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00525-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00449-of-01024.json.gz',\n", " '/data/nu_uniq_cleaned_20210225/part-00126-f31777cc-bf2a-4d2c-95c8-22f5eec50039-c000.json.gz',\n", " '/data/nrc_uniq_cleaned_20210223/part-00016-47c217ad-de48-4b5e-b177-ef4bcd69ad2b-c000.json.gz',\n", " '/data/nrc_uniq_cleaned_20210223/part-00197-47c217ad-de48-4b5e-b177-ef4bcd69ad2b-c000.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00762-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00247-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00982-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00130-of-01024.json.gz',\n", " '/data/nrc_uniq_cleaned_20210223/part-00184-47c217ad-de48-4b5e-b177-ef4bcd69ad2b-c000.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00063-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00445-of-01024.json.gz',\n", " '/data/nrc_uniq_cleaned_20210223/part-00116-47c217ad-de48-4b5e-b177-ef4bcd69ad2b-c000.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00675-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00838-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00726-of-01024.json.gz',\n", " '/data/nrc_uniq_cleaned_20210223/part-00191-47c217ad-de48-4b5e-b177-ef4bcd69ad2b-c000.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00832-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00110-of-01024.json.gz',\n", " '/data/nrc_uniq_cleaned_20210223/part-00012-47c217ad-de48-4b5e-b177-ef4bcd69ad2b-c000.json.gz',\n", " '/data/nu_uniq_cleaned_20210225/part-00098-f31777cc-bf2a-4d2c-95c8-22f5eec50039-c000.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00257-of-01024.json.gz',\n", " '/data/nrc_uniq_cleaned_20210223/part-00130-47c217ad-de48-4b5e-b177-ef4bcd69ad2b-c000.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00659-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00355-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00487-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00324-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00459-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00439-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-01007-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00706-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00512-of-01024.json.gz',\n", " '/data/nrc_uniq_cleaned_20210223/part-00073-47c217ad-de48-4b5e-b177-ef4bcd69ad2b-c000.json.gz',\n", " '/data/nu_uniq_cleaned_20210225/part-00051-f31777cc-bf2a-4d2c-95c8-22f5eec50039-c000.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00911-of-01024.json.gz',\n", " '/data/nrc_uniq_cleaned_20210223/part-00013-47c217ad-de48-4b5e-b177-ef4bcd69ad2b-c000.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00987-of-01024.json.gz',\n", " '/data/nrc_uniq_cleaned_20210223/part-00188-47c217ad-de48-4b5e-b177-ef4bcd69ad2b-c000.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00220-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00885-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00905-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00813-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00326-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-01015-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00457-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00562-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00503-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00845-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00755-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00969-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00949-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00668-of-01024.json.gz',\n", " '/data/nrc_uniq_cleaned_20210223/part-00042-47c217ad-de48-4b5e-b177-ef4bcd69ad2b-c000.json.gz',\n", " '/data/nu_uniq_cleaned_20210225/part-00146-f31777cc-bf2a-4d2c-95c8-22f5eec50039-c000.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00302-of-01024.json.gz',\n", " '/data/nrc_uniq_cleaned_20210223/part-00050-47c217ad-de48-4b5e-b177-ef4bcd69ad2b-c000.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00002-of-01024.json.gz',\n", " '/data/nu_uniq_cleaned_20210225/part-00068-f31777cc-bf2a-4d2c-95c8-22f5eec50039-c000.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00608-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00616-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00573-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00127-of-01024.json.gz',\n", " '/data/nrc_uniq_cleaned_20210223/part-00171-47c217ad-de48-4b5e-b177-ef4bcd69ad2b-c000.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00149-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00516-of-01024.json.gz',\n", " '/data/nrc_uniq_cleaned_20210223/part-00176-47c217ad-de48-4b5e-b177-ef4bcd69ad2b-c000.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00451-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00597-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00311-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00747-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00430-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00743-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00561-of-01024.json.gz',\n", " '/data/nu_uniq_cleaned_20210225/part-00161-f31777cc-bf2a-4d2c-95c8-22f5eec50039-c000.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00534-of-01024.json.gz',\n", " '/data/nrc_uniq_cleaned_20210223/part-00029-47c217ad-de48-4b5e-b177-ef4bcd69ad2b-c000.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00448-of-01024.json.gz',\n", " '/data/nrc_uniq_cleaned_20210223/part-00022-47c217ad-de48-4b5e-b177-ef4bcd69ad2b-c000.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00096-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00736-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00672-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00533-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00295-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00438-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00776-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00176-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00375-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00298-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00501-of-01024.json.gz',\n", " '/data/nu_uniq_cleaned_20210225/part-00145-f31777cc-bf2a-4d2c-95c8-22f5eec50039-c000.json.gz',\n", " '/data/nrc_uniq_cleaned_20210223/part-00152-47c217ad-de48-4b5e-b177-ef4bcd69ad2b-c000.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00524-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00693-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00252-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00772-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00102-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00468-of-01024.json.gz',\n", " '/data/nu_uniq_cleaned_20210225/part-00165-f31777cc-bf2a-4d2c-95c8-22f5eec50039-c000.json.gz',\n", " '/data/nu_uniq_cleaned_20210225/part-00199-f31777cc-bf2a-4d2c-95c8-22f5eec50039-c000.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00409-of-01024.json.gz',\n", " '/data/nu_uniq_cleaned_20210225/part-00095-f31777cc-bf2a-4d2c-95c8-22f5eec50039-c000.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00816-of-01024.json.gz',\n", " '/data/nu_uniq_cleaned_20210225/part-00136-f31777cc-bf2a-4d2c-95c8-22f5eec50039-c000.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00908-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00942-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00611-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00304-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00825-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00880-of-01024.json.gz',\n", " '/data/nu_uniq_cleaned_20210225/part-00157-f31777cc-bf2a-4d2c-95c8-22f5eec50039-c000.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00643-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00602-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00050-of-01024.json.gz',\n", " '/data/nu_uniq_cleaned_20210225/part-00009-f31777cc-bf2a-4d2c-95c8-22f5eec50039-c000.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00995-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00175-of-01024.json.gz',\n", " '/data/nu_uniq_cleaned_20210225/part-00025-f31777cc-bf2a-4d2c-95c8-22f5eec50039-c000.json.gz',\n", " '/data/nu_uniq_cleaned_20210225/part-00147-f31777cc-bf2a-4d2c-95c8-22f5eec50039-c000.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00039-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00104-of-01024.json.gz',\n", " '/data/nu_uniq_cleaned_20210225/part-00140-f31777cc-bf2a-4d2c-95c8-22f5eec50039-c000.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00278-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00663-of-01024.json.gz',\n", " '/data/nu_uniq_cleaned_20210225/part-00058-f31777cc-bf2a-4d2c-95c8-22f5eec50039-c000.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00846-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00314-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00486-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00073-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00622-of-01024.json.gz',\n", " '/data/nu_uniq_cleaned_20210225/part-00153-f31777cc-bf2a-4d2c-95c8-22f5eec50039-c000.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00630-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00042-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00740-of-01024.json.gz',\n", " '/data/nu_uniq_cleaned_20210225/part-00172-f31777cc-bf2a-4d2c-95c8-22f5eec50039-c000.json.gz',\n", " '/data/nrc_uniq_cleaned_20210223/part-00121-47c217ad-de48-4b5e-b177-ef4bcd69ad2b-c000.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-01023-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00156-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00759-of-01024.json.gz',\n", " '/data/nu_uniq_cleaned_20210225/part-00148-f31777cc-bf2a-4d2c-95c8-22f5eec50039-c000.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00007-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00811-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00270-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00360-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00541-of-01024.json.gz',\n", " '/data/nu_uniq_cleaned_20210225/part-00176-f31777cc-bf2a-4d2c-95c8-22f5eec50039-c000.json.gz',\n", " '/data/nu_uniq_cleaned_20210225/part-00121-f31777cc-bf2a-4d2c-95c8-22f5eec50039-c000.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00948-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00628-of-01024.json.gz',\n", " '/data/nu_uniq_cleaned_20210225/part-00106-f31777cc-bf2a-4d2c-95c8-22f5eec50039-c000.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00208-of-01024.json.gz',\n", " '/data/nrc_uniq_cleaned_20210223/part-00172-47c217ad-de48-4b5e-b177-ef4bcd69ad2b-c000.json.gz',\n", " '/data/nrc_uniq_cleaned_20210223/part-00072-47c217ad-de48-4b5e-b177-ef4bcd69ad2b-c000.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00920-of-01024.json.gz',\n", " '/data/nrc_uniq_cleaned_20210223/part-00006-47c217ad-de48-4b5e-b177-ef4bcd69ad2b-c000.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00582-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00983-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00594-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00461-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00134-of-01024.json.gz',\n", " '/data/nu_uniq_cleaned_20210225/part-00118-f31777cc-bf2a-4d2c-95c8-22f5eec50039-c000.json.gz',\n", " '/data/nrc_uniq_cleaned_20210223/part-00081-47c217ad-de48-4b5e-b177-ef4bcd69ad2b-c000.json.gz',\n", " '/data/nu_uniq_cleaned_20210225/part-00093-f31777cc-bf2a-4d2c-95c8-22f5eec50039-c000.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00277-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00377-of-01024.json.gz',\n", " '/data/nrc_uniq_cleaned_20210223/part-00034-47c217ad-de48-4b5e-b177-ef4bcd69ad2b-c000.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00424-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00421-of-01024.json.gz',\n", " '/data/nrc_uniq_cleaned_20210223/part-00162-47c217ad-de48-4b5e-b177-ef4bcd69ad2b-c000.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00488-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-01016-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00703-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00748-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00866-of-01024.json.gz',\n", " '/data/nrc_uniq_cleaned_20210223/part-00096-47c217ad-de48-4b5e-b177-ef4bcd69ad2b-c000.json.gz',\n", " '/data/nrc_uniq_cleaned_20210223/part-00170-47c217ad-de48-4b5e-b177-ef4bcd69ad2b-c000.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00248-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00669-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00436-of-01024.json.gz',\n", " '/data/nu_uniq_cleaned_20210225/part-00085-f31777cc-bf2a-4d2c-95c8-22f5eec50039-c000.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00036-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00058-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00695-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-01020-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00817-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00844-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00477-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00224-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00464-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00564-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00442-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00065-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00592-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-01017-of-01024.json.gz',\n", " '/data/nu_uniq_cleaned_20210225/part-00181-f31777cc-bf2a-4d2c-95c8-22f5eec50039-c000.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00273-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00957-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00153-of-01024.json.gz',\n", " '/data/nu_uniq_cleaned_20210225/part-00035-f31777cc-bf2a-4d2c-95c8-22f5eec50039-c000.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00543-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00940-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00526-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00275-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00161-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00812-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00858-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00992-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00769-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00015-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00753-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00413-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00435-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00351-of-01024.json.gz',\n", " '/data/nu_uniq_cleaned_20210225/part-00031-f31777cc-bf2a-4d2c-95c8-22f5eec50039-c000.json.gz',\n", " '/data/nrc_uniq_cleaned_20210223/part-00180-47c217ad-de48-4b5e-b177-ef4bcd69ad2b-c000.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00021-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00734-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00854-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00859-of-01024.json.gz',\n", " '/data/nrc_uniq_cleaned_20210223/part-00018-47c217ad-de48-4b5e-b177-ef4bcd69ad2b-c000.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00219-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00836-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00895-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-01009-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00632-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00530-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00508-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00709-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00077-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00578-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00945-of-01024.json.gz',\n", " '/data/nrc_uniq_cleaned_20210223/part-00182-47c217ad-de48-4b5e-b177-ef4bcd69ad2b-c000.json.gz',\n", " '/data/nrc_uniq_cleaned_20210223/part-00139-47c217ad-de48-4b5e-b177-ef4bcd69ad2b-c000.json.gz',\n", " '/data/nu_uniq_cleaned_20210225/part-00047-f31777cc-bf2a-4d2c-95c8-22f5eec50039-c000.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00035-of-01024.json.gz',\n", " '/data/nrc_uniq_cleaned_20210223/part-00035-47c217ad-de48-4b5e-b177-ef4bcd69ad2b-c000.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00387-of-01024.json.gz',\n", " '/data/nrc_uniq_cleaned_20210223/part-00001-47c217ad-de48-4b5e-b177-ef4bcd69ad2b-c000.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00804-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00474-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00383-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00181-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-01021-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00679-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00338-of-01024.json.gz',\n", " '/data/nrc_uniq_cleaned_20210223/part-00179-47c217ad-de48-4b5e-b177-ef4bcd69ad2b-c000.json.gz',\n", " '/data/nu_uniq_cleaned_20210225/part-00042-f31777cc-bf2a-4d2c-95c8-22f5eec50039-c000.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00961-of-01024.json.gz',\n", " '/data/nu_uniq_cleaned_20210225/part-00010-f31777cc-bf2a-4d2c-95c8-22f5eec50039-c000.json.gz',\n", " '/data/nrc_uniq_cleaned_20210223/part-00091-47c217ad-de48-4b5e-b177-ef4bcd69ad2b-c000.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00423-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00290-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00947-of-01024.json.gz',\n", " '/data/nu_uniq_cleaned_20210225/part-00133-f31777cc-bf2a-4d2c-95c8-22f5eec50039-c000.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00380-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00946-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00604-of-01024.json.gz',\n", " '/data/nrc_uniq_cleaned_20210223/part-00045-47c217ad-de48-4b5e-b177-ef4bcd69ad2b-c000.json.gz',\n", " '/data/nrc_uniq_cleaned_20210223/part-00082-47c217ad-de48-4b5e-b177-ef4bcd69ad2b-c000.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00493-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00552-of-01024.json.gz',\n", " '/data/nu_uniq_cleaned_20210225/part-00152-f31777cc-bf2a-4d2c-95c8-22f5eec50039-c000.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00732-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00427-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00216-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00746-of-01024.json.gz',\n", " '/data/nu_uniq_cleaned_20210225/part-00057-f31777cc-bf2a-4d2c-95c8-22f5eec50039-c000.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00781-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00918-of-01024.json.gz',\n", " '/data/nrc_uniq_cleaned_20210223/part-00134-47c217ad-de48-4b5e-b177-ef4bcd69ad2b-c000.json.gz',\n", " '/data/nrc_uniq_cleaned_20210223/part-00046-47c217ad-de48-4b5e-b177-ef4bcd69ad2b-c000.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00453-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00099-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00704-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00361-of-01024.json.gz',\n", " '/data/nu_uniq_cleaned_20210225/part-00067-f31777cc-bf2a-4d2c-95c8-22f5eec50039-c000.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00510-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00088-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00210-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00325-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00605-of-01024.json.gz',\n", " '/data/nu_uniq_cleaned_20210225/part-00080-f31777cc-bf2a-4d2c-95c8-22f5eec50039-c000.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00651-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00367-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00822-of-01024.json.gz',\n", " '/data/nrc_uniq_cleaned_20210223/part-00041-47c217ad-de48-4b5e-b177-ef4bcd69ad2b-c000.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00358-of-01024.json.gz',\n", " '/data/nu_uniq_cleaned_20210225/part-00142-f31777cc-bf2a-4d2c-95c8-22f5eec50039-c000.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00491-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00892-of-01024.json.gz',\n", " '/data/nu_uniq_cleaned_20210225/part-00190-f31777cc-bf2a-4d2c-95c8-22f5eec50039-c000.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00356-of-01024.json.gz',\n", " '/data/nrc_uniq_cleaned_20210223/part-00068-47c217ad-de48-4b5e-b177-ef4bcd69ad2b-c000.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00359-of-01024.json.gz',\n", " '/data/nu_uniq_cleaned_20210225/part-00087-f31777cc-bf2a-4d2c-95c8-22f5eec50039-c000.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00452-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00554-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00259-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00086-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00904-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00012-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00105-of-01024.json.gz',\n", " '/data/nu_uniq_cleaned_20210225/part-00032-f31777cc-bf2a-4d2c-95c8-22f5eec50039-c000.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00996-of-01024.json.gz',\n", " '/data/nu_uniq_cleaned_20210225/part-00192-f31777cc-bf2a-4d2c-95c8-22f5eec50039-c000.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00443-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00909-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00938-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00162-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00214-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00447-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00839-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00856-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00476-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00371-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00504-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00253-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00921-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00408-of-01024.json.gz',\n", " '/data/nrc_uniq_cleaned_20210223/part-00000-47c217ad-de48-4b5e-b177-ef4bcd69ad2b-c000.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00973-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-01001-of-01024.json.gz',\n", " '/data/nrc_uniq_cleaned_20210223/part-00048-47c217ad-de48-4b5e-b177-ef4bcd69ad2b-c000.json.gz',\n", " '/data/nu_uniq_cleaned_20210225/part-00179-f31777cc-bf2a-4d2c-95c8-22f5eec50039-c000.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00146-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00261-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00902-of-01024.json.gz',\n", " '/data/nrc_uniq_cleaned_20210223/part-00092-47c217ad-de48-4b5e-b177-ef4bcd69ad2b-c000.json.gz',\n", " '/data/nrc_uniq_cleaned_20210223/part-00126-47c217ad-de48-4b5e-b177-ef4bcd69ad2b-c000.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00026-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00154-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00640-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00994-of-01024.json.gz',\n", " '/data/nu_uniq_cleaned_20210225/part-00156-f31777cc-bf2a-4d2c-95c8-22f5eec50039-c000.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00228-of-01024.json.gz',\n", " '/data/nrc_uniq_cleaned_20210223/part-00038-47c217ad-de48-4b5e-b177-ef4bcd69ad2b-c000.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00654-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00852-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00128-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00268-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00933-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00492-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00056-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00232-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00808-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00398-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00401-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00386-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00179-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00650-of-01024.json.gz',\n", " '/data/nu_uniq_cleaned_20210225/part-00197-f31777cc-bf2a-4d2c-95c8-22f5eec50039-c000.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00093-of-01024.json.gz',\n", " '/data/nrc_uniq_cleaned_20210223/part-00114-47c217ad-de48-4b5e-b177-ef4bcd69ad2b-c000.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00626-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00317-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00336-of-01024.json.gz',\n", " '/data/nu_uniq_cleaned_20210225/part-00012-f31777cc-bf2a-4d2c-95c8-22f5eec50039-c000.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00606-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00340-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00861-of-01024.json.gz',\n", " '/data/nrc_uniq_cleaned_20210223/part-00089-47c217ad-de48-4b5e-b177-ef4bcd69ad2b-c000.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00515-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00378-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00684-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00647-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00196-of-01024.json.gz',\n", " '/data/nrc_uniq_cleaned_20210223/part-00025-47c217ad-de48-4b5e-b177-ef4bcd69ad2b-c000.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00266-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00204-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00022-of-01024.json.gz',\n", " '/data/nrc_uniq_cleaned_20210223/part-00097-47c217ad-de48-4b5e-b177-ef4bcd69ad2b-c000.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00160-of-01024.json.gz',\n", " '/data/nu_uniq_cleaned_20210225/part-00059-f31777cc-bf2a-4d2c-95c8-22f5eec50039-c000.json.gz',\n", " '/data/nu_uniq_cleaned_20210225/part-00113-f31777cc-bf2a-4d2c-95c8-22f5eec50039-c000.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00182-of-01024.json.gz',\n", " '/data/nrc_uniq_cleaned_20210223/part-00060-47c217ad-de48-4b5e-b177-ef4bcd69ad2b-c000.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00368-of-01024.json.gz',\n", " '/data/nu_uniq_cleaned_20210225/part-00001-f31777cc-bf2a-4d2c-95c8-22f5eec50039-c000.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00353-of-01024.json.gz',\n", " '/data/nrc_uniq_cleaned_20210223/part-00062-47c217ad-de48-4b5e-b177-ef4bcd69ad2b-c000.json.gz',\n", " '/data/nu_uniq_cleaned_20210225/part-00198-f31777cc-bf2a-4d2c-95c8-22f5eec50039-c000.json.gz',\n", " '/data/nrc_uniq_cleaned_20210223/part-00175-47c217ad-de48-4b5e-b177-ef4bcd69ad2b-c000.json.gz',\n", " '/data/nu_uniq_cleaned_20210225/part-00026-f31777cc-bf2a-4d2c-95c8-22f5eec50039-c000.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00143-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00016-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00008-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00189-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00334-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00071-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00519-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00773-of-01024.json.gz',\n", " '/data/nrc_uniq_cleaned_20210223/part-00159-47c217ad-de48-4b5e-b177-ef4bcd69ad2b-c000.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00624-of-01024.json.gz',\n", " '/data/nrc_uniq_cleaned_20210223/part-00105-47c217ad-de48-4b5e-b177-ef4bcd69ad2b-c000.json.gz',\n", " '/data/nrc_uniq_cleaned_20210223/part-00109-47c217ad-de48-4b5e-b177-ef4bcd69ad2b-c000.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00112-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00754-of-01024.json.gz',\n", " '/data/nu_uniq_cleaned_20210225/part-00184-f31777cc-bf2a-4d2c-95c8-22f5eec50039-c000.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-01005-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00289-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00136-of-01024.json.gz',\n", " '/data/nu_uniq_cleaned_20210225/part-00194-f31777cc-bf2a-4d2c-95c8-22f5eec50039-c000.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00775-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00768-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00402-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00868-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00827-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00689-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00894-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00802-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00980-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00661-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00523-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00631-of-01024.json.gz',\n", " '/data/nu_uniq_cleaned_20210225/part-00073-f31777cc-bf2a-4d2c-95c8-22f5eec50039-c000.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00490-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00473-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00173-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00319-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00791-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00321-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00194-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00715-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00132-of-01024.json.gz',\n", " '/data/nu_uniq_cleaned_20210225/part-00100-f31777cc-bf2a-4d2c-95c8-22f5eec50039-c000.json.gz',\n", " '/data/nrc_uniq_cleaned_20210223/part-00053-47c217ad-de48-4b5e-b177-ef4bcd69ad2b-c000.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00433-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00455-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00142-of-01024.json.gz',\n", " '/data/nrc_uniq_cleaned_20210223/part-00011-47c217ad-de48-4b5e-b177-ef4bcd69ad2b-c000.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00585-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00798-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00842-of-01024.json.gz',\n", " '/data/nu_uniq_cleaned_20210225/part-00050-f31777cc-bf2a-4d2c-95c8-22f5eec50039-c000.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00657-of-01024.json.gz',\n", " '/data/nu_uniq_cleaned_20210225/part-00102-f31777cc-bf2a-4d2c-95c8-22f5eec50039-c000.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00750-of-01024.json.gz',\n", " '/data/nu_uniq_cleaned_20210225/part-00072-f31777cc-bf2a-4d2c-95c8-22f5eec50039-c000.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00716-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00702-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00285-of-01024.json.gz',\n", " '/data/nu_uniq_cleaned_20210225/part-00002-f31777cc-bf2a-4d2c-95c8-22f5eec50039-c000.json.gz',\n", " '/data/nu_uniq_cleaned_20210225/part-00020-f31777cc-bf2a-4d2c-95c8-22f5eec50039-c000.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00761-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-01014-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00553-of-01024.json.gz',\n", " '/data/nrc_uniq_cleaned_20210223/part-00181-47c217ad-de48-4b5e-b177-ef4bcd69ad2b-c000.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00500-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00287-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00422-of-01024.json.gz',\n", " '/data/nu_uniq_cleaned_20210225/part-00076-f31777cc-bf2a-4d2c-95c8-22f5eec50039-c000.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00511-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00246-of-01024.json.gz',\n", " '/data/nu_uniq_cleaned_20210225/part-00092-f31777cc-bf2a-4d2c-95c8-22f5eec50039-c000.json.gz',\n", " '/data/nu_uniq_cleaned_20210225/part-00159-f31777cc-bf2a-4d2c-95c8-22f5eec50039-c000.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00320-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00869-of-01024.json.gz',\n", " '/data/nrc_uniq_cleaned_20210223/part-00194-47c217ad-de48-4b5e-b177-ef4bcd69ad2b-c000.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00031-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00855-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00158-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00098-of-01024.json.gz',\n", " '/data/nrc_uniq_cleaned_20210223/part-00102-47c217ad-de48-4b5e-b177-ef4bcd69ad2b-c000.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00686-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00167-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00887-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00737-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00155-of-01024.json.gz',\n", " '/data/nrc_uniq_cleaned_20210223/part-00069-47c217ad-de48-4b5e-b177-ef4bcd69ad2b-c000.json.gz',\n", " '/data/nu_uniq_cleaned_20210225/part-00016-f31777cc-bf2a-4d2c-95c8-22f5eec50039-c000.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00299-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00168-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00462-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00416-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00627-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00567-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00559-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00799-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00364-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00172-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00521-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00187-of-01024.json.gz',\n", " '/data/nu_uniq_cleaned_20210225/part-00062-f31777cc-bf2a-4d2c-95c8-22f5eec50039-c000.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00119-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00030-of-01024.json.gz',\n", " '/data/nu_uniq_cleaned_20210225/part-00027-f31777cc-bf2a-4d2c-95c8-22f5eec50039-c000.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00739-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00041-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00888-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00934-of-01024.json.gz',\n", " '/data/nu_uniq_cleaned_20210225/part-00019-f31777cc-bf2a-4d2c-95c8-22f5eec50039-c000.json.gz',\n", " '/data/nu_uniq_cleaned_20210225/part-00154-f31777cc-bf2a-4d2c-95c8-22f5eec50039-c000.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00330-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00786-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00939-of-01024.json.gz',\n", " '/data/nu_uniq_cleaned_20210225/part-00066-f31777cc-bf2a-4d2c-95c8-22f5eec50039-c000.json.gz',\n", " '/data/nrc_uniq_cleaned_20210223/part-00043-47c217ad-de48-4b5e-b177-ef4bcd69ad2b-c000.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00881-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00391-of-01024.json.gz',\n", " '/data/nrc_uniq_cleaned_20210223/part-00112-47c217ad-de48-4b5e-b177-ef4bcd69ad2b-c000.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00332-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00593-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-01022-of-01024.json.gz',\n", " '/data/nrc_uniq_cleaned_20210223/part-00127-47c217ad-de48-4b5e-b177-ef4bcd69ad2b-c000.json.gz',\n", " '/data/nrc_uniq_cleaned_20210223/part-00141-47c217ad-de48-4b5e-b177-ef4bcd69ad2b-c000.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00629-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00953-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00242-of-01024.json.gz',\n", " '/data/nu_uniq_cleaned_20210225/part-00054-f31777cc-bf2a-4d2c-95c8-22f5eec50039-c000.json.gz',\n", " '/data/nu_uniq_cleaned_20210225/part-00112-f31777cc-bf2a-4d2c-95c8-22f5eec50039-c000.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00207-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00990-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00463-of-01024.json.gz',\n", " '/data/nrc_uniq_cleaned_20210223/part-00145-47c217ad-de48-4b5e-b177-ef4bcd69ad2b-c000.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00009-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00713-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00414-of-01024.json.gz',\n", " '/data/nrc_uniq_cleaned_20210223/part-00119-47c217ad-de48-4b5e-b177-ef4bcd69ad2b-c000.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00874-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00682-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00150-of-01024.json.gz',\n", " '/data/nu_uniq_cleaned_20210225/part-00122-f31777cc-bf2a-4d2c-95c8-22f5eec50039-c000.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00193-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00535-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00610-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00198-of-01024.json.gz',\n", " '/data/nrc_uniq_cleaned_20210223/part-00023-47c217ad-de48-4b5e-b177-ef4bcd69ad2b-c000.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00212-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00470-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00678-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00192-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00117-of-01024.json.gz',\n", " '/data/nu_uniq_cleaned_20210225/part-00040-f31777cc-bf2a-4d2c-95c8-22f5eec50039-c000.json.gz',\n", " '/data/nrc_uniq_cleaned_20210223/part-00027-47c217ad-de48-4b5e-b177-ef4bcd69ad2b-c000.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00349-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00576-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00549-of-01024.json.gz',\n", " '/data/nu_uniq_cleaned_20210225/part-00023-f31777cc-bf2a-4d2c-95c8-22f5eec50039-c000.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00612-of-01024.json.gz',\n", " '/data/nu_uniq_cleaned_20210225/part-00171-f31777cc-bf2a-4d2c-95c8-22f5eec50039-c000.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00003-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00502-of-01024.json.gz',\n", " '/data/nu_uniq_cleaned_20210225/part-00084-f31777cc-bf2a-4d2c-95c8-22f5eec50039-c000.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00884-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00309-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00690-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00696-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00784-of-01024.json.gz',\n", " '/data/nrc_uniq_cleaned_20210223/part-00031-47c217ad-de48-4b5e-b177-ef4bcd69ad2b-c000.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00280-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00697-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00536-of-01024.json.gz',\n", " '/data/nu_uniq_cleaned_20210225/part-00195-f31777cc-bf2a-4d2c-95c8-22f5eec50039-c000.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00712-of-01024.json.gz',\n", " '/data/nu_uniq_cleaned_20210225/part-00107-f31777cc-bf2a-4d2c-95c8-22f5eec50039-c000.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00454-of-01024.json.gz',\n", " '/data/nu_uniq_cleaned_20210225/part-00150-f31777cc-bf2a-4d2c-95c8-22f5eec50039-c000.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00203-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00806-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00999-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00496-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00404-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00857-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00771-of-01024.json.gz',\n", " '/data/nrc_uniq_cleaned_20210223/part-00185-47c217ad-de48-4b5e-b177-ef4bcd69ad2b-c000.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00045-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00475-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00575-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00879-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00357-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00665-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00057-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00951-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00979-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00906-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00062-of-01024.json.gz',\n", " '/data/nrc_uniq_cleaned_20210223/part-00024-47c217ad-de48-4b5e-b177-ef4bcd69ad2b-c000.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00495-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00692-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00095-of-01024.json.gz',\n", " '/data/nrc_uniq_cleaned_20210223/part-00014-47c217ad-de48-4b5e-b177-ef4bcd69ad2b-c000.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00201-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00805-of-01024.json.gz',\n", " '/data/nu_uniq_cleaned_20210225/part-00039-f31777cc-bf2a-4d2c-95c8-22f5eec50039-c000.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00388-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00032-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00589-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00186-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00677-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00411-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00641-of-01024.json.gz',\n", " '/data/nrc_uniq_cleaned_20210223/part-00061-47c217ad-de48-4b5e-b177-ef4bcd69ad2b-c000.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00079-of-01024.json.gz',\n", " '/data/nrc_uniq_cleaned_20210223/part-00028-47c217ad-de48-4b5e-b177-ef4bcd69ad2b-c000.json.gz',\n", " '/data/nu_uniq_cleaned_20210225/part-00119-f31777cc-bf2a-4d2c-95c8-22f5eec50039-c000.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00527-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00346-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00720-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00829-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00558-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00064-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00676-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00774-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00574-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00899-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00596-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00074-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00069-of-01024.json.gz',\n", " '/data/nrc_uniq_cleaned_20210223/part-00125-47c217ad-de48-4b5e-b177-ef4bcd69ad2b-c000.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00341-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00456-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00393-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00020-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00258-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00514-of-01024.json.gz',\n", " '/data/nrc_uniq_cleaned_20210223/part-00108-47c217ad-de48-4b5e-b177-ef4bcd69ad2b-c000.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00271-of-01024.json.gz',\n", " '/data/nu_uniq_cleaned_20210225/part-00089-f31777cc-bf2a-4d2c-95c8-22f5eec50039-c000.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00091-of-01024.json.gz',\n", " '/data/nrc_uniq_cleaned_20210223/part-00146-47c217ad-de48-4b5e-b177-ef4bcd69ad2b-c000.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00044-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00590-of-01024.json.gz',\n", " '/data/nu_uniq_cleaned_20210225/part-00008-f31777cc-bf2a-4d2c-95c8-22f5eec50039-c000.json.gz',\n", " '/data/nrc_uniq_cleaned_20210223/part-00074-47c217ad-de48-4b5e-b177-ef4bcd69ad2b-c000.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00914-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00296-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00800-of-01024.json.gz',\n", " '/data/nrc_uniq_cleaned_20210223/part-00163-47c217ad-de48-4b5e-b177-ef4bcd69ad2b-c000.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00190-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00484-of-01024.json.gz',\n", " '/data/nu_uniq_cleaned_20210225/part-00144-f31777cc-bf2a-4d2c-95c8-22f5eec50039-c000.json.gz',\n", " '/data/nrc_uniq_cleaned_20210223/part-00100-47c217ad-de48-4b5e-b177-ef4bcd69ad2b-c000.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00991-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00965-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00507-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00916-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00563-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00269-of-01024.json.gz',\n", " '/data/nrc_uniq_cleaned_20210223/part-00123-47c217ad-de48-4b5e-b177-ef4bcd69ad2b-c000.json.gz',\n", " '/data/nrc_uniq_cleaned_20210223/part-00084-47c217ad-de48-4b5e-b177-ef4bcd69ad2b-c000.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00889-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00744-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00862-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00777-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00719-of-01024.json.gz',\n", " '/data/nu_uniq_cleaned_20210225/part-00014-f31777cc-bf2a-4d2c-95c8-22f5eec50039-c000.json.gz',\n", " '/data/nrc_uniq_cleaned_20210223/part-00087-47c217ad-de48-4b5e-b177-ef4bcd69ad2b-c000.json.gz',\n", " '/data/nrc_uniq_cleaned_20210223/part-00144-47c217ad-de48-4b5e-b177-ef4bcd69ad2b-c000.json.gz',\n", " '/data/nrc_uniq_cleaned_20210223/part-00051-47c217ad-de48-4b5e-b177-ef4bcd69ad2b-c000.json.gz',\n", " '/data/nu_uniq_cleaned_20210225/part-00196-f31777cc-bf2a-4d2c-95c8-22f5eec50039-c000.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00226-of-01024.json.gz',\n", " '/data/nu_uniq_cleaned_20210225/part-00071-f31777cc-bf2a-4d2c-95c8-22f5eec50039-c000.json.gz',\n", " '/data/nu_uniq_cleaned_20210225/part-00028-f31777cc-bf2a-4d2c-95c8-22f5eec50039-c000.json.gz',\n", " '/data/nu_uniq_cleaned_20210225/part-00024-f31777cc-bf2a-4d2c-95c8-22f5eec50039-c000.json.gz',\n", " '/data/nrc_uniq_cleaned_20210223/part-00080-47c217ad-de48-4b5e-b177-ef4bcd69ad2b-c000.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00428-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00579-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00619-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00607-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00968-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00052-of-01024.json.gz',\n", " '/data/nrc_uniq_cleaned_20210223/part-00020-47c217ad-de48-4b5e-b177-ef4bcd69ad2b-c000.json.gz',\n", " '/data/nrc_uniq_cleaned_20210223/part-00128-47c217ad-de48-4b5e-b177-ef4bcd69ad2b-c000.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00620-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00372-of-01024.json.gz',\n", " '/data/nu_uniq_cleaned_20210225/part-00187-f31777cc-bf2a-4d2c-95c8-22f5eec50039-c000.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00090-of-01024.json.gz',\n", " '/data/nrc_uniq_cleaned_20210223/part-00099-47c217ad-de48-4b5e-b177-ef4bcd69ad2b-c000.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00084-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00100-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00171-of-01024.json.gz',\n", " '/data/nu_uniq_cleaned_20210225/part-00120-f31777cc-bf2a-4d2c-95c8-22f5eec50039-c000.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00544-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00997-of-01024.json.gz',\n", " '/data/nu_uniq_cleaned_20210225/part-00078-f31777cc-bf2a-4d2c-95c8-22f5eec50039-c000.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00801-of-01024.json.gz',\n", " '/data/nu_uniq_cleaned_20210225/part-00041-f31777cc-bf2a-4d2c-95c8-22f5eec50039-c000.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00699-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00571-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00054-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00034-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00871-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00710-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00653-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00803-of-01024.json.gz',\n", " '/data/nrc_uniq_cleaned_20210223/part-00107-47c217ad-de48-4b5e-b177-ef4bcd69ad2b-c000.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00382-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00539-of-01024.json.gz',\n", " '/data/nrc_uniq_cleaned_20210223/part-00155-47c217ad-de48-4b5e-b177-ef4bcd69ad2b-c000.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00782-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00531-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00912-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00680-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00714-of-01024.json.gz',\n", " '/data/nrc_uniq_cleaned_20210223/part-00052-47c217ad-de48-4b5e-b177-ef4bcd69ad2b-c000.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00255-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00944-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00494-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00603-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00316-of-01024.json.gz',\n", " '/data/nrc_uniq_cleaned_20210223/part-00049-47c217ad-de48-4b5e-b177-ef4bcd69ad2b-c000.json.gz',\n", " '/data/nu_uniq_cleaned_20210225/part-00158-f31777cc-bf2a-4d2c-95c8-22f5eec50039-c000.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00191-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00010-of-01024.json.gz',\n", " '/data/nrc_uniq_cleaned_20210223/part-00066-47c217ad-de48-4b5e-b177-ef4bcd69ad2b-c000.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00322-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00250-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00656-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00963-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00262-of-01024.json.gz',\n", " '/data/nrc_uniq_cleaned_20210223/part-00168-47c217ad-de48-4b5e-b177-ef4bcd69ad2b-c000.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00810-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00072-of-01024.json.gz',\n", " '/data/nu_uniq_cleaned_20210225/part-00138-f31777cc-bf2a-4d2c-95c8-22f5eec50039-c000.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00272-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00313-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00318-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00863-of-01024.json.gz',\n", " '/data/nrc_uniq_cleaned_20210223/part-00077-47c217ad-de48-4b5e-b177-ef4bcd69ad2b-c000.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00126-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00742-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00738-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00780-of-01024.json.gz',\n", " '/data/nu_uniq_cleaned_20210225/part-00189-f31777cc-bf2a-4d2c-95c8-22f5eec50039-c000.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00658-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00701-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00286-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00568-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00853-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00931-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00717-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00138-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00046-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00059-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00118-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00993-of-01024.json.gz',\n", " '/data/nrc_uniq_cleaned_20210223/part-00158-47c217ad-de48-4b5e-b177-ef4bcd69ad2b-c000.json.gz',\n", " '/data/nrc_uniq_cleaned_20210223/part-00166-47c217ad-de48-4b5e-b177-ef4bcd69ad2b-c000.json.gz',\n", " '/data/nrc_uniq_cleaned_20210223/part-00044-47c217ad-de48-4b5e-b177-ef4bcd69ad2b-c000.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00864-of-01024.json.gz',\n", " '/data/nu_uniq_cleaned_20210225/part-00017-f31777cc-bf2a-4d2c-95c8-22f5eec50039-c000.json.gz',\n", " '/data/nrc_uniq_cleaned_20210223/part-00093-47c217ad-de48-4b5e-b177-ef4bcd69ad2b-c000.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00638-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00327-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00385-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00988-of-01024.json.gz',\n", " '/data/nrc_uniq_cleaned_20210223/part-00117-47c217ad-de48-4b5e-b177-ef4bcd69ad2b-c000.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00120-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00237-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00941-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00307-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00157-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00705-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00613-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00014-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00244-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00978-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00113-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00730-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00151-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00583-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00870-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00415-of-01024.json.gz',\n", " '/data/nrc_uniq_cleaned_20210223/part-00183-47c217ad-de48-4b5e-b177-ef4bcd69ad2b-c000.json.gz',\n", " '/data/nrc_uniq_cleaned_20210223/part-00065-47c217ad-de48-4b5e-b177-ef4bcd69ad2b-c000.json.gz',\n", " '/data/nrc_uniq_cleaned_20210223/part-00075-47c217ad-de48-4b5e-b177-ef4bcd69ad2b-c000.json.gz',\n", " '/data/nu_uniq_cleaned_20210225/part-00193-f31777cc-bf2a-4d2c-95c8-22f5eec50039-c000.json.gz',\n", " '/data/nu_uniq_cleaned_20210225/part-00022-f31777cc-bf2a-4d2c-95c8-22f5eec50039-c000.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00935-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00376-of-01024.json.gz',\n", " '/data/nrc_uniq_cleaned_20210223/part-00192-47c217ad-de48-4b5e-b177-ef4bcd69ad2b-c000.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00793-of-01024.json.gz',\n", " '/data/nu_uniq_cleaned_20210225/part-00149-f31777cc-bf2a-4d2c-95c8-22f5eec50039-c000.json.gz',\n", " '/data/nu_uniq_cleaned_20210225/part-00160-f31777cc-bf2a-4d2c-95c8-22f5eec50039-c000.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00365-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00412-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00998-of-01024.json.gz',\n", " '/data/nrc_uniq_cleaned_20210223/part-00039-47c217ad-de48-4b5e-b177-ef4bcd69ad2b-c000.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00621-of-01024.json.gz',\n", " '/data/nu_uniq_cleaned_20210225/part-00182-f31777cc-bf2a-4d2c-95c8-22f5eec50039-c000.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00615-of-01024.json.gz',\n", " '/data/nrc_uniq_cleaned_20210223/part-00040-47c217ad-de48-4b5e-b177-ef4bcd69ad2b-c000.json.gz',\n", " '/data/nu_uniq_cleaned_20210225/part-00168-f31777cc-bf2a-4d2c-95c8-22f5eec50039-c000.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00148-of-01024.json.gz',\n", " '/data/nu_uniq_cleaned_20210225/part-00117-f31777cc-bf2a-4d2c-95c8-22f5eec50039-c000.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00345-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00241-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00013-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00896-of-01024.json.gz',\n", " '/data/nrc_uniq_cleaned_20210223/part-00149-47c217ad-de48-4b5e-b177-ef4bcd69ad2b-c000.json.gz',\n", " '/data/nu_uniq_cleaned_20210225/part-00061-f31777cc-bf2a-4d2c-95c8-22f5eec50039-c000.json.gz',\n", " '/data/nu_uniq_cleaned_20210225/part-00180-f31777cc-bf2a-4d2c-95c8-22f5eec50039-c000.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00082-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00114-of-01024.json.gz',\n", " '/data/nrc_uniq_cleaned_20210223/part-00021-47c217ad-de48-4b5e-b177-ef4bcd69ad2b-c000.json.gz',\n", " '/data/nrc_uniq_cleaned_20210223/part-00153-47c217ad-de48-4b5e-b177-ef4bcd69ad2b-c000.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00420-of-01024.json.gz',\n", " '/data/nrc_uniq_cleaned_20210223/part-00002-47c217ad-de48-4b5e-b177-ef4bcd69ad2b-c000.json.gz',\n", " '/data/nu_uniq_cleaned_20210225/part-00056-f31777cc-bf2a-4d2c-95c8-22f5eec50039-c000.json.gz',\n", " '/data/nu_uniq_cleaned_20210225/part-00007-f31777cc-bf2a-4d2c-95c8-22f5eec50039-c000.json.gz',\n", " '/data/nu_uniq_cleaned_20210225/part-00110-f31777cc-bf2a-4d2c-95c8-22f5eec50039-c000.json.gz',\n", " '/data/nrc_uniq_cleaned_20210223/part-00160-47c217ad-de48-4b5e-b177-ef4bcd69ad2b-c000.json.gz',\n", " '/data/nu_uniq_cleaned_20210225/part-00018-f31777cc-bf2a-4d2c-95c8-22f5eec50039-c000.json.gz',\n", " '/data/nu_uniq_cleaned_20210225/part-00164-f31777cc-bf2a-4d2c-95c8-22f5eec50039-c000.json.gz',\n", " '/data/nu_uniq_cleaned_20210225/part-00174-f31777cc-bf2a-4d2c-95c8-22f5eec50039-c000.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00555-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00028-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00985-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00397-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00588-of-01024.json.gz',\n", " '/data/nrc_uniq_cleaned_20210223/part-00101-47c217ad-de48-4b5e-b177-ef4bcd69ad2b-c000.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00066-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00797-of-01024.json.gz',\n", " '/data/nu_uniq_cleaned_20210225/part-00143-f31777cc-bf2a-4d2c-95c8-22f5eec50039-c000.json.gz',\n", " '/data/nrc_uniq_cleaned_20210223/part-00103-47c217ad-de48-4b5e-b177-ef4bcd69ad2b-c000.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00954-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00649-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00722-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00545-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00700-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00254-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00482-of-01024.json.gz',\n", " '/data/nrc_uniq_cleaned_20210223/part-00079-47c217ad-de48-4b5e-b177-ef4bcd69ad2b-c000.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00550-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00645-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00572-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00837-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00329-of-01024.json.gz',\n", " '/data/nu_uniq_cleaned_20210225/part-00108-f31777cc-bf2a-4d2c-95c8-22f5eec50039-c000.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00635-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00116-of-01024.json.gz',\n", " '/data/nrc_uniq_cleaned_20210223/part-00054-47c217ad-de48-4b5e-b177-ef4bcd69ad2b-c000.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00283-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00080-of-01024.json.gz',\n", " '/data/nu_uniq_cleaned_20210225/part-00037-f31777cc-bf2a-4d2c-95c8-22f5eec50039-c000.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00483-of-01024.json.gz',\n", " '/data/nrc_uniq_cleaned_20210223/part-00059-47c217ad-de48-4b5e-b177-ef4bcd69ad2b-c000.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00591-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00694-of-01024.json.gz',\n", " '/data/nu_uniq_cleaned_20210225/part-00134-f31777cc-bf2a-4d2c-95c8-22f5eec50039-c000.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00225-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00206-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00970-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00569-of-01024.json.gz',\n", " '/data/nrc_uniq_cleaned_20210223/part-00169-47c217ad-de48-4b5e-b177-ef4bcd69ad2b-c000.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00472-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00929-of-01024.json.gz',\n", " '/data/nu_uniq_cleaned_20210225/part-00130-f31777cc-bf2a-4d2c-95c8-22f5eec50039-c000.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00300-of-01024.json.gz',\n", " '/data/nrc_uniq_cleaned_20210223/part-00138-47c217ad-de48-4b5e-b177-ef4bcd69ad2b-c000.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00890-of-01024.json.gz',\n", " '/data/nrc_uniq_cleaned_20210223/part-00005-47c217ad-de48-4b5e-b177-ef4bcd69ad2b-c000.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00891-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00913-of-01024.json.gz',\n", " '/data/nu_uniq_cleaned_20210225/part-00038-f31777cc-bf2a-4d2c-95c8-22f5eec50039-c000.json.gz',\n", " '/data/nrc_uniq_cleaned_20210223/part-00019-47c217ad-de48-4b5e-b177-ef4bcd69ad2b-c000.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00518-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00830-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-01002-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00363-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00789-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00053-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00683-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00111-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00431-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00223-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00809-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00767-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00642-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00218-of-01024.json.gz',\n", " '/data/nu_uniq_cleaned_20210225/part-00052-f31777cc-bf2a-4d2c-95c8-22f5eec50039-c000.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00685-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00876-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00347-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00027-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00024-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00828-of-01024.json.gz',\n", " '/data/nu_uniq_cleaned_20210225/part-00075-f31777cc-bf2a-4d2c-95c8-22f5eec50039-c000.json.gz',\n", " '/data/nrc_uniq_cleaned_20210223/part-00133-47c217ad-de48-4b5e-b177-ef4bcd69ad2b-c000.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00927-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00831-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00749-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00279-of-01024.json.gz',\n", " '/data/nu_uniq_cleaned_20210225/part-00005-f31777cc-bf2a-4d2c-95c8-22f5eec50039-c000.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00634-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00546-of-01024.json.gz',\n", " '/data/nu_uniq_cleaned_20210225/part-00055-f31777cc-bf2a-4d2c-95c8-22f5eec50039-c000.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00849-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00165-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00209-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00029-of-01024.json.gz',\n", " '/data/nrc_uniq_cleaned_20210223/part-00198-47c217ad-de48-4b5e-b177-ef4bcd69ad2b-c000.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00757-of-01024.json.gz',\n", " '/data/nrc_uniq_cleaned_20210223/part-00124-47c217ad-de48-4b5e-b177-ef4bcd69ad2b-c000.json.gz',\n", " '/data/nu_uniq_cleaned_20210225/part-00105-f31777cc-bf2a-4d2c-95c8-22f5eec50039-c000.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00323-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00498-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00795-of-01024.json.gz',\n", " '/data/c4_cleaned2/cleaned2_c4-nl.tfrecord-00366-of-01024.json.gz',\n", " ...]" ] }, "execution_count": 61, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data_files" ] }, { "cell_type": "code", "execution_count": 45, "id": "59076aa7", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Number of files 2448 after adding /data/c4_cleaned2 glob *.gz\n", "Number of files 2648 after adding /data/nrc_uniq_cleaned_20210223 glob *.gz\n", "Number of files 2848 after adding /data/nu_uniq_cleaned_20210225 glob *.gz\n" ] }, { "data": { "text/plain": [ "[]" ] }, "execution_count": 45, "metadata": {}, "output_type": "execute_result" } ], "source": [ " datafiles = []\n", " import glob\n", " import random\n", " SEED = 12345\n", " def add_jsonlines_dir(path, filespec):\n", " global data_files\n", " data_files += glob.glob(f\"{path}/{filespec}\")\n", " data_files = list(set(data_files))\n", " print(f\"Number of files {len(data_files)} after adding {path} glob {filespec}\")\n", " add_jsonlines_dir(f\"/data/c4_cleaned2\", \"*.gz\")\n", " add_jsonlines_dir(f\"/data/nrc_uniq_cleaned_20210223\", \"*.gz\")\n", " add_jsonlines_dir(f\"/data/nu_uniq_cleaned_20210225\", \"*.gz\")\n", " datafiles" ] }, { "cell_type": "code", "execution_count": 38, "id": "7c5980cd", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Number of files 1424 after adding /data/c4_cleaned2 glob *.gz\n", "Number of files 1424 after adding /data/nrc_uniq_cleaned_20210223 glob *.gz\n", "Number of files 1424 after adding /data/nu_uniq_cleaned_20210225 glob *.gz\n" ] } ], "source": [ "train_val_files()" ] }, { "cell_type": "code", "execution_count": null, "id": "969b8fa4", "metadata": {}, "outputs": [], "source": [ " random.Random(SEED).shuffle(data_files)\n", " total = len(data_files)\n", " print(total)\n", " perc = 0.05\n", " val_size = int(perc * total)\n", " train_size = total - val_size\n", " train = data_files[:train_size]\n", " val = data_files[train_size:]\n", " print(f\"Got {len(train)} training files and {perc*100} % {len(val)} validation files\")\n", " assert list(set(train) & set(val)) == [], \"Train overlaps with test\"\n", " return train, val" ] }, { "cell_type": "code", "execution_count": null, "id": "f92c2b13", "metadata": {}, "outputs": [], "source": [ " datafiles = []\n", " import glob\n", " import random\n", " SEED = 12345\n", " def add_jsonlines_dir(path, filespec):\n", " global data_files\n", " data_files += glob.glob(f\"{path}/{filespec}\")\n", " data_files = list(set(data_files))\n", " print(f\"Number of files {len(data_files)} after adding {path} glob {filespec}\")\n", " add_jsonlines_dir(f\"/data/c4_cleaned2\", \"*.gz\")\n", " add_jsonlines_dir(f\"/data/nrc_uniq_cleaned_20210223\", \"*.gz\")\n", " add_jsonlines_dir(f\"/data/nu_uniq_cleaned_20210225\", \"*.gz\")" ] }, { "cell_type": "code", "execution_count": null, "id": "1ffb5036", "metadata": {}, "outputs": [], "source": [ "datasets[\"train\"] = load_dataset(datafiles, split=\"train[5%:]\")\n", "datasets[\"validation\"] = load_dataset(datafiles, split=\"train[5%:]\")" ] }, { "cell_type": "code", "execution_count": 34, "id": "31e5a164", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Number of files 1424 after adding /data/c4_cleaned2 glob *.gz\n", "Number of files 1424 after adding /data/nrc_uniq_cleaned_20210223 glob *.gz\n", "Number of files 1424 after adding /data/nu_uniq_cleaned_20210225 glob *.gz\n", "1424\n", "Got 1353 training files and 5.0 % 71 validation files\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "WARNING:datasets.builder:Using custom data configuration default-28929211ee23e224\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Downloading and preparing dataset json/default (download: Unknown size, generated: Unknown size, post-processed: Unknown size, total: Unknown size) to /home/dat/.cache/huggingface/datasets/json/default-28929211ee23e224/0.0.0/f92a4de297ac644ad9781979b79064b0e222b3af766f8ea3bee32390dca23723...\n" ] }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "e7b73482da6744639826bd7a677f17ff", "version_major": 2, "version_minor": 0 }, "text/plain": [ "0 tables [00:00, ? tables/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "ename": "JSONDecodeError", "evalue": "Extra data: line 2 column 1 (char 651)", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mArrowInvalid\u001b[0m Traceback (most recent call last)", "\u001b[0;32m~/.local/lib/python3.8/site-packages/datasets/packaged_modules/json/json.py\u001b[0m in \u001b[0;36m_generate_tables\u001b[0;34m(self, files)\u001b[0m\n\u001b[1;32m 80\u001b[0m \u001b[0;32mwith\u001b[0m \u001b[0mopen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfile\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"rb\"\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mf\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 81\u001b[0;31m pa_table = paj.read_json(\n\u001b[0m\u001b[1;32m 82\u001b[0m \u001b[0mf\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mread_options\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mconfig\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpa_read_options\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mparse_options\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mconfig\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpa_parse_options\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m~/.local/lib/python3.8/site-packages/pyarrow/_json.pyx\u001b[0m in \u001b[0;36mpyarrow._json.read_json\u001b[0;34m()\u001b[0m\n", "\u001b[0;32m~/.local/lib/python3.8/site-packages/pyarrow/error.pxi\u001b[0m in \u001b[0;36mpyarrow.lib.pyarrow_internal_check_status\u001b[0;34m()\u001b[0m\n", "\u001b[0;32m~/.local/lib/python3.8/site-packages/pyarrow/error.pxi\u001b[0m in \u001b[0;36mpyarrow.lib.check_status\u001b[0;34m()\u001b[0m\n", "\u001b[0;31mArrowInvalid\u001b[0m: JSON parse error: Missing a closing quotation mark in string. in row 93", "\nDuring handling of the above exception, another exception occurred:\n", "\u001b[0;31mJSONDecodeError\u001b[0m Traceback (most recent call last)", "\u001b[0;32m/tmp/ipykernel_371965/265278772.py\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0mtrain\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mval\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mtrain_val_files\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 2\u001b[0;31m \u001b[0mdatasets\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mload_dataset\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'json'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdata_files\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m{\u001b[0m\u001b[0;34m'train'\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mtrain\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'validation'\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mval\u001b[0m\u001b[0;34m}\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", "\u001b[0;32m~/.local/lib/python3.8/site-packages/datasets/load.py\u001b[0m in \u001b[0;36mload_dataset\u001b[0;34m(path, name, data_dir, data_files, split, cache_dir, features, download_config, download_mode, ignore_verifications, keep_in_memory, save_infos, script_version, use_auth_token, task, streaming, **config_kwargs)\u001b[0m\n\u001b[1;32m 839\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 840\u001b[0m \u001b[0;31m# Download and prepare data\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 841\u001b[0;31m builder_instance.download_and_prepare(\n\u001b[0m\u001b[1;32m 842\u001b[0m \u001b[0mdownload_config\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mdownload_config\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 843\u001b[0m \u001b[0mdownload_mode\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mdownload_mode\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m~/.local/lib/python3.8/site-packages/datasets/builder.py\u001b[0m in \u001b[0;36mdownload_and_prepare\u001b[0;34m(self, download_config, download_mode, ignore_verifications, try_from_hf_gcs, dl_manager, base_path, use_auth_token, **download_and_prepare_kwargs)\u001b[0m\n\u001b[1;32m 581\u001b[0m \u001b[0mlogger\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mwarning\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"HF google storage unreachable. Downloading and preparing it from source\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 582\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0mdownloaded_from_gcs\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 583\u001b[0;31m self._download_and_prepare(\n\u001b[0m\u001b[1;32m 584\u001b[0m \u001b[0mdl_manager\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mdl_manager\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mverify_infos\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mverify_infos\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mdownload_and_prepare_kwargs\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 585\u001b[0m )\n", "\u001b[0;32m~/.local/lib/python3.8/site-packages/datasets/builder.py\u001b[0m in \u001b[0;36m_download_and_prepare\u001b[0;34m(self, dl_manager, verify_infos, **prepare_split_kwargs)\u001b[0m\n\u001b[1;32m 659\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 660\u001b[0m \u001b[0;31m# Prepare split will record examples associated to the split\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 661\u001b[0;31m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_prepare_split\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0msplit_generator\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mprepare_split_kwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 662\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mOSError\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0me\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 663\u001b[0m raise OSError(\n", "\u001b[0;32m~/.local/lib/python3.8/site-packages/datasets/builder.py\u001b[0m in \u001b[0;36m_prepare_split\u001b[0;34m(self, split_generator)\u001b[0m\n\u001b[1;32m 1125\u001b[0m \u001b[0mgenerator\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_generate_tables\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m**\u001b[0m\u001b[0msplit_generator\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mgen_kwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1126\u001b[0m \u001b[0;32mwith\u001b[0m \u001b[0mArrowWriter\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfeatures\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0minfo\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfeatures\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mpath\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mfpath\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mwriter\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1127\u001b[0;31m for key, table in utils.tqdm(\n\u001b[0m\u001b[1;32m 1128\u001b[0m \u001b[0mgenerator\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0munit\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m\" tables\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mleave\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mFalse\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdisable\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mbool\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mlogging\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget_verbosity\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0mlogging\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mNOTSET\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1129\u001b[0m ):\n", "\u001b[0;32m~/.local/lib/python3.8/site-packages/tqdm/notebook.py\u001b[0m in \u001b[0;36m__iter__\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 252\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m__iter__\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 253\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 254\u001b[0;31m \u001b[0;32mfor\u001b[0m \u001b[0mobj\u001b[0m \u001b[0;32min\u001b[0m \u001b[0msuper\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtqdm_notebook\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m__iter__\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 255\u001b[0m \u001b[0;31m# return super(tqdm...) will not catch exception\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 256\u001b[0m \u001b[0;32myield\u001b[0m \u001b[0mobj\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m~/.local/lib/python3.8/site-packages/tqdm/std.py\u001b[0m in \u001b[0;36m__iter__\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 1176\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1177\u001b[0m \u001b[0;32mtry\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1178\u001b[0;31m \u001b[0;32mfor\u001b[0m \u001b[0mobj\u001b[0m \u001b[0;32min\u001b[0m \u001b[0miterable\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 1179\u001b[0m \u001b[0;32myield\u001b[0m \u001b[0mobj\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1180\u001b[0m \u001b[0;31m# Update and possibly print the progressbar.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m~/.local/lib/python3.8/site-packages/datasets/packaged_modules/json/json.py\u001b[0m in \u001b[0;36m_generate_tables\u001b[0;34m(self, files)\u001b[0m\n\u001b[1;32m 84\u001b[0m \u001b[0;32mexcept\u001b[0m \u001b[0mpa\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mArrowInvalid\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 85\u001b[0m \u001b[0;32mwith\u001b[0m \u001b[0mopen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfile\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mencoding\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m\"utf-8\"\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mf\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 86\u001b[0;31m \u001b[0mdataset\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mjson\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mload\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mf\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 87\u001b[0m raise ValueError(\n\u001b[1;32m 88\u001b[0m \u001b[0;34mf\"Not able to read records in the JSON file at {file}. \"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m/usr/lib/python3.8/json/__init__.py\u001b[0m in \u001b[0;36mload\u001b[0;34m(fp, cls, object_hook, parse_float, parse_int, parse_constant, object_pairs_hook, **kw)\u001b[0m\n\u001b[1;32m 291\u001b[0m \u001b[0mkwarg\u001b[0m\u001b[0;34m;\u001b[0m \u001b[0motherwise\u001b[0m\u001b[0;31m \u001b[0m\u001b[0;31m`\u001b[0m\u001b[0;31m`\u001b[0m\u001b[0mJSONDecoder\u001b[0m\u001b[0;31m`\u001b[0m\u001b[0;31m`\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0mused\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 292\u001b[0m \"\"\"\n\u001b[0;32m--> 293\u001b[0;31m return loads(fp.read(),\n\u001b[0m\u001b[1;32m 294\u001b[0m \u001b[0mcls\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mcls\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mobject_hook\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mobject_hook\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 295\u001b[0m \u001b[0mparse_float\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mparse_float\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mparse_int\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mparse_int\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m/usr/lib/python3.8/json/__init__.py\u001b[0m in \u001b[0;36mloads\u001b[0;34m(s, cls, object_hook, parse_float, parse_int, parse_constant, object_pairs_hook, **kw)\u001b[0m\n\u001b[1;32m 355\u001b[0m \u001b[0mparse_int\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mNone\u001b[0m \u001b[0;32mand\u001b[0m \u001b[0mparse_float\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mNone\u001b[0m \u001b[0;32mand\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 356\u001b[0m parse_constant is None and object_pairs_hook is None and not kw):\n\u001b[0;32m--> 357\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0m_default_decoder\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdecode\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0ms\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 358\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mcls\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 359\u001b[0m \u001b[0mcls\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mJSONDecoder\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;32m/usr/lib/python3.8/json/decoder.py\u001b[0m in \u001b[0;36mdecode\u001b[0;34m(self, s, _w)\u001b[0m\n\u001b[1;32m 338\u001b[0m \u001b[0mend\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0m_w\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0ms\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mend\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mend\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 339\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mend\u001b[0m \u001b[0;34m!=\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0ms\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 340\u001b[0;31m \u001b[0;32mraise\u001b[0m \u001b[0mJSONDecodeError\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"Extra data\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0ms\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mend\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 341\u001b[0m \u001b[0;32mreturn\u001b[0m \u001b[0mobj\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 342\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;31mJSONDecodeError\u001b[0m: Extra data: line 2 column 1 (char 651)" ] } ], "source": [ "train, val = train_val_files()\n", "datasets = load_dataset('json', data_files={'train': train, 'validation': val})" ] }, { "cell_type": "code", "execution_count": 70, "id": "69445179", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Number of files 1024 after adding /data/c4_cleaned2 glob *json.gz\n", "Number of files 1224 after adding /data/nrc_uniq_cleaned_20210223 glob *.gz\n", "Number of files 1424 after adding /data/nu_uniq_cleaned_20210225 glob *.gz\n" ] } ], "source": [ "import glob\n", "import random\n", "SEED = 12345\n", "data_files = []\n", "def add_jsonlines_dir(path, filespec):\n", " global data_files\n", " data_files += glob.glob(f\"{path}/{filespec}\")\n", " data_files = list(set(data_files))\n", " print(f\"Number of files {len(data_files)} after adding {path} glob {filespec}\")\n", "add_jsonlines_dir(f\"/data/c4_cleaned2\", \"*json.gz\")\n", "add_jsonlines_dir(f\"/data/nrc_uniq_cleaned_20210223\", \"*.gz\")\n", "add_jsonlines_dir(f\"/data/nu_uniq_cleaned_20210225\", \"*.gz\")" ] }, { "cell_type": "code", "execution_count": 29, "id": "5b1c04f8", "metadata": { "scrolled": true }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "WARNING:datasets.builder:Using custom data configuration default-5aa1f90e962b1369\n", "WARNING:datasets.builder:Reusing dataset json (/home/dat/.cache/huggingface/datasets/json/default-5aa1f90e962b1369/0.0.0/f92a4de297ac644ad9781979b79064b0e222b3af766f8ea3bee32390dca23723)\n" ] } ], "source": [ "datasets = load_dataset('json', data_files={'train': train, 'validation': val})" ] }, { "cell_type": "code", "execution_count": 21, "id": "47db602d", "metadata": {}, "outputs": [], "source": [ "dataset_iterator = iter(datasets['train'])" ] }, { "cell_type": "code", "execution_count": 22, "id": "03a23a9c", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "{'text': 'Welke school kiezen?\\nSchoolinformatie in je brievenbus ... graag of liever niet?\\nKinderen die geboren zijn tussen 15 november en 31 december 2016 stappen pas in op 1 september 2019.\\nVoor deze groep is inschrijving mogelijk tijdens de eerste voorrangsperiode: van 1 maart tot 16 maart 2018.\\n2: kinderen van personeel van de school.\\nVoor deze groep is inschrijving mogelijk tijdens de tweede voorrangsperiode: op 22 maart en 23 maart 2018.\\nVanaf 18 april tot 4mei 2018 loopt de algemene inschrijvingsperiode en dan kunnen alle kinderen ingeschreven worden.\\nTIP: Wacht niet tot de laatste dag om je kind in te schrijven. Hoe sneller je inschrijft, hoe groter de kans dat er plaats is in de school die je kiest.\\nKies bij voorkeur een school in de buurt van waar je woont; dat heeft enkel maar voordelen. Bezoek zeker vooraf de school van je keuze. Je kan er uitleg vragen en de schoolomgeving ontdekken.\\nIn welke school zijn er vrije plaatsen?\\nVanaf 30 maart 2018 vanaf 17u vind je informatie over de vrije plaatsen op www.lop.be Zo weet je hoeveel plaatsen in jouw school beschikbaar zijn.\\n• Wil je bepaalde scholen bezoeken samen met andere ouders uit je buurt? Neem contact op met School in zicht.\\nMeer informatie over inschrijven in een school vind je ook op de website van Onderwijs Vlaanderen.\\nBlijft het onduidelijk?\\nAlle scholen van Bonheiden, Mechelen, Sint-Katelijne-Waver en Zemst schrijven de kinderen in op hetzelfde moment.\\nIs je kind een jongere broer of zus die in 2016 geboren is? Maak dan een afspraak met de school.\\nIs je kind een oudere broer of zus die voor 2016 geboren is? Ga naar de school tijdens de voorrangsperiode. De inschrijvingen starten op 1 maart om 8u30 stipt en eindigen op 16 maart om 16u. Let op, het aantal plaatsen is in elke school beperkt en wie eerst komt wordt eerst ingeschreven.\\nLet op: vanaf 17 maart heeft de jongere broer/zus niet langer voorrang. Je kan hem/haar dan alleen tijdens de algemene inschrijvingsperiode inschrijven.\\nDe inschrijvingen starten op 18 april 2018 om 8u30 stipt en eindigen op 4 mei2018 om 16u. Let op, het aantal plaatsen is in elke school beperkt en wie eerst komt wordt eerst ingeschreven.\\nNeem de identiteitskaart van je kind of een ander identiteitsbewijs mee (bv. Kids-ID, paspoort, identiteitsbewijs voor kinderen onder 12 jaar).\\nSTAP 1; Je hebt een gesprek met de directeur. Je krijgt informatie over het schoolreglement en het pedagogisch project van de school. Alleen als je akkoord gaat met het schoolreglement en het pedagogisch project kan je je kind inschrijven.\\nHeeft de moeder van het kind een diploma van hoger secundair onderwijs?\\nHeeft het gezin een schooltoelage voor het schooljaar 2017-2018 en/of 2018-2019?\\nAan de hand van de antwoorden wordt je kind ingedeeld als een indicatorleerling of een niet-indicatorleerling.\\nSTAP 3; Op basis van het aantal beschikbare plaatsen wordt je kind wel of niet ingeschreven. Er zijn 3 mogelijkheden.\\nJe kind wordt onmiddellijk ingeschreven in het inschrijvingsregister van de school.\\nEr is nog wel plaats in de school maar niet meer in de groep waarvoor je kind in aanmerking komt.\\n- De directeur kan je niet onmiddellijk zeggen of er uiteindelijk plaats zal zijn of niet.\\n- Je kind komt op de wachtlijst van de school. De wachtlijst geeft geen garantie op een plaats. Als een ander ingeschreven kind de school verlaat, kan de eerste op de wachtlijst zich inschrijven.\\n- Wat moet je doen? Zoek zo snel mogelijk een andere school voor je kind.\\nMechelen heeft een uitgebreid onderwijsaanbod. Op www.mechelen.be/scholen vind je een overzicht.\\nHet stadsbestuur vindt het erg belangrijk dat ouders voldoende geïnformeerd worden over het scholenaanbod in Mechelen en de inschrijvingsprocedure in Mechelse scholen. Daarom krijgen de scholen en een aantal organisaties de mogelijkheid om aan inwoners van Mechelen met schoolgaande kinderen via de post informatie te bezorgen over het basis- of secundair onderwijs en de inschrijvingsprocedure in Mechelse scholen..\\nDeze informatie kan in functie zijn van de leeftijdsgroepen 2,5 jarigen (kleuters), 6 jarigen (lagere school) en de 12 jarigen (secundaire school).\\nGrote Markt 21, 2800 Mechelen.', 'timestamp': datetime.datetime(2018, 10, 23, 5, 45), 'url': 'https://www.mechelen.be/schrijf-je-kind-tijdig-in-op-school', 'id': 42}\n" ] } ], "source": [ "print(next(dataset_iterator))" ] }, { "cell_type": "code", "execution_count": 13, "id": "fc9519d2", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Number of files 0 after adding /data/oscar_nl_cleaned2\n" ] }, { "ename": "NameError", "evalue": "name 'data_dir' is not defined", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", "\u001b[0;32m/tmp/ipykernel_371965/3501862563.py\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 18\u001b[0m \u001b[0;31m#add_jsonlines_dir(\"/data/nu_cleaned_idtextfmt\",\"*.gz\")\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 19\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 20\u001b[0;31m \u001b[0madd_jsonlines_dir\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34mf\"{data_dir}/c4_cleaned\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"*73*.gz\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 21\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 22\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n", "\u001b[0;31mNameError\u001b[0m: name 'data_dir' is not defined" ] } ], "source": [ "#59G c4_cleaned compressed\n", "#937M nrc_uniq_cleaned_20210223 compressed\n", "#410M nu_uniq_cleaned_20210225 compressed\n", "#9.9G oscar_nl_cleaned compressed\n", "\n", "\n", "\n", "data_files = []\n", "SEED=42\n", "def add_jsonlines_dir(path,filespec):\n", " global data_files\n", " data_files += glob.glob(f\"{path}/{filespec}\")\n", " print(f\"Number of files {len(data_files)} after adding {path}\")\n", " \n", "#add_jsonlines_dir(\"/home/dat/subset_c4_cleannl\",\"*.gz\") \n", "add_jsonlines_dir(\"/data/oscar_nl_cleaned2\",\"*.gz\")\n", "#add_jsonlines_dir(\"/data/nrc_cleaned_idtextfmt\",\"*.gz\")\n", "#add_jsonlines_dir(\"/data/nu_cleaned_idtextfmt\",\"*.gz\")\n", "\n", "add_jsonlines_dir(f\"{data_dir}/c4_cleaned\", \"*73*.gz\")\n", "\n", " \n", "random.Random(SEED).shuffle(data_files)\n", "total = len(data_files)\n", "val_size = int(0.05 * total)\n", "train_size = total - val_size\n", "print(f\"95%: {train_size}\")\n", "train = data_files[:train_size]\n", "val = data_files[train_size:]\n", "print(f\"Got {len(train)} training files and {len(val)} validation files\")\n", "assert list(set(train) & set(val)) == [], \"Train overlaps with test\"\n", "datasets = load_dataset('json', data_files={'train': train, 'validation': val})\n", "\n", "\n", "assert list(set(train) & set(val)) == [], 'train overlaps with test'\n" ] }, { "cell_type": "code", "execution_count": null, "id": "71cac0b7", "metadata": {}, "outputs": [], "source": [ "from datasets import Dataset\n", "datasets['train'] = Dataset.from_file(\"/home/dat/.cache/huggingface/datasets/json/default-3eb349358dcf6436/0.0.0/f92a4de297ac644ad9781979b79064b0e222b3af766f8ea3bee32390dca23723/json-train.arrow\") \n", "datasets['validation'] = Dataset.from_file(\"/home/dat/.cache/huggingface/datasets/json/default-3eb349358dcf6436/0.0.0/f92a4de297ac644ad9781979b79064b0e222b3af766f8ea3bee32390dca23723/json-validation.arrow\") \n", "\n" ] }, { "cell_type": "code", "execution_count": 41, "id": "865a9642", "metadata": {}, "outputs": [], "source": [ "dataset_iterator = iter(datasets['train'])" ] }, { "cell_type": "code", "execution_count": 78, "id": "523b0fc2", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Zo stel ik het me voor. Tegen iedere conventie in. Och wat heeft de burgerij gemopperd en schande gesproken. Dat was in die dagen. Nu nog steeds, maar anders. Daarover later meer. En wat zullen ze van u gehouden hebben in de kleine kring van liefhebbers.\n", "Jaren geleden, toen ik nog op de academie zat bestudeerde ik uw werk. Vooral de paar overgebleven foto’s van uw Merzbau in Hannover troffen mij. Zo vrij en swingend en onconventioneel.\n", "Ze werden opgeslagen in een afgelegen kamer in mijn geheugen, want eigentijdse choreografen en filmmakers en schilders uit de vroege renaissance vroegen om voorrang.\n", "Toen u het huis van uw ouders in Hannover betrok transformeerde u acht kamers tot een betoverende sculptuur. Merzbau! Kathedrale des erotischen Elend.\n", "In abstracte vlakken en vormen kruipen de volumes chaotisch omhoog langs de muren. Meestal wit. Er vormen zich ruimtes en grotachtige structuren. Hier en daar een typografisch detail of een herkenbaar object, dat uit zijn context geslingerd, vooral vragen oproept. Met hier en daar een antwoord of een vermoeden daarvan.\n", "Soms verborg u zich in het kleine orgelkamertje bovenin als er gasten kwamen, om de reactie op hun gezichten te lezen als ze uw gedichten of het karnavals-achtige nummer Du lieber Augustin door de fantastische ruimte hoorden schallen, een lied vol humor en boerse middeleeuwse wreedheid, maar ook melancholie.\n", "Banale liedjes laten horen in een ruimte die verschillende betekenissen kan hebben. Ik herken dat zo. Wij deden dat ook in het theater.\n", "Ik vraag nu toch uw hand, zo’n beetje dwars door de tijd, om een paar pirouettes te draaien of misschien beter een twist.\n", "Het gewicht van de tapijten of het zeil waaronder ik zowat bezwijk, de inspanning om hoog in de opstelling een klosje op te hangen… Op een gegeven moment raak ik in een staat waarin ik niet meer nadenk. Dan doe ik de ingreep die een beeld uiteindelijk af maakt. Grappig niet?\n", "Ik vermoed dat u dat ook heeft, dat zware fysieke werken aan Merzbau; dat dat fijn is, dat het zo echt is daardoor en dat je uiteindelijk in trance raakt.\n", "Daar leefde u van werken in opdracht; portretten en landschappen. Beeldschoon werk, maar u deed niet anders dan erop mopperen.\n", "Ondertussen begon u een nieuwe Merzbau in een schuur op het platteland. U groef er een verdieping onder en begon daar te merzen. Weer die zware fysieke arbeid. Dat beschouwde u als uw echte werk. Daar legde u ‘connecties tussen alles in uw wereld’, al uw werk ‘een levenslange ervaring’.\n", "Maar uw landschappen hoorden daar niet bij. Dat is nu vreemd, jammer zelfs. Tenminste, gezien vanuit mijn perspectief, vanuit het heden. Ze komen immers uit dezelfde bron. Is het omdat ze niet abstract zijn?\n", "Per Kirkeby is een beroemd Deens schilder en beeldhouwer, graficus en dichter. Nu tachtig jaar oud. U zou hem weten te waarderen. Ook niet binnen een -isme te vangen. Hij heeft heel mooi over zuivere en onzuivere kunst gesproken. Dit klinkt een beetje eng maar ging over zuiver in de zin van kaal en zonder betekenis en in het onzuivere zaten alle associaties en verwijzingen.\n", "In míjn werk houd ik van de associaties en verwijzingen. Maar we leven nu in een andere tijd. Pure abstractie wordt zeker nog gevierd door sommige kunstenaars, en zeker niet de minsten, maar de revolutie die het in uw tijd ontketende is uitgewoed.\n", "Ik houd ervan dat in mijn werk niks helemaal lijkt te kloppen, maar er is wel samenhang. De objecten zijn volgens een innerlijke logica gekozen. Maar het mag geen surealisme worden. Daar houd ik niet van. Het is een smalle marge waarin ze mogen bestaan.\n", "Het gaat vreemd genoeg volgens schilderkunstige principes, al komt er geen verf aan te pas. Ik bouw mijn opstellingen laag voor laag op. Vanuit de achtergrond. Ik doe weg, of bedek wat te makkelijk te duiden is en daarmee het beeld plat slaat, of wat ik te mooi of esthetisch vind. Soms draait het zich om, behoud ik juist wat mooi of betekenisvol is. Ik zet voortdurend voetangels en klemmen voor mijzelf. En ik geloof dat dat de kwaliteit van het werk uitmaakt.\n", "Ik vraag me af in hoeverre dit een wet is die voor alle kunst opgaat. Ik geloof het wel. Al gebeurt het soms alleen in het denkproces dat vooraf gaat aan de uitvoering van het werk.\n", "Ik ken het in ieder geval heel goed uit mijn theaterwerk. Dat schaven aan een productie tot alle puzzelstukken op hun plaats vallen.\n", "Ik kan mij voorstellen dat dat zelfs bij Mondriaan gebeurde. Zijn Victory Boogy Woogy heeft zo iets magisch ongrijpbaars. En toch staan alle vlakken gewoon op hun plek. Daar is zoveel jaar werk voor nodig geweest!\n", "In zijn vroege werken, ook landschappen en bomen, proef je wat er allemaal in zit. In die man bedoel ik en in die doeken.\n", "Ik wil maar zeggen, die landschappen van u zijn denk ik toch met dezelfde mentaliteit gemaakt als uw dichtwerk of Merzbau. Ze zijn in ieder geval door u gemaakt. Met uw hand, uw geest, uw afwegingen tijdens het schilderen. Dit wel, dit niet.\n", "Maar niet mystiek of transcendent? Ik lees in andere bronnen over Dada’s grondslag; Boeddhisme, Taoisme, vroegchristelijke mystici, en over filosofen als Bergson, Nietzsche en Descartes. Nogal tegenstrijdig allemaal.\n", "En dat DaDa niets is, dat wil zeggen alles, of het niet-iets, of een vogel op vier poten, of een levensverzekering of een ladder zonder sporten….\n", "Ik heb een leven lang studie en kijken en nog eens kijken voor me, om dit alles te doorvorsen. Maar begrijpen doe ik het al. Op m’n intuïtie.\n" ] } ], "source": [ "print(next(dataset_iterator)['text'])" ] }, { "cell_type": "code", "execution_count": 31, "id": "b5839c79", "metadata": {}, "outputs": [ { "ename": "IndentationError", "evalue": "unexpected indent (1021262509.py, line 15)", "output_type": "error", "traceback": [ "\u001b[0;36m File \u001b[0;32m\"/tmp/ipykernel_309684/1021262509.py\"\u001b[0;36m, line \u001b[0;32m15\u001b[0m\n\u001b[0;31m train, val = train_val_files()\u001b[0m\n\u001b[0m ^\u001b[0m\n\u001b[0;31mIndentationError\u001b[0m\u001b[0;31m:\u001b[0m unexpected indent\n" ] } ], "source": [ "\n", " add_jsonlines_dir(\"/home/dat/subset_c4_cleannl\") \n", " add_jsonlines_dir(\"/data/oscar_nl_cleaned\")\n", " add_jsonlines_dir(\"/data/nrc_cleaned_idtextfmt\")\n", " add_jsonlines_dir(\"/data/nu_cleaned_idtextfmt\")\n", " random.Random(SEED).shuffle(data_files)\n", " total = len(data_files)\n", " val_size = int(0.05 * total)\n", " train_size = total - val_size\n", " print(f\"95%: {train_size}\")\n", " train = data_files\n", " val = data_files\n", " print(f\"Got {len(train)} training files and {len(val)} validation files\")\n", " assert list(set(train) & set(val)) == [], \"Train overlaps with test\"\n", " return train, val\n", " train, val = train_val_files()\n", " datasets = load_dataset('json', data_files={'train': train, 'validation': val})" ] }, { "cell_type": "code", "execution_count": 4, "id": "6685589f", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "\n", "\n" ] } ], "source": [ "from tokenizers import ByteLevelBPETokenizer\n", "tokenizer = ByteLevelBPETokenizer()\n", "\n", "def batch_iterator(batch_size=1000):\n", " for i in range(0, len(datasets), batch_size):\n", " yield datasets[\"train\"][i: i + batch_size][\"text\"]\n", "\n", "tokenizer.train_from_iterator(batch_iterator(), vocab_size=50358, min_frequency=2, special_tokens=[\n", " \"\",\n", " \"\",\n", " \"\",\n", " \"\",\n", " \"\",\n", "])" ] }, { "cell_type": "code", "execution_count": 5, "id": "5fed49b4", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "39503" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "tokenizer.get_vocab_size()" ] }, { "cell_type": "code", "execution_count": 6, "id": "69401680", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "/home/dat/pino-roberta-base\n" ] } ], "source": [ "cd ~/pino-roberta-base" ] }, { "cell_type": "code", "execution_count": 7, "id": "7a98d754", "metadata": {}, "outputs": [], "source": [ "tokenizer.save(\"tokenizer.json\")" ] }, { "cell_type": "code", "execution_count": null, "id": "e686b9c8", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Using custom data configuration nl-lang=nl\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Downloading and preparing dataset cc100/nl (download: Unknown size, generated: Unknown size, post-processed: Unknown size, total: Unknown size) to /home/dat/.cache/huggingface/datasets/cc100/nl-lang=nl/0.0.0/b583dd47b0dd43a3c3773075abd993be12d0eee93dbd2cfe15a0e4e94d481e80...\n" ] }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "8bb6155775084c42841d5a786a3f014c", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Downloading: 0%| | 0.00/8.42G [00:00