versae commited on Jul 17, 2021

Commit

75469bd

0 Parent(s):

Training dump

Browse files

Files changed (49) hide show

.gitattributes +19 -0
.gitignore +4 -0
configs/base/config.json +25 -0
configs/base/tokenizer.json +0 -0
configs/large/config.json +25 -0
configs/large/tokenizer.json +0 -0
mc4/README.md +525 -0
mc4/dummy/af/0.0.0/dummy_data.zip +0 -0
mc4/mc4.py +426 -0
mc4/mc4.py.lock +0 -0
outputs/checkpoints/checkpoint-140001/config.json +25 -0
outputs/checkpoints/checkpoint-140001/data_collator.joblib +3 -0
outputs/checkpoints/checkpoint-140001/flax_model.msgpack +3 -0
outputs/checkpoints/checkpoint-140001/optimizer_state.msgpack +3 -0
outputs/checkpoints/checkpoint-140001/training_args.joblib +3 -0
outputs/checkpoints/checkpoint-140001/training_state.json +1 -0
outputs/checkpoints/checkpoint-150001/config.json +25 -0
outputs/checkpoints/checkpoint-150001/data_collator.joblib +3 -0
outputs/checkpoints/checkpoint-150001/flax_model.msgpack +3 -0
outputs/checkpoints/checkpoint-150001/optimizer_state.msgpack +3 -0
outputs/checkpoints/checkpoint-150001/training_args.joblib +3 -0
outputs/checkpoints/checkpoint-150001/training_state.json +1 -0
outputs/checkpoints/checkpoint-160001/config.json +25 -0
outputs/checkpoints/checkpoint-160001/data_collator.joblib +3 -0
outputs/checkpoints/checkpoint-160001/flax_model.msgpack +3 -0
outputs/checkpoints/checkpoint-160001/optimizer_state.msgpack +3 -0
outputs/checkpoints/checkpoint-160001/training_args.joblib +3 -0
outputs/checkpoints/checkpoint-160001/training_state.json +1 -0
outputs/checkpoints/checkpoint-170001/config.json +25 -0
outputs/checkpoints/checkpoint-170001/data_collator.joblib +3 -0
outputs/checkpoints/checkpoint-170001/flax_model.msgpack +3 -0
outputs/checkpoints/checkpoint-170001/optimizer_state.msgpack +3 -0
outputs/checkpoints/checkpoint-170001/training_args.joblib +3 -0
outputs/checkpoints/checkpoint-170001/training_state.json +1 -0
outputs/checkpoints/checkpoint-180001/config.json +25 -0
outputs/checkpoints/checkpoint-180001/data_collator.joblib +3 -0
outputs/checkpoints/checkpoint-180001/flax_model.msgpack +3 -0
outputs/checkpoints/checkpoint-180001/optimizer_state.msgpack +3 -0
outputs/checkpoints/checkpoint-180001/training_args.joblib +3 -0
outputs/checkpoints/checkpoint-180001/training_state.json +1 -0
outputs/config.json +25 -0
outputs/data_collator.joblib +3 -0
outputs/events.out.tfevents.1626172316.underestimate.4022703.3.v2 +3 -0
outputs/flax_model.msgpack +3 -0
outputs/optimizer_state.msgpack +3 -0
outputs/training_args.joblib +3 -0
outputs/training_state.json +1 -0
run_mlm_flax_stream.py +722 -0
run_stream.sh +27 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,19 @@

+*.bin.* filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tar.gz filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text
+*.log filter=lfs diff=lfs merge=lfs -text
+*.wandb filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,4 @@

+run*.log
+debug*.log
+run*.wandb
+wandb/

configs/base/config.json ADDED Viewed

	@@ -0,0 +1,25 @@

+{
+    "architectures": [
+      "RobertaForMaskedLM"
+    ],
+    "attention_probs_dropout_prob": 0.1,
+    "bos_token_id": 0,
+    "eos_token_id": 2,
+    "gradient_checkpointing": false,
+    "hidden_act": "gelu",
+    "hidden_dropout_prob": 0.1,
+    "hidden_size": 768,
+    "initializer_range": 0.02,
+    "intermediate_size": 3072,
+    "layer_norm_eps": 1e-05,
+    "max_position_embeddings": 514,
+    "model_type": "roberta",
+    "num_attention_heads": 12,
+    "num_hidden_layers": 12,
+    "pad_token_id": 1,
+    "position_embedding_type": "absolute",
+    "transformers_version": "4.9.0.dev0",
+    "type_vocab_size": 1,
+    "use_cache": true,
+    "vocab_size": 50265
+  }

configs/base/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

configs/large/config.json ADDED Viewed

	@@ -0,0 +1,25 @@

+{
+  "architectures": [
+    "RobertaForMaskedLM"
+  ],
+  "attention_probs_dropout_prob": 0.1,
+  "bos_token_id": 0,
+  "eos_token_id": 2,
+  "gradient_checkpointing": false,
+  "hidden_act": "gelu",
+  "hidden_dropout_prob": 0.1,
+  "hidden_size": 1024,
+  "initializer_range": 0.02,
+  "intermediate_size": 4096,
+  "layer_norm_eps": 1e-05,
+  "max_position_embeddings": 514,
+  "model_type": "roberta",
+  "num_attention_heads": 16,
+  "num_hidden_layers": 24,
+  "pad_token_id": 1,
+  "position_embedding_type": "absolute",
+  "transformers_version": "4.9.0.dev0",
+  "type_vocab_size": 1,
+  "use_cache": true,
+  "vocab_size": 50265
+}

configs/large/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

mc4/README.md ADDED Viewed

	@@ -0,0 +1,525 @@

+---
+pretty_name: mC4
+annotations_creators:
+- no-annotation
+language_creators:
+- found
+languages:
+- af
+- am
+- ar
+- az
+- be
+- bg
+- bg-Latn
+- bn
+- ca
+- ceb
+- co
+- cs
+- cy
+- da
+- de
+- el
+- el-Latn
+- en
+- eo
+- es
+- et
+- eu
+- fa
+- fi
+- fil
+- fr
+- fy
+- ga
+- gd
+- gl
+- gu
+- ha
+- haw
+- hi
+- hi-Latn
+- hmn
+- ht
+- hu
+- hy
+- id
+- ig
+- is
+- it
+- iw
+- ja
+- ja-Latn
+- jv
+- ka
+- kk
+- km
+- kn
+- ko
+- ku
+- ky
+- la
+- lb
+- lo
+- lt
+- lv
+- mg
+- mi
+- mk
+- ml
+- mn
+- mr
+- ms
+- mt
+- my
+- ne
+- nl
+- "no"
+- ny
+- pa
+- pl
+- ps
+- pt
+- ro
+- ru
+- ru-Latn
+- sd
+- si
+- sk
+- sl
+- sm
+- sn
+- so
+- sq
+- sr
+- st
+- su
+- sv
+- sw
+- ta
+- te
+- tg
+- th
+- tr
+- uk
+- und
+- ur
+- uz
+- vi
+- xh
+- yi
+- yo
+- zh
+- zh-Latn
+- zu
+licenses:
+- odc-by-1.0
+multilinguality:
+- multilingual
+size_categories:
+- n<1K
+- 1K<n<10K
+- 10K<n<100K
+- 100K<n<1M
+- 1M<n<10M
+- 10M<n<100M
+- 100M<n<1B
+- 1B<n<10B
+source_datasets:
+- original
+task_categories:
+- sequence-modeling
+task_ids:
+- language-modeling
+paperswithcode_id: mc4
+---
+# Dataset Card for mC4
+## Table of Contents
+- [Dataset Card for mC4](#dataset-card-for-mc4)
+  - [Table of Contents](#table-of-contents)
+  - [Dataset Description](#dataset-description)
+    - [Dataset Summary](#dataset-summary)
+    - [Supported Tasks and Leaderboards](#supported-tasks-and-leaderboards)
+    - [Languages](#languages)
+  - [Dataset Structure](#dataset-structure)
+    - [Data Instances](#data-instances)
+    - [Data Fields](#data-fields)
+    - [Data Splits](#data-splits)
+  - [Dataset Creation](#dataset-creation)
+    - [Curation Rationale](#curation-rationale)
+    - [Source Data](#source-data)
+      - [Initial Data Collection and Normalization](#initial-data-collection-and-normalization)
+      - [Who are the source language producers?](#who-are-the-source-language-producers)
+    - [Annotations](#annotations)
+      - [Annotation process](#annotation-process)
+      - [Who are the annotators?](#who-are-the-annotators)
+    - [Personal and Sensitive Information](#personal-and-sensitive-information)
+  - [Considerations for Using the Data](#considerations-for-using-the-data)
+    - [Social Impact of Dataset](#social-impact-of-dataset)
+    - [Discussion of Biases](#discussion-of-biases)
+    - [Other Known Limitations](#other-known-limitations)
+  - [Additional Information](#additional-information)
+    - [Dataset Curators](#dataset-curators)
+    - [Licensing Information](#licensing-information)
+    - [Citation Information](#citation-information)
+    - [Contributions](#contributions)
+## Dataset Description
+- **Homepage:** https://huggingface.co/datasets/allenai/c4
+- **Paper:** https://arxiv.org/abs/1910.10683
+### Dataset Summary
+A multilingual colossal, cleaned version of Common Crawl's web crawl corpus. Based on Common Crawl dataset: "https://commoncrawl.org".
+This is the version prepared by AllenAI, hosted at this address: https://huggingface.co/datasets/allenai/c4
+108 languages are available and are reported in the table below.
+Note that the languages that end with "-Latn" are simply romanized variants, i.e. written using the Latin script.
+| language code   | language name        |
+|:----------------|:---------------------|
+| af              | Afrikaans            |
+| am              | Amharic              |
+| ar              | Arabic               |
+| az              | Azerbaijani          |
+| be              | Belarusian           |
+| bg              | Bulgarian            |
+| bg-Latn         | Bulgarian (Latin)    |
+| bn              | Bangla               |
+| ca              | Catalan              |
+| ceb             | Cebuano              |
+| co              | Corsican             |
+| cs              | Czech                |
+| cy              | Welsh                |
+| da              | Danish               |
+| de              | German               |
+| el              | Greek                |
+| el-Latn         | Greek (Latin)        |
+| en              | English              |
+| eo              | Esperanto            |
+| es              | Spanish              |
+| et              | Estonian             |
+| eu              | Basque               |
+| fa              | Persian              |
+| fi              | Finnish              |
+| fil             | Filipino             |
+| fr              | French               |
+| fy              | Western Frisian      |
+| ga              | Irish                |
+| gd              | Scottish Gaelic      |
+| gl              | Galician             |
+| gu              | Gujarati             |
+| ha              | Hausa                |
+| haw             | Hawaiian             |
+| hi              | Hindi                |
+| hi-Latn         | Hindi (Latin script) |
+| hmn             | Hmong, Mong          |
+| ht              | Haitian              |
+| hu              | Hungarian            |
+| hy              | Armenian             |
+| id              | Indonesian           |
+| ig              | Igbo                 |
+| is              | Icelandic            |
+| it              | Italian              |
+| iw              | former Hebrew        |
+| ja              | Japanese             |
+| ja-Latn         | Japanese (Latin)     |
+| jv              | Javanese             |
+| ka              | Georgian             |
+| kk              | Kazakh               |
+| km              | Khmer                |
+| kn              | Kannada              |
+| ko              | Korean               |
+| ku              | Kurdish              |
+| ky              | Kyrgyz               |
+| la              | Latin                |
+| lb              | Luxembourgish        |
+| lo              | Lao                  |
+| lt              | Lithuanian           |
+| lv              | Latvian              |
+| mg              | Malagasy             |
+| mi              | Maori                |
+| mk              | Macedonian           |
+| ml              | Malayalam            |
+| mn              | Mongolian            |
+| mr              | Marathi              |
+| ms              | Malay                |
+| mt              | Maltese              |
+| my              | Burmese              |
+| ne              | Nepali               |
+| nl              | Dutch                |
+| no              | Norwegian            |
+| ny              | Nyanja               |
+| pa              | Punjabi              |
+| pl              | Polish               |
+| ps              | Pashto               |
+| pt              | Portuguese           |
+| ro              | Romanian             |
+| ru              | Russian              |
+| ru-Latn         | Russian (Latin)      |
+| sd              | Sindhi               |
+| si              | Sinhala              |
+| sk              | Slovak               |
+| sl              | Slovenian            |
+| sm              | San Marino           |
+| sn              | Shona                |
+| so              | Somali               |
+| sq              | Albanian             |
+| sr              | Serbian              |
+| st              | Southern Sotho       |
+| su              | Sundanese            |
+| sv              | Swedish              |
+| sw              | Swahili              |
+| ta              | Tamil                |
+| te              | Telugu               |
+| tg              | Tajik                |
+| th              | Thai                 |
+| tr              | Turkish              |
+| uk              | Ukrainian            |
+| und             | Unknown language     |
+| ur              | Urdu                 |
+| uz              | Uzbek                |
+| vi              | Vietnamese           |
+| xh              | Xhosa                |
+| yi              | Yiddish              |
+| yo              | Yoruba               |
+| zh              | Chinese              |
+| zh-Latn         | Chinese (Latin)      |
+| zu              | Zulu                 |
+You can load the mC4 subset of any language like this:
+```python
+from datasets import load_dataset
+en_mc4 = load_dataset("mc4", "en")
+```
+And if you can even specify a list of languages:
+```python
+from datasets import load_dataset
+mc4_subset_with_five_languages = load_dataset("mc4", languages=["en", "fr", "es", "de", "zh"])
+```
+### Supported Tasks and Leaderboards
+mC4 is mainly intended to pretrain language models and word representations.
+### Languages
+The dataset supports 108 languages.
+## Dataset Structure
+### Data Instances
+An example form the `en` config is:
+```
+{'timestamp': '2018-06-24T01:32:39Z',
+ 'text': 'Farm Resources in Plumas County\nShow Beginning Farmer Organizations & Professionals (304)\nThere are 304 resources serving Plumas County in the following categories:\nMap of Beginning Farmer Organizations & Professionals serving Plumas County\nVictoria Fisher - Office Manager - Loyalton, CA\nAmy Lynn Rasband - UCCE Plumas-Sierra Administrative Assistant II - Quincy , CA\nShow Farm Income Opportunities Organizations & Professionals (353)\nThere are 353 resources serving Plumas County in the following categories:\nFarm Ranch And Forest Retailers (18)\nMap of Farm Income Opportunities Organizations & Professionals serving Plumas County\nWarner Valley Wildlife Area - Plumas County\nShow Farm Resources Organizations & Professionals (297)\nThere are 297 resources serving Plumas County in the following categories:\nMap of Farm Resources Organizations & Professionals serving Plumas County\nThere are 57 resources serving Plumas County in the following categories:\nMap of Organic Certification Organizations & Professionals serving Plumas County',
+ 'url': 'http://www.californialandcan.org/Plumas/Farm-Resources/'}
+```
+### Data Fields
+The data have several fields:
+- `url`: url of the source as a string
+- `text`: text content as a string
+- `timestamp`: timestamp as a string
+### Data Splits
+To build mC4, the authors used [CLD3](https://github.com/google/cld3) to identify over 100 languages. The resulting mC4 subsets for each language are reported in this table:
+| config   | train   | validation   |
+|:---------|:--------|:-------------|
+| af       | ?       | ?            |
+| am       | ?       | ?            |
+| ar       | ?       | ?            |
+| az       | ?       | ?            |
+| be       | ?       | ?            |
+| bg       | ?       | ?            |
+| bg-Latn  | ?       | ?            |
+| bn       | ?       | ?            |
+| ca       | ?       | ?            |
+| ceb      | ?       | ?            |
+| co       | ?       | ?            |
+| cs       | ?       | ?            |
+| cy       | ?       | ?            |
+| da       | ?       | ?            |
+| de       | ?       | ?            |
+| el       | ?       | ?            |
+| el-Latn  | ?       | ?            |
+| en       | ?       | ?            |
+| eo       | ?       | ?            |
+| es       | ?       | ?            |
+| et       | ?       | ?            |
+| eu       | ?       | ?            |
+| fa       | ?       | ?            |
+| fi       | ?       | ?            |
+| fil      | ?       | ?            |
+| fr       | ?       | ?            |
+| fy       | ?       | ?            |
+| ga       | ?       | ?            |
+| gd       | ?       | ?            |
+| gl       | ?       | ?            |
+| gu       | ?       | ?            |
+| ha       | ?       | ?            |
+| haw      | ?       | ?            |
+| hi       | ?       | ?            |
+| hi-Latn  | ?       | ?            |
+| hmn      | ?       | ?            |
+| ht       | ?       | ?            |
+| hu       | ?       | ?            |
+| hy       | ?       | ?            |
+| id       | ?       | ?            |
+| ig       | ?       | ?            |
+| is       | ?       | ?            |
+| it       | ?       | ?            |
+| iw       | ?       | ?            |
+| ja       | ?       | ?            |
+| ja-Latn  | ?       | ?            |
+| jv       | ?       | ?            |
+| ka       | ?       | ?            |
+| kk       | ?       | ?            |
+| km       | ?       | ?            |
+| kn       | ?       | ?            |
+| ko       | ?       | ?            |
+| ku       | ?       | ?            |
+| ky       | ?       | ?            |
+| la       | ?       | ?            |
+| lb       | ?       | ?            |
+| lo       | ?       | ?            |
+| lt       | ?       | ?            |
+| lv       | ?       | ?            |
+| mg       | ?       | ?            |
+| mi       | ?       | ?            |
+| mk       | ?       | ?            |
+| ml       | ?       | ?            |
+| mn       | ?       | ?            |
+| mr       | ?       | ?            |
+| ms       | ?       | ?            |
+| mt       | ?       | ?            |
+| my       | ?       | ?            |
+| ne       | ?       | ?            |
+| nl       | ?       | ?            |
+| no       | ?       | ?            |
+| ny       | ?       | ?            |
+| pa       | ?       | ?            |
+| pl       | ?       | ?            |
+| ps       | ?       | ?            |
+| pt       | ?       | ?            |
+| ro       | ?       | ?            |
+| ru       | ?       | ?            |
+| ru-Latn  | ?       | ?            |
+| sd       | ?       | ?            |
+| si       | ?       | ?            |
+| sk       | ?       | ?            |
+| sl       | ?       | ?            |
+| sm       | ?       | ?            |
+| sn       | ?       | ?            |
+| so       | ?       | ?            |
+| sq       | ?       | ?            |
+| sr       | ?       | ?            |
+| st       | ?       | ?            |
+| su       | ?       | ?            |
+| sv       | ?       | ?            |
+| sw       | ?       | ?            |
+| ta       | ?       | ?            |
+| te       | ?       | ?            |
+| tg       | ?       | ?            |
+| th       | ?       | ?            |
+| tr       | ?       | ?            |
+| uk       | ?       | ?            |
+| und      | ?       | ?            |
+| ur       | ?       | ?            |
+| uz       | ?       | ?            |
+| vi       | ?       | ?            |
+| xh       | ?       | ?            |
+| yi       | ?       | ?            |
+| yo       | ?       | ?            |
+| zh       | ?       | ?            |
+| zh-Latn  | ?       | ?            |
+| zu       | ?       | ?            |
+## Dataset Creation
+### Curation Rationale
+[More Information Needed]
+### Source Data
+#### Initial Data Collection and Normalization
+[More Information Needed]
+#### Who are the source language producers?
+[More Information Needed]
+### Annotations
+#### Annotation process
+[More Information Needed]
+#### Who are the annotators?
+[More Information Needed]
+### Personal and Sensitive Information
+[More Information Needed]
+## Considerations for Using the Data
+### Social Impact of Dataset
+[More Information Needed]
+### Discussion of Biases
+[More Information Needed]
+### Other Known Limitations
+[More Information Needed]
+## Additional Information
+### Dataset Curators
+[More Information Needed]
+### Licensing Information
+AllenAI are releasing this dataset under the terms of ODC-BY. By using this, you are also bound by the Common Crawl terms of use in respect of the content contained in the dataset.
+### Citation Information
+```
+@article{2019t5,
+    author = {Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu},
+    title = {Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer},
+    journal = {arXiv e-prints},
+    year = {2019},
+    archivePrefix = {arXiv},
+    eprint = {1910.10683},
+}
+```
+### Contributions
+Thanks to [@dirkgr](https://github.com/dirkgr) and [@lhoestq](https://github.com/lhoestq) for adding this dataset.

mc4/dummy/af/0.0.0/dummy_data.zip ADDED Viewed

Binary file (8.54 kB). View file

mc4/mc4.py ADDED Viewed

	@@ -0,0 +1,426 @@

+"""mC4 dataset based on Common Crawl."""
+import gzip
+import json
+import datasets
+import kenlm
+import numpy as np
+from numpy.random import default_rng
+logger = datasets.logging.get_logger(__name__)
+_DESCRIPTION = """\
+A colossal, cleaned version of Common Crawl's web crawl corpus.
+Based on Common Crawl dataset: "https://commoncrawl.org".
+This is the processed version of Google's mC4 dataset by AllenAI.
+"""
+_CITATION = """
+@article{2019t5,
+    author = {Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu},
+    title = {Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer},
+    journal = {arXiv e-prints},
+    year = {2019},
+    archivePrefix = {arXiv},
+    eprint = {1910.10683},
+}
+"""
+_URL = "https://github.com/allenai/allennlp/discussions/5056"
+_DATA_URL = "https://huggingface.co/datasets/allenai/c4/resolve/1ddc917116b730e1859edef32896ec5c16be51d0/multilingual/c4-{language}{split_suffix}.tfrecord-{index:05d}-of-{n_shards:05d}.json.gz"
+_LANGUAGES = [
+    "af",
+    "am",
+    "ar",
+    "az",
+    "be",
+    "bg",
+    "bg-Latn",
+    "bn",
+    "ca",
+    "ceb",
+    "co",
+    "cs",
+    "cy",
+    "da",
+    "de",
+    "el",
+    "el-Latn",
+    "en",
+    "eo",
+    "es",
+    "et",
+    "eu",
+    "fa",
+    "fi",
+    "fil",
+    "fr",
+    "fy",
+    "ga",
+    "gd",
+    "gl",
+    "gu",
+    "ha",
+    "haw",
+    "hi",
+    "hi-Latn",
+    "hmn",
+    "ht",
+    "hu",
+    "hy",
+    "id",
+    "ig",
+    "is",
+    "it",
+    "iw",
+    "ja",
+    "ja-Latn",
+    "jv",
+    "ka",
+    "kk",
+    "km",
+    "kn",
+    "ko",
+    "ku",
+    "ky",
+    "la",
+    "lb",
+    "lo",
+    "lt",
+    "lv",
+    "mg",
+    "mi",
+    "mk",
+    "ml",
+    "mn",
+    "mr",
+    "ms",
+    "mt",
+    "my",
+    "ne",
+    "nl",
+    "no",
+    "ny",
+    "pa",
+    "pl",
+    "ps",
+    "pt",
+    "ro",
+    "ru",
+    "ru-Latn",
+    "sd",
+    "si",
+    "sk",
+    "sl",
+    "sm",
+    "sn",
+    "so",
+    "sq",
+    "sr",
+    "st",
+    "su",
+    "sv",
+    "sw",
+    "ta",
+    "te",
+    "tg",
+    "th",
+    "tr",
+    "uk",
+    "und",
+    "ur",
+    "uz",
+    "vi",
+    "xh",
+    "yi",
+    "yo",
+    "zh",
+    "zh-Latn",
+    "zu",
+]
+_N_SHARDS_PER_SPLIT = {
+    "af": {"train": 64, "validation": 1},
+    "am": {"train": 16, "validation": 1},
+    "ar": {"train": 1024, "validation": 4},
+    "az": {"train": 256, "validation": 1},
+    "be": {"train": 128, "validation": 1},
+    "bg": {"train": 1024, "validation": 1},
+    "bg-Latn": {"train": 4, "validation": 1},
+    "bn": {"train": 512, "validation": 1},
+    "ca": {"train": 512, "validation": 1},
+    "ceb": {"train": 8, "validation": 1},
+    "co": {"train": 8, "validation": 1},
+    "cs": {"train": 1024, "validation": 2},
+    "cy": {"train": 256, "validation": 1},
+    "da": {"train": 1024, "validation": 1},
+    "de": {"train": 2048, "validation": 16},
+    "el": {"train": 1024, "validation": 2},
+    "el-Latn": {"train": 16, "validation": 1},
+    "en": {"train": 11264, "validation": 128},
+    "eo": {"train": 32, "validation": 1},
+    "es": {"train": 2048, "validation": 16},
+    "et": {"train": 256, "validation": 1},
+    "eu": {"train": 64, "validation": 1},
+    "fa": {"train": 1024, "validation": 2},
+    "fi": {"train": 1024, "validation": 1},
+    "fil": {"train": 64, "validation": 1},
+    "fr": {"train": 2048, "validation": 16},
+    "fy": {"train": 16, "validation": 1},
+    "ga": {"train": 16, "validation": 1},
+    "gd": {"train": 16, "validation": 1},
+    "gl": {"train": 128, "validation": 1},
+    "gu": {"train": 64, "validation": 1},
+    "ha": {"train": 8, "validation": 1},
+    "haw": {"train": 2, "validation": 1},
+    "hi": {"train": 1024, "validation": 2},
+    "hi-Latn": {"train": 16, "validation": 1},
+    "hmn": {"train": 8, "validation": 1},
+    "ht": {"train": 8, "validation": 1},
+    "hu": {"train": 1024, "validation": 2},
+    "hy": {"train": 128, "validation": 1},
+    "id": {"train": 1024, "validation": 4},
+    "ig": {"train": 4, "validation": 1},
+    "is": {"train": 128, "validation": 1},
+    "it": {"train": 1024, "validation": 8},
+    "iw": {"train": 1024, "validation": 1},
+    "ja": {"train": 1024, "validation": 8},
+    "ja-Latn": {"train": 8, "validation": 1},
+    "jv": {"train": 8, "validation": 1},
+    "ka": {"train": 256, "validation": 1},
+    "kk": {"train": 256, "validation": 1},
+    "km": {"train": 64, "validation": 1},
+    "kn": {"train": 64, "validation": 1},
+    "ko": {"train": 1024, "validation": 1},
+    "ku": {"train": 16, "validation": 1},
+    "ky": {"train": 64, "validation": 1},
+    "la": {"train": 64, "validation": 1},
+    "lb": {"train": 32, "validation": 1},
+    "lo": {"train": 8, "validation": 1},
+    "lt": {"train": 512, "validation": 1},
+    "lv": {"train": 256, "validation": 1},
+    "mg": {"train": 8, "validation": 1},
+    "mi": {"train": 4, "validation": 1},
+    "mk": {"train": 128, "validation": 1},
+    "ml": {"train": 128, "validation": 1},
+    "mn": {"train": 128, "validation": 1},
+    "mr": {"train": 1024, "validation": 1},
+    "ms": {"train": 512, "validation": 1},
+    "mt": {"train": 128, "validation": 1},
+    "my": {"train": 64, "validation": 1},
+    "ne": {"train": 256, "validation": 1},
+    "nl": {"train": 1024, "validation": 4},
+    "no": {"train": 1024, "validation": 1},
+    "ny": {"train": 4, "validation": 1},
+    "pa": {"train": 32, "validation": 1},
+    "pl": {"train": 1024, "validation": 4},
+    "ps": {"train": 16, "validation": 1},
+    "pt": {"train": 1024, "validation": 4},
+    "ro": {"train": 1024, "validation": 2},
+    "ru": {"train": 4096, "validation": 32},
+    "ru-Latn": {"train": 32, "validation": 1},
+    "sd": {"train": 64, "validation": 1},
+    "si": {"train": 64, "validation": 1},
+    "sk": {"train": 512, "validation": 1},
+    "sl": {"train": 256, "validation": 1},
+    "sm": {"train": 4, "validation": 1},
+    "sn": {"train": 8, "validation": 1},
+    "so": {"train": 64, "validation": 1},
+    "sq": {"train": 128, "validation": 1},
+    "sr": {"train": 256, "validation": 1},
+    "st": {"train": 2, "validation": 1},
+    "su": {"train": 4, "validation": 1},
+    "sv": {"train": 1024, "validation": 2},
+    "sw": {"train": 32, "validation": 1},
+    "ta": {"train": 256, "validation": 1},
+    "te": {"train": 128, "validation": 1},
+    "tg": {"train": 64, "validation": 1},
+    "th": {"train": 1024, "validation": 1},
+    "tr": {"train": 1024, "validation": 4},
+    "uk": {"train": 1024, "validation": 2},
+    "und": {"train": 3072, "validation": 32},
+    "ur": {"train": 128, "validation": 1},
+    "uz": {"train": 32, "validation": 1},
+    "vi": {"train": 1024, "validation": 4},
+    "xh": {"train": 2, "validation": 1},
+    "yi": {"train": 16, "validation": 1},
+    "yo": {"train": 2, "validation": 1},
+    "zh": {"train": 1024, "validation": 2},
+    "zh-Latn": {"train": 8, "validation": 1},
+    "zu": {"train": 8, "validation": 1},
+}
+class Mc4Config(datasets.BuilderConfig):
+    """BuilderConfig for mC4."""
+    def __init__(self, *args, languages, **kwargs):
+        """BuilderConfig for mC4.
+        Args:
+            languages (:obj:`List[str]`): list of languages to load
+            **kwargs: keyword arguments forwarded to super.
+        """
+        super().__init__(
+            *args,
+            name="+".join(languages),
+            **kwargs,
+        )
+        self.languages = languages
+class Mc4(datasets.GeneratorBasedBuilder):
+    """mC4, a colossal, cleaned version of Common Crawl's web crawl corpus."""
+    BUILDER_CONFIGS = [Mc4Config(languages=[lang]) for lang in _LANGUAGES]
+    BUILDER_CONFIG_CLASS = Mc4Config
+    def __init__(self, *args, writer_batch_size=None, **kwargs):
+        self.data_files = kwargs.pop("data_files", {})
+        self.sampling_method = kwargs.pop("sampling_method", None)
+        self.perplexity_model = kwargs.pop("perplexity_model", None)
+        self.sampling_factor = kwargs.pop("sampling_factor", None)
+        self.boundaries = kwargs.pop("boundaries", None)
+        self.seed = kwargs.pop("seed", None)
+        if self.sampling_method:
+            if self.seed is not None:
+                self.rng = default_rng(self.seed)
+            else:
+                self.rng = default_rng()
+            if self.sampling_method == "random":
+                self.should_keep_doc = self._should_keep_doc_random
+            else:
+                # Loading 5-gram model
+                # http://dl.fbaipublicfiles.com/cc_net/lm/es.arpa.bin
+                logger.info("loading model = %s", self.perplexity_model)
+                self.pp_model = kenlm.Model(self.perplexity_model)
+                if self.sampling_method == "gaussian":
+                    self.should_keep_doc = self._should_keep_doc_gaussian
+                else:
+                    self.should_keep_doc = self._should_keep_doc_step
+        super().__init__(*args, writer_batch_size=writer_batch_size, **kwargs)
+    def get_perplexity(self, doc):
+        doc_log_score, doc_length = 0, 0
+        for line in doc.split("\n"):
+            log_score = self.pp_model.score(line)
+            length = len(line.split()) + 1
+            doc_log_score += log_score
+            doc_length += length
+        return 10.0 ** (-doc_log_score / doc_length)
+    def _should_keep_doc_step(self, doc, factor=1.5e5, boundaries=None):
+        perplexity = self.get_perplexity(doc)
+        if boundaries is None:
+            boundaries = [536394.99320948, 662247.50212365, 919250.87225178]
+        if perplexity <= boundaries[0]:
+            quartile_range = boundaries[0]
+        elif boundaries[0] < perplexity < boundaries[1]:
+            quartile_range = boundaries[1] - boundaries[0]
+        elif boundaries[1] < perplexity < boundaries[2]:
+            quartile_range = boundaries[2] - boundaries[1]
+        elif perplexity >= boundaries[2]:
+            quartile_range = 10 * boundaries[2]
+        probability = factor / quartile_range
+        return self.rng.uniform() < probability
+    def _should_keep_doc_gaussian(self, doc, factor=0.78, boundaries=None):
+        perplexity = self.get_perplexity(doc)
+        if boundaries is not None:
+            m = boundaries[1]
+        else:
+            m = 662247.50212365
+        exponential = np.exp(-9/2 * ((perplexity - m) / m) ** 2)
+        weighted_perplexity = factor * exponential
+        return self.rng.uniform() < weighted_perplexity
+    def _should_keep_doc_random(self, doc, factor=None, boundaries=None):
+        if factor is None:
+            factor = 0.5
+        return self.rng.uniform() <= factor
+    def _info(self):
+        return datasets.DatasetInfo(
+            description=_DESCRIPTION,
+            features=datasets.Features(
+                {
+                    "text": datasets.Value("string"),
+                    "timestamp": datasets.Value("string"),
+                    "url": datasets.Value("string"),
+                }
+            ),
+            supervised_keys=None,
+            homepage=_URL,
+            citation=_CITATION,
+        )
+    def _split_generators(self, dl_manager):
+        data_urls = {}
+        for split in ["train", "validation"]:
+            data_urls[split] = [
+                _DATA_URL.format(
+                    language=self.config.name,
+                    split_suffix="-validation" if split == "validation" else "",
+                    index=index,
+                    n_shards=_N_SHARDS_PER_SPLIT[lang][split],
+                )
+                for lang in self.config.languages
+                for index in range(_N_SHARDS_PER_SPLIT[lang][split])
+            ]
+        if "train" in self.data_files:
+            train_downloaded_files = self.data_files["train"]
+            if not isinstance(train_downloaded_files, (tuple, list)):
+                train_downloaded_files = [train_downloaded_files]
+        else:
+            train_downloaded_files = dl_manager.download(data_urls["train"])
+        if "validation" in self.data_files:
+            validation_downloaded_files = self.data_files["validation"]
+            if not isinstance(validation_downloaded_files, (tuple, list)):
+                validation_downloaded_files = [validation_downloaded_files]
+        else:
+            validation_downloaded_files = dl_manager.download(data_urls["validation"])
+        return [
+            datasets.SplitGenerator(name=datasets.Split.TRAIN, gen_kwargs={"filepaths": train_downloaded_files}),
+            datasets.SplitGenerator(
+                name=datasets.Split.VALIDATION, gen_kwargs={"filepaths": validation_downloaded_files}
+            ),
+        ]
+    def _generate_examples(self, filepaths):
+        """This function returns the examples in the raw (text) form by iterating on all the files."""
+        id_ = 0
+        for filepath in filepaths:
+            logger.info("generating examples from = %s", filepath)
+            if filepath.endswith("jsonl"):
+                with open(filepath, "r", encoding="utf-8") as f:
+                    for line in f:
+                        if line:
+                            example = json.loads(line)
+                            yield id_, example
+                            id_ += 1
+            else:
+                with gzip.open(open(filepath, "rb"), "rt", encoding="utf-8") as f:
+                    if self.sampling_method:
+                        logger.info("sampling method = %s", self.sampling_method)
+                        for line in f:
+                            if line:
+                                example = json.loads(line)
+                                if self.should_keep_doc(
+                                    example["text"],
+                                    factor=self.sampling_factor,
+                                    boundaries=self.boundaries):
+                                    yield id_, example
+                                    id_ += 1
+                    else:
+                        for line in f:
+                            if line:
+                                example = json.loads(line)
+                                yield id_, example
+                                id_ += 1

mc4/mc4.py.lock ADDED Viewed

File without changes

outputs/checkpoints/checkpoint-140001/config.json ADDED Viewed

	@@ -0,0 +1,25 @@

+{
+  "architectures": [
+    "RobertaForMaskedLM"
+  ],
+  "attention_probs_dropout_prob": 0.1,
+  "bos_token_id": 0,
+  "eos_token_id": 2,
+  "gradient_checkpointing": false,
+  "hidden_act": "gelu",
+  "hidden_dropout_prob": 0.1,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "max_position_embeddings": 514,
+  "model_type": "roberta",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "pad_token_id": 1,
+  "position_embedding_type": "absolute",
+  "transformers_version": "4.9.0.dev0",
+  "type_vocab_size": 1,
+  "use_cache": true,
+  "vocab_size": 50265
+}

outputs/checkpoints/checkpoint-140001/data_collator.joblib ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e02a6e9cfa63cb321cac9402efd29841b652999fcbf787800ae050e747b161ee
+size 1471394

outputs/checkpoints/checkpoint-140001/flax_model.msgpack ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:bb3b6443b0b4e0fd6b95f7409525ddde51fb73dd99318041f2fecda9f547f5a6
+size 249750019

outputs/checkpoints/checkpoint-140001/optimizer_state.msgpack ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:73ce4d1287008fdfac801ca7df44a0debe3e41f901970f3132f0cd49d2ad6bd0
+size 499500278

outputs/checkpoints/checkpoint-140001/training_args.joblib ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8bc14fe16573d318dd510c7cfb42ebb7cc87b4dcf77e99247a2d1605cffd772b
+size 1876

outputs/checkpoints/checkpoint-140001/training_state.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"step": 140001}

outputs/checkpoints/checkpoint-150001/config.json ADDED Viewed

	@@ -0,0 +1,25 @@

+{
+  "architectures": [
+    "RobertaForMaskedLM"
+  ],
+  "attention_probs_dropout_prob": 0.1,
+  "bos_token_id": 0,
+  "eos_token_id": 2,
+  "gradient_checkpointing": false,
+  "hidden_act": "gelu",
+  "hidden_dropout_prob": 0.1,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "max_position_embeddings": 514,
+  "model_type": "roberta",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "pad_token_id": 1,
+  "position_embedding_type": "absolute",
+  "transformers_version": "4.9.0.dev0",
+  "type_vocab_size": 1,
+  "use_cache": true,
+  "vocab_size": 50265
+}

outputs/checkpoints/checkpoint-150001/data_collator.joblib ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e02a6e9cfa63cb321cac9402efd29841b652999fcbf787800ae050e747b161ee
+size 1471394

outputs/checkpoints/checkpoint-150001/flax_model.msgpack ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d9f2a38ac6c111d01809dd28ae9078aab932064126a7de753ce0d88bd60421e4
+size 249750019

outputs/checkpoints/checkpoint-150001/optimizer_state.msgpack ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:84f53f9b574ccfb97696f637d71903b9762ef2718c656bea201e5aeb9078c328
+size 499500278

outputs/checkpoints/checkpoint-150001/training_args.joblib ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8bc14fe16573d318dd510c7cfb42ebb7cc87b4dcf77e99247a2d1605cffd772b
+size 1876

outputs/checkpoints/checkpoint-150001/training_state.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"step": 150001}

outputs/checkpoints/checkpoint-160001/config.json ADDED Viewed

	@@ -0,0 +1,25 @@

+{
+  "architectures": [
+    "RobertaForMaskedLM"
+  ],
+  "attention_probs_dropout_prob": 0.1,
+  "bos_token_id": 0,
+  "eos_token_id": 2,
+  "gradient_checkpointing": false,
+  "hidden_act": "gelu",
+  "hidden_dropout_prob": 0.1,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "max_position_embeddings": 514,
+  "model_type": "roberta",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "pad_token_id": 1,
+  "position_embedding_type": "absolute",
+  "transformers_version": "4.9.0.dev0",
+  "type_vocab_size": 1,
+  "use_cache": true,
+  "vocab_size": 50265
+}

outputs/checkpoints/checkpoint-160001/data_collator.joblib ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e02a6e9cfa63cb321cac9402efd29841b652999fcbf787800ae050e747b161ee
+size 1471394

outputs/checkpoints/checkpoint-160001/flax_model.msgpack ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b86d26169d8fb7bb58ae7fecd67ca557a0affc93bf2d5b5947af0070ee894ab9
+size 249750019

outputs/checkpoints/checkpoint-160001/optimizer_state.msgpack ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ea3a8f65ea9c3c6c3606f1167c4e54049784fa8b2a5ee3f4936563ecd4f811b6
+size 499500278

outputs/checkpoints/checkpoint-160001/training_args.joblib ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8bc14fe16573d318dd510c7cfb42ebb7cc87b4dcf77e99247a2d1605cffd772b
+size 1876

outputs/checkpoints/checkpoint-160001/training_state.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"step": 160001}

outputs/checkpoints/checkpoint-170001/config.json ADDED Viewed

	@@ -0,0 +1,25 @@

+{
+  "architectures": [
+    "RobertaForMaskedLM"
+  ],
+  "attention_probs_dropout_prob": 0.1,
+  "bos_token_id": 0,
+  "eos_token_id": 2,
+  "gradient_checkpointing": false,
+  "hidden_act": "gelu",
+  "hidden_dropout_prob": 0.1,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "max_position_embeddings": 514,
+  "model_type": "roberta",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "pad_token_id": 1,
+  "position_embedding_type": "absolute",
+  "transformers_version": "4.9.0.dev0",
+  "type_vocab_size": 1,
+  "use_cache": true,
+  "vocab_size": 50265
+}

outputs/checkpoints/checkpoint-170001/data_collator.joblib ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e02a6e9cfa63cb321cac9402efd29841b652999fcbf787800ae050e747b161ee
+size 1471394

outputs/checkpoints/checkpoint-170001/flax_model.msgpack ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c40291527e2cf6e418cf78bb9cd4eec53ac716230987ad7a0a447bf0ce041d4c
+size 249750019

outputs/checkpoints/checkpoint-170001/optimizer_state.msgpack ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:90dbe4fe7d7694dd86d13e9b075953620aa4dabb4fdc2023b6ede17aa720848e
+size 499500278

outputs/checkpoints/checkpoint-170001/training_args.joblib ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8bc14fe16573d318dd510c7cfb42ebb7cc87b4dcf77e99247a2d1605cffd772b
+size 1876

outputs/checkpoints/checkpoint-170001/training_state.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"step": 170001}

outputs/checkpoints/checkpoint-180001/config.json ADDED Viewed

	@@ -0,0 +1,25 @@

+{
+  "architectures": [
+    "RobertaForMaskedLM"
+  ],
+  "attention_probs_dropout_prob": 0.1,
+  "bos_token_id": 0,
+  "eos_token_id": 2,
+  "gradient_checkpointing": false,
+  "hidden_act": "gelu",
+  "hidden_dropout_prob": 0.1,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "max_position_embeddings": 514,
+  "model_type": "roberta",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "pad_token_id": 1,
+  "position_embedding_type": "absolute",
+  "transformers_version": "4.9.0.dev0",
+  "type_vocab_size": 1,
+  "use_cache": true,
+  "vocab_size": 50265
+}

outputs/checkpoints/checkpoint-180001/data_collator.joblib ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e02a6e9cfa63cb321cac9402efd29841b652999fcbf787800ae050e747b161ee
+size 1471394

outputs/checkpoints/checkpoint-180001/flax_model.msgpack ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:393c37966461709fe51a3b3f84befb7fa7e5030025856d171308efd40dbbc7da
+size 249750019

outputs/checkpoints/checkpoint-180001/optimizer_state.msgpack ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3a33cad417a7e78eaafc1c041f93fd54ad9f63869d01e1351bac4abcd58e4eeb
+size 499500278

outputs/checkpoints/checkpoint-180001/training_args.joblib ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8bc14fe16573d318dd510c7cfb42ebb7cc87b4dcf77e99247a2d1605cffd772b
+size 1876

outputs/checkpoints/checkpoint-180001/training_state.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"step": 180001}

outputs/config.json ADDED Viewed

	@@ -0,0 +1,25 @@

+{
+  "architectures": [
+    "RobertaForMaskedLM"
+  ],
+  "attention_probs_dropout_prob": 0.1,
+  "bos_token_id": 0,
+  "eos_token_id": 2,
+  "gradient_checkpointing": false,
+  "hidden_act": "gelu",
+  "hidden_dropout_prob": 0.1,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "max_position_embeddings": 514,
+  "model_type": "roberta",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "pad_token_id": 1,
+  "position_embedding_type": "absolute",
+  "transformers_version": "4.9.0.dev0",
+  "type_vocab_size": 1,
+  "use_cache": true,
+  "vocab_size": 50265
+}

outputs/data_collator.joblib ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e02a6e9cfa63cb321cac9402efd29841b652999fcbf787800ae050e747b161ee
+size 1471394

outputs/events.out.tfevents.1626172316.underestimate.4022703.3.v2 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:54e7a88ae2dc3c9128df68ad99b735f3ae87946bc9753da8eb080eb7379dc4d3
+size 26964023

outputs/flax_model.msgpack ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:393c37966461709fe51a3b3f84befb7fa7e5030025856d171308efd40dbbc7da
+size 249750019

outputs/optimizer_state.msgpack ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3a33cad417a7e78eaafc1c041f93fd54ad9f63869d01e1351bac4abcd58e4eeb
+size 499500278

outputs/training_args.joblib ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8bc14fe16573d318dd510c7cfb42ebb7cc87b4dcf77e99247a2d1605cffd772b
+size 1876

outputs/training_state.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"step": 180001}

run_mlm_flax_stream.py ADDED Viewed

	@@ -0,0 +1,722 @@

+#!/usr/bin/env python
+# coding=utf-8
+# Copyright 2021 The HuggingFace Team All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Fine-tuning the library models for masked language modeling (BERT, ALBERT, RoBERTa...) with whole word masking on a
+text file or a dataset.
+Here is the full list of checkpoints on the hub that can be fine-tuned by this script:
+https://huggingface.co/models?filter=masked-lm
+"""
+import logging
+import json
+import os
+import shutil
+import sys
+import time
+from collections import defaultdict
+from dataclasses import dataclass, field
+# You can also adapt this script on your own masked language modeling task. Pointers for this are left as comments.
+import joblib
+from pathlib import Path
+from typing import Dict, List, Optional, Tuple
+import datasets
+import numpy as np
+from datasets import load_dataset
+from tqdm import tqdm
+import flax
+import jax
+import jax.numpy as jnp
+import kenlm  # pip install https://github.com/kpu/kenlm/archive/master.zip
+import optax
+from flax import jax_utils, traverse_util
+from flax.serialization import from_bytes, to_bytes
+from flax.training import train_state
+from flax.training.common_utils import get_metrics, onehot, shard
+from transformers import (
+    CONFIG_MAPPING,
+    FLAX_MODEL_FOR_MASKED_LM_MAPPING,
+    AutoConfig,
+    AutoTokenizer,
+    FlaxAutoModelForMaskedLM,
+    HfArgumentParser,
+    PreTrainedTokenizerBase,
+    TensorType,
+    TrainingArguments,
+    is_tensorboard_available,
+    set_seed,
+)
+if datasets.__version__ <= "1.8.0":
+    raise ValueError("Make sure to upgrade `datasets` to a version >= 1.9.0 to use dataset streaming")
+MODEL_CONFIG_CLASSES = list(FLAX_MODEL_FOR_MASKED_LM_MAPPING.keys())
+MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES)
+@dataclass
+class ModelArguments:
+    """
+    Arguments pertaining to which model/config/tokenizer we are going to fine-tune, or train from scratch.
+    """
+    model_name_or_path: Optional[str] = field(
+        default=None,
+        metadata={
+            "help": "The model checkpoint for weights initialization."
+            "Don't set if you want to train a model from scratch."
+        },
+    )
+    model_type: Optional[str] = field(
+        default=None,
+        metadata={"help": "If training from scratch, pass a model type from the list: " + ", ".join(MODEL_TYPES)},
+    )
+    config_name: Optional[str] = field(
+        default=None, metadata={"help": "Pretrained config name or path if not the same as model_name"}
+    )
+    tokenizer_name: Optional[str] = field(
+        default=None, metadata={"help": "Pretrained tokenizer name or path if not the same as model_name"}
+    )
+    cache_dir: Optional[str] = field(
+        default=None, metadata={"help": "Where do you want to store the pretrained models downloaded from s3"}
+    )
+    use_fast_tokenizer: bool = field(
+        default=True,
+        metadata={"help": "Whether to use one of the fast tokenizer (backed by the tokenizers library) or not."},
+    )
+    dtype: Optional[str] = field(
+        default="float32",
+        metadata={
+            "help": "Floating-point format in which the model weights should be initialized and trained. Choose one of `[float32, float16, bfloat16]`."
+        },
+    )
+@dataclass
+class DataTrainingArguments:
+    """
+    Arguments pertaining to what data we are going to input our model for training and eval.
+    """
+    dataset_name: Optional[str] = field(
+        default=None, metadata={"help": "The name of the dataset to use (via the datasets library)."}
+    )
+    dataset_config_name: Optional[str] = field(
+        default=None, metadata={"help": "The configuration name of the dataset to use (via the datasets library)."}
+    )
+    train_file: Optional[str] = field(default=None, metadata={"help": "The input training data file (a text file)."})
+    validation_file: Optional[str] = field(
+        default=None,
+        metadata={"help": "An optional input evaluation data file to evaluate the perplexity on (a text file)."},
+    )
+    train_ref_file: Optional[str] = field(
+        default=None,
+        metadata={"help": "An optional input train ref data file for whole word masking in Chinese."},
+    )
+    validation_ref_file: Optional[str] = field(
+        default=None,
+        metadata={"help": "An optional input validation ref data file for whole word masking in Chinese."},
+    )
+    overwrite_cache: bool = field(
+        default=False, metadata={"help": "Overwrite the cached training and evaluation sets"}
+    )
+    validation_split_percentage: Optional[int] = field(
+        default=5,
+        metadata={
+            "help": "The percentage of the train set used as validation set in case there's no validation split"
+        },
+    )
+    max_seq_length: Optional[int] = field(
+        default=None,
+        metadata={
+            "help": "The maximum total input sequence length after tokenization. Sequences longer "
+            "than this will be truncated. Default to the max input length of the model."
+        },
+    )
+    preprocessing_num_workers: Optional[int] = field(
+        default=None,
+        metadata={"help": "The number of processes to use for the preprocessing."},
+    )
+    mlm_probability: float = field(
+        default=0.15, metadata={"help": "Ratio of tokens to mask for masked language modeling loss"}
+    )
+    pad_to_max_length: bool = field(
+        default=False,
+        metadata={
+            "help": "Whether to pad all samples to `max_seq_length`. "
+            "If False, will pad the samples dynamically when batching to the maximum length in the batch."
+        },
+    )
+    line_by_line: bool = field(
+        default=False,
+        metadata={"help": "Whether distinct lines of text in the dataset are to be handled as distinct sequences."},
+    )
+    text_column_name: str = field(
+        default="text", metadata={"help": "The name of the column to retrieve the training text."}
+    )
+    shuffle_buffer_size: int = field(
+        default=10000, metadata={"help": "The number of examples to pre-load for shuffling."}
+    )
+    num_train_steps: int = field(default=50000, metadata={"help": "The number of training steps."})
+    num_eval_samples: int = field(default=50000, metadata={"help": "The number of samples to be used for evaluation"})
+    def __post_init__(self):
+        if self.dataset_name is None and self.train_file is None and self.validation_file is None:
+            raise ValueError("Need either a dataset name or a training/validation file.")
+        else:
+            if self.train_file is not None:
+                extension = self.train_file.split(".")[-1]
+                assert extension in ["csv", "json", "jsonl", "txt", "gz"], "`train_file` should be a csv, a json (lines) or a txt file."
+            if self.validation_file is not None:
+                extension = self.validation_file.split(".")[-1]
+                assert extension in ["csv", "json", "jsonl", "txt", "gz"], "`validation_file` should be a csv, a json (lines) or a txt file."
+@flax.struct.dataclass
+class FlaxDataCollatorForLanguageModeling:
+    """
+    Data collator used for language modeling. Inputs are dynamically padded to the maximum length of a batch if they
+    are not all of the same length.
+    Args:
+        tokenizer (:class:`~transformers.PreTrainedTokenizer` or :class:`~transformers.PreTrainedTokenizerFast`):
+            The tokenizer used for encoding the data.
+        mlm_probability (:obj:`float`, `optional`, defaults to 0.15):
+            The probability with which to (randomly) mask tokens in the input.
+    .. note::
+        For best performance, this data collator should be used with a dataset having items that are dictionaries or
+        BatchEncoding, with the :obj:`"special_tokens_mask"` key, as returned by a
+        :class:`~transformers.PreTrainedTokenizer` or a :class:`~transformers.PreTrainedTokenizerFast` with the
+        argument :obj:`return_special_tokens_mask=True`.
+    """
+    tokenizer: PreTrainedTokenizerBase
+    mlm_probability: float = 0.15
+    def __post_init__(self):
+        if self.tokenizer.mask_token is None:
+            raise ValueError(
+                "This tokenizer does not have a mask token which is necessary for masked language modeling. "
+                "You should pass `mlm=False` to train on causal language modeling instead."
+            )
+    def __call__(self, examples: List[Dict[str, np.ndarray]], pad_to_multiple_of: int) -> Dict[str, np.ndarray]:
+        # Handle dict or lists with proper padding and conversion to tensor.
+        batch = self.tokenizer.pad(examples, pad_to_multiple_of=pad_to_multiple_of, return_tensors=TensorType.NUMPY)
+        # If special token mask has been preprocessed, pop it from the dict.
+        special_tokens_mask = batch.pop("special_tokens_mask", None)
+        batch["input_ids"], batch["labels"] = self.mask_tokens(
+            batch["input_ids"], special_tokens_mask=special_tokens_mask
+        )
+        return batch
+    def mask_tokens(
+        self, inputs: np.ndarray, special_tokens_mask: Optional[np.ndarray]
+    ) -> Tuple[jnp.ndarray, jnp.ndarray]:
+        """
+        Prepare masked tokens inputs/labels for masked language modeling: 80% MASK, 10% random, 10% original.
+        """
+        labels = inputs.copy()
+        # We sample a few tokens in each sequence for MLM training (with probability `self.mlm_probability`)
+        probability_matrix = np.full(labels.shape, self.mlm_probability)
+        special_tokens_mask = special_tokens_mask.astype("bool")
+        probability_matrix[special_tokens_mask] = 0.0
+        masked_indices = np.random.binomial(1, probability_matrix).astype("bool")
+        labels[~masked_indices] = -100  # We only compute loss on masked tokens
+        # 80% of the time, we replace masked input tokens with tokenizer.mask_token ([MASK])
+        indices_replaced = np.random.binomial(1, np.full(labels.shape, 0.8)).astype("bool") & masked_indices
+        inputs[indices_replaced] = self.tokenizer.convert_tokens_to_ids(self.tokenizer.mask_token)
+        # 10% of the time, we replace masked input tokens with random word
+        indices_random = np.random.binomial(1, np.full(labels.shape, 0.5)).astype("bool")
+        indices_random &= masked_indices & ~indices_replaced
+        random_words = np.random.randint(self.tokenizer.vocab_size, size=labels.shape, dtype="i4")
+        inputs[indices_random] = random_words[indices_random]
+        # The rest of the time (10% of the time) we keep the masked input tokens unchanged
+        return inputs, labels
+@dataclass
+class SamplingArguments:
+    """
+    Arguments pertaining to how to perform sampling of the dataset.
+    """
+    perplexity_model: Optional[str] = field(
+        default="./es.arpa.bin", metadata={"help": "Path to KenLM model to use to get perplexity values."}
+    )
+    sampling_method: Optional[str] = field(
+        default=None, metadata={"help": "Sample using a 'step' or 'gaussian' perplexity function per document, or 'random'."}
+    )
+    sampling_factor: Optional[float]  = field(
+        default=None, metadata={"help": "Sampling factor. Integers for step function, decimals for gaussian."}
+    )
+    boundaries: Optional[str] = field(
+        default="536394.99320948,662247.50212365,919250.87225178", metadata={"help": "Quartile boundaries"}
+    )
+    def __post_init__(self):
+        self.boundaries = [float(q.strip()) for q in self.boundaries.split(",")]
+def generate_batch_splits(samples_idx: jnp.ndarray, batch_size: int) -> jnp.ndarray:
+    num_samples = len(samples_idx)
+    samples_to_remove = num_samples % batch_size
+    if samples_to_remove != 0:
+        samples_idx = samples_idx[:-samples_to_remove]
+    sections_split = num_samples // batch_size
+    batch_idx = np.split(samples_idx, sections_split)
+    return batch_idx
+def advance_iter_and_group_samples(train_iterator, num_samples, max_seq_length):
+    """
+    The training iterator is advanced so that after groupifying the samples,
+    `num_samples` of length `max_seq_length` are returned.
+    """
+    num_total_tokens = max_seq_length * num_samples
+    samples = defaultdict(list)
+    i = 0
+    while i < num_total_tokens:
+        tokenized_samples = next(train_iterator)
+        i += len(tokenized_samples["input_ids"])
+        # concatenate tokenized samples to list
+        samples = {k: samples[k] + tokenized_samples[k] for k in tokenized_samples.keys()}
+    # Concatenated tokens are split to lists of length `max_seq_length`.
+    # Note that remainedr of % max_seq_length are thrown away.
+    def group_texts(examples):
+        result = {
+            k: [t[i : i + max_seq_length] for i in range(0, num_total_tokens, max_seq_length)]
+            for k, t in examples.items()
+        }
+        return result
+    grouped_samples = group_texts(samples)
+    return grouped_samples
+def write_train_metric(summary_writer, train_metrics, train_time, step):
+    summary_writer.scalar("train_time", train_time, step)
+    train_metrics = get_metrics(train_metrics)
+    for key, vals in train_metrics.items():
+        tag = f"train_{key}"
+        for i, val in enumerate(vals):
+            summary_writer.scalar(tag, val, step - len(vals) + i + 1)
+def write_eval_metric(summary_writer, eval_metrics, step):
+    for metric_name, value in eval_metrics.items():
+        summary_writer.scalar(f"eval_{metric_name}", value, step)
+def save_checkpoint_files(state, data_collator, training_args, save_dir):
+    unreplicated_state = jax_utils.unreplicate(state)
+    with open(os.path.join(save_dir, "optimizer_state.msgpack"), "wb") as f:
+        f.write(to_bytes(unreplicated_state.opt_state))
+    joblib.dump(training_args, os.path.join(save_dir, "training_args.joblib"))
+    joblib.dump(data_collator, os.path.join(save_dir, "data_collator.joblib"))
+    with open(os.path.join(save_dir, "training_state.json"), "w") as f:
+        json.dump({"step": unreplicated_state.step.item()}, f)
+def rotate_checkpoints(path, max_checkpoints=5):
+    paths = sorted(Path(path).iterdir(), key=os.path.getmtime)[::-1]
+    if len(paths) > max_checkpoints:
+        for path_to_delete in paths[max_checkpoints:]:
+            try:
+                shutil.rmtree(path_to_delete)
+            except OSError:
+                os.remove(path_to_delete)
+if __name__ == "__main__":
+    # See all possible arguments in src/transformers/training_args.py
+    # or by passing the --help flag to this script.
+    # We now keep distinct sets of args, for a cleaner separation of concerns.
+    parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments, SamplingArguments))
+    if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
+        # If we pass only one argument to the script and it's the path to a json file,
+        # let's parse it to get our arguments.
+        model_args, data_args, training_args, sampling_args = parser.parse_json_file(json_file=os.path.abspath(sys.argv[1]))
+    else:
+        model_args, data_args, training_args, sampling_args = parser.parse_args_into_dataclasses()
+    if (
+        os.path.exists(training_args.output_dir)
+        and os.listdir(training_args.output_dir)
+        and training_args.do_train
+        and not training_args.overwrite_output_dir
+    ):
+        raise ValueError(
+            f"Output directory ({training_args.output_dir}) already exists and is not empty."
+            "Use --overwrite_output_dir to overcome."
+        )
+    # Setup logging
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
+        level="INFO",
+        datefmt="[%X]",
+    )
+    # Log on each process the small summary:
+    logger = logging.getLogger(__name__)
+    logger.warning(
+        f"Process rank: {training_args.local_rank}, device: {training_args.device}, n_gpu: {training_args.n_gpu}"
+        + f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
+    )
+    # Set the verbosity to info of the Transformers logger (on main process only):
+    logger.info(f"Training/evaluation parameters {training_args}")
+    # Set seed before initializing model.
+    set_seed(training_args.seed)
+    # Get the datasets: you can either provide your own CSV/JSON/TXT training and evaluation files (see below)
+    # or just provide the name of one of the public datasets available on the hub at https://huggingface.co/datasets/
+    # (the dataset will be downloaded automatically from the datasets Hub).
+    #
+    # For CSV/JSON files, this script will use the column called 'text' or the first column if no column called
+    # 'text' is found. You can easily tweak this behavior (see below).
+    if data_args.dataset_name is not None:
+        # Downloading and loading a dataset from the hub.
+        filepaths = {}
+        if data_args.train_file:
+            filepaths["train"] = data_args.train_file
+        if data_args.validation_file:
+            filepaths["validation"] = data_args.validation_file
+        try:
+            dataset = load_dataset(
+                data_args.dataset_name,
+                data_args.dataset_config_name,
+                cache_dir=model_args.cache_dir,
+                streaming=True,
+                split="train",
+                sampling_method=sampling_args.sampling_method,
+                sampling_factor=sampling_args.sampling_factor,
+                boundaries=sampling_args.boundaries,
+                perplexity_model=sampling_args.perplexity_model,
+                seed=training_args.seed,
+                data_files=filepaths,
+            )
+        except Exception as exc:
+            logger.warning(
+                f"Unable to load local dataset with perplexity sampling support. Using huggingface.co/datasets/{data_args.dataset_name}: {exc}"
+            )
+            dataset = load_dataset(
+                data_args.dataset_name,
+                data_args.dataset_config_name,
+                cache_dir=model_args.cache_dir,
+                streaming=True,
+                split="train",
+            )
+    if model_args.config_name:
+        config = AutoConfig.from_pretrained(model_args.config_name, cache_dir=model_args.cache_dir)
+    elif model_args.model_name_or_path:
+        config = AutoConfig.from_pretrained(model_args.model_name_or_path, cache_dir=model_args.cache_dir)
+    else:
+        config = CONFIG_MAPPING[model_args.model_type]()
+        logger.warning("You are instantiating a new config instance from scratch.")
+    if model_args.tokenizer_name:
+        tokenizer = AutoTokenizer.from_pretrained(
+            model_args.tokenizer_name, cache_dir=model_args.cache_dir, use_fast=model_args.use_fast_tokenizer
+        )
+    elif model_args.model_name_or_path:
+        tokenizer = AutoTokenizer.from_pretrained(
+            model_args.model_name_or_path, cache_dir=model_args.cache_dir, use_fast=model_args.use_fast_tokenizer
+        )
+    else:
+        raise ValueError(
+            "You are instantiating a new tokenizer from scratch. This is not supported by this script."
+            "You can do it from another script, save it, and load it from here, using --tokenizer_name."
+        )
+    # Otherwise, we tokenize every text, then concatenate them together before splitting them in smaller parts.
+    # We use `return_special_tokens_mask=True` because DataCollatorForLanguageModeling (see below) is more
+    # efficient when it receives the `special_tokens_mask`.
+    def tokenize_function(examples):
+        return tokenizer(
+            examples[data_args.text_column_name],
+            return_special_tokens_mask=True
+        )
+    tokenized_datasets = dataset.map(
+        tokenize_function,
+        batched=True,
+    )
+    shuffle_seed = training_args.seed
+    tokenized_datasets = tokenized_datasets.shuffle(buffer_size=data_args.shuffle_buffer_size, seed=shuffle_seed)
+    # Enable tensorboard only on the master node
+    has_tensorboard = is_tensorboard_available()
+    if has_tensorboard and jax.process_index() == 0:
+        try:
+            from flax.metrics.tensorboard import SummaryWriter
+            summary_writer = SummaryWriter(log_dir=Path(training_args.output_dir))
+            # Enable Weight&Biases
+            import wandb
+            wandb.init(
+                entity='wandb',
+                project='hf-flax-bertin-roberta-es',
+                sync_tensorboard=True,
+            )
+            wandb.config.update(training_args)
+            wandb.config.update(model_args)
+            wandb.config.update(data_args)
+        except ImportError as ie:
+            has_tensorboard = False
+            logger.warning(
+                f"Unable to display metrics through TensorBoard because some package are not installed: {ie}"
+            )
+    else:
+        logger.warning(
+            "Unable to display metrics through TensorBoard because the package is not installed: "
+            "Please run pip install tensorboard to enable."
+        )
+    # Data collator
+    # This one will take care of randomly masking the tokens.
+    data_collator = FlaxDataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=data_args.mlm_probability)
+    # Initialize our training
+    rng = jax.random.PRNGKey(training_args.seed)
+    dropout_rngs = jax.random.split(rng, jax.local_device_count())
+    if model_args.model_name_or_path:
+        model = FlaxAutoModelForMaskedLM.from_pretrained(
+            model_args.model_name_or_path, config=config, seed=training_args.seed, dtype=getattr(jnp, model_args.dtype)
+        )
+    else:
+        model = FlaxAutoModelForMaskedLM.from_config(
+            config, seed=training_args.seed, dtype=getattr(jnp, model_args.dtype)
+        )
+    # Store some constant
+    num_epochs = int(training_args.num_train_epochs)
+    train_batch_size = int(training_args.per_device_train_batch_size) * jax.device_count()
+    eval_batch_size = int(training_args.per_device_eval_batch_size) * jax.device_count()
+    # define number steps per stream epoch
+    num_train_steps = data_args.num_train_steps
+    # Create learning rate schedule
+    warmup_fn = optax.linear_schedule(
+        init_value=0.0, end_value=training_args.learning_rate, transition_steps=training_args.warmup_steps
+    )
+    decay_fn = optax.linear_schedule(
+        init_value=training_args.learning_rate,
+        end_value=0,
+        transition_steps=num_train_steps - training_args.warmup_steps,
+    )
+    linear_decay_lr_schedule_fn = optax.join_schedules(
+        schedules=[warmup_fn, decay_fn], boundaries=[training_args.warmup_steps]
+    )
+    # We use Optax's "masking" functionality to not apply weight decay
+    # to bias and LayerNorm scale parameters. decay_mask_fn returns a
+    # mask boolean with the same structure as the parameters.
+    # The mask is True for parameters that should be decayed.
+    # Note that this mask is specifically adapted for FlaxBERT-like models.
+    # For other models, one should correct the layer norm parameter naming
+    # accordingly.
+    def decay_mask_fn(params):
+        flat_params = traverse_util.flatten_dict(params)
+        flat_mask = {path: (path[-1] != "bias" and path[-2:] != ("LayerNorm", "scale")) for path in flat_params}
+        return traverse_util.unflatten_dict(flat_mask)
+    # create adam optimizer
+    adamw = optax.adamw(
+        learning_rate=linear_decay_lr_schedule_fn,
+        b1=training_args.adam_beta1,
+        b2=training_args.adam_beta2,
+        eps=training_args.adam_epsilon,
+        weight_decay=training_args.weight_decay,
+        mask=decay_mask_fn,
+    )
+    # Setup train state
+    state = train_state.TrainState.create(apply_fn=model.__call__, params=model.params, tx=adamw)
+    # Define gradient update step fn
+    def train_step(state, batch, dropout_rng):
+        dropout_rng, new_dropout_rng = jax.random.split(dropout_rng)
+        def loss_fn(params):
+            labels = batch.pop("labels")
+            logits = state.apply_fn(**batch, params=params, dropout_rng=dropout_rng, train=True)[0]
+            # compute loss, ignore padded input tokens
+            label_mask = jnp.where(labels > 0, 1.0, 0.0)
+            loss = optax.softmax_cross_entropy(logits, onehot(labels, logits.shape[-1])) * label_mask
+            # take average
+            loss = loss.sum() / label_mask.sum()
+            return loss
+        grad_fn = jax.value_and_grad(loss_fn)
+        loss, grad = grad_fn(state.params)
+        grad = jax.lax.pmean(grad, "batch")
+        new_state = state.apply_gradients(grads=grad)
+        metrics = jax.lax.pmean(
+            {"loss": loss, "learning_rate": linear_decay_lr_schedule_fn(state.step)}, axis_name="batch"
+        )
+        return new_state, metrics, new_dropout_rng
+    # Create parallel version of the train step
+    p_train_step = jax.pmap(train_step, "batch", donate_argnums=(0,))
+    # Define eval fn
+    def eval_step(params, batch):
+        labels = batch.pop("labels")
+        logits = model(**batch, params=params, train=False)[0]
+        # compute loss, ignore padded input tokens
+        label_mask = jnp.where(labels > 0, 1.0, 0.0)
+        loss = optax.softmax_cross_entropy(logits, onehot(labels, logits.shape[-1])) * label_mask
+        # compute accuracy
+        accuracy = jnp.equal(jnp.argmax(logits, axis=-1), labels) * label_mask
+        # summarize metrics
+        metrics = {"loss": loss.sum(), "accuracy": accuracy.sum(), "normalizer": label_mask.sum()}
+        metrics = jax.lax.psum(metrics, axis_name="batch")
+        return metrics
+    p_eval_step = jax.pmap(eval_step, "batch", donate_argnums=(0,))
+    # Replicate the train state on each device
+    state = jax_utils.replicate(state)
+    train_time = 0
+    train_start = time.time()
+    train_metrics = []
+    eval_metrics = []
+    training_iter = iter(tokenized_datasets)
+    max_seq_length = min(data_args.max_seq_length, tokenizer.model_max_length)
+    eval_samples = advance_iter_and_group_samples(training_iter, data_args.num_eval_samples, max_seq_length)
+    steps = tqdm(range(num_train_steps), desc="Training...", position=0)
+    for step in range(num_train_steps):
+        # ======================== Training ================================
+        try:
+            samples = advance_iter_and_group_samples(training_iter, train_batch_size, max_seq_length)
+        except StopIteration:
+            # Once the end of the dataset stream is reached, the training iterator
+            # is reinitialized and reshuffled and a new eval dataset is randomely chosen.
+            shuffle_seed += 1
+            tokenized_datasets.set_epoch(shuffle_seed)
+            training_iter = iter(tokenized_datasets)
+            eval_dataset = advance_iter_and_group_samples(training_iter, data_args.num_eval_samples, max_seq_length)
+            samples = advance_iter_and_group_samples(training_iter, train_batch_size, max_seq_length)
+        # process input samples
+        model_inputs = data_collator(samples, pad_to_multiple_of=16)
+        # Model forward
+        model_inputs = shard(model_inputs.data)
+        state, train_metric, dropout_rngs = p_train_step(state, model_inputs, dropout_rngs)
+        train_metrics.append(train_metric)
+        if step % training_args.logging_steps == 0 and step > 0:
+            steps.write(
+                f"Step... ({step} | Loss: {train_metric['loss'].mean()}, Learning Rate: {train_metric['learning_rate'].mean()})"
+            )
+            train_time += time.time() - train_start
+            if has_tensorboard and jax.process_index() == 0:
+                write_train_metric(summary_writer, train_metrics, train_time, step)
+            train_metrics = []
+        # ======================== Evaluating ==============================
+        if step % training_args.eval_steps == 0 and step > 0:
+            eval_samples_idx = jnp.arange(data_args.num_eval_samples)
+            eval_batch_idx = generate_batch_splits(eval_samples_idx, eval_batch_size)
+            for i, batch_idx in enumerate(tqdm(eval_batch_idx, desc="Evaluating ...", position=1)):
+                # process input samples
+                batch_eval_samples = {k: [v[idx] for idx in batch_idx] for k, v in eval_samples.items()}
+                model_inputs = data_collator(batch_eval_samples, pad_to_multiple_of=16)
+                # Model forward
+                model_inputs = shard(model_inputs.data)
+                metrics = p_eval_step(state.params, model_inputs)
+                eval_metrics.append(metrics)
+            # normalize eval metrics
+            eval_metrics = get_metrics(eval_metrics)
+            eval_metrics = jax.tree_map(jnp.sum, eval_metrics)
+            eval_normalizer = eval_metrics.pop("normalizer")
+            eval_metrics = jax.tree_map(lambda x: x / eval_normalizer, eval_metrics)
+            # Update progress bar
+            steps.desc = f"Step... ({step + 1}/{num_train_steps} | Loss: {eval_metrics['loss']}, Acc: {eval_metrics['accuracy']})"
+            if has_tensorboard and jax.process_index() == 0:
+                write_eval_metric(summary_writer, eval_metrics, step)
+            eval_metrics = []
+        # save checkpoint after eval_steps
+        if step % training_args.save_steps == 0 and step > 0 and jax.process_index() == 0:
+            logger.info(f"Saving checkpoint at {step + 1} steps")
+            params = jax.device_get(jax.tree_map(lambda x: x[0], state.params))
+            model.save_pretrained(
+                training_args.output_dir,
+                params=params,
+                push_to_hub=training_args.push_to_hub,
+                commit_message=f"Saving weights and logs of step {step + 1}",
+            )
+            save_checkpoint_files(state, data_collator, training_args, training_args.output_dir)
+            checkpoints_dir = Path(training_args.output_dir) / "checkpoints" / f"checkpoint-{step + 1}"
+            checkpoints_dir.mkdir(parents=True, exist_ok=True)
+            model.save_pretrained(checkpoints_dir, params=params,)
+            save_checkpoint_files(state, data_collator, training_args, checkpoints_dir)
+            rotate_checkpoints(
+                Path(training_args.output_dir) / "checkpoints",
+                max_checkpoints=training_args.save_total_limit
+            )
+        # update tqdm bar
+        steps.update(1)

run_stream.sh ADDED Viewed

	@@ -0,0 +1,27 @@

+# From https://arxiv.org/pdf/1907.11692.pdf for base model
+python -c "import jax; print('TPUs', jax.device_count())"
+python ./run_mlm_flax_stream.py \
+    --output_dir="./outputs" \
+    --model_type="roberta" \
+    --config_name="./configs/base" \
+    --tokenizer_name="./configs/base" \
+    --dataset_name="./mc4" \
+    --dataset_config_name="es" \
+    --train_file="../mc4-es-train-50M-steps.jsonl" \
+    --max_seq_length="128" \
+    --pad_to_max_length  \
+    --per_device_train_batch_size="256" \
+    --per_device_eval_batch_size="256" \
+    --adam_beta1="0.9" \
+    --adam_beta2="0.98" \
+    --adam_epsilon="1e-6" \
+    --learning_rate="6e-4" \
+    --weight_decay="0.01" \
+    --save_steps="10000" \
+    --save_total_limit="5" \
+    --warmup_steps="24000" \
+    --overwrite_output_dir \
+    --num_train_steps="250000" \
+    --eval_steps="10000" \
+    --dtype="bfloat16" \
+    --logging_steps="500" 2>&1 | tee run_stream.log