versae commited on Jul 17, 2021

Commit

d6c5011

0 Parent(s):

Model at 210k steps, mlm acc 0.6509

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +19 -0
.gitignore +4 -0
config.json +25 -0
configs/base/config.json +25 -0
configs/base/tokenizer.json +0 -0
configs/large/config.json +25 -0
configs/large/tokenizer.json +0 -0
convert.py +29 -0
flax_model.msgpack +3 -0
mc4/README.md +525 -0
mc4/dummy/af/0.0.0/dummy_data.zip +0 -0
mc4/mc4.py +426 -0
mc4/mc4.py.lock +0 -0
merges.txt +0 -0
outputs/checkpoints/checkpoint-170001/config.json +25 -0
outputs/checkpoints/checkpoint-170001/data_collator.joblib +3 -0
outputs/checkpoints/checkpoint-170001/flax_model.msgpack +3 -0
outputs/checkpoints/checkpoint-170001/optimizer_state.msgpack +3 -0
outputs/checkpoints/checkpoint-170001/training_args.joblib +3 -0
outputs/checkpoints/checkpoint-170001/training_state.json +1 -0
outputs/checkpoints/checkpoint-180001/config.json +25 -0
outputs/checkpoints/checkpoint-180001/data_collator.joblib +3 -0
outputs/checkpoints/checkpoint-180001/flax_model.msgpack +3 -0
outputs/checkpoints/checkpoint-180001/optimizer_state.msgpack +3 -0
outputs/checkpoints/checkpoint-180001/training_args.joblib +3 -0
outputs/checkpoints/checkpoint-180001/training_state.json +1 -0
outputs/checkpoints/checkpoint-190001/config.json +25 -0
outputs/checkpoints/checkpoint-190001/data_collator.joblib +3 -0
outputs/checkpoints/checkpoint-190001/flax_model.msgpack +3 -0
outputs/checkpoints/checkpoint-190001/optimizer_state.msgpack +3 -0
outputs/checkpoints/checkpoint-190001/training_args.joblib +3 -0
outputs/checkpoints/checkpoint-190001/training_state.json +1 -0
outputs/checkpoints/checkpoint-200001/config.json +25 -0
outputs/checkpoints/checkpoint-200001/data_collator.joblib +3 -0
outputs/checkpoints/checkpoint-200001/flax_model.msgpack +3 -0
outputs/checkpoints/checkpoint-200001/optimizer_state.msgpack +3 -0
outputs/checkpoints/checkpoint-200001/training_args.joblib +3 -0
outputs/checkpoints/checkpoint-200001/training_state.json +1 -0
outputs/checkpoints/checkpoint-210001/config.json +25 -0
outputs/checkpoints/checkpoint-210001/data_collator.joblib +3 -0
outputs/checkpoints/checkpoint-210001/flax_model.msgpack +3 -0
outputs/checkpoints/checkpoint-210001/optimizer_state.msgpack +3 -0
outputs/checkpoints/checkpoint-210001/training_args.joblib +3 -0
outputs/checkpoints/checkpoint-210001/training_state.json +1 -0
outputs/config.json +25 -0
outputs/data_collator.joblib +3 -0
outputs/events.out.tfevents.1626132703.tablespoon.2534518.3.v2 +3 -0
outputs/events.out.tfevents.1626132842.tablespoon.2540265.3.v2 +3 -0
outputs/flax_model.msgpack +3 -0
outputs/optimizer_state.msgpack +3 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,19 @@

+*.bin.* filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tar.gz filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text
+*.log filter=lfs diff=lfs merge=lfs -text
+*.wandb filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,4 @@

+run*.log
+debug*.log
+run*.wandb
+wandb/

config.json ADDED Viewed

	@@ -0,0 +1,25 @@

+{
+  "architectures": [
+    "RobertaForMaskedLM"
+  ],
+  "attention_probs_dropout_prob": 0.1,
+  "bos_token_id": 0,
+  "eos_token_id": 2,
+  "gradient_checkpointing": false,
+  "hidden_act": "gelu",
+  "hidden_dropout_prob": 0.1,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "max_position_embeddings": 514,
+  "model_type": "roberta",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "pad_token_id": 1,
+  "position_embedding_type": "absolute",
+  "transformers_version": "4.9.0.dev0",
+  "type_vocab_size": 1,
+  "use_cache": true,
+  "vocab_size": 50265
+}

configs/base/config.json ADDED Viewed

	@@ -0,0 +1,25 @@

+{
+    "architectures": [
+      "RobertaForMaskedLM"
+    ],
+    "attention_probs_dropout_prob": 0.1,
+    "bos_token_id": 0,
+    "eos_token_id": 2,
+    "gradient_checkpointing": false,
+    "hidden_act": "gelu",
+    "hidden_dropout_prob": 0.1,
+    "hidden_size": 768,
+    "initializer_range": 0.02,
+    "intermediate_size": 3072,
+    "layer_norm_eps": 1e-05,
+    "max_position_embeddings": 514,
+    "model_type": "roberta",
+    "num_attention_heads": 12,
+    "num_hidden_layers": 12,
+    "pad_token_id": 1,
+    "position_embedding_type": "absolute",
+    "transformers_version": "4.9.0.dev0",
+    "type_vocab_size": 1,
+    "use_cache": true,
+    "vocab_size": 50265
+  }

configs/base/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

configs/large/config.json ADDED Viewed

	@@ -0,0 +1,25 @@

+{
+  "architectures": [
+    "RobertaForMaskedLM"
+  ],
+  "attention_probs_dropout_prob": 0.1,
+  "bos_token_id": 0,
+  "eos_token_id": 2,
+  "gradient_checkpointing": false,
+  "hidden_act": "gelu",
+  "hidden_dropout_prob": 0.1,
+  "hidden_size": 1024,
+  "initializer_range": 0.02,
+  "intermediate_size": 4096,
+  "layer_norm_eps": 1e-05,
+  "max_position_embeddings": 514,
+  "model_type": "roberta",
+  "num_attention_heads": 16,
+  "num_hidden_layers": 24,
+  "pad_token_id": 1,
+  "position_embedding_type": "absolute",
+  "transformers_version": "4.9.0.dev0",
+  "type_vocab_size": 1,
+  "use_cache": true,
+  "vocab_size": 50265
+}

configs/large/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

convert.py ADDED Viewed

	@@ -0,0 +1,29 @@

+#!/usr/bin/env python
+import tempfile
+import jax
+from jax import numpy as jnp
+from transformers import AutoTokenizer, FlaxRobertaForMaskedLM, RobertaForMaskedLM
+def to_f32(t):
+    return jax.tree_map(lambda x: x.astype(jnp.float32) if x.dtype == jnp.bfloat16 else x, t)
+def main():
+    # Saving extra files from config.json and tokenizer.json files
+    tokenizer = AutoTokenizer.from_pretrained("./")
+    tokenizer.save_pretrained("./")
+    # Temporary saving bfloat16 Flax model into float32
+    tmp = tempfile.mkdtemp()
+    flax_model = FlaxRobertaForMaskedLM.from_pretrained("./")
+    flax_model.params = to_f32(flax_model.params)
+    flax_model.save_pretrained(tmp)
+    # Converting float32 Flax to PyTorch
+    model = RobertaForMaskedLM.from_pretrained(tmp, from_flax=True)
+    model.save_pretrained("./", save_config=False)
+if __name__ == "__main__":
+    main()

flax_model.msgpack ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9f64c5b858c8917d0c7c86909e61da0c0564704de3797a9d43c2570cac4b0247
+size 249750019

mc4/README.md ADDED Viewed

	@@ -0,0 +1,525 @@

+---
+pretty_name: mC4
+annotations_creators:
+- no-annotation
+language_creators:
+- found
+languages:
+- af
+- am
+- ar
+- az
+- be
+- bg
+- bg-Latn
+- bn
+- ca
+- ceb
+- co
+- cs
+- cy
+- da
+- de
+- el
+- el-Latn
+- en
+- eo
+- es
+- et
+- eu
+- fa
+- fi
+- fil
+- fr
+- fy
+- ga
+- gd
+- gl
+- gu
+- ha
+- haw
+- hi
+- hi-Latn
+- hmn
+- ht
+- hu
+- hy
+- id
+- ig
+- is
+- it
+- iw
+- ja
+- ja-Latn
+- jv
+- ka
+- kk
+- km
+- kn
+- ko
+- ku
+- ky
+- la
+- lb
+- lo
+- lt
+- lv
+- mg
+- mi
+- mk
+- ml
+- mn
+- mr
+- ms
+- mt
+- my
+- ne
+- nl
+- "no"
+- ny
+- pa
+- pl
+- ps
+- pt
+- ro
+- ru
+- ru-Latn
+- sd
+- si
+- sk
+- sl
+- sm
+- sn
+- so
+- sq
+- sr
+- st
+- su
+- sv
+- sw
+- ta
+- te
+- tg
+- th
+- tr
+- uk
+- und
+- ur
+- uz
+- vi
+- xh
+- yi
+- yo
+- zh
+- zh-Latn
+- zu
+licenses:
+- odc-by-1.0
+multilinguality:
+- multilingual
+size_categories:
+- n<1K
+- 1K<n<10K
+- 10K<n<100K
+- 100K<n<1M
+- 1M<n<10M
+- 10M<n<100M
+- 100M<n<1B
+- 1B<n<10B
+source_datasets:
+- original
+task_categories:
+- sequence-modeling
+task_ids:
+- language-modeling
+paperswithcode_id: mc4
+---
+# Dataset Card for mC4
+## Table of Contents
+- [Dataset Card for mC4](#dataset-card-for-mc4)
+  - [Table of Contents](#table-of-contents)
+  - [Dataset Description](#dataset-description)
+    - [Dataset Summary](#dataset-summary)
+    - [Supported Tasks and Leaderboards](#supported-tasks-and-leaderboards)
+    - [Languages](#languages)
+  - [Dataset Structure](#dataset-structure)
+    - [Data Instances](#data-instances)
+    - [Data Fields](#data-fields)
+    - [Data Splits](#data-splits)
+  - [Dataset Creation](#dataset-creation)
+    - [Curation Rationale](#curation-rationale)
+    - [Source Data](#source-data)
+      - [Initial Data Collection and Normalization](#initial-data-collection-and-normalization)
+      - [Who are the source language producers?](#who-are-the-source-language-producers)
+    - [Annotations](#annotations)
+      - [Annotation process](#annotation-process)
+      - [Who are the annotators?](#who-are-the-annotators)
+    - [Personal and Sensitive Information](#personal-and-sensitive-information)
+  - [Considerations for Using the Data](#considerations-for-using-the-data)
+    - [Social Impact of Dataset](#social-impact-of-dataset)
+    - [Discussion of Biases](#discussion-of-biases)
+    - [Other Known Limitations](#other-known-limitations)
+  - [Additional Information](#additional-information)
+    - [Dataset Curators](#dataset-curators)
+    - [Licensing Information](#licensing-information)
+    - [Citation Information](#citation-information)
+    - [Contributions](#contributions)
+## Dataset Description
+- **Homepage:** https://huggingface.co/datasets/allenai/c4
+- **Paper:** https://arxiv.org/abs/1910.10683
+### Dataset Summary
+A multilingual colossal, cleaned version of Common Crawl's web crawl corpus. Based on Common Crawl dataset: "https://commoncrawl.org".
+This is the version prepared by AllenAI, hosted at this address: https://huggingface.co/datasets/allenai/c4
+108 languages are available and are reported in the table below.
+Note that the languages that end with "-Latn" are simply romanized variants, i.e. written using the Latin script.
+| language code   | language name        |
+|:----------------|:---------------------|
+| af              | Afrikaans            |
+| am              | Amharic              |
+| ar              | Arabic               |
+| az              | Azerbaijani          |
+| be              | Belarusian           |
+| bg              | Bulgarian            |
+| bg-Latn         | Bulgarian (Latin)    |
+| bn              | Bangla               |
+| ca              | Catalan              |
+| ceb             | Cebuano              |
+| co              | Corsican             |
+| cs              | Czech                |
+| cy              | Welsh                |
+| da              | Danish               |
+| de              | German               |
+| el              | Greek                |
+| el-Latn         | Greek (Latin)        |
+| en              | English              |
+| eo              | Esperanto            |
+| es              | Spanish              |
+| et              | Estonian             |
+| eu              | Basque               |
+| fa              | Persian              |
+| fi              | Finnish              |
+| fil             | Filipino             |
+| fr              | French               |
+| fy              | Western Frisian      |
+| ga              | Irish                |
+| gd              | Scottish Gaelic      |
+| gl              | Galician             |
+| gu              | Gujarati             |
+| ha              | Hausa                |
+| haw             | Hawaiian             |
+| hi              | Hindi                |
+| hi-Latn         | Hindi (Latin script) |
+| hmn             | Hmong, Mong          |
+| ht              | Haitian              |
+| hu              | Hungarian            |
+| hy              | Armenian             |
+| id              | Indonesian           |
+| ig              | Igbo                 |
+| is              | Icelandic            |
+| it              | Italian              |
+| iw              | former Hebrew        |
+| ja              | Japanese             |
+| ja-Latn         | Japanese (Latin)     |
+| jv              | Javanese             |
+| ka              | Georgian             |
+| kk              | Kazakh               |
+| km              | Khmer                |
+| kn              | Kannada              |
+| ko              | Korean               |
+| ku              | Kurdish              |
+| ky              | Kyrgyz               |
+| la              | Latin                |
+| lb              | Luxembourgish        |
+| lo              | Lao                  |
+| lt              | Lithuanian           |
+| lv              | Latvian              |
+| mg              | Malagasy             |
+| mi              | Maori                |
+| mk              | Macedonian           |
+| ml              | Malayalam            |
+| mn              | Mongolian            |
+| mr              | Marathi              |
+| ms              | Malay                |
+| mt              | Maltese              |
+| my              | Burmese              |
+| ne              | Nepali               |
+| nl              | Dutch                |
+| no              | Norwegian            |
+| ny              | Nyanja               |
+| pa              | Punjabi              |
+| pl              | Polish               |
+| ps              | Pashto               |
+| pt              | Portuguese           |
+| ro              | Romanian             |
+| ru              | Russian              |
+| ru-Latn         | Russian (Latin)      |
+| sd              | Sindhi               |
+| si              | Sinhala              |
+| sk              | Slovak               |
+| sl              | Slovenian            |
+| sm              | San Marino           |
+| sn              | Shona                |
+| so              | Somali               |
+| sq              | Albanian             |
+| sr              | Serbian              |
+| st              | Southern Sotho       |
+| su              | Sundanese            |
+| sv              | Swedish              |
+| sw              | Swahili              |
+| ta              | Tamil                |
+| te              | Telugu               |
+| tg              | Tajik                |
+| th              | Thai                 |
+| tr              | Turkish              |
+| uk              | Ukrainian            |
+| und             | Unknown language     |
+| ur              | Urdu                 |
+| uz              | Uzbek                |
+| vi              | Vietnamese           |
+| xh              | Xhosa                |
+| yi              | Yiddish              |
+| yo              | Yoruba               |
+| zh              | Chinese              |
+| zh-Latn         | Chinese (Latin)      |
+| zu              | Zulu                 |
+You can load the mC4 subset of any language like this:
+```python
+from datasets import load_dataset
+en_mc4 = load_dataset("mc4", "en")
+```
+And if you can even specify a list of languages:
+```python
+from datasets import load_dataset
+mc4_subset_with_five_languages = load_dataset("mc4", languages=["en", "fr", "es", "de", "zh"])
+```
+### Supported Tasks and Leaderboards
+mC4 is mainly intended to pretrain language models and word representations.
+### Languages
+The dataset supports 108 languages.
+## Dataset Structure
+### Data Instances
+An example form the `en` config is:
+```
+{'timestamp': '2018-06-24T01:32:39Z',
+ 'text': 'Farm Resources in Plumas County\nShow Beginning Farmer Organizations & Professionals (304)\nThere are 304 resources serving Plumas County in the following categories:\nMap of Beginning Farmer Organizations & Professionals serving Plumas County\nVictoria Fisher - Office Manager - Loyalton, CA\nAmy Lynn Rasband - UCCE Plumas-Sierra Administrative Assistant II - Quincy , CA\nShow Farm Income Opportunities Organizations & Professionals (353)\nThere are 353 resources serving Plumas County in the following categories:\nFarm Ranch And Forest Retailers (18)\nMap of Farm Income Opportunities Organizations & Professionals serving Plumas County\nWarner Valley Wildlife Area - Plumas County\nShow Farm Resources Organizations & Professionals (297)\nThere are 297 resources serving Plumas County in the following categories:\nMap of Farm Resources Organizations & Professionals serving Plumas County\nThere are 57 resources serving Plumas County in the following categories:\nMap of Organic Certification Organizations & Professionals serving Plumas County',
+ 'url': 'http://www.californialandcan.org/Plumas/Farm-Resources/'}
+```
+### Data Fields
+The data have several fields:
+- `url`: url of the source as a string
+- `text`: text content as a string
+- `timestamp`: timestamp as a string
+### Data Splits
+To build mC4, the authors used [CLD3](https://github.com/google/cld3) to identify over 100 languages. The resulting mC4 subsets for each language are reported in this table:
+| config   | train   | validation   |
+|:---------|:--------|:-------------|
+| af       | ?       | ?            |
+| am       | ?       | ?            |
+| ar       | ?       | ?            |
+| az       | ?       | ?            |
+| be       | ?       | ?            |
+| bg       | ?       | ?            |
+| bg-Latn  | ?       | ?            |
+| bn       | ?       | ?            |
+| ca       | ?       | ?            |
+| ceb      | ?       | ?            |
+| co       | ?       | ?            |
+| cs       | ?       | ?            |
+| cy       | ?       | ?            |
+| da       | ?       | ?            |
+| de       | ?       | ?            |
+| el       | ?       | ?            |
+| el-Latn  | ?       | ?            |
+| en       | ?       | ?            |
+| eo       | ?       | ?            |
+| es       | ?       | ?            |
+| et       | ?       | ?            |
+| eu       | ?       | ?            |
+| fa       | ?       | ?            |
+| fi       | ?       | ?            |
+| fil      | ?       | ?            |
+| fr       | ?       | ?            |
+| fy       | ?       | ?            |
+| ga       | ?       | ?            |
+| gd       | ?       | ?            |
+| gl       | ?       | ?            |
+| gu       | ?       | ?            |
+| ha       | ?       | ?            |
+| haw      | ?       | ?            |
+| hi       | ?       | ?            |
+| hi-Latn  | ?       | ?            |
+| hmn      | ?       | ?            |
+| ht       | ?       | ?            |
+| hu       | ?       | ?            |
+| hy       | ?       | ?            |
+| id       | ?       | ?            |
+| ig       | ?       | ?            |
+| is       | ?       | ?            |
+| it       | ?       | ?            |
+| iw       | ?       | ?            |
+| ja       | ?       | ?            |
+| ja-Latn  | ?       | ?            |
+| jv       | ?       | ?            |
+| ka       | ?       | ?            |
+| kk       | ?       | ?            |
+| km       | ?       | ?            |
+| kn       | ?       | ?            |
+| ko       | ?       | ?            |
+| ku       | ?       | ?            |
+| ky       | ?       | ?            |
+| la       | ?       | ?            |
+| lb       | ?       | ?            |
+| lo       | ?       | ?            |
+| lt       | ?       | ?            |
+| lv       | ?       | ?            |
+| mg       | ?       | ?            |
+| mi       | ?       | ?            |
+| mk       | ?       | ?            |
+| ml       | ?       | ?            |
+| mn       | ?       | ?            |
+| mr       | ?       | ?            |
+| ms       | ?       | ?            |
+| mt       | ?       | ?            |
+| my       | ?       | ?            |
+| ne       | ?       | ?            |
+| nl       | ?       | ?            |
+| no       | ?       | ?            |
+| ny       | ?       | ?            |
+| pa       | ?       | ?            |
+| pl       | ?       | ?            |
+| ps       | ?       | ?            |
+| pt       | ?       | ?            |
+| ro       | ?       | ?            |
+| ru       | ?       | ?            |
+| ru-Latn  | ?       | ?            |
+| sd       | ?       | ?            |
+| si       | ?       | ?            |
+| sk       | ?       | ?            |
+| sl       | ?       | ?            |
+| sm       | ?       | ?            |
+| sn       | ?       | ?            |
+| so       | ?       | ?            |
+| sq       | ?       | ?            |
+| sr       | ?       | ?            |
+| st       | ?       | ?            |
+| su       | ?       | ?            |
+| sv       | ?       | ?            |
+| sw       | ?       | ?            |
+| ta       | ?       | ?            |
+| te       | ?       | ?            |
+| tg       | ?       | ?            |
+| th       | ?       | ?            |
+| tr       | ?       | ?            |
+| uk       | ?       | ?            |
+| und      | ?       | ?            |
+| ur       | ?       | ?            |
+| uz       | ?       | ?            |
+| vi       | ?       | ?            |
+| xh       | ?       | ?            |
+| yi       | ?       | ?            |
+| yo       | ?       | ?            |
+| zh       | ?       | ?            |
+| zh-Latn  | ?       | ?            |
+| zu       | ?       | ?            |
+## Dataset Creation
+### Curation Rationale
+[More Information Needed]
+### Source Data
+#### Initial Data Collection and Normalization
+[More Information Needed]
+#### Who are the source language producers?
+[More Information Needed]
+### Annotations
+#### Annotation process
+[More Information Needed]
+#### Who are the annotators?
+[More Information Needed]
+### Personal and Sensitive Information
+[More Information Needed]
+## Considerations for Using the Data
+### Social Impact of Dataset
+[More Information Needed]
+### Discussion of Biases
+[More Information Needed]
+### Other Known Limitations
+[More Information Needed]
+## Additional Information
+### Dataset Curators
+[More Information Needed]
+### Licensing Information
+AllenAI are releasing this dataset under the terms of ODC-BY. By using this, you are also bound by the Common Crawl terms of use in respect of the content contained in the dataset.
+### Citation Information
+```
+@article{2019t5,
+    author = {Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu},
+    title = {Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer},
+    journal = {arXiv e-prints},
+    year = {2019},
+    archivePrefix = {arXiv},
+    eprint = {1910.10683},
+}
+```
+### Contributions
+Thanks to [@dirkgr](https://github.com/dirkgr) and [@lhoestq](https://github.com/lhoestq) for adding this dataset.

mc4/dummy/af/0.0.0/dummy_data.zip ADDED Viewed

Binary file (8.54 kB). View file

mc4/mc4.py ADDED Viewed

	@@ -0,0 +1,426 @@

+"""mC4 dataset based on Common Crawl."""
+import gzip
+import json
+import datasets
+import kenlm
+import numpy as np
+from numpy.random import default_rng
+logger = datasets.logging.get_logger(__name__)
+_DESCRIPTION = """\
+A colossal, cleaned version of Common Crawl's web crawl corpus.
+Based on Common Crawl dataset: "https://commoncrawl.org".
+This is the processed version of Google's mC4 dataset by AllenAI.
+"""
+_CITATION = """
+@article{2019t5,
+    author = {Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu},
+    title = {Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer},
+    journal = {arXiv e-prints},
+    year = {2019},
+    archivePrefix = {arXiv},
+    eprint = {1910.10683},
+}
+"""
+_URL = "https://github.com/allenai/allennlp/discussions/5056"
+_DATA_URL = "https://huggingface.co/datasets/allenai/c4/resolve/1ddc917116b730e1859edef32896ec5c16be51d0/multilingual/c4-{language}{split_suffix}.tfrecord-{index:05d}-of-{n_shards:05d}.json.gz"
+_LANGUAGES = [
+    "af",
+    "am",
+    "ar",
+    "az",
+    "be",
+    "bg",
+    "bg-Latn",
+    "bn",
+    "ca",
+    "ceb",
+    "co",
+    "cs",
+    "cy",
+    "da",
+    "de",
+    "el",
+    "el-Latn",
+    "en",
+    "eo",
+    "es",
+    "et",
+    "eu",
+    "fa",
+    "fi",
+    "fil",
+    "fr",
+    "fy",
+    "ga",
+    "gd",
+    "gl",
+    "gu",
+    "ha",
+    "haw",
+    "hi",
+    "hi-Latn",
+    "hmn",
+    "ht",
+    "hu",
+    "hy",
+    "id",
+    "ig",
+    "is",
+    "it",
+    "iw",
+    "ja",
+    "ja-Latn",
+    "jv",
+    "ka",
+    "kk",
+    "km",
+    "kn",
+    "ko",
+    "ku",
+    "ky",
+    "la",
+    "lb",
+    "lo",
+    "lt",
+    "lv",
+    "mg",
+    "mi",
+    "mk",
+    "ml",
+    "mn",
+    "mr",
+    "ms",
+    "mt",
+    "my",
+    "ne",
+    "nl",
+    "no",
+    "ny",
+    "pa",
+    "pl",
+    "ps",
+    "pt",
+    "ro",
+    "ru",
+    "ru-Latn",
+    "sd",
+    "si",
+    "sk",
+    "sl",
+    "sm",
+    "sn",
+    "so",
+    "sq",
+    "sr",
+    "st",
+    "su",
+    "sv",
+    "sw",
+    "ta",
+    "te",
+    "tg",
+    "th",
+    "tr",
+    "uk",
+    "und",
+    "ur",
+    "uz",
+    "vi",
+    "xh",
+    "yi",
+    "yo",
+    "zh",
+    "zh-Latn",
+    "zu",
+]
+_N_SHARDS_PER_SPLIT = {
+    "af": {"train": 64, "validation": 1},
+    "am": {"train": 16, "validation": 1},
+    "ar": {"train": 1024, "validation": 4},
+    "az": {"train": 256, "validation": 1},
+    "be": {"train": 128, "validation": 1},
+    "bg": {"train": 1024, "validation": 1},
+    "bg-Latn": {"train": 4, "validation": 1},
+    "bn": {"train": 512, "validation": 1},
+    "ca": {"train": 512, "validation": 1},
+    "ceb": {"train": 8, "validation": 1},
+    "co": {"train": 8, "validation": 1},
+    "cs": {"train": 1024, "validation": 2},
+    "cy": {"train": 256, "validation": 1},
+    "da": {"train": 1024, "validation": 1},
+    "de": {"train": 2048, "validation": 16},
+    "el": {"train": 1024, "validation": 2},
+    "el-Latn": {"train": 16, "validation": 1},
+    "en": {"train": 11264, "validation": 128},
+    "eo": {"train": 32, "validation": 1},
+    "es": {"train": 2048, "validation": 16},
+    "et": {"train": 256, "validation": 1},
+    "eu": {"train": 64, "validation": 1},
+    "fa": {"train": 1024, "validation": 2},
+    "fi": {"train": 1024, "validation": 1},
+    "fil": {"train": 64, "validation": 1},
+    "fr": {"train": 2048, "validation": 16},
+    "fy": {"train": 16, "validation": 1},
+    "ga": {"train": 16, "validation": 1},
+    "gd": {"train": 16, "validation": 1},
+    "gl": {"train": 128, "validation": 1},
+    "gu": {"train": 64, "validation": 1},
+    "ha": {"train": 8, "validation": 1},
+    "haw": {"train": 2, "validation": 1},
+    "hi": {"train": 1024, "validation": 2},
+    "hi-Latn": {"train": 16, "validation": 1},
+    "hmn": {"train": 8, "validation": 1},
+    "ht": {"train": 8, "validation": 1},
+    "hu": {"train": 1024, "validation": 2},
+    "hy": {"train": 128, "validation": 1},
+    "id": {"train": 1024, "validation": 4},
+    "ig": {"train": 4, "validation": 1},
+    "is": {"train": 128, "validation": 1},
+    "it": {"train": 1024, "validation": 8},
+    "iw": {"train": 1024, "validation": 1},
+    "ja": {"train": 1024, "validation": 8},
+    "ja-Latn": {"train": 8, "validation": 1},
+    "jv": {"train": 8, "validation": 1},
+    "ka": {"train": 256, "validation": 1},
+    "kk": {"train": 256, "validation": 1},
+    "km": {"train": 64, "validation": 1},
+    "kn": {"train": 64, "validation": 1},
+    "ko": {"train": 1024, "validation": 1},
+    "ku": {"train": 16, "validation": 1},
+    "ky": {"train": 64, "validation": 1},
+    "la": {"train": 64, "validation": 1},
+    "lb": {"train": 32, "validation": 1},
+    "lo": {"train": 8, "validation": 1},
+    "lt": {"train": 512, "validation": 1},
+    "lv": {"train": 256, "validation": 1},
+    "mg": {"train": 8, "validation": 1},
+    "mi": {"train": 4, "validation": 1},
+    "mk": {"train": 128, "validation": 1},
+    "ml": {"train": 128, "validation": 1},
+    "mn": {"train": 128, "validation": 1},
+    "mr": {"train": 1024, "validation": 1},
+    "ms": {"train": 512, "validation": 1},
+    "mt": {"train": 128, "validation": 1},
+    "my": {"train": 64, "validation": 1},
+    "ne": {"train": 256, "validation": 1},
+    "nl": {"train": 1024, "validation": 4},
+    "no": {"train": 1024, "validation": 1},
+    "ny": {"train": 4, "validation": 1},
+    "pa": {"train": 32, "validation": 1},
+    "pl": {"train": 1024, "validation": 4},
+    "ps": {"train": 16, "validation": 1},
+    "pt": {"train": 1024, "validation": 4},
+    "ro": {"train": 1024, "validation": 2},
+    "ru": {"train": 4096, "validation": 32},
+    "ru-Latn": {"train": 32, "validation": 1},
+    "sd": {"train": 64, "validation": 1},
+    "si": {"train": 64, "validation": 1},
+    "sk": {"train": 512, "validation": 1},
+    "sl": {"train": 256, "validation": 1},
+    "sm": {"train": 4, "validation": 1},
+    "sn": {"train": 8, "validation": 1},
+    "so": {"train": 64, "validation": 1},
+    "sq": {"train": 128, "validation": 1},
+    "sr": {"train": 256, "validation": 1},
+    "st": {"train": 2, "validation": 1},
+    "su": {"train": 4, "validation": 1},
+    "sv": {"train": 1024, "validation": 2},
+    "sw": {"train": 32, "validation": 1},
+    "ta": {"train": 256, "validation": 1},
+    "te": {"train": 128, "validation": 1},
+    "tg": {"train": 64, "validation": 1},
+    "th": {"train": 1024, "validation": 1},
+    "tr": {"train": 1024, "validation": 4},
+    "uk": {"train": 1024, "validation": 2},
+    "und": {"train": 3072, "validation": 32},
+    "ur": {"train": 128, "validation": 1},
+    "uz": {"train": 32, "validation": 1},
+    "vi": {"train": 1024, "validation": 4},
+    "xh": {"train": 2, "validation": 1},
+    "yi": {"train": 16, "validation": 1},
+    "yo": {"train": 2, "validation": 1},
+    "zh": {"train": 1024, "validation": 2},
+    "zh-Latn": {"train": 8, "validation": 1},
+    "zu": {"train": 8, "validation": 1},
+}
+class Mc4Config(datasets.BuilderConfig):
+    """BuilderConfig for mC4."""
+    def __init__(self, *args, languages, **kwargs):
+        """BuilderConfig for mC4.
+        Args:
+            languages (:obj:`List[str]`): list of languages to load
+            **kwargs: keyword arguments forwarded to super.
+        """
+        super().__init__(
+            *args,
+            name="+".join(languages),
+            **kwargs,
+        )
+        self.languages = languages
+class Mc4(datasets.GeneratorBasedBuilder):
+    """mC4, a colossal, cleaned version of Common Crawl's web crawl corpus."""
+    BUILDER_CONFIGS = [Mc4Config(languages=[lang]) for lang in _LANGUAGES]
+    BUILDER_CONFIG_CLASS = Mc4Config
+    def __init__(self, *args, writer_batch_size=None, **kwargs):
+        self.data_files = kwargs.pop("data_files", {})
+        self.sampling_method = kwargs.pop("sampling_method", None)
+        self.perplexity_model = kwargs.pop("perplexity_model", None)
+        self.sampling_factor = kwargs.pop("sampling_factor", None)
+        self.boundaries = kwargs.pop("boundaries", None)
+        self.seed = kwargs.pop("seed", None)
+        if self.sampling_method:
+            if self.seed is not None:
+                self.rng = default_rng(self.seed)
+            else:
+                self.rng = default_rng()
+            if self.sampling_method == "random":
+                self.should_keep_doc = self._should_keep_doc_random
+            else:
+                # Loading 5-gram model
+                # http://dl.fbaipublicfiles.com/cc_net/lm/es.arpa.bin
+                logger.info("loading model = %s", self.perplexity_model)
+                self.pp_model = kenlm.Model(self.perplexity_model)
+                if self.sampling_method == "gaussian":
+                    self.should_keep_doc = self._should_keep_doc_gaussian
+                else:
+                    self.should_keep_doc = self._should_keep_doc_step
+        super().__init__(*args, writer_batch_size=writer_batch_size, **kwargs)
+    def get_perplexity(self, doc):
+        doc_log_score, doc_length = 0, 0
+        for line in doc.split("\n"):
+            log_score = self.pp_model.score(line)
+            length = len(line.split()) + 1
+            doc_log_score += log_score
+            doc_length += length
+        return 10.0 ** (-doc_log_score / doc_length)
+    def _should_keep_doc_step(self, doc, factor=1.5e5, boundaries=None):
+        perplexity = self.get_perplexity(doc)
+        if boundaries is None:
+            boundaries = [536394.99320948, 662247.50212365, 919250.87225178]
+        if perplexity <= boundaries[0]:
+            quartile_range = boundaries[0]
+        elif boundaries[0] < perplexity < boundaries[1]:
+            quartile_range = boundaries[1] - boundaries[0]
+        elif boundaries[1] < perplexity < boundaries[2]:
+            quartile_range = boundaries[2] - boundaries[1]
+        elif perplexity >= boundaries[2]:
+            quartile_range = 10 * boundaries[2]
+        probability = factor / quartile_range
+        return self.rng.uniform() < probability
+    def _should_keep_doc_gaussian(self, doc, factor=0.78, boundaries=None):
+        perplexity = self.get_perplexity(doc)
+        if boundaries is not None:
+            m = boundaries[1]
+        else:
+            m = 662247.50212365
+        exponential = np.exp(-9/2 * ((perplexity - m) / m) ** 2)
+        weighted_perplexity = factor * exponential
+        return self.rng.uniform() < weighted_perplexity
+    def _should_keep_doc_random(self, doc, factor=None, boundaries=None):
+        if factor is None:
+            factor = 0.5
+        return self.rng.uniform() <= factor
+    def _info(self):
+        return datasets.DatasetInfo(
+            description=_DESCRIPTION,
+            features=datasets.Features(
+                {
+                    "text": datasets.Value("string"),
+                    "timestamp": datasets.Value("string"),
+                    "url": datasets.Value("string"),
+                }
+            ),
+            supervised_keys=None,
+            homepage=_URL,
+            citation=_CITATION,
+        )
+    def _split_generators(self, dl_manager):
+        data_urls = {}
+        for split in ["train", "validation"]:
+            data_urls[split] = [
+                _DATA_URL.format(
+                    language=self.config.name,
+                    split_suffix="-validation" if split == "validation" else "",
+                    index=index,
+                    n_shards=_N_SHARDS_PER_SPLIT[lang][split],
+                )
+                for lang in self.config.languages
+                for index in range(_N_SHARDS_PER_SPLIT[lang][split])
+            ]
+        if "train" in self.data_files:
+            train_downloaded_files = self.data_files["train"]
+            if not isinstance(train_downloaded_files, (tuple, list)):
+                train_downloaded_files = [train_downloaded_files]
+        else:
+            train_downloaded_files = dl_manager.download(data_urls["train"])
+        if "validation" in self.data_files:
+            validation_downloaded_files = self.data_files["validation"]
+            if not isinstance(validation_downloaded_files, (tuple, list)):
+                validation_downloaded_files = [validation_downloaded_files]
+        else:
+            validation_downloaded_files = dl_manager.download(data_urls["validation"])
+        return [
+            datasets.SplitGenerator(name=datasets.Split.TRAIN, gen_kwargs={"filepaths": train_downloaded_files}),
+            datasets.SplitGenerator(
+                name=datasets.Split.VALIDATION, gen_kwargs={"filepaths": validation_downloaded_files}
+            ),
+        ]
+    def _generate_examples(self, filepaths):
+        """This function returns the examples in the raw (text) form by iterating on all the files."""
+        id_ = 0
+        for filepath in filepaths:
+            logger.info("generating examples from = %s", filepath)
+            if filepath.endswith("jsonl"):
+                with open(filepath, "r", encoding="utf-8") as f:
+                    for line in f:
+                        if line:
+                            example = json.loads(line)
+                            yield id_, example
+                            id_ += 1
+            else:
+                with gzip.open(open(filepath, "rb"), "rt", encoding="utf-8") as f:
+                    if self.sampling_method:
+                        logger.info("sampling method = %s", self.sampling_method)
+                        for line in f:
+                            if line:
+                                example = json.loads(line)
+                                if self.should_keep_doc(
+                                    example["text"],
+                                    factor=self.sampling_factor,
+                                    boundaries=self.boundaries):
+                                    yield id_, example
+                                    id_ += 1
+                    else:
+                        for line in f:
+                            if line:
+                                example = json.loads(line)
+                                yield id_, example
+                                id_ += 1

mc4/mc4.py.lock ADDED Viewed

File without changes

merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

outputs/checkpoints/checkpoint-170001/config.json ADDED Viewed

	@@ -0,0 +1,25 @@

+{
+  "architectures": [
+    "RobertaForMaskedLM"
+  ],
+  "attention_probs_dropout_prob": 0.1,
+  "bos_token_id": 0,
+  "eos_token_id": 2,
+  "gradient_checkpointing": false,
+  "hidden_act": "gelu",
+  "hidden_dropout_prob": 0.1,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "max_position_embeddings": 514,
+  "model_type": "roberta",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "pad_token_id": 1,
+  "position_embedding_type": "absolute",
+  "transformers_version": "4.9.0.dev0",
+  "type_vocab_size": 1,
+  "use_cache": true,
+  "vocab_size": 50265
+}

outputs/checkpoints/checkpoint-170001/data_collator.joblib ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e02a6e9cfa63cb321cac9402efd29841b652999fcbf787800ae050e747b161ee
+size 1471394

outputs/checkpoints/checkpoint-170001/flax_model.msgpack ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:82acddc9d2695672d6083ebf0bcde93b488f7aba953505236ac3f977e6f70284
+size 249750019

outputs/checkpoints/checkpoint-170001/optimizer_state.msgpack ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f44ea5ea6a253566d5b813a6ddda12274ce966aa2da9b84c2886126f906f50ac
+size 499500278

outputs/checkpoints/checkpoint-170001/training_args.joblib ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:992b48eed1c6d2f7f720200cd91ad0097c32fde0bda7f370b8b6bee448eaede7
+size 1873

outputs/checkpoints/checkpoint-170001/training_state.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"step": 170001}

outputs/checkpoints/checkpoint-180001/config.json ADDED Viewed

	@@ -0,0 +1,25 @@

+{
+  "architectures": [
+    "RobertaForMaskedLM"
+  ],
+  "attention_probs_dropout_prob": 0.1,
+  "bos_token_id": 0,
+  "eos_token_id": 2,
+  "gradient_checkpointing": false,
+  "hidden_act": "gelu",
+  "hidden_dropout_prob": 0.1,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "max_position_embeddings": 514,
+  "model_type": "roberta",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "pad_token_id": 1,
+  "position_embedding_type": "absolute",
+  "transformers_version": "4.9.0.dev0",
+  "type_vocab_size": 1,
+  "use_cache": true,
+  "vocab_size": 50265
+}

outputs/checkpoints/checkpoint-180001/data_collator.joblib ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e02a6e9cfa63cb321cac9402efd29841b652999fcbf787800ae050e747b161ee
+size 1471394

outputs/checkpoints/checkpoint-180001/flax_model.msgpack ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d4cdfd93674fdb84b689fe6ce896b29a0afecb90e270f95cc718df2fca4b59b8
+size 249750019

outputs/checkpoints/checkpoint-180001/optimizer_state.msgpack ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2576e99225ff5d89b059b8cb255049597f29bfd7c29a8a19ce83be9439aad468
+size 499500278

outputs/checkpoints/checkpoint-180001/training_args.joblib ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:992b48eed1c6d2f7f720200cd91ad0097c32fde0bda7f370b8b6bee448eaede7
+size 1873

outputs/checkpoints/checkpoint-180001/training_state.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"step": 180001}

outputs/checkpoints/checkpoint-190001/config.json ADDED Viewed

	@@ -0,0 +1,25 @@

+{
+  "architectures": [
+    "RobertaForMaskedLM"
+  ],
+  "attention_probs_dropout_prob": 0.1,
+  "bos_token_id": 0,
+  "eos_token_id": 2,
+  "gradient_checkpointing": false,
+  "hidden_act": "gelu",
+  "hidden_dropout_prob": 0.1,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "max_position_embeddings": 514,
+  "model_type": "roberta",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "pad_token_id": 1,
+  "position_embedding_type": "absolute",
+  "transformers_version": "4.9.0.dev0",
+  "type_vocab_size": 1,
+  "use_cache": true,
+  "vocab_size": 50265
+}

outputs/checkpoints/checkpoint-190001/data_collator.joblib ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e02a6e9cfa63cb321cac9402efd29841b652999fcbf787800ae050e747b161ee
+size 1471394

outputs/checkpoints/checkpoint-190001/flax_model.msgpack ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:101266d4390596691c16eca919169708aee4fa432bd7973bf51885daf6ea5b75
+size 249750019

outputs/checkpoints/checkpoint-190001/optimizer_state.msgpack ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:fe2b0c85e7f2988ae9d5ea7856586fe2486a5297592019dd384b01e2ba59e95b
+size 499500278

outputs/checkpoints/checkpoint-190001/training_args.joblib ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:992b48eed1c6d2f7f720200cd91ad0097c32fde0bda7f370b8b6bee448eaede7
+size 1873

outputs/checkpoints/checkpoint-190001/training_state.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"step": 190001}

outputs/checkpoints/checkpoint-200001/config.json ADDED Viewed

	@@ -0,0 +1,25 @@

+{
+  "architectures": [
+    "RobertaForMaskedLM"
+  ],
+  "attention_probs_dropout_prob": 0.1,
+  "bos_token_id": 0,
+  "eos_token_id": 2,
+  "gradient_checkpointing": false,
+  "hidden_act": "gelu",
+  "hidden_dropout_prob": 0.1,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "max_position_embeddings": 514,
+  "model_type": "roberta",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "pad_token_id": 1,
+  "position_embedding_type": "absolute",
+  "transformers_version": "4.9.0.dev0",
+  "type_vocab_size": 1,
+  "use_cache": true,
+  "vocab_size": 50265
+}

outputs/checkpoints/checkpoint-200001/data_collator.joblib ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e02a6e9cfa63cb321cac9402efd29841b652999fcbf787800ae050e747b161ee
+size 1471394

outputs/checkpoints/checkpoint-200001/flax_model.msgpack ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:88e78ae26ac4d3f4891d12ce9c6856b25907020e2aa4a3a833b95a37746d25c6
+size 249750019

outputs/checkpoints/checkpoint-200001/optimizer_state.msgpack ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0f5c4c1ff0bae24abf15f6eeb6bedadc4cad96c37845d7f1d92f56959c671002
+size 499500278

outputs/checkpoints/checkpoint-200001/training_args.joblib ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:992b48eed1c6d2f7f720200cd91ad0097c32fde0bda7f370b8b6bee448eaede7
+size 1873

outputs/checkpoints/checkpoint-200001/training_state.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"step": 200001}

outputs/checkpoints/checkpoint-210001/config.json ADDED Viewed

	@@ -0,0 +1,25 @@

+{
+  "architectures": [
+    "RobertaForMaskedLM"
+  ],
+  "attention_probs_dropout_prob": 0.1,
+  "bos_token_id": 0,
+  "eos_token_id": 2,
+  "gradient_checkpointing": false,
+  "hidden_act": "gelu",
+  "hidden_dropout_prob": 0.1,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "max_position_embeddings": 514,
+  "model_type": "roberta",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "pad_token_id": 1,
+  "position_embedding_type": "absolute",
+  "transformers_version": "4.9.0.dev0",
+  "type_vocab_size": 1,
+  "use_cache": true,
+  "vocab_size": 50265
+}

outputs/checkpoints/checkpoint-210001/data_collator.joblib ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e02a6e9cfa63cb321cac9402efd29841b652999fcbf787800ae050e747b161ee
+size 1471394

outputs/checkpoints/checkpoint-210001/flax_model.msgpack ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9f64c5b858c8917d0c7c86909e61da0c0564704de3797a9d43c2570cac4b0247
+size 249750019

outputs/checkpoints/checkpoint-210001/optimizer_state.msgpack ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:edd1079d9598c4890bc8e4a7222bcf12fb70e9c4e00fc33499e01a01e37915e3
+size 499500278

outputs/checkpoints/checkpoint-210001/training_args.joblib ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:992b48eed1c6d2f7f720200cd91ad0097c32fde0bda7f370b8b6bee448eaede7
+size 1873

outputs/checkpoints/checkpoint-210001/training_state.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"step": 210001}

outputs/config.json ADDED Viewed

	@@ -0,0 +1,25 @@

+{
+  "architectures": [
+    "RobertaForMaskedLM"
+  ],
+  "attention_probs_dropout_prob": 0.1,
+  "bos_token_id": 0,
+  "eos_token_id": 2,
+  "gradient_checkpointing": false,
+  "hidden_act": "gelu",
+  "hidden_dropout_prob": 0.1,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "max_position_embeddings": 514,
+  "model_type": "roberta",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "pad_token_id": 1,
+  "position_embedding_type": "absolute",
+  "transformers_version": "4.9.0.dev0",
+  "type_vocab_size": 1,
+  "use_cache": true,
+  "vocab_size": 50265
+}

outputs/data_collator.joblib ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e02a6e9cfa63cb321cac9402efd29841b652999fcbf787800ae050e747b161ee
+size 1471394

outputs/events.out.tfevents.1626132703.tablespoon.2534518.3.v2 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:dcf653e7864d18167096d734dbd860d15fbba06384015f694de1099fc39f95de
+size 40

outputs/events.out.tfevents.1626132842.tablespoon.2540265.3.v2 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:eede710c8b36371055e5299d3f6797af265dfcbd6b5034c4aff6d2ee6402d900
+size 32408059

outputs/flax_model.msgpack ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9f64c5b858c8917d0c7c86909e61da0c0564704de3797a9d43c2570cac4b0247
+size 249750019

outputs/optimizer_state.msgpack ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:edd1079d9598c4890bc8e4a7222bcf12fb70e9c4e00fc33499e01a01e37915e3
+size 499500278