versae commited on Jul 17, 2021

Commit

c725c08

•

1 Parent(s): 8045dd9

Step... (240001/250000 | Loss: 2.1932833194732666, Acc: 0.5893170833587646): 4%|▉ | 10063/250000 [3:28:31<88:33:32, 1.33s/it]

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +2 -0
.gitignore +4 -0
config.json +25 -0
configs/base/config.json +25 -0
configs/base/tokenizer.json +0 -0
configs/large/config.json +25 -0
configs/large/tokenizer.json +0 -0
convert.py +29 -0
flax_model.msgpack +3 -0
mc4/README.md +525 -0
mc4/dummy/af/0.0.0/dummy_data.zip +0 -0
mc4/mc4.py +426 -0
mc4/mc4.py.lock +0 -0
merges.txt +0 -0
outputs/checkpoints/checkpoint-236000/config.json +25 -0
outputs/checkpoints/checkpoint-236000/data_collator.joblib +3 -0
outputs/checkpoints/checkpoint-236000/flax_model.msgpack +3 -0
outputs/checkpoints/checkpoint-236000/optimizer_state.msgpack +3 -0
outputs/checkpoints/checkpoint-236000/training_args.joblib +3 -0
outputs/checkpoints/checkpoint-236000/training_state.json +1 -0
outputs/checkpoints/checkpoint-237000/config.json +25 -0
outputs/checkpoints/checkpoint-237000/data_collator.joblib +3 -0
outputs/checkpoints/checkpoint-237000/flax_model.msgpack +3 -0
outputs/checkpoints/checkpoint-237000/optimizer_state.msgpack +3 -0
outputs/checkpoints/checkpoint-237000/training_args.joblib +3 -0
outputs/checkpoints/checkpoint-237000/training_state.json +1 -0
outputs/checkpoints/checkpoint-238000/config.json +25 -0
outputs/checkpoints/checkpoint-238000/data_collator.joblib +3 -0
outputs/checkpoints/checkpoint-238000/flax_model.msgpack +3 -0
outputs/checkpoints/checkpoint-238000/optimizer_state.msgpack +3 -0
outputs/checkpoints/checkpoint-238000/training_args.joblib +3 -0
outputs/checkpoints/checkpoint-238000/training_state.json +1 -0
outputs/checkpoints/checkpoint-239000/config.json +25 -0
outputs/checkpoints/checkpoint-239000/data_collator.joblib +3 -0
outputs/checkpoints/checkpoint-239000/flax_model.msgpack +3 -0
outputs/checkpoints/checkpoint-239000/optimizer_state.msgpack +3 -0
outputs/checkpoints/checkpoint-239000/training_args.joblib +3 -0
outputs/checkpoints/checkpoint-239000/training_state.json +1 -0
outputs/checkpoints/checkpoint-240000/config.json +25 -0
outputs/checkpoints/checkpoint-240000/data_collator.joblib +3 -0
outputs/checkpoints/checkpoint-240000/flax_model.msgpack +3 -0
outputs/checkpoints/checkpoint-240000/optimizer_state.msgpack +3 -0
outputs/checkpoints/checkpoint-240000/training_args.joblib +3 -0
outputs/checkpoints/checkpoint-240000/training_state.json +1 -0
outputs/config.json +25 -0
outputs/data_collator.joblib +3 -0
outputs/events.out.tfevents.1626132703.tablespoon.2534518.3.v2 +3 -0
outputs/events.out.tfevents.1626132842.tablespoon.2540265.3.v2 +3 -0
outputs/events.out.tfevents.1626535665.tablespoon.2656403.3.v2 +3 -0
outputs/events.out.tfevents.1626537915.tablespoon.2714825.3.v2 +3 -0

.gitattributes CHANGED Viewed

@@ -15,3 +15,5 @@
 *.pt filter=lfs diff=lfs merge=lfs -text
 *.pth filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.pt filter=lfs diff=lfs merge=lfs -text
 *.pth filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+*.log filter=lfs diff=lfs merge=lfs -text
+*.wandb filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,4 @@

+#run*.log
+debug*.log
+run*.wandb
+wandb/

config.json ADDED Viewed

	@@ -0,0 +1,25 @@

+{
+  "architectures": [
+    "RobertaForMaskedLM"
+  ],
+  "attention_probs_dropout_prob": 0.1,
+  "bos_token_id": 0,
+  "eos_token_id": 2,
+  "gradient_checkpointing": false,
+  "hidden_act": "gelu",
+  "hidden_dropout_prob": 0.1,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "max_position_embeddings": 514,
+  "model_type": "roberta",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "pad_token_id": 1,
+  "position_embedding_type": "absolute",
+  "transformers_version": "4.9.0.dev0",
+  "type_vocab_size": 1,
+  "use_cache": true,
+  "vocab_size": 50265
+}

configs/base/config.json ADDED Viewed

	@@ -0,0 +1,25 @@

+{
+    "architectures": [
+      "RobertaForMaskedLM"
+    ],
+    "attention_probs_dropout_prob": 0.1,
+    "bos_token_id": 0,
+    "eos_token_id": 2,
+    "gradient_checkpointing": false,
+    "hidden_act": "gelu",
+    "hidden_dropout_prob": 0.1,
+    "hidden_size": 768,
+    "initializer_range": 0.02,
+    "intermediate_size": 3072,
+    "layer_norm_eps": 1e-05,
+    "max_position_embeddings": 514,
+    "model_type": "roberta",
+    "num_attention_heads": 12,
+    "num_hidden_layers": 12,
+    "pad_token_id": 1,
+    "position_embedding_type": "absolute",
+    "transformers_version": "4.9.0.dev0",
+    "type_vocab_size": 1,
+    "use_cache": true,
+    "vocab_size": 50265
+  }

configs/base/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

configs/large/config.json ADDED Viewed

	@@ -0,0 +1,25 @@

+{
+  "architectures": [
+    "RobertaForMaskedLM"
+  ],
+  "attention_probs_dropout_prob": 0.1,
+  "bos_token_id": 0,
+  "eos_token_id": 2,
+  "gradient_checkpointing": false,
+  "hidden_act": "gelu",
+  "hidden_dropout_prob": 0.1,
+  "hidden_size": 1024,
+  "initializer_range": 0.02,
+  "intermediate_size": 4096,
+  "layer_norm_eps": 1e-05,
+  "max_position_embeddings": 514,
+  "model_type": "roberta",
+  "num_attention_heads": 16,
+  "num_hidden_layers": 24,
+  "pad_token_id": 1,
+  "position_embedding_type": "absolute",
+  "transformers_version": "4.9.0.dev0",
+  "type_vocab_size": 1,
+  "use_cache": true,
+  "vocab_size": 50265
+}

configs/large/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

convert.py ADDED Viewed

	@@ -0,0 +1,29 @@

+#!/usr/bin/env python
+import tempfile
+import jax
+from jax import numpy as jnp
+from transformers import AutoTokenizer, FlaxRobertaForMaskedLM, RobertaForMaskedLM
+def to_f32(t):
+    return jax.tree_map(lambda x: x.astype(jnp.float32) if x.dtype == jnp.bfloat16 else x, t)
+def main():
+    # Saving extra files from config.json and tokenizer.json files
+    tokenizer = AutoTokenizer.from_pretrained("./")
+    tokenizer.save_pretrained("./")
+    # Temporary saving bfloat16 Flax model into float32
+    tmp = tempfile.mkdtemp()
+    flax_model = FlaxRobertaForMaskedLM.from_pretrained("./")
+    flax_model.params = to_f32(flax_model.params)
+    flax_model.save_pretrained(tmp)
+    # Converting float32 Flax to PyTorch
+    model = RobertaForMaskedLM.from_pretrained(tmp, from_flax=True)
+    model.save_pretrained("./", save_config=False)
+if __name__ == "__main__":
+    main()

flax_model.msgpack ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9ae5937c808e6e457c600f1da0eb5f8f38f6a3137b2d59828bc675b0103214ca
+size 249750019

mc4/README.md ADDED Viewed

	@@ -0,0 +1,525 @@

+---
+pretty_name: mC4
+annotations_creators:
+- no-annotation
+language_creators:
+- found
+languages:
+- af
+- am
+- ar
+- az
+- be
+- bg
+- bg-Latn
+- bn
+- ca
+- ceb
+- co
+- cs
+- cy
+- da
+- de
+- el
+- el-Latn
+- en
+- eo
+- es
+- et
+- eu
+- fa
+- fi
+- fil
+- fr
+- fy
+- ga
+- gd
+- gl
+- gu
+- ha
+- haw
+- hi
+- hi-Latn
+- hmn
+- ht
+- hu
+- hy
+- id
+- ig
+- is
+- it
+- iw
+- ja
+- ja-Latn
+- jv
+- ka
+- kk
+- km
+- kn
+- ko
+- ku
+- ky
+- la
+- lb
+- lo
+- lt
+- lv
+- mg
+- mi
+- mk
+- ml
+- mn
+- mr
+- ms
+- mt
+- my
+- ne
+- nl
+- "no"
+- ny
+- pa
+- pl
+- ps
+- pt
+- ro
+- ru
+- ru-Latn
+- sd
+- si
+- sk
+- sl
+- sm
+- sn
+- so
+- sq
+- sr
+- st
+- su
+- sv
+- sw
+- ta
+- te
+- tg
+- th
+- tr
+- uk
+- und
+- ur
+- uz
+- vi
+- xh
+- yi
+- yo
+- zh
+- zh-Latn
+- zu
+licenses:
+- odc-by-1.0
+multilinguality:
+- multilingual
+size_categories:
+- n<1K
+- 1K<n<10K
+- 10K<n<100K
+- 100K<n<1M
+- 1M<n<10M
+- 10M<n<100M
+- 100M<n<1B
+- 1B<n<10B
+source_datasets:
+- original
+task_categories:
+- sequence-modeling
+task_ids:
+- language-modeling
+paperswithcode_id: mc4
+---
+# Dataset Card for mC4
+## Table of Contents
+- [Dataset Card for mC4](#dataset-card-for-mc4)
+  - [Table of Contents](#table-of-contents)
+  - [Dataset Description](#dataset-description)
+    - [Dataset Summary](#dataset-summary)
+    - [Supported Tasks and Leaderboards](#supported-tasks-and-leaderboards)
+    - [Languages](#languages)
+  - [Dataset Structure](#dataset-structure)
+    - [Data Instances](#data-instances)
+    - [Data Fields](#data-fields)
+    - [Data Splits](#data-splits)
+  - [Dataset Creation](#dataset-creation)
+    - [Curation Rationale](#curation-rationale)
+    - [Source Data](#source-data)
+      - [Initial Data Collection and Normalization](#initial-data-collection-and-normalization)
+      - [Who are the source language producers?](#who-are-the-source-language-producers)
+    - [Annotations](#annotations)
+      - [Annotation process](#annotation-process)
+      - [Who are the annotators?](#who-are-the-annotators)
+    - [Personal and Sensitive Information](#personal-and-sensitive-information)
+  - [Considerations for Using the Data](#considerations-for-using-the-data)
+    - [Social Impact of Dataset](#social-impact-of-dataset)
+    - [Discussion of Biases](#discussion-of-biases)
+    - [Other Known Limitations](#other-known-limitations)
+  - [Additional Information](#additional-information)
+    - [Dataset Curators](#dataset-curators)
+    - [Licensing Information](#licensing-information)
+    - [Citation Information](#citation-information)
+    - [Contributions](#contributions)
+## Dataset Description
+- **Homepage:** https://huggingface.co/datasets/allenai/c4
+- **Paper:** https://arxiv.org/abs/1910.10683
+### Dataset Summary
+A multilingual colossal, cleaned version of Common Crawl's web crawl corpus. Based on Common Crawl dataset: "https://commoncrawl.org".
+This is the version prepared by AllenAI, hosted at this address: https://huggingface.co/datasets/allenai/c4
+108 languages are available and are reported in the table below.
+Note that the languages that end with "-Latn" are simply romanized variants, i.e. written using the Latin script.
+| language code   | language name        |
+|:----------------|:---------------------|
+| af              | Afrikaans            |
+| am              | Amharic              |
+| ar              | Arabic               |
+| az              | Azerbaijani          |
+| be              | Belarusian           |
+| bg              | Bulgarian            |
+| bg-Latn         | Bulgarian (Latin)    |
+| bn              | Bangla               |
+| ca              | Catalan              |
+| ceb             | Cebuano              |
+| co              | Corsican             |
+| cs              | Czech                |
+| cy              | Welsh                |
+| da              | Danish               |
+| de              | German               |
+| el              | Greek                |
+| el-Latn         | Greek (Latin)        |
+| en              | English              |
+| eo              | Esperanto            |
+| es              | Spanish              |
+| et              | Estonian             |
+| eu              | Basque               |
+| fa              | Persian              |
+| fi              | Finnish              |
+| fil             | Filipino             |
+| fr              | French               |
+| fy              | Western Frisian      |
+| ga              | Irish                |
+| gd              | Scottish Gaelic      |
+| gl              | Galician             |
+| gu              | Gujarati             |
+| ha              | Hausa                |
+| haw             | Hawaiian             |
+| hi              | Hindi                |
+| hi-Latn         | Hindi (Latin script) |
+| hmn             | Hmong, Mong          |
+| ht              | Haitian              |
+| hu              | Hungarian            |
+| hy              | Armenian             |
+| id              | Indonesian           |
+| ig              | Igbo                 |
+| is              | Icelandic            |
+| it              | Italian              |
+| iw              | former Hebrew        |
+| ja              | Japanese             |
+| ja-Latn         | Japanese (Latin)     |
+| jv              | Javanese             |
+| ka              | Georgian             |
+| kk              | Kazakh               |
+| km              | Khmer                |
+| kn              | Kannada              |
+| ko              | Korean               |
+| ku              | Kurdish              |
+| ky              | Kyrgyz               |
+| la              | Latin                |
+| lb              | Luxembourgish        |
+| lo              | Lao                  |
+| lt              | Lithuanian           |
+| lv              | Latvian              |
+| mg              | Malagasy             |
+| mi              | Maori                |
+| mk              | Macedonian           |
+| ml              | Malayalam            |
+| mn              | Mongolian            |
+| mr              | Marathi              |
+| ms              | Malay                |
+| mt              | Maltese              |
+| my              | Burmese              |
+| ne              | Nepali               |
+| nl              | Dutch                |
+| no              | Norwegian            |
+| ny              | Nyanja               |
+| pa              | Punjabi              |
+| pl              | Polish               |
+| ps              | Pashto               |
+| pt              | Portuguese           |
+| ro              | Romanian             |
+| ru              | Russian              |
+| ru-Latn         | Russian (Latin)      |
+| sd              | Sindhi               |
+| si              | Sinhala              |
+| sk              | Slovak               |
+| sl              | Slovenian            |
+| sm              | San Marino           |
+| sn              | Shona                |
+| so              | Somali               |
+| sq              | Albanian             |
+| sr              | Serbian              |
+| st              | Southern Sotho       |
+| su              | Sundanese            |
+| sv              | Swedish              |
+| sw              | Swahili              |
+| ta              | Tamil                |
+| te              | Telugu               |
+| tg              | Tajik                |
+| th              | Thai                 |
+| tr              | Turkish              |
+| uk              | Ukrainian            |
+| und             | Unknown language     |
+| ur              | Urdu                 |
+| uz              | Uzbek                |
+| vi              | Vietnamese           |
+| xh              | Xhosa                |
+| yi              | Yiddish              |
+| yo              | Yoruba               |
+| zh              | Chinese              |
+| zh-Latn         | Chinese (Latin)      |
+| zu              | Zulu                 |
+You can load the mC4 subset of any language like this:
+```python
+from datasets import load_dataset
+en_mc4 = load_dataset("mc4", "en")
+```
+And if you can even specify a list of languages:
+```python
+from datasets import load_dataset
+mc4_subset_with_five_languages = load_dataset("mc4", languages=["en", "fr", "es", "de", "zh"])
+```
+### Supported Tasks and Leaderboards
+mC4 is mainly intended to pretrain language models and word representations.
+### Languages
+The dataset supports 108 languages.
+## Dataset Structure
+### Data Instances
+An example form the `en` config is:
+```
+{'timestamp': '2018-06-24T01:32:39Z',
+ 'text': 'Farm Resources in Plumas County\nShow Beginning Farmer Organizations & Professionals (304)\nThere are 304 resources serving Plumas County in the following categories:\nMap of Beginning Farmer Organizations & Professionals serving Plumas County\nVictoria Fisher - Office Manager - Loyalton, CA\nAmy Lynn Rasband - UCCE Plumas-Sierra Administrative Assistant II - Quincy , CA\nShow Farm Income Opportunities Organizations & Professionals (353)\nThere are 353 resources serving Plumas County in the following categories:\nFarm Ranch And Forest Retailers (18)\nMap of Farm Income Opportunities Organizations & Professionals serving Plumas County\nWarner Valley Wildlife Area - Plumas County\nShow Farm Resources Organizations & Professionals (297)\nThere are 297 resources serving Plumas County in the following categories:\nMap of Farm Resources Organizations & Professionals serving Plumas County\nThere are 57 resources serving Plumas County in the following categories:\nMap of Organic Certification Organizations & Professionals serving Plumas County',
+ 'url': 'http://www.californialandcan.org/Plumas/Farm-Resources/'}
+```
+### Data Fields
+The data have several fields:
+- `url`: url of the source as a string
+- `text`: text content as a string
+- `timestamp`: timestamp as a string
+### Data Splits
+To build mC4, the authors used [CLD3](https://github.com/google/cld3) to identify over 100 languages. The resulting mC4 subsets for each language are reported in this table:
+| config   | train   | validation   |
+|:---------|:--------|:-------------|
+| af       | ?       | ?            |
+| am       | ?       | ?            |
+| ar       | ?       | ?            |
+| az       | ?       | ?            |
+| be       | ?       | ?            |
+| bg       | ?       | ?            |
+| bg-Latn  | ?       | ?            |
+| bn       | ?       | ?            |
+| ca       | ?       | ?            |
+| ceb      | ?       | ?            |
+| co       | ?       | ?            |
+| cs       | ?       | ?            |
+| cy       | ?       | ?            |
+| da       | ?       | ?            |
+| de       | ?       | ?            |
+| el       | ?       | ?            |
+| el-Latn  | ?       | ?            |
+| en       | ?       | ?            |
+| eo       | ?       | ?            |
+| es       | ?       | ?            |
+| et       | ?       | ?            |
+| eu       | ?       | ?            |
+| fa       | ?       | ?            |
+| fi       | ?       | ?            |
+| fil      | ?       | ?            |
+| fr       | ?       | ?            |
+| fy       | ?       | ?            |
+| ga       | ?       | ?            |
+| gd       | ?       | ?            |
+| gl       | ?       | ?            |
+| gu       | ?       | ?            |
+| ha       | ?       | ?            |
+| haw      | ?       | ?            |
+| hi       | ?       | ?            |
+| hi-Latn  | ?       | ?            |
+| hmn      | ?       | ?            |
+| ht       | ?       | ?            |
+| hu       | ?       | ?            |
+| hy       | ?       | ?            |
+| id       | ?       | ?            |
+| ig       | ?       | ?            |
+| is       | ?       | ?            |
+| it       | ?       | ?            |
+| iw       | ?       | ?            |
+| ja       | ?       | ?            |
+| ja-Latn  | ?       | ?            |
+| jv       | ?       | ?            |
+| ka       | ?       | ?            |
+| kk       | ?       | ?            |
+| km       | ?       | ?            |
+| kn       | ?       | ?            |
+| ko       | ?       | ?            |
+| ku       | ?       | ?            |
+| ky       | ?       | ?            |
+| la       | ?       | ?            |
+| lb       | ?       | ?            |
+| lo       | ?       | ?            |
+| lt       | ?       | ?            |
+| lv       | ?       | ?            |
+| mg       | ?       | ?            |
+| mi       | ?       | ?            |
+| mk       | ?       | ?            |
+| ml       | ?       | ?            |
+| mn       | ?       | ?            |
+| mr       | ?       | ?            |
+| ms       | ?       | ?            |
+| mt       | ?       | ?            |
+| my       | ?       | ?            |
+| ne       | ?       | ?            |
+| nl       | ?       | ?            |
+| no       | ?       | ?            |
+| ny       | ?       | ?            |
+| pa       | ?       | ?            |
+| pl       | ?       | ?            |
+| ps       | ?       | ?            |
+| pt       | ?       | ?            |
+| ro       | ?       | ?            |
+| ru       | ?       | ?            |
+| ru-Latn  | ?       | ?            |
+| sd       | ?       | ?            |
+| si       | ?       | ?            |
+| sk       | ?       | ?            |
+| sl       | ?       | ?            |
+| sm       | ?       | ?            |
+| sn       | ?       | ?            |
+| so       | ?       | ?            |
+| sq       | ?       | ?            |
+| sr       | ?       | ?            |
+| st       | ?       | ?            |
+| su       | ?       | ?            |
+| sv       | ?       | ?            |
+| sw       | ?       | ?            |
+| ta       | ?       | ?            |
+| te       | ?       | ?            |
+| tg       | ?       | ?            |
+| th       | ?       | ?            |
+| tr       | ?       | ?            |
+| uk       | ?       | ?            |
+| und      | ?       | ?            |
+| ur       | ?       | ?            |
+| uz       | ?       | ?            |
+| vi       | ?       | ?            |
+| xh       | ?       | ?            |
+| yi       | ?       | ?            |
+| yo       | ?       | ?            |
+| zh       | ?       | ?            |
+| zh-Latn  | ?       | ?            |
+| zu       | ?       | ?            |
+## Dataset Creation
+### Curation Rationale
+[More Information Needed]
+### Source Data
+#### Initial Data Collection and Normalization
+[More Information Needed]
+#### Who are the source language producers?
+[More Information Needed]
+### Annotations
+#### Annotation process
+[More Information Needed]
+#### Who are the annotators?
+[More Information Needed]
+### Personal and Sensitive Information
+[More Information Needed]
+## Considerations for Using the Data
+### Social Impact of Dataset
+[More Information Needed]
+### Discussion of Biases
+[More Information Needed]
+### Other Known Limitations
+[More Information Needed]
+## Additional Information
+### Dataset Curators
+[More Information Needed]
+### Licensing Information
+AllenAI are releasing this dataset under the terms of ODC-BY. By using this, you are also bound by the Common Crawl terms of use in respect of the content contained in the dataset.
+### Citation Information
+```
+@article{2019t5,
+    author = {Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu},
+    title = {Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer},
+    journal = {arXiv e-prints},
+    year = {2019},
+    archivePrefix = {arXiv},
+    eprint = {1910.10683},
+}
+```
+### Contributions
+Thanks to [@dirkgr](https://github.com/dirkgr) and [@lhoestq](https://github.com/lhoestq) for adding this dataset.

mc4/dummy/af/0.0.0/dummy_data.zip ADDED Viewed

Binary file (8.54 kB). View file

mc4/mc4.py ADDED Viewed

	@@ -0,0 +1,426 @@

+"""mC4 dataset based on Common Crawl."""
+import gzip
+import json
+import datasets
+import kenlm
+import numpy as np
+from numpy.random import default_rng
+logger = datasets.logging.get_logger(__name__)
+_DESCRIPTION = """\
+A colossal, cleaned version of Common Crawl's web crawl corpus.
+Based on Common Crawl dataset: "https://commoncrawl.org".
+This is the processed version of Google's mC4 dataset by AllenAI.
+"""
+_CITATION = """
+@article{2019t5,
+    author = {Colin Raffel and Noam Shazeer and Adam Roberts and Katherine Lee and Sharan Narang and Michael Matena and Yanqi Zhou and Wei Li and Peter J. Liu},
+    title = {Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer},
+    journal = {arXiv e-prints},
+    year = {2019},
+    archivePrefix = {arXiv},
+    eprint = {1910.10683},
+}
+"""
+_URL = "https://github.com/allenai/allennlp/discussions/5056"
+_DATA_URL = "https://huggingface.co/datasets/allenai/c4/resolve/1ddc917116b730e1859edef32896ec5c16be51d0/multilingual/c4-{language}{split_suffix}.tfrecord-{index:05d}-of-{n_shards:05d}.json.gz"
+_LANGUAGES = [
+    "af",
+    "am",
+    "ar",
+    "az",
+    "be",
+    "bg",
+    "bg-Latn",
+    "bn",
+    "ca",
+    "ceb",
+    "co",
+    "cs",
+    "cy",
+    "da",
+    "de",
+    "el",
+    "el-Latn",
+    "en",
+    "eo",
+    "es",
+    "et",
+    "eu",
+    "fa",
+    "fi",
+    "fil",
+    "fr",
+    "fy",
+    "ga",
+    "gd",
+    "gl",
+    "gu",
+    "ha",
+    "haw",
+    "hi",
+    "hi-Latn",
+    "hmn",
+    "ht",
+    "hu",
+    "hy",
+    "id",
+    "ig",
+    "is",
+    "it",
+    "iw",
+    "ja",
+    "ja-Latn",
+    "jv",
+    "ka",
+    "kk",
+    "km",
+    "kn",
+    "ko",
+    "ku",
+    "ky",
+    "la",
+    "lb",
+    "lo",
+    "lt",
+    "lv",
+    "mg",
+    "mi",
+    "mk",
+    "ml",
+    "mn",
+    "mr",
+    "ms",
+    "mt",
+    "my",
+    "ne",
+    "nl",
+    "no",
+    "ny",
+    "pa",
+    "pl",
+    "ps",
+    "pt",
+    "ro",
+    "ru",
+    "ru-Latn",
+    "sd",
+    "si",
+    "sk",
+    "sl",
+    "sm",
+    "sn",
+    "so",
+    "sq",
+    "sr",
+    "st",
+    "su",
+    "sv",
+    "sw",
+    "ta",
+    "te",
+    "tg",
+    "th",
+    "tr",
+    "uk",
+    "und",
+    "ur",
+    "uz",
+    "vi",
+    "xh",
+    "yi",
+    "yo",
+    "zh",
+    "zh-Latn",
+    "zu",
+]
+_N_SHARDS_PER_SPLIT = {
+    "af": {"train": 64, "validation": 1},
+    "am": {"train": 16, "validation": 1},
+    "ar": {"train": 1024, "validation": 4},
+    "az": {"train": 256, "validation": 1},
+    "be": {"train": 128, "validation": 1},
+    "bg": {"train": 1024, "validation": 1},
+    "bg-Latn": {"train": 4, "validation": 1},
+    "bn": {"train": 512, "validation": 1},
+    "ca": {"train": 512, "validation": 1},
+    "ceb": {"train": 8, "validation": 1},
+    "co": {"train": 8, "validation": 1},
+    "cs": {"train": 1024, "validation": 2},
+    "cy": {"train": 256, "validation": 1},
+    "da": {"train": 1024, "validation": 1},
+    "de": {"train": 2048, "validation": 16},
+    "el": {"train": 1024, "validation": 2},
+    "el-Latn": {"train": 16, "validation": 1},
+    "en": {"train": 11264, "validation": 128},
+    "eo": {"train": 32, "validation": 1},
+    "es": {"train": 2048, "validation": 16},
+    "et": {"train": 256, "validation": 1},
+    "eu": {"train": 64, "validation": 1},
+    "fa": {"train": 1024, "validation": 2},
+    "fi": {"train": 1024, "validation": 1},
+    "fil": {"train": 64, "validation": 1},
+    "fr": {"train": 2048, "validation": 16},
+    "fy": {"train": 16, "validation": 1},
+    "ga": {"train": 16, "validation": 1},
+    "gd": {"train": 16, "validation": 1},
+    "gl": {"train": 128, "validation": 1},
+    "gu": {"train": 64, "validation": 1},
+    "ha": {"train": 8, "validation": 1},
+    "haw": {"train": 2, "validation": 1},
+    "hi": {"train": 1024, "validation": 2},
+    "hi-Latn": {"train": 16, "validation": 1},
+    "hmn": {"train": 8, "validation": 1},
+    "ht": {"train": 8, "validation": 1},
+    "hu": {"train": 1024, "validation": 2},
+    "hy": {"train": 128, "validation": 1},
+    "id": {"train": 1024, "validation": 4},
+    "ig": {"train": 4, "validation": 1},
+    "is": {"train": 128, "validation": 1},
+    "it": {"train": 1024, "validation": 8},
+    "iw": {"train": 1024, "validation": 1},
+    "ja": {"train": 1024, "validation": 8},
+    "ja-Latn": {"train": 8, "validation": 1},
+    "jv": {"train": 8, "validation": 1},
+    "ka": {"train": 256, "validation": 1},
+    "kk": {"train": 256, "validation": 1},
+    "km": {"train": 64, "validation": 1},
+    "kn": {"train": 64, "validation": 1},
+    "ko": {"train": 1024, "validation": 1},
+    "ku": {"train": 16, "validation": 1},
+    "ky": {"train": 64, "validation": 1},
+    "la": {"train": 64, "validation": 1},
+    "lb": {"train": 32, "validation": 1},
+    "lo": {"train": 8, "validation": 1},
+    "lt": {"train": 512, "validation": 1},
+    "lv": {"train": 256, "validation": 1},
+    "mg": {"train": 8, "validation": 1},
+    "mi": {"train": 4, "validation": 1},
+    "mk": {"train": 128, "validation": 1},
+    "ml": {"train": 128, "validation": 1},
+    "mn": {"train": 128, "validation": 1},
+    "mr": {"train": 1024, "validation": 1},
+    "ms": {"train": 512, "validation": 1},
+    "mt": {"train": 128, "validation": 1},
+    "my": {"train": 64, "validation": 1},
+    "ne": {"train": 256, "validation": 1},
+    "nl": {"train": 1024, "validation": 4},
+    "no": {"train": 1024, "validation": 1},
+    "ny": {"train": 4, "validation": 1},
+    "pa": {"train": 32, "validation": 1},
+    "pl": {"train": 1024, "validation": 4},
+    "ps": {"train": 16, "validation": 1},
+    "pt": {"train": 1024, "validation": 4},
+    "ro": {"train": 1024, "validation": 2},
+    "ru": {"train": 4096, "validation": 32},
+    "ru-Latn": {"train": 32, "validation": 1},
+    "sd": {"train": 64, "validation": 1},
+    "si": {"train": 64, "validation": 1},
+    "sk": {"train": 512, "validation": 1},
+    "sl": {"train": 256, "validation": 1},
+    "sm": {"train": 4, "validation": 1},
+    "sn": {"train": 8, "validation": 1},
+    "so": {"train": 64, "validation": 1},
+    "sq": {"train": 128, "validation": 1},
+    "sr": {"train": 256, "validation": 1},
+    "st": {"train": 2, "validation": 1},
+    "su": {"train": 4, "validation": 1},
+    "sv": {"train": 1024, "validation": 2},
+    "sw": {"train": 32, "validation": 1},
+    "ta": {"train": 256, "validation": 1},
+    "te": {"train": 128, "validation": 1},
+    "tg": {"train": 64, "validation": 1},
+    "th": {"train": 1024, "validation": 1},
+    "tr": {"train": 1024, "validation": 4},
+    "uk": {"train": 1024, "validation": 2},
+    "und": {"train": 3072, "validation": 32},
+    "ur": {"train": 128, "validation": 1},
+    "uz": {"train": 32, "validation": 1},
+    "vi": {"train": 1024, "validation": 4},
+    "xh": {"train": 2, "validation": 1},
+    "yi": {"train": 16, "validation": 1},
+    "yo": {"train": 2, "validation": 1},
+    "zh": {"train": 1024, "validation": 2},
+    "zh-Latn": {"train": 8, "validation": 1},
+    "zu": {"train": 8, "validation": 1},
+}
+class Mc4Config(datasets.BuilderConfig):
+    """BuilderConfig for mC4."""
+    def __init__(self, *args, languages, **kwargs):
+        """BuilderConfig for mC4.
+        Args:
+            languages (:obj:`List[str]`): list of languages to load
+            **kwargs: keyword arguments forwarded to super.
+        """
+        super().__init__(
+            *args,
+            name="+".join(languages),
+            **kwargs,
+        )
+        self.languages = languages
+class Mc4(datasets.GeneratorBasedBuilder):
+    """mC4, a colossal, cleaned version of Common Crawl's web crawl corpus."""
+    BUILDER_CONFIGS = [Mc4Config(languages=[lang]) for lang in _LANGUAGES]
+    BUILDER_CONFIG_CLASS = Mc4Config
+    def __init__(self, *args, writer_batch_size=None, **kwargs):
+        self.data_files = kwargs.pop("data_files", {})
+        self.sampling_method = kwargs.pop("sampling_method", None)
+        self.perplexity_model = kwargs.pop("perplexity_model", None)
+        self.sampling_factor = kwargs.pop("sampling_factor", None)
+        self.boundaries = kwargs.pop("boundaries", None)
+        self.seed = kwargs.pop("seed", None)
+        if self.sampling_method:
+            if self.seed is not None:
+                self.rng = default_rng(self.seed)
+            else:
+                self.rng = default_rng()
+            if self.sampling_method == "random":
+                self.should_keep_doc = self._should_keep_doc_random
+            else:
+                # Loading 5-gram model
+                # http://dl.fbaipublicfiles.com/cc_net/lm/es.arpa.bin
+                logger.info("loading model = %s", self.perplexity_model)
+                self.pp_model = kenlm.Model(self.perplexity_model)
+                if self.sampling_method == "gaussian":
+                    self.should_keep_doc = self._should_keep_doc_gaussian
+                else:
+                    self.should_keep_doc = self._should_keep_doc_step
+        super().__init__(*args, writer_batch_size=writer_batch_size, **kwargs)
+    def get_perplexity(self, doc):
+        doc_log_score, doc_length = 0, 0
+        for line in doc.split("\n"):
+            log_score = self.pp_model.score(line)
+            length = len(line.split()) + 1
+            doc_log_score += log_score
+            doc_length += length
+        return 10.0 ** (-doc_log_score / doc_length)
+    def _should_keep_doc_step(self, doc, factor=1.5e5, boundaries=None):
+        perplexity = self.get_perplexity(doc)
+        if boundaries is None:
+            boundaries = [536394.99320948, 662247.50212365, 919250.87225178]
+        if perplexity <= boundaries[0]:
+            quartile_range = boundaries[0]
+        elif boundaries[0] < perplexity < boundaries[1]:
+            quartile_range = boundaries[1] - boundaries[0]
+        elif boundaries[1] < perplexity < boundaries[2]:
+            quartile_range = boundaries[2] - boundaries[1]
+        elif perplexity >= boundaries[2]:
+            quartile_range = 10 * boundaries[2]
+        probability = factor / quartile_range
+        return self.rng.uniform() < probability
+    def _should_keep_doc_gaussian(self, doc, factor=0.78, boundaries=None):
+        perplexity = self.get_perplexity(doc)
+        if boundaries is not None:
+            m = boundaries[1]
+        else:
+            m = 662247.50212365
+        exponential = np.exp(-9/2 * ((perplexity - m) / m) ** 2)
+        weighted_perplexity = factor * exponential
+        return self.rng.uniform() < weighted_perplexity
+    def _should_keep_doc_random(self, doc, factor=None, boundaries=None):
+        if factor is None:
+            factor = 0.5
+        return self.rng.uniform() <= factor
+    def _info(self):
+        return datasets.DatasetInfo(
+            description=_DESCRIPTION,
+            features=datasets.Features(
+                {
+                    "text": datasets.Value("string"),
+                    "timestamp": datasets.Value("string"),
+                    "url": datasets.Value("string"),
+                }
+            ),
+            supervised_keys=None,
+            homepage=_URL,
+            citation=_CITATION,
+        )
+    def _split_generators(self, dl_manager):
+        data_urls = {}
+        for split in ["train", "validation"]:
+            data_urls[split] = [
+                _DATA_URL.format(
+                    language=self.config.name,
+                    split_suffix="-validation" if split == "validation" else "",
+                    index=index,
+                    n_shards=_N_SHARDS_PER_SPLIT[lang][split],
+                )
+                for lang in self.config.languages
+                for index in range(_N_SHARDS_PER_SPLIT[lang][split])
+            ]
+        if "train" in self.data_files:
+            train_downloaded_files = self.data_files["train"]
+            if not isinstance(train_downloaded_files, (tuple, list)):
+                train_downloaded_files = [train_downloaded_files]
+        else:
+            train_downloaded_files = dl_manager.download(data_urls["train"])
+        if "validation" in self.data_files:
+            validation_downloaded_files = self.data_files["validation"]
+            if not isinstance(validation_downloaded_files, (tuple, list)):
+                validation_downloaded_files = [validation_downloaded_files]
+        else:
+            validation_downloaded_files = dl_manager.download(data_urls["validation"])
+        return [
+            datasets.SplitGenerator(name=datasets.Split.TRAIN, gen_kwargs={"filepaths": train_downloaded_files}),
+            datasets.SplitGenerator(
+                name=datasets.Split.VALIDATION, gen_kwargs={"filepaths": validation_downloaded_files}
+            ),
+        ]
+    def _generate_examples(self, filepaths):
+        """This function returns the examples in the raw (text) form by iterating on all the files."""
+        id_ = 0
+        for filepath in filepaths:
+            logger.info("generating examples from = %s", filepath)
+            if filepath.endswith("jsonl"):
+                with open(filepath, "r", encoding="utf-8") as f:
+                    for line in f:
+                        if line:
+                            example = json.loads(line)
+                            yield id_, example
+                            id_ += 1
+            else:
+                with gzip.open(open(filepath, "rb"), "rt", encoding="utf-8") as f:
+                    if self.sampling_method:
+                        logger.info("sampling method = %s", self.sampling_method)
+                        for line in f:
+                            if line:
+                                example = json.loads(line)
+                                if self.should_keep_doc(
+                                    example["text"],
+                                    factor=self.sampling_factor,
+                                    boundaries=self.boundaries):
+                                    yield id_, example
+                                    id_ += 1
+                    else:
+                        for line in f:
+                            if line:
+                                example = json.loads(line)
+                                yield id_, example
+                                id_ += 1

mc4/mc4.py.lock ADDED Viewed

File without changes

merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

outputs/checkpoints/checkpoint-236000/config.json ADDED Viewed

	@@ -0,0 +1,25 @@

+{
+  "architectures": [
+    "RobertaForMaskedLM"
+  ],
+  "attention_probs_dropout_prob": 0.1,
+  "bos_token_id": 0,
+  "eos_token_id": 2,
+  "gradient_checkpointing": false,
+  "hidden_act": "gelu",
+  "hidden_dropout_prob": 0.1,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "max_position_embeddings": 514,
+  "model_type": "roberta",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "pad_token_id": 1,
+  "position_embedding_type": "absolute",
+  "transformers_version": "4.9.0.dev0",
+  "type_vocab_size": 1,
+  "use_cache": true,
+  "vocab_size": 50265
+}

outputs/checkpoints/checkpoint-236000/data_collator.joblib ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0321b1a9629e1be122045cd72470365a63c8496fec109fdeec34827f01ffbb9e
+size 1471424

outputs/checkpoints/checkpoint-236000/flax_model.msgpack ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:99845155f064d68ed7235bec28f18280d889893696b16d39d5060ccc1565fe7c
+size 249750019

outputs/checkpoints/checkpoint-236000/optimizer_state.msgpack ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6f1bb7accdbbac41048ab318cdbd0a0298793a5ca4a203aad1e0404dd23dd7bf
+size 499500278

outputs/checkpoints/checkpoint-236000/training_args.joblib ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:750e4dfee62b3647178ef58384b169e0bfc4aff8738f70be19c457301459d630
+size 1871

outputs/checkpoints/checkpoint-236000/training_state.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"step": 236001}

outputs/checkpoints/checkpoint-237000/config.json ADDED Viewed

	@@ -0,0 +1,25 @@

+{
+  "architectures": [
+    "RobertaForMaskedLM"
+  ],
+  "attention_probs_dropout_prob": 0.1,
+  "bos_token_id": 0,
+  "eos_token_id": 2,
+  "gradient_checkpointing": false,
+  "hidden_act": "gelu",
+  "hidden_dropout_prob": 0.1,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "max_position_embeddings": 514,
+  "model_type": "roberta",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "pad_token_id": 1,
+  "position_embedding_type": "absolute",
+  "transformers_version": "4.9.0.dev0",
+  "type_vocab_size": 1,
+  "use_cache": true,
+  "vocab_size": 50265
+}

outputs/checkpoints/checkpoint-237000/data_collator.joblib ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0321b1a9629e1be122045cd72470365a63c8496fec109fdeec34827f01ffbb9e
+size 1471424

outputs/checkpoints/checkpoint-237000/flax_model.msgpack ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:755f3c2edda86bdb78a5b091d753ed5c1a3eee3a4bc01fc92e97a4e00d44666b
+size 249750019

outputs/checkpoints/checkpoint-237000/optimizer_state.msgpack ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1b092b38f151ded8fc7c3904381f4043b42d21461bc277acfb423ff6c288c662
+size 499500278

outputs/checkpoints/checkpoint-237000/training_args.joblib ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:750e4dfee62b3647178ef58384b169e0bfc4aff8738f70be19c457301459d630
+size 1871

outputs/checkpoints/checkpoint-237000/training_state.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"step": 237001}

outputs/checkpoints/checkpoint-238000/config.json ADDED Viewed

	@@ -0,0 +1,25 @@

+{
+  "architectures": [
+    "RobertaForMaskedLM"
+  ],
+  "attention_probs_dropout_prob": 0.1,
+  "bos_token_id": 0,
+  "eos_token_id": 2,
+  "gradient_checkpointing": false,
+  "hidden_act": "gelu",
+  "hidden_dropout_prob": 0.1,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "max_position_embeddings": 514,
+  "model_type": "roberta",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "pad_token_id": 1,
+  "position_embedding_type": "absolute",
+  "transformers_version": "4.9.0.dev0",
+  "type_vocab_size": 1,
+  "use_cache": true,
+  "vocab_size": 50265
+}

outputs/checkpoints/checkpoint-238000/data_collator.joblib ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0321b1a9629e1be122045cd72470365a63c8496fec109fdeec34827f01ffbb9e
+size 1471424

outputs/checkpoints/checkpoint-238000/flax_model.msgpack ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2318ad2b972dbe362e6ed41efb0d5d26e3fa8b0d854d6f396f8133203977fc13
+size 249750019

outputs/checkpoints/checkpoint-238000/optimizer_state.msgpack ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2a6fc7fda05af98fa8d25993f5adf8e54f8ba210dc0f0c455e55706c5b9e932c
+size 499500278

outputs/checkpoints/checkpoint-238000/training_args.joblib ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:750e4dfee62b3647178ef58384b169e0bfc4aff8738f70be19c457301459d630
+size 1871

outputs/checkpoints/checkpoint-238000/training_state.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"step": 238001}

outputs/checkpoints/checkpoint-239000/config.json ADDED Viewed

	@@ -0,0 +1,25 @@

+{
+  "architectures": [
+    "RobertaForMaskedLM"
+  ],
+  "attention_probs_dropout_prob": 0.1,
+  "bos_token_id": 0,
+  "eos_token_id": 2,
+  "gradient_checkpointing": false,
+  "hidden_act": "gelu",
+  "hidden_dropout_prob": 0.1,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "max_position_embeddings": 514,
+  "model_type": "roberta",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "pad_token_id": 1,
+  "position_embedding_type": "absolute",
+  "transformers_version": "4.9.0.dev0",
+  "type_vocab_size": 1,
+  "use_cache": true,
+  "vocab_size": 50265
+}

outputs/checkpoints/checkpoint-239000/data_collator.joblib ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0321b1a9629e1be122045cd72470365a63c8496fec109fdeec34827f01ffbb9e
+size 1471424

outputs/checkpoints/checkpoint-239000/flax_model.msgpack ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a91241143d68cab5eaabcd673afdbeb921b9bd7421f64e916068a9bbbb9f3f61
+size 249750019

outputs/checkpoints/checkpoint-239000/optimizer_state.msgpack ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:da4afed84b087814ce34440c407482a6d72fd5b561c6f9abc97c6a9169702c6a
+size 499500278

outputs/checkpoints/checkpoint-239000/training_args.joblib ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:750e4dfee62b3647178ef58384b169e0bfc4aff8738f70be19c457301459d630
+size 1871

outputs/checkpoints/checkpoint-239000/training_state.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"step": 239001}

outputs/checkpoints/checkpoint-240000/config.json ADDED Viewed

	@@ -0,0 +1,25 @@

+{
+  "architectures": [
+    "RobertaForMaskedLM"
+  ],
+  "attention_probs_dropout_prob": 0.1,
+  "bos_token_id": 0,
+  "eos_token_id": 2,
+  "gradient_checkpointing": false,
+  "hidden_act": "gelu",
+  "hidden_dropout_prob": 0.1,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "max_position_embeddings": 514,
+  "model_type": "roberta",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "pad_token_id": 1,
+  "position_embedding_type": "absolute",
+  "transformers_version": "4.9.0.dev0",
+  "type_vocab_size": 1,
+  "use_cache": true,
+  "vocab_size": 50265
+}

outputs/checkpoints/checkpoint-240000/data_collator.joblib ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0321b1a9629e1be122045cd72470365a63c8496fec109fdeec34827f01ffbb9e
+size 1471424

outputs/checkpoints/checkpoint-240000/flax_model.msgpack ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9ae5937c808e6e457c600f1da0eb5f8f38f6a3137b2d59828bc675b0103214ca
+size 249750019

outputs/checkpoints/checkpoint-240000/optimizer_state.msgpack ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:88c8e3403699be77d3def6ff91b6fa612b9df84c14ca7c447c1fefbc9c626b17
+size 499500278

outputs/checkpoints/checkpoint-240000/training_args.joblib ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:750e4dfee62b3647178ef58384b169e0bfc4aff8738f70be19c457301459d630
+size 1871

outputs/checkpoints/checkpoint-240000/training_state.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"step": 240001}

outputs/config.json ADDED Viewed

	@@ -0,0 +1,25 @@

+{
+  "architectures": [
+    "RobertaForMaskedLM"
+  ],
+  "attention_probs_dropout_prob": 0.1,
+  "bos_token_id": 0,
+  "eos_token_id": 2,
+  "gradient_checkpointing": false,
+  "hidden_act": "gelu",
+  "hidden_dropout_prob": 0.1,
+  "hidden_size": 768,
+  "initializer_range": 0.02,
+  "intermediate_size": 3072,
+  "layer_norm_eps": 1e-05,
+  "max_position_embeddings": 514,
+  "model_type": "roberta",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 12,
+  "pad_token_id": 1,
+  "position_embedding_type": "absolute",
+  "transformers_version": "4.9.0.dev0",
+  "type_vocab_size": 1,
+  "use_cache": true,
+  "vocab_size": 50265
+}

outputs/data_collator.joblib ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0321b1a9629e1be122045cd72470365a63c8496fec109fdeec34827f01ffbb9e
+size 1471424

outputs/events.out.tfevents.1626132703.tablespoon.2534518.3.v2 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:dcf653e7864d18167096d734dbd860d15fbba06384015f694de1099fc39f95de
+size 40

outputs/events.out.tfevents.1626132842.tablespoon.2540265.3.v2 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:cb1b3e5c4ef3d4d9f4f8c03d897f931b27c715e76908f1738efc5b18d5684daf
+size 34421733

outputs/events.out.tfevents.1626535665.tablespoon.2656403.3.v2 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:95635a084003eb01fefc99ecfe5a77a4c924fd2cee2f1a511c504dc1d63feaf5
+size 40

outputs/events.out.tfevents.1626537915.tablespoon.2714825.3.v2 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:09c2bcb1c66711eed2ae348defb56f15368a5da3205c64a1b686d29c595ad408
+size 40