occiglot-7b-eu5 / lm-datasets-config.yml
malteos's picture
uploading model files
e2ef0ce verified
# Config file for https://github.com/malteos/lm-datasets
#
# EU top-5 (en,fr,es,de,it) + code
# target size: 300B tokens (train first for 200B tokens)
# a fixed random seed for shuffling etc.
seed: 0
# data split settings
validation_ratio: 0.005 # number of documents in the split: len(dataset) * ratio
validation_min_total_docs: 1_000 # to be used as validation set, the dataset must have at least n docs
validation_max_split_docs: 1_000 # number of documents in validation split are capped at this numbers
validation_min_split_docs: 10 # split must have at least this number of documents, otherwise it will be discarded
tokenizer_train_ratio: 0.1
selected_source_ids:
- starcoder
selected_dataset_ids:
# english
- pes2o
- math_amps
- eurlex_en
- wikipedia_20231101_en
- wikibooks_en
- wikiquote_en
- wikinews_en
- wikisource_en
- wikivoyage_en
- colossal_oscar_2015-14_en
- colossal_oscar_2016-40_en
- colossal_oscar_2017-43_en
- colossal_oscar_2018-47_en
- colossal_oscar_2019-22_en
- colossal_oscar_2020-24_en
- colossal_oscar_2020-45_en
- colossal_oscar_2021-49_en
- colossal_oscar_2022-27_en
- colossal_oscar_2022-49_en
- colossal_oscar_2023-14_en
- colossal_oscar_2023-23_en
- pile_of_law_r_legaladvice
- pile_of_law_atticus_contracts
- pile_of_law_un_debates
- proof_pile2_open_web_math
- parlamint_gb
- redpajama_stackexchange
# french
- cabernet
- eurlex_fr
- legal_mc4_fr
- wikipedia_20231101_fr
- wikibooks_fr
- wikiquote_fr
- wikinews_fr
- wikisource_fr
- wikivoyage_fr
- colossal_oscar_2015-14_fr
- colossal_oscar_2016-40_fr
- colossal_oscar_2017-43_fr
- colossal_oscar_2018-47_fr
- colossal_oscar_2019-22_fr
- colossal_oscar_2020-24_fr
- colossal_oscar_2020-45_fr
- colossal_oscar_2021-49_fr
- colossal_oscar_2022-27_fr
- colossal_oscar_2022-49_fr
- colossal_oscar_2023-14_fr
- colossal_oscar_2023-23_fr
- opensubtitles_fr
- parlamint_fr
# spanish
- spanish_legal
- eurlex_es
- legal_mc4_es
- wikipedia_20231101_es
- wikibooks_es
- wikiquote_es
- wikinews_es
- wikisource_es
- wikivoyage_es
- colossal_oscar_2015-14_es
- colossal_oscar_2016-40_es
- colossal_oscar_2017-43_es
- colossal_oscar_2018-47_es
- colossal_oscar_2019-22_es
- colossal_oscar_2020-24_es
- colossal_oscar_2020-45_es
- colossal_oscar_2021-49_es
- colossal_oscar_2022-27_es
- colossal_oscar_2022-49_es
- colossal_oscar_2023-14_es
- colossal_oscar_2023-23_es
- opensubtitles_es
- parlamint_es
# german
- openlegaldata
- dewac
- eurlex_de
- legal_mc4_de
- wikipedia_20231101_de
- wikibooks_de
- wikiquote_de
- wikinews_de
- wikisource_de
- wikivoyage_de
- colossal_oscar_2015-14_de
- colossal_oscar_2016-40_de
- colossal_oscar_2017-43_de
- colossal_oscar_2018-47_de
- colossal_oscar_2019-22_de
- colossal_oscar_2020-24_de
- colossal_oscar_2020-45_de
- colossal_oscar_2021-49_de
- colossal_oscar_2022-27_de
- colossal_oscar_2022-49_de
- colossal_oscar_2023-14_de
- colossal_oscar_2023-23_de
- open_discourse_bundestag
- tagesschau_2018_2023
- opensubtitles_de
- parlamint_at
# italian
- itwac
- eurlex_it
- legal_mc4_it
- wikipedia_20231101_it
- wikibooks_it
- wikiquote_it
- wikinews_it
- wikisource_it
- wikivoyage_it
- colossal_oscar_2015-14_it
- colossal_oscar_2016-40_it
- colossal_oscar_2017-43_it
- colossal_oscar_2018-47_it
- colossal_oscar_2019-22_it
- colossal_oscar_2020-24_it
- colossal_oscar_2020-45_it
- colossal_oscar_2021-49_it
- colossal_oscar_2022-27_it
- colossal_oscar_2022-49_it
- colossal_oscar_2023-14_it
- colossal_oscar_2023-23_it
- opensubtitles_it
- parlamint_it
- tatoeba_translation_en_fr
- tatoeba_translation_en_es
- tatoeba_translation_en_it
- tatoeba_translation_fr_it
- tatoeba_translation_es_fr
- tatoeba_translation_es_it
- tatoeba_translation_de_en
- tatoeba_translation_de_fr
- tatoeba_translation_de_es
- tatoeba_translation_de_it
- opus100_translation_de_en
- opus100_translation_en_es
- opus100_translation_en_fr
- opus100_translation_en_it
- wmt19_translation_de_en
- wmt19_translation_fr_de
sampling_factor_by_dataset_id:
redpajama_stackexchange: 0.1
pes2o: 0.1
math_amps: 0.1
openlegaldata: 0.75
dewac: 0.05
itwac: 1
cabernet: 1
spanish_legal: 0.1
eurlex_de: 0.5
eurlex_en: 0.5
eurlex_es: 1
eurlex_fr: 1
eurlex_it: 1
legal_mc4_de: 0.1
legal_mc4_es: 0.25
legal_mc4_fr: 0.25
legal_mc4_it: 1
wikipedia_20231101_de: 2
wikibooks_de: 1
wikiquote_de: 1
wikinews_de: 2
wikisource_de: 1
wikivoyage_de: 1
wikipedia_20231101_en: 1
wikibooks_en: 1
wikiquote_en: 0.25
wikinews_en: 1
wikisource_en: 1
wikivoyage_en: 1
wikipedia_20231101_es: 2
wikibooks_es: 1
wikiquote_es: 1
wikinews_es: 2
wikisource_es: 1
wikivoyage_es: 1
wikipedia_20231101_fr: 2
wikibooks_fr: 1
wikiquote_fr: 1
wikinews_fr: 2
wikisource_fr: 1
wikivoyage_fr: 1
wikipedia_20231101_it: 2
wikibooks_it: 1
wikiquote_it: 1
wikinews_it: 2
wikisource_it: 1
wikivoyage_it: 1
colossal_oscar_2015-14_de: 1
colossal_oscar_2016-40_de: 0.95
colossal_oscar_2017-43_de: 0.1
colossal_oscar_2018-47_de: 0.1
colossal_oscar_2019-22_de: 0.1
colossal_oscar_2020-24_de: 0.1
colossal_oscar_2020-45_de: 0.1
colossal_oscar_2021-49_de: 0.1
colossal_oscar_2022-27_de: 0.1
colossal_oscar_2022-49_de: 0.1
colossal_oscar_2023-14_de: 0.95
colossal_oscar_2023-23_de: 1
colossal_oscar_2015-14_en: 0.05
colossal_oscar_2016-40_en: 0.05
colossal_oscar_2017-43_en: 0.001
colossal_oscar_2018-47_en: 0.001
colossal_oscar_2019-22_en: 0.001
colossal_oscar_2020-24_en: 0.001
colossal_oscar_2020-45_en: 0.001
colossal_oscar_2021-49_en: 0.001
colossal_oscar_2022-27_en: 0.001
colossal_oscar_2022-49_en: 0.001
colossal_oscar_2023-14_en: 0.05
colossal_oscar_2023-23_en: 0.05
colossal_oscar_2015-14_es: 1
colossal_oscar_2016-40_es: 1
colossal_oscar_2017-43_es: 0.25
colossal_oscar_2018-47_es: 0.1
colossal_oscar_2019-22_es: 0.1
colossal_oscar_2020-24_es: 0.1
colossal_oscar_2020-45_es: 0.1
colossal_oscar_2021-49_es: 0.1
colossal_oscar_2022-27_es: 0.1
colossal_oscar_2022-49_es: 0.3
colossal_oscar_2023-14_es: 1
colossal_oscar_2023-23_es: 1
colossal_oscar_2015-14_fr: 1
colossal_oscar_2016-40_fr: 1
colossal_oscar_2017-43_fr: 0.25
colossal_oscar_2018-47_fr: 0.25
colossal_oscar_2019-22_fr: 0.1
colossal_oscar_2020-24_fr: 0.1
colossal_oscar_2020-45_fr: 0.1
colossal_oscar_2021-49_fr: 0.1
colossal_oscar_2022-27_fr: 0.1
colossal_oscar_2022-49_fr: 0.75
colossal_oscar_2023-14_fr: 1
colossal_oscar_2023-23_fr: 1
starcoder_emacs-lisp: 0.1
starcoder_literate-haskell: 0.1
starcoder_shell: 0.1
starcoder_ada: 0.1
starcoder_erlang: 0.1
starcoder_lua: 0.1
starcoder_smalltalk: 0.1
starcoder_agda: 0.1
starcoder_f-sharp: 0.1
starcoder_makefile: 0.1
starcoder_solidity: 0.1
starcoder_alloy: 0.1
starcoder_fortran: 0.1
starcoder_maple: 0.1
starcoder_sparql: 0.1
starcoder_antlr: 0.1
starcoder_git-commits-cleaned: 0.05
starcoder_markdown: 0.05
starcoder_sql: 0.1
starcoder_applescript: 0.1
starcoder_github-issues-filtered-structured: 0.075
starcoder_mathematica: 0.1
starcoder_stan: 0.1
starcoder_assembly: 0.1
starcoder_glsl: 0.1
starcoder_matlab: 0.1
starcoder_standard-ml: 0.1
starcoder_augeas: 0.1
starcoder_go: 0.05
starcoder_ocaml: 0.1
starcoder_stata: 0.1
starcoder_awk: 0.1
starcoder_groovy: 0.1
starcoder_pascal: 0.1
starcoder_systemverilog: 0.1
starcoder_batchfile: 0.1
starcoder_haskell: 0.1
starcoder_perl: 0.1
starcoder_tcl: 0.1
starcoder_bluespec: 0.1
starcoder_html: 0.05
starcoder_php: 0.05
starcoder_tcsh: 0.1
starcoder_c: 0.05
starcoder_idris: 0.1
starcoder_powershell: 0.1
starcoder_tex: 0.1
starcoder_c-sharp: 0.05
starcoder_isabelle: 0.1
starcoder_prolog: 0.1
starcoder_thrift: 0.1
starcoder_clojure: 0.1
starcoder_java: 0.05
starcoder_protocol-buffer: 0.1
starcoder_typescript: 0.05
starcoder_cmake: 0.1
starcoder_java-server-pages: 0.1
starcoder_python: 0.05
starcoder_verilog: 0.1
starcoder_coffeescript: 0.1
starcoder_javascript: 0.05
starcoder_r: 0.1
starcoder_vhdl: 0.1
starcoder_common-lisp: 0.1
starcoder_json: 0.1
starcoder_racket: 0.1
starcoder_visual-basic: 0.1
starcoder_cpp: 0.05
starcoder_julia: 0.1
starcoder_restructuredtext: 0.1
starcoder_xslt: 0.1
starcoder_css: 0.1
starcoder_jupyter-scripts-dedup-filtered: 0.1
starcoder_rmarkdown: 0.1
starcoder_yacc: 0.1
starcoder_cuda: 0.1
starcoder_jupyter-structured-clean-dedup: 0.1
starcoder_ruby: 0.1
starcoder_yaml: 0.1
starcoder_dart: 0.1
starcoder_kotlin: 0.1
starcoder_rust: 0.1
starcoder_zig: 0.1
starcoder_dockerfile: 0.1
starcoder_lean: 0.1
starcoder_sas: 0.1
starcoder_elixir: 0.1
starcoder_literate-agda: 0.1
starcoder_scala: 0.1
starcoder_elm: 0.1
starcoder_literate-coffeescript: 0.1
starcoder_scheme: 0.1
pile_of_law_r_legaladvice: 1
pile_of_law_atticus_contracts: 0.25
pile_of_law_un_debates: 1
open_discourse_bundestag: 0.5
tagesschau_2018_2023: 1
proof_pile2_open_web_math: 0.25
tatoeba_translation_en_fr: 1
tatoeba_translation_en_es: 1
tatoeba_translation_en_it: 1
tatoeba_translation_fr_it: 1
tatoeba_translation_es_fr: 1
tatoeba_translation_es_it: 1
tatoeba_translation_de_en: 1
tatoeba_translation_de_fr: 1
tatoeba_translation_de_es: 1
tatoeba_translation_de_it: 1
opus100_translation_de_en: 1
opus100_translation_en_es: 1
opus100_translation_en_fr: 1
opus100_translation_en_it: 1
wmt19_translation_de_en: 1
wmt19_translation_fr_de: 1
opensubtitles_es: 1
opensubtitles_fr: 1
opensubtitles_de: 1
opensubtitles_it: 1
parlamint_es: 1
parlamint_it: 1
parlamint_at: 1
parlamint_fr: 1
parlamint_gb: 1
colossal_oscar_2015-14_it: 1
colossal_oscar_2016-40_it: 1
colossal_oscar_2017-43_it: 0.75
colossal_oscar_2018-47_it: 0.75
colossal_oscar_2019-22_it: 0.75
colossal_oscar_2020-24_it: 0.75
colossal_oscar_2020-45_it: 0.75
colossal_oscar_2021-49_it: 0.75
colossal_oscar_2022-27_it: 0.75
colossal_oscar_2022-49_it: 0.75
colossal_oscar_2023-14_it: 0.9
colossal_oscar_2023-23_it: 1