# Config file for https://github.com/malteos/lm-datasets # # EU top-5 (en,fr,es,de,it) + code # target size: 300B tokens (train first for 200B tokens) # a fixed random seed for shuffling etc. seed: 0 # data split settings validation_ratio: 0.005 # number of documents in the split: len(dataset) * ratio validation_min_total_docs: 1_000 # to be used as validation set, the dataset must have at least n docs validation_max_split_docs: 1_000 # number of documents in validation split are capped at this numbers validation_min_split_docs: 10 # split must have at least this number of documents, otherwise it will be discarded tokenizer_train_ratio: 0.1 selected_source_ids: - starcoder selected_dataset_ids: # english - pes2o - math_amps - eurlex_en - wikipedia_20231101_en - wikibooks_en - wikiquote_en - wikinews_en - wikisource_en - wikivoyage_en - colossal_oscar_2015-14_en - colossal_oscar_2016-40_en - colossal_oscar_2017-43_en - colossal_oscar_2018-47_en - colossal_oscar_2019-22_en - colossal_oscar_2020-24_en - colossal_oscar_2020-45_en - colossal_oscar_2021-49_en - colossal_oscar_2022-27_en - colossal_oscar_2022-49_en - colossal_oscar_2023-14_en - colossal_oscar_2023-23_en - pile_of_law_r_legaladvice - pile_of_law_atticus_contracts - pile_of_law_un_debates - proof_pile2_open_web_math - parlamint_gb - redpajama_stackexchange # french - cabernet - eurlex_fr - legal_mc4_fr - wikipedia_20231101_fr - wikibooks_fr - wikiquote_fr - wikinews_fr - wikisource_fr - wikivoyage_fr - colossal_oscar_2015-14_fr - colossal_oscar_2016-40_fr - colossal_oscar_2017-43_fr - colossal_oscar_2018-47_fr - colossal_oscar_2019-22_fr - colossal_oscar_2020-24_fr - colossal_oscar_2020-45_fr - colossal_oscar_2021-49_fr - colossal_oscar_2022-27_fr - colossal_oscar_2022-49_fr - colossal_oscar_2023-14_fr - colossal_oscar_2023-23_fr - opensubtitles_fr - parlamint_fr # spanish - spanish_legal - eurlex_es - legal_mc4_es - wikipedia_20231101_es - wikibooks_es - wikiquote_es - wikinews_es - wikisource_es - wikivoyage_es - colossal_oscar_2015-14_es - colossal_oscar_2016-40_es - colossal_oscar_2017-43_es - colossal_oscar_2018-47_es - colossal_oscar_2019-22_es - colossal_oscar_2020-24_es - colossal_oscar_2020-45_es - colossal_oscar_2021-49_es - colossal_oscar_2022-27_es - colossal_oscar_2022-49_es - colossal_oscar_2023-14_es - colossal_oscar_2023-23_es - opensubtitles_es - parlamint_es # german - openlegaldata - dewac - eurlex_de - legal_mc4_de - wikipedia_20231101_de - wikibooks_de - wikiquote_de - wikinews_de - wikisource_de - wikivoyage_de - colossal_oscar_2015-14_de - colossal_oscar_2016-40_de - colossal_oscar_2017-43_de - colossal_oscar_2018-47_de - colossal_oscar_2019-22_de - colossal_oscar_2020-24_de - colossal_oscar_2020-45_de - colossal_oscar_2021-49_de - colossal_oscar_2022-27_de - colossal_oscar_2022-49_de - colossal_oscar_2023-14_de - colossal_oscar_2023-23_de - open_discourse_bundestag - tagesschau_2018_2023 - opensubtitles_de - parlamint_at # italian - itwac - eurlex_it - legal_mc4_it - wikipedia_20231101_it - wikibooks_it - wikiquote_it - wikinews_it - wikisource_it - wikivoyage_it - colossal_oscar_2015-14_it - colossal_oscar_2016-40_it - colossal_oscar_2017-43_it - colossal_oscar_2018-47_it - colossal_oscar_2019-22_it - colossal_oscar_2020-24_it - colossal_oscar_2020-45_it - colossal_oscar_2021-49_it - colossal_oscar_2022-27_it - colossal_oscar_2022-49_it - colossal_oscar_2023-14_it - colossal_oscar_2023-23_it - opensubtitles_it - parlamint_it - tatoeba_translation_en_fr - tatoeba_translation_en_es - tatoeba_translation_en_it - tatoeba_translation_fr_it - tatoeba_translation_es_fr - tatoeba_translation_es_it - tatoeba_translation_de_en - tatoeba_translation_de_fr - tatoeba_translation_de_es - tatoeba_translation_de_it - opus100_translation_de_en - opus100_translation_en_es - opus100_translation_en_fr - opus100_translation_en_it - wmt19_translation_de_en - wmt19_translation_fr_de sampling_factor_by_dataset_id: redpajama_stackexchange: 0.1 pes2o: 0.1 math_amps: 0.1 openlegaldata: 0.75 dewac: 0.05 itwac: 1 cabernet: 1 spanish_legal: 0.1 eurlex_de: 0.5 eurlex_en: 0.5 eurlex_es: 1 eurlex_fr: 1 eurlex_it: 1 legal_mc4_de: 0.1 legal_mc4_es: 0.25 legal_mc4_fr: 0.25 legal_mc4_it: 1 wikipedia_20231101_de: 2 wikibooks_de: 1 wikiquote_de: 1 wikinews_de: 2 wikisource_de: 1 wikivoyage_de: 1 wikipedia_20231101_en: 1 wikibooks_en: 1 wikiquote_en: 0.25 wikinews_en: 1 wikisource_en: 1 wikivoyage_en: 1 wikipedia_20231101_es: 2 wikibooks_es: 1 wikiquote_es: 1 wikinews_es: 2 wikisource_es: 1 wikivoyage_es: 1 wikipedia_20231101_fr: 2 wikibooks_fr: 1 wikiquote_fr: 1 wikinews_fr: 2 wikisource_fr: 1 wikivoyage_fr: 1 wikipedia_20231101_it: 2 wikibooks_it: 1 wikiquote_it: 1 wikinews_it: 2 wikisource_it: 1 wikivoyage_it: 1 colossal_oscar_2015-14_de: 1 colossal_oscar_2016-40_de: 0.95 colossal_oscar_2017-43_de: 0.1 colossal_oscar_2018-47_de: 0.1 colossal_oscar_2019-22_de: 0.1 colossal_oscar_2020-24_de: 0.1 colossal_oscar_2020-45_de: 0.1 colossal_oscar_2021-49_de: 0.1 colossal_oscar_2022-27_de: 0.1 colossal_oscar_2022-49_de: 0.1 colossal_oscar_2023-14_de: 0.95 colossal_oscar_2023-23_de: 1 colossal_oscar_2015-14_en: 0.05 colossal_oscar_2016-40_en: 0.05 colossal_oscar_2017-43_en: 0.001 colossal_oscar_2018-47_en: 0.001 colossal_oscar_2019-22_en: 0.001 colossal_oscar_2020-24_en: 0.001 colossal_oscar_2020-45_en: 0.001 colossal_oscar_2021-49_en: 0.001 colossal_oscar_2022-27_en: 0.001 colossal_oscar_2022-49_en: 0.001 colossal_oscar_2023-14_en: 0.05 colossal_oscar_2023-23_en: 0.05 colossal_oscar_2015-14_es: 1 colossal_oscar_2016-40_es: 1 colossal_oscar_2017-43_es: 0.25 colossal_oscar_2018-47_es: 0.1 colossal_oscar_2019-22_es: 0.1 colossal_oscar_2020-24_es: 0.1 colossal_oscar_2020-45_es: 0.1 colossal_oscar_2021-49_es: 0.1 colossal_oscar_2022-27_es: 0.1 colossal_oscar_2022-49_es: 0.3 colossal_oscar_2023-14_es: 1 colossal_oscar_2023-23_es: 1 colossal_oscar_2015-14_fr: 1 colossal_oscar_2016-40_fr: 1 colossal_oscar_2017-43_fr: 0.25 colossal_oscar_2018-47_fr: 0.25 colossal_oscar_2019-22_fr: 0.1 colossal_oscar_2020-24_fr: 0.1 colossal_oscar_2020-45_fr: 0.1 colossal_oscar_2021-49_fr: 0.1 colossal_oscar_2022-27_fr: 0.1 colossal_oscar_2022-49_fr: 0.75 colossal_oscar_2023-14_fr: 1 colossal_oscar_2023-23_fr: 1 starcoder_emacs-lisp: 0.1 starcoder_literate-haskell: 0.1 starcoder_shell: 0.1 starcoder_ada: 0.1 starcoder_erlang: 0.1 starcoder_lua: 0.1 starcoder_smalltalk: 0.1 starcoder_agda: 0.1 starcoder_f-sharp: 0.1 starcoder_makefile: 0.1 starcoder_solidity: 0.1 starcoder_alloy: 0.1 starcoder_fortran: 0.1 starcoder_maple: 0.1 starcoder_sparql: 0.1 starcoder_antlr: 0.1 starcoder_git-commits-cleaned: 0.05 starcoder_markdown: 0.05 starcoder_sql: 0.1 starcoder_applescript: 0.1 starcoder_github-issues-filtered-structured: 0.075 starcoder_mathematica: 0.1 starcoder_stan: 0.1 starcoder_assembly: 0.1 starcoder_glsl: 0.1 starcoder_matlab: 0.1 starcoder_standard-ml: 0.1 starcoder_augeas: 0.1 starcoder_go: 0.05 starcoder_ocaml: 0.1 starcoder_stata: 0.1 starcoder_awk: 0.1 starcoder_groovy: 0.1 starcoder_pascal: 0.1 starcoder_systemverilog: 0.1 starcoder_batchfile: 0.1 starcoder_haskell: 0.1 starcoder_perl: 0.1 starcoder_tcl: 0.1 starcoder_bluespec: 0.1 starcoder_html: 0.05 starcoder_php: 0.05 starcoder_tcsh: 0.1 starcoder_c: 0.05 starcoder_idris: 0.1 starcoder_powershell: 0.1 starcoder_tex: 0.1 starcoder_c-sharp: 0.05 starcoder_isabelle: 0.1 starcoder_prolog: 0.1 starcoder_thrift: 0.1 starcoder_clojure: 0.1 starcoder_java: 0.05 starcoder_protocol-buffer: 0.1 starcoder_typescript: 0.05 starcoder_cmake: 0.1 starcoder_java-server-pages: 0.1 starcoder_python: 0.05 starcoder_verilog: 0.1 starcoder_coffeescript: 0.1 starcoder_javascript: 0.05 starcoder_r: 0.1 starcoder_vhdl: 0.1 starcoder_common-lisp: 0.1 starcoder_json: 0.1 starcoder_racket: 0.1 starcoder_visual-basic: 0.1 starcoder_cpp: 0.05 starcoder_julia: 0.1 starcoder_restructuredtext: 0.1 starcoder_xslt: 0.1 starcoder_css: 0.1 starcoder_jupyter-scripts-dedup-filtered: 0.1 starcoder_rmarkdown: 0.1 starcoder_yacc: 0.1 starcoder_cuda: 0.1 starcoder_jupyter-structured-clean-dedup: 0.1 starcoder_ruby: 0.1 starcoder_yaml: 0.1 starcoder_dart: 0.1 starcoder_kotlin: 0.1 starcoder_rust: 0.1 starcoder_zig: 0.1 starcoder_dockerfile: 0.1 starcoder_lean: 0.1 starcoder_sas: 0.1 starcoder_elixir: 0.1 starcoder_literate-agda: 0.1 starcoder_scala: 0.1 starcoder_elm: 0.1 starcoder_literate-coffeescript: 0.1 starcoder_scheme: 0.1 pile_of_law_r_legaladvice: 1 pile_of_law_atticus_contracts: 0.25 pile_of_law_un_debates: 1 open_discourse_bundestag: 0.5 tagesschau_2018_2023: 1 proof_pile2_open_web_math: 0.25 tatoeba_translation_en_fr: 1 tatoeba_translation_en_es: 1 tatoeba_translation_en_it: 1 tatoeba_translation_fr_it: 1 tatoeba_translation_es_fr: 1 tatoeba_translation_es_it: 1 tatoeba_translation_de_en: 1 tatoeba_translation_de_fr: 1 tatoeba_translation_de_es: 1 tatoeba_translation_de_it: 1 opus100_translation_de_en: 1 opus100_translation_en_es: 1 opus100_translation_en_fr: 1 opus100_translation_en_it: 1 wmt19_translation_de_en: 1 wmt19_translation_fr_de: 1 opensubtitles_es: 1 opensubtitles_fr: 1 opensubtitles_de: 1 opensubtitles_it: 1 parlamint_es: 1 parlamint_it: 1 parlamint_at: 1 parlamint_fr: 1 parlamint_gb: 1 colossal_oscar_2015-14_it: 1 colossal_oscar_2016-40_it: 1 colossal_oscar_2017-43_it: 0.75 colossal_oscar_2018-47_it: 0.75 colossal_oscar_2019-22_it: 0.75 colossal_oscar_2020-24_it: 0.75 colossal_oscar_2020-45_it: 0.75 colossal_oscar_2021-49_it: 0.75 colossal_oscar_2022-27_it: 0.75 colossal_oscar_2022-49_it: 0.75 colossal_oscar_2023-14_it: 0.9 colossal_oscar_2023-23_it: 1