|
data_config: |
|
streaming: true |
|
validation_size_max: 1024 |
|
metadata_config: |
|
random_sample_metadata: true |
|
random_sample_metadata_calculate_size: 16384 |
|
random_sample_metadata_weights: |
|
html: 0.5 |
|
timestamp: 11.56111563110182 |
|
website_desc: 11.033764368362439 |
|
title: 1.0644297987874418 |
|
generation_datasource: 1.0 |
|
entity_paragraph: 11.077104653627899 |
|
metadata_list: |
|
- html |
|
- timestamp |
|
- website_description |
|
- title |
|
- url |
|
- datasource |
|
- length |
|
- entity_paragraph |
|
metadata_column_list: |
|
- html |
|
- timestamp |
|
- website_desc |
|
- title |
|
- generation_datasource |
|
- entity_paragraph |
|
local_metadata_special_tokens: |
|
entity_paragraph: entity |
|
metadata_sep: ' | ' |
|
metadata_key_value_sep: ': ' |
|
metadata_probability: 0.5 |
|
treat_local_metadata_as_regular_text: true |
|
add_local_metadata_special_tokens_in_prefix: true |
|
metadata_prefix_sep: ' |||' |
|
metadata_prefix_start_seq: '' |
|
max_seq_len: 1024 |
|
html_parser_config: |
|
all_tags_rules: |
|
attributes_to_keep: |
|
- class |
|
- id |
|
txt_max_chr_len: 0 |
|
txt_min_chr_len: -.inf |
|
tags_exceptions_to_txt_max_min_chr_len: |
|
- table |
|
- tr |
|
- th |
|
- td |
|
- colgroup |
|
- thead |
|
- tfoot |
|
- tbody |
|
tags_to_remove_alone_tag_name: |
|
- body |
|
tags_to_remove_alone_txt_max_chr_len: |
|
- .inf |
|
tags_to_remove_alone_txt_min_chr_len: |
|
- 0.0 |
|
local_metadata_special_token_start: |
|
entity_paragraph: <ENTITY_CHAIN> |
|
local_metadata_special_token_end: |
|
entity_paragraph: ' </ENTITY_CHAIN> ' |
|
experiment: with_metadata_datasetv2 |
|
per_device_eval_batch_size: 32 |
|
per_device_train_batch_size: 32 |
|
dataset_name: bs-modeling-metadata/c4-en-html-with-metadata |
|
dataset_config_name: null |
|
train_file: '*.jsonl.gz' |
|
validation_file: c4-en-html_cc-main-2019-18_pq00-000.jsonl.gz |
|
overwrite_cache: false |
|
cache_dir: null |
|
extension: null |
|
preprocessing_num_workers: 48 |
|
validation_split_percentage: 5 |
|
block_size: null |
|
map_batch_size: 1 |
|
weight_decay: 0.01 |
|
learning_rate: 1.0e-05 |
|
num_train_epochs: 1 |
|
max_train_steps: 100000 |
|
lr_scheduler_type: linear |
|
num_warmup_steps: 6000 |
|
seed: 42 |
|
out_dir: /mnt/ssd-1/bigscience-metadata/lower-lr-2-lower-html-weight |
|
model_name: gpt2-xl |
|
project_name: metadata_lm |
|
jobid: '' |
|
start_with_eval: false |
|
extra_steps_to_eval_save_at: |
|
- 2 |
|
evaluation_strategy: STEPS |
|
eval_num_per_epoch: 3 |
|
eval_steps: 2000 |
|
save_strategy: STEPS |
|
save_num_per_epoch: 3 |
|
save_steps: 2000 |
|
do_train: true |
|
do_eval: true |
|
gradient_checkpointing: true |
|
resume_from_checkpoint_dir: null |
|
gradient_accumulation_steps: 1 |
|
|