bs-modeling-metadata
/

checkpoints_v0.3

Model card Files Files and versions Community

checkpoints_v0.3 / actual_config.yaml

ttj's picture

ttj

Upload actual_config.yaml with huggingface_hub

56c12ac over 2 years ago

history blame contribute delete

2.7 kB

	data_config:
	streaming: true
	validation_size_max: 1024
	metadata_config:
	random_sample_metadata: true
	random_sample_metadata_calculate_size: 16384
	random_sample_metadata_weights:
	html: 0.5
	timestamp: 11.56111563110182
	website_desc: 11.033764368362439
	title: 1.0644297987874418
	generation_datasource: 1.0
	entity_paragraph: 11.077104653627899
	metadata_list:
	- html
	- timestamp
	- website_description
	- title
	- url
	- datasource
	- length
	- entity_paragraph
	metadata_column_list:
	- html
	- timestamp
	- website_desc
	- title
	- generation_datasource
	- entity_paragraph
	local_metadata_special_tokens:
	entity_paragraph: entity
	metadata_sep: ' \| '
	metadata_key_value_sep: ': '
	metadata_probability: 0.5
	treat_local_metadata_as_regular_text: true
	add_local_metadata_special_tokens_in_prefix: true
	metadata_prefix_sep: ' \|\|\|'
	metadata_prefix_start_seq: ''
	max_seq_len: 1024
	html_parser_config:
	all_tags_rules:
	attributes_to_keep:
	- class
	- id
	txt_max_chr_len: 0
	txt_min_chr_len: -.inf
	tags_exceptions_to_txt_max_min_chr_len:
	- table
	- tr
	- th
	- td
	- colgroup
	- thead
	- tfoot
	- tbody
	tags_to_remove_alone_tag_name:
	- body
	tags_to_remove_alone_txt_max_chr_len:
	- .inf
	tags_to_remove_alone_txt_min_chr_len:
	- 0.0
	local_metadata_special_token_start:
	entity_paragraph: <ENTITY_CHAIN>
	local_metadata_special_token_end:
	entity_paragraph: ' </ENTITY_CHAIN> '
	experiment: with_metadata_datasetv2
	per_device_eval_batch_size: 32
	per_device_train_batch_size: 32
	dataset_name: bs-modeling-metadata/c4-en-html-with-metadata
	dataset_config_name: null
	train_file: '*.jsonl.gz'
	validation_file: c4-en-html_cc-main-2019-18_pq00-000.jsonl.gz
	overwrite_cache: false
	cache_dir: null
	extension: null
	preprocessing_num_workers: 48
	validation_split_percentage: 5
	block_size: null
	map_batch_size: 1
	weight_decay: 0.01
	learning_rate: 1.0e-05
	num_train_epochs: 1
	max_train_steps: 100000
	lr_scheduler_type: linear
	num_warmup_steps: 6000
	seed: 42
	out_dir: /mnt/ssd-1/bigscience-metadata/lower-lr-2-lower-html-weight
	model_name: gpt2-xl
	project_name: metadata_lm
	jobid: ''
	start_with_eval: false
	extra_steps_to_eval_save_at:
	- 2
	evaluation_strategy: STEPS
	eval_num_per_epoch: 3
	eval_steps: 2000
	save_strategy: STEPS
	save_num_per_epoch: 3
	save_steps: 2000
	do_train: true
	do_eval: true
	gradient_checkpointing: true
	resume_from_checkpoint_dir: null
	gradient_accumulation_steps: 1