kleinay's picture
Upload T5ForConditionalGeneration
b13892a
{
"_name_or_path": "trained_models/t5_qanom-joint-23.03.22",
"append_verb_form": true,
"architectures": [
"T5ForConditionalGeneration"
],
"d_ff": 2048,
"d_kv": 64,
"d_model": 512,
"debug_mode": false,
"decoder_start_token_id": 0,
"dense_act_fn": "relu",
"description": "optimal joint config from sweep, mainly for qanom",
"dir_switch": "joint_optimal",
"do_eval_on": "validation",
"dropout_rate": 0.1,
"eos_token_id": 1,
"eval_steps": 500,
"evaluation_strategy": "steps",
"feed_forward_proj": "relu",
"fp16": true,
"gradient_accumulation_steps": 14,
"initializer_factor": 1.0,
"is_encoder_decoder": true,
"is_gated_act": false,
"layer_norm_epsilon": 1e-06,
"learning_rate": 0.001,
"load_best_model_at_end": true,
"logging_steps": 500,
"logging_strategy": "steps",
"metric_for_best_model": "eval_loss",
"model_type": "t5",
"n_positions": 512,
"num_beams": 5,
"num_decoder_layers": 6,
"num_heads": 8,
"num_layers": 6,
"output_past": true,
"overwrite_output_dir": true,
"pad_token_id": 0,
"per_device_eval_batch_size": 12,
"per_device_train_batch_size": 12,
"predicate_marker_type": "generic",
"predict_with_generate": true,
"preprocess_input_func": "input_predicate_marker",
"preprocessing_kwargs": {
"append_verb_form": true,
"debug_mode": false,
"description": "optimal joint config from sweep, mainly for qanom",
"dir_switch": "joint_optimal",
"do_eval_on": "validation",
"dropout_rate": 0.1,
"eval_steps": 500,
"evaluation_strategy": "steps",
"fp16": true,
"gradient_accumulation_steps": 14,
"learning_rate": 0.001,
"load_best_model_at_end": true,
"logging_steps": 500,
"logging_strategy": "steps",
"metric_for_best_model": "eval_loss",
"model_type": "t5",
"num_beams": 5,
"overwrite_output_dir": true,
"per_device_eval_batch_size": 12,
"per_device_train_batch_size": 12,
"predicate_marker_type": "generic",
"predict_with_generate": true,
"preprocess_input_func": "input_predicate_marker",
"qanom_joint_factor": 14,
"save_steps": 500,
"save_strategy": "steps",
"seed": 44,
"source_prefix": "parse: ",
"train_dataset": "joint_qanom",
"train_epochs": 20,
"use_bilateral_predicate_marker": true
},
"qanom_joint_factor": 14,
"relative_attention_max_distance": 128,
"relative_attention_num_buckets": 32,
"save_steps": 500,
"save_strategy": "steps",
"seed": 44,
"source_prefix": "parse: ",
"task_specific_params": {
"summarization": {
"early_stopping": true,
"length_penalty": 2.0,
"max_length": 200,
"min_length": 30,
"no_repeat_ngram_size": 3,
"num_beams": 4,
"prefix": "summarize: "
},
"translation_en_to_de": {
"early_stopping": true,
"max_length": 300,
"num_beams": 4,
"prefix": "translate English to German: "
},
"translation_en_to_fr": {
"early_stopping": true,
"max_length": 300,
"num_beams": 4,
"prefix": "translate English to French: "
},
"translation_en_to_ro": {
"early_stopping": true,
"max_length": 300,
"num_beams": 4,
"prefix": "translate English to Romanian: "
}
},
"torch_dtype": "float32",
"train_dataset": "joint_qanom",
"train_epochs": 20,
"transformers_version": "4.26.1",
"use_bilateral_predicate_marker": true,
"use_cache": true,
"vocab_size": 32101
}