pietrolesci's picture
Create README.md
2388064
## Overview
T5-Base v1.1 model trained to generate hypotheses given a premise and a label. Below the settings used to train it.
```yaml
Experiment configurations
β”œβ”€β”€ datasets
β”‚ └── snli_train:
β”‚ dataset_name: snli
β”‚ dataset_config_name: null
β”‚ cache_dir: null
β”‚ input_fields:
β”‚ - premise
β”‚ - hypothesis
β”‚ target_field: label
β”‚ train_subset_names: null
β”‚ val_subset_names: validation
β”‚ test_subset_names: none
β”‚ train_val_split: null
β”‚ limit_train_samples: null
β”‚ limit_val_samples: null
β”‚ limit_test_samples: null
β”‚ sampling_kwargs:
β”‚ sampling_strategy: random
β”‚ seed: 42
β”‚ replace: false
β”‚ align_labels_with_mapping: null
β”‚ avoid_consistency_check: false
β”‚ predict_label_mapping: null
β”‚ anli_train:
β”‚ dataset_name: anli
β”‚ dataset_config_name: null
β”‚ cache_dir: null
β”‚ input_fields:
β”‚ - premise
β”‚ - hypothesis
β”‚ target_field: label
β”‚ train_subset_names:
β”‚ - train_r1
β”‚ - train_r2
β”‚ - train_r3
β”‚ val_subset_names:
β”‚ - dev_r1
β”‚ - dev_r2
β”‚ - dev_r3
β”‚ test_subset_names: none
β”‚ train_val_split: null
β”‚ limit_train_samples: null
β”‚ limit_val_samples: null
β”‚ limit_test_samples: null
β”‚ sampling_kwargs:
β”‚ sampling_strategy: random
β”‚ seed: 42
β”‚ replace: false
β”‚ align_labels_with_mapping: null
β”‚ avoid_consistency_check: false
β”‚ predict_label_mapping: null
β”‚ mnli_train:
β”‚ dataset_name: multi_nli
β”‚ dataset_config_name: null
β”‚ cache_dir: null
β”‚ input_fields:
β”‚ - premise
β”‚ - hypothesis
β”‚ target_field: label
β”‚ train_subset_names: null
β”‚ val_subset_names: validation_matched
β”‚ test_subset_names: none
β”‚ train_val_split: null
β”‚ limit_train_samples: null
β”‚ limit_val_samples: null
β”‚ limit_test_samples: null
β”‚ sampling_kwargs:
β”‚ sampling_strategy: random
β”‚ seed: 42
β”‚ replace: false
β”‚ align_labels_with_mapping: null
β”‚ avoid_consistency_check: false
β”‚ predict_label_mapping: null
β”‚ snli:
β”‚ dataset_name: snli
β”‚ dataset_config_name: null
β”‚ cache_dir: null
β”‚ input_fields:
β”‚ - premise
β”‚ - hypothesis
β”‚ target_field: label
β”‚ train_subset_names: none
β”‚ val_subset_names: none
β”‚ test_subset_names: null
β”‚ train_val_split: null
β”‚ limit_train_samples: null
β”‚ limit_val_samples: null
β”‚ limit_test_samples: null
β”‚ sampling_kwargs:
β”‚ sampling_strategy: random
β”‚ seed: 42
β”‚ replace: false
β”‚ align_labels_with_mapping: null
β”‚ avoid_consistency_check: false
β”‚ predict_label_mapping: null
β”‚ anli:
β”‚ dataset_name: anli
β”‚ dataset_config_name: null
β”‚ cache_dir: null
β”‚ input_fields:
β”‚ - premise
β”‚ - hypothesis
β”‚ target_field: label
β”‚ train_subset_names: none
β”‚ val_subset_names: none
β”‚ test_subset_names:
β”‚ - test_r1
β”‚ - test_r2
β”‚ - test_r3
β”‚ train_val_split: null
β”‚ limit_train_samples: null
β”‚ limit_val_samples: null
β”‚ limit_test_samples: null
β”‚ sampling_kwargs:
β”‚ sampling_strategy: random
β”‚ seed: 42
β”‚ replace: false
β”‚ align_labels_with_mapping: null
β”‚ avoid_consistency_check: false
β”‚ predict_label_mapping: null
β”‚ mnli:
β”‚ dataset_name: multi_nli
β”‚ dataset_config_name: null
β”‚ cache_dir: null
β”‚ input_fields:
β”‚ - premise
β”‚ - hypothesis
β”‚ target_field: label
β”‚ train_subset_names: none
β”‚ val_subset_names: none
β”‚ test_subset_names: validation_mismatched
β”‚ train_val_split: null
β”‚ limit_train_samples: null
β”‚ limit_val_samples: null
β”‚ limit_test_samples: null
β”‚ sampling_kwargs:
β”‚ sampling_strategy: random
β”‚ seed: 42
β”‚ replace: false
β”‚ align_labels_with_mapping: null
β”‚ avoid_consistency_check: false
β”‚ predict_label_mapping: null
β”‚
β”œβ”€β”€ data
β”‚ └── _target_: src.task.nli.data.NLIGenerationData.from_config
β”‚ main_dataset_name: null
β”‚ use_additional_as_test: null
β”‚ dataloader:
β”‚ batch_size: 96
β”‚ eval_batch_size: 96
β”‚ num_workers: 8
β”‚ pin_memory: true
β”‚ drop_last: false
β”‚ persistent_workers: false
β”‚ shuffle: true
β”‚ seed_dataloader: 42
β”‚ replacement: false
β”‚ processing:
β”‚ preprocessing_num_workers: 8
β”‚ preprocessing_batch_size: 1000
β”‚ load_from_cache_file: true
β”‚ padding: longest
β”‚ truncation: longest_first
β”‚ max_source_length: 128
β”‚ max_target_length: 128
β”‚ template: 'premise: $premise $label hypothesis: '
β”‚ tokenizer:
β”‚ _target_: transformers.AutoTokenizer.from_pretrained
β”‚ pretrained_model_name_or_path: pietrolesci/t5-v1_1-base_nli_gen
β”‚ use_fast: true
β”‚
β”œβ”€β”€ task
β”‚ └── optimizer:
β”‚ name: Adafactor
β”‚ lr: 0.001
β”‚ weight_decay: 0.0
β”‚ no_decay:
β”‚ - bias
β”‚ - LayerNorm.weight
β”‚ decay_rate: -0.8
β”‚ clip_threshold: 1.0
β”‚ relative_step: false
β”‚ scale_parameter: false
β”‚ warmup_init: false
β”‚ scheduler:
β”‚ name: constant_schedule
β”‚ model:
β”‚ model_name_or_path: pietrolesci/t5-v1_1-base_nli_gen
β”‚ checkpoint_path: null
β”‚ freeze: false
β”‚ seed_init_weight: 42
β”‚ _target_: src.task.nli.NLIGenerationTask.from_config
β”‚ generation:
β”‚ generation_max_length: 128
β”‚ generation_min_length: 3
β”‚ do_sample: true
β”‚ early_stopping: false
β”‚ num_beams: 1
β”‚ temperature: 1.0
β”‚ top_k: 50
β”‚ top_p: 0.95
β”‚ repetition_penalty: null
β”‚ length_penalty: null
β”‚ no_repeat_ngram_size: null
β”‚ encoder_no_repeat_ngram_size: null
β”‚ num_return_sequences: 1
β”‚ max_time: null
β”‚ max_new_tokens: null
β”‚ decoder_start_token_id: null
β”‚ use_cache: null
β”‚ num_beam_groups: null
β”‚ diversity_penalty: null
β”‚
β”œβ”€β”€ trainer
β”‚ └── _target_: pytorch_lightning.Trainer
β”‚ callbacks:
β”‚ lr_monitor:
β”‚ _target_: pytorch_lightning.callbacks.LearningRateMonitor
β”‚ logging_interval: step
β”‚ log_momentum: false
β”‚ model_checkpoint:
β”‚ _target_: pytorch_lightning.callbacks.ModelCheckpoint
β”‚ dirpath: ./checkpoints/
β”‚ filename: nli_generator_sma-epoch={epoch:02d}-val_loss={val/aggregat
β”‚ monitor: val/aggregated_loss
β”‚ mode: min
β”‚ verbose: false
β”‚ save_last: true
β”‚ save_top_k: 1
β”‚ auto_insert_metric_name: false
β”‚ save_on_train_epoch_end: false
β”‚ rich_model_summary:
β”‚ _target_: pytorch_lightning.callbacks.RichModelSummary
β”‚ max_depth: 1
β”‚ log_grad_norm:
β”‚ _target_: src.core.callbacks.LogGradNorm
β”‚ norm_type: 2
β”‚ group_separator: /
β”‚ only_total: true
β”‚ on_step: true
β”‚ on_epoch: false
β”‚ prog_bar: true
β”‚ log_generated_text:
β”‚ _target_: src.core.callbacks.GenerateAndLogText
β”‚ dirpath: ./generated_text
β”‚ type: generated_text
β”‚ pop_keys_after_logging: true
β”‚ on_train: false
β”‚ on_validation: false
β”‚ on_test: true
β”‚ log_to_wandb: true
β”‚ wandb_log_dataset_sizes:
β”‚ _target_: src.core.callbacks.WandbLogDatasetSizes
β”‚ logger:
β”‚ wandb:
β”‚ _target_: pytorch_lightning.loggers.WandbLogger
β”‚ project: nli_debiasing
β”‚ entity: team_brushino
β”‚ name: nli_generator_sma
β”‚ save_dir: ./
β”‚ offline: false
β”‚ log_model: false
β”‚ group: generator
β”‚ job_type: genearator_training
β”‚ tags:
β”‚ - nli_generator_sma
β”‚ - seed=42
β”‚ - seed_dataloader=42
β”‚ notes: nli_generator_sma_time=01-37-04
β”‚ enable_checkpointing: true
β”‚ enable_progress_bar: true
β”‚ enable_model_summary: true
β”‚ gradient_clip_val: 6
β”‚ gradient_clip_algorithm: null
β”‚ accelerator: gpu
β”‚ devices: auto
β”‚ gpus: null
β”‚ auto_select_gpus: true
β”‚ accumulate_grad_batches: 1
β”‚ max_epochs: 2
β”‚ min_epochs: 1
β”‚ max_steps: -1
β”‚ min_steps: null
β”‚ max_time: null
β”‚ num_sanity_val_steps: 2
β”‚ overfit_batches: 0.0
β”‚ fast_dev_run: false
β”‚ limit_train_batches: 1.0
β”‚ limit_val_batches: 1.0
β”‚ limit_test_batches: 1.0
β”‚ profiler: null
β”‚ detect_anomaly: false
β”‚ deterministic: false
β”‚ check_val_every_n_epoch: 1
β”‚ val_check_interval: 0.5
β”‚ log_every_n_steps: 1
β”‚ move_metrics_to_cpu: false
β”‚
└── training
└── run_val_before_fit: false
run_val_after_fit: false
run_test_before_fit: false
run_test_after_fit: true
lr: 0.001
seed: 42
show_batch: false
batch_size: 96
eval_batch_size: 96
num_workers: 8
pin_memory: true
drop_last: false
persistent_workers: false
shuffle: true
seed_dataloader: 42
ignore_warnings: true
experiment_name: nli_generator_sma
```