MenahemOz's picture
Upload folder using huggingface_hub
5c058d4
2023-05-13 15:32:42,395 - INFO - allennlp.common.params - random_seed = 13370
2023-05-13 15:32:42,395 - INFO - allennlp.common.params - numpy_seed = 1337
2023-05-13 15:32:42,395 - INFO - allennlp.common.params - pytorch_seed = 133
2023-05-13 15:32:42,396 - INFO - allennlp.common.checks - Pytorch version: 1.11.0+cu102
2023-05-13 15:32:42,396 - INFO - allennlp.common.params - type = default
2023-05-13 15:32:42,397 - INFO - allennlp.common.params - dataset_reader.type = seq2rel
2023-05-13 15:32:42,397 - INFO - allennlp.common.params - dataset_reader.max_instances = None
2023-05-13 15:32:42,397 - INFO - allennlp.common.params - dataset_reader.manual_distributed_sharding = False
2023-05-13 15:32:42,397 - INFO - allennlp.common.params - dataset_reader.manual_multiprocess_sharding = False
2023-05-13 15:32:42,397 - INFO - allennlp.common.params - dataset_reader.target_namespace = target_tokens
2023-05-13 15:32:42,398 - INFO - allennlp.common.params - dataset_reader.source_tokenizer.type = pretrained_transformer
2023-05-13 15:32:42,398 - INFO - allennlp.common.params - dataset_reader.source_tokenizer.model_name = microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext
2023-05-13 15:32:42,398 - INFO - allennlp.common.params - dataset_reader.source_tokenizer.add_special_tokens = True
2023-05-13 15:32:42,398 - INFO - allennlp.common.params - dataset_reader.source_tokenizer.max_length = 512
2023-05-13 15:32:42,398 - INFO - allennlp.common.params - dataset_reader.source_tokenizer.tokenizer_kwargs.additional_special_tokens = ['@ARG@', '@TRIGGER@']
2023-05-13 15:32:42,399 - INFO - allennlp.common.params - dataset_reader.source_tokenizer.tokenizer_kwargs.do_lower_case = True
2023-05-13 15:32:42,399 - INFO - allennlp.common.params - dataset_reader.source_tokenizer.verification_tokens = None
2023-05-13 15:32:48,173 - INFO - allennlp.common.params - dataset_reader.target_tokenizer.type = pretrained_transformer
2023-05-13 15:32:48,174 - INFO - allennlp.common.params - dataset_reader.target_tokenizer.model_name = microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext
2023-05-13 15:32:48,174 - INFO - allennlp.common.params - dataset_reader.target_tokenizer.add_special_tokens = False
2023-05-13 15:32:48,174 - INFO - allennlp.common.params - dataset_reader.target_tokenizer.max_length = None
2023-05-13 15:32:48,174 - INFO - allennlp.common.params - dataset_reader.target_tokenizer.tokenizer_kwargs.additional_special_tokens = ['@ARG@', '@TRIGGER@', '@OSP@', '@start@', '@end@', ';']
2023-05-13 15:32:48,174 - INFO - allennlp.common.params - dataset_reader.target_tokenizer.tokenizer_kwargs.do_lower_case = True
2023-05-13 15:32:48,174 - INFO - allennlp.common.params - dataset_reader.target_tokenizer.verification_tokens = None
2023-05-13 15:32:55,200 - INFO - allennlp.common.params - dataset_reader.source_token_indexers.tokens.type = pretrained_transformer
2023-05-13 15:32:55,200 - INFO - allennlp.common.params - dataset_reader.source_token_indexers.tokens.token_min_padding_length = 0
2023-05-13 15:32:55,201 - INFO - allennlp.common.params - dataset_reader.source_token_indexers.tokens.model_name = microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext
2023-05-13 15:32:55,201 - INFO - allennlp.common.params - dataset_reader.source_token_indexers.tokens.namespace = tags
2023-05-13 15:32:55,201 - INFO - allennlp.common.params - dataset_reader.source_token_indexers.tokens.max_length = None
2023-05-13 15:32:55,201 - INFO - allennlp.common.params - dataset_reader.source_token_indexers.tokens.tokenizer_kwargs.additional_special_tokens = ['@ARG@', '@TRIGGER@']
2023-05-13 15:32:55,201 - INFO - allennlp.common.params - dataset_reader.source_token_indexers.tokens.tokenizer_kwargs.do_lower_case = True
2023-05-13 15:32:55,202 - INFO - allennlp.common.params - dataset_reader.max_length = 512
2023-05-13 15:32:55,202 - INFO - allennlp.common.params - train_data_path = ../granular/train_transform.tsv
2023-05-13 15:32:55,203 - INFO - allennlp.common.params - datasets_for_vocab_creation = None
2023-05-13 15:32:55,203 - INFO - allennlp.common.params - validation_dataset_reader = None
2023-05-13 15:32:55,203 - INFO - allennlp.common.params - validation_data_path = ../granular/dev_transform.tsv
2023-05-13 15:32:55,203 - INFO - allennlp.common.params - test_data_path = None
2023-05-13 15:32:55,203 - INFO - allennlp.common.params - evaluate_on_test = False
2023-05-13 15:32:55,203 - INFO - allennlp.common.params - batch_weight_key =
2023-05-13 15:32:55,203 - INFO - allennlp.common.params - data_loader.type = multiprocess
2023-05-13 15:32:55,204 - INFO - allennlp.common.params - data_loader.batch_size = None
2023-05-13 15:32:55,204 - INFO - allennlp.common.params - data_loader.drop_last = False
2023-05-13 15:32:55,204 - INFO - allennlp.common.params - data_loader.shuffle = False
2023-05-13 15:32:55,204 - INFO - allennlp.common.params - data_loader.batch_sampler.type = bucket
2023-05-13 15:32:55,204 - INFO - allennlp.common.params - data_loader.batch_sampler.batch_size = 4
2023-05-13 15:32:55,204 - INFO - allennlp.common.params - data_loader.batch_sampler.sorting_keys = ['source_tokens']
2023-05-13 15:32:55,205 - INFO - allennlp.common.params - data_loader.batch_sampler.padding_noise = 0.1
2023-05-13 15:32:55,205 - INFO - allennlp.common.params - data_loader.batch_sampler.drop_last = False
2023-05-13 15:32:55,205 - INFO - allennlp.common.params - data_loader.batch_sampler.shuffle = True
2023-05-13 15:32:55,205 - INFO - allennlp.common.params - data_loader.batches_per_epoch = None
2023-05-13 15:32:55,205 - INFO - allennlp.common.params - data_loader.num_workers = 0
2023-05-13 15:32:55,205 - INFO - allennlp.common.params - data_loader.max_instances_in_memory = None
2023-05-13 15:32:55,205 - INFO - allennlp.common.params - data_loader.start_method = fork
2023-05-13 15:32:55,205 - INFO - allennlp.common.params - data_loader.cuda_device = None
2023-05-13 15:32:55,205 - INFO - allennlp.common.params - data_loader.quiet = False
2023-05-13 15:32:55,205 - INFO - allennlp.common.params - data_loader.collate_fn = <allennlp.data.data_loaders.data_collator.DefaultDataCollator object at 0x7f57313cfc10>
2023-05-13 15:32:55,206 - INFO - tqdm - loading instances: 0it [00:00, ?it/s]
2023-05-13 15:32:55,206 - INFO - seq2rel.dataset_reader - Reading instances from lines in file at: ../granular/train_transform.tsv
2023-05-13 15:32:59,867 - INFO - allennlp.common.params - validation_data_loader.type = multiprocess
2023-05-13 15:32:59,867 - INFO - allennlp.common.params - validation_data_loader.batch_size = None
2023-05-13 15:32:59,867 - INFO - allennlp.common.params - validation_data_loader.drop_last = False
2023-05-13 15:32:59,868 - INFO - allennlp.common.params - validation_data_loader.shuffle = False
2023-05-13 15:32:59,868 - INFO - allennlp.common.params - validation_data_loader.batch_sampler.type = bucket
2023-05-13 15:32:59,868 - INFO - allennlp.common.params - validation_data_loader.batch_sampler.batch_size = 128
2023-05-13 15:32:59,868 - INFO - allennlp.common.params - validation_data_loader.batch_sampler.sorting_keys = ['source_tokens']
2023-05-13 15:32:59,868 - INFO - allennlp.common.params - validation_data_loader.batch_sampler.padding_noise = 0
2023-05-13 15:32:59,868 - INFO - allennlp.common.params - validation_data_loader.batch_sampler.drop_last = False
2023-05-13 15:32:59,868 - INFO - allennlp.common.params - validation_data_loader.batch_sampler.shuffle = True
2023-05-13 15:32:59,869 - INFO - allennlp.common.params - validation_data_loader.batches_per_epoch = None
2023-05-13 15:32:59,869 - INFO - allennlp.common.params - validation_data_loader.num_workers = 0
2023-05-13 15:32:59,869 - INFO - allennlp.common.params - validation_data_loader.max_instances_in_memory = None
2023-05-13 15:32:59,869 - INFO - allennlp.common.params - validation_data_loader.start_method = fork
2023-05-13 15:32:59,869 - INFO - allennlp.common.params - validation_data_loader.cuda_device = None
2023-05-13 15:32:59,869 - INFO - allennlp.common.params - validation_data_loader.quiet = False
2023-05-13 15:32:59,869 - INFO - allennlp.common.params - validation_data_loader.collate_fn = <allennlp.data.data_loaders.data_collator.DefaultDataCollator object at 0x7f57313cfc10>
2023-05-13 15:32:59,869 - INFO - tqdm - loading instances: 0it [00:00, ?it/s]
2023-05-13 15:32:59,870 - INFO - seq2rel.dataset_reader - Reading instances from lines in file at: ../granular/dev_transform.tsv
2023-05-13 15:33:01,994 - INFO - allennlp.common.params - vocabulary.type = from_instances
2023-05-13 15:33:01,994 - INFO - allennlp.common.params - vocabulary.min_count = None
2023-05-13 15:33:01,995 - INFO - allennlp.common.params - vocabulary.non_padded_namespaces = ('*tags', '*labels')
2023-05-13 15:33:01,995 - INFO - allennlp.common.params - vocabulary.pretrained_files = None
2023-05-13 15:33:01,995 - INFO - allennlp.common.params - vocabulary.only_include_pretrained_words = False
2023-05-13 15:33:01,995 - INFO - allennlp.common.params - vocabulary.min_pretrained_embeddings = None
2023-05-13 15:33:01,995 - INFO - allennlp.common.params - vocabulary.padding_token = @@PADDING@@
2023-05-13 15:33:01,996 - INFO - allennlp.common.params - vocabulary.oov_token = @@UNKNOWN@@
2023-05-13 15:33:01,996 - INFO - allennlp.data.vocabulary - Fitting token dictionary from dataset.
2023-05-13 15:33:01,996 - INFO - tqdm - building vocab: 0it [00:00, ?it/s]
2023-05-13 15:33:02,129 - INFO - allennlp.common.params - model.type = copynet_seq2rel
2023-05-13 15:33:02,130 - INFO - allennlp.common.params - model.regularizer = None
2023-05-13 15:33:02,130 - INFO - allennlp.common.params - model.source_embedder.type = basic
2023-05-13 15:33:02,130 - INFO - allennlp.common.params - model.source_embedder.token_embedders.tokens.type = pretrained_transformer
2023-05-13 15:33:02,131 - INFO - allennlp.common.params - model.source_embedder.token_embedders.tokens.model_name = microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext
2023-05-13 15:33:02,131 - INFO - allennlp.common.params - model.source_embedder.token_embedders.tokens.max_length = None
2023-05-13 15:33:02,131 - INFO - allennlp.common.params - model.source_embedder.token_embedders.tokens.sub_module = None
2023-05-13 15:33:02,131 - INFO - allennlp.common.params - model.source_embedder.token_embedders.tokens.train_parameters = True
2023-05-13 15:33:02,131 - INFO - allennlp.common.params - model.source_embedder.token_embedders.tokens.eval_mode = False
2023-05-13 15:33:02,131 - INFO - allennlp.common.params - model.source_embedder.token_embedders.tokens.last_layer_only = True
2023-05-13 15:33:02,131 - INFO - allennlp.common.params - model.source_embedder.token_embedders.tokens.override_weights_file = None
2023-05-13 15:33:02,131 - INFO - allennlp.common.params - model.source_embedder.token_embedders.tokens.override_weights_strip_prefix = None
2023-05-13 15:33:02,132 - INFO - allennlp.common.params - model.source_embedder.token_embedders.tokens.reinit_modules = 2
2023-05-13 15:33:02,132 - INFO - allennlp.common.params - model.source_embedder.token_embedders.tokens.load_weights = True
2023-05-13 15:33:02,132 - INFO - allennlp.common.params - model.source_embedder.token_embedders.tokens.gradient_checkpointing = None
2023-05-13 15:33:02,132 - INFO - allennlp.common.params - model.source_embedder.token_embedders.tokens.tokenizer_kwargs.additional_special_tokens = ['@ARG@', '@TRIGGER@']
2023-05-13 15:33:02,132 - INFO - allennlp.common.params - model.source_embedder.token_embedders.tokens.tokenizer_kwargs.do_lower_case = True
2023-05-13 15:33:02,132 - INFO - allennlp.common.params - model.source_embedder.token_embedders.tokens.transformer_kwargs = None
2023-05-13 15:33:12,743 - INFO - allennlp.common.params - model.encoder = None
2023-05-13 15:33:12,744 - INFO - allennlp.common.params - model.label_smoothing = None
2023-05-13 15:33:12,744 - INFO - allennlp.common.params - model.target_embedding_dim = 256
2023-05-13 15:33:12,744 - INFO - allennlp.common.params - model.scheduled_sampling_ratio = 0.0
2023-05-13 15:33:12,744 - INFO - allennlp.common.params - model.copy_token = @COPY@
2023-05-13 15:33:12,744 - INFO - allennlp.common.params - model.target_namespace = target_tokens
2023-05-13 15:33:12,744 - INFO - allennlp.common.params - model.tensor_based_metric = None
2023-05-13 15:33:12,744 - INFO - allennlp.common.params - model.token_based_metric = None
2023-05-13 15:33:12,744 - INFO - allennlp.common.params - model.initializer = <allennlp.nn.initializers.InitializerApplicator object at 0x7f571cca0250>
2023-05-13 15:33:12,745 - INFO - allennlp.common.params - model.target_tokenizer.type = pretrained_transformer
2023-05-13 15:33:12,745 - INFO - allennlp.common.params - model.target_tokenizer.model_name = microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext
2023-05-13 15:33:12,745 - INFO - allennlp.common.params - model.target_tokenizer.add_special_tokens = False
2023-05-13 15:33:12,745 - INFO - allennlp.common.params - model.target_tokenizer.max_length = None
2023-05-13 15:33:12,745 - INFO - allennlp.common.params - model.target_tokenizer.tokenizer_kwargs.additional_special_tokens = ['@ARG@', '@TRIGGER@', '@OSP@', '@start@', '@end@', ';']
2023-05-13 15:33:12,745 - INFO - allennlp.common.params - model.target_tokenizer.tokenizer_kwargs.do_lower_case = True
2023-05-13 15:33:12,745 - INFO - allennlp.common.params - model.target_tokenizer.verification_tokens = None
2023-05-13 15:33:12,746 - INFO - allennlp.common.params - model.dropout = 0.1
2023-05-13 15:33:12,746 - INFO - allennlp.common.params - model.weight_dropout = 0.5
2023-05-13 15:33:12,747 - INFO - allennlp.common.params - model.sequence_based_metrics.0.type = f1_seq2rel
2023-05-13 15:33:12,747 - INFO - allennlp.common.params - model.sequence_based_metrics.0.labels = ['OSP']
2023-05-13 15:33:12,747 - INFO - allennlp.common.params - model.sequence_based_metrics.0.threshold = None
2023-05-13 15:33:12,747 - INFO - allennlp.common.params - model.sequence_based_metrics.0.ordered_ents = False
2023-05-13 15:33:12,747 - INFO - allennlp.common.params - model.sequence_based_metrics.0.remove_duplicate_ents = True
2023-05-13 15:33:12,747 - INFO - allennlp.common.params - model.sequence_based_metrics.0.average = micro
2023-05-13 15:33:12,747 - INFO - allennlp.common.params - model.init_decoder_state_strategy = mean
2023-05-13 15:33:12,748 - INFO - allennlp.common.params - model.attention.type = multihead_attention
2023-05-13 15:33:12,748 - INFO - allennlp.common.params - model.attention.normalize = True
2023-05-13 15:33:12,748 - INFO - allennlp.common.params - model.attention.num_heads = 6
2023-05-13 15:33:12,815 - INFO - allennlp.common.params - model.beam_search.type = beam_search
2023-05-13 15:33:12,815 - INFO - allennlp.common.params - model.beam_search.max_steps = 96
2023-05-13 15:33:12,815 - INFO - allennlp.common.params - model.beam_search.beam_size = 1
2023-05-13 15:33:12,815 - INFO - allennlp.common.params - model.beam_search.per_node_beam_size = None
2023-05-13 15:33:12,815 - INFO - allennlp.common.params - model.beam_search.sampler = None
2023-05-13 15:33:12,815 - INFO - allennlp.common.params - model.beam_search.min_steps = None
2023-05-13 15:33:12,816 - INFO - allennlp.common.params - model.beam_search.final_sequence_scorer.type = length-normalized-sequence-log-prob
2023-05-13 15:33:12,816 - INFO - allennlp.common.params - model.beam_search.final_sequence_scorer.length_penalty = 1
2023-05-13 15:33:12,816 - INFO - allennlp.common.params - model.beam_search.constraints = None
2023-05-13 15:33:12,816 - INFO - allennlp.nn.initializers - Initializing parameters
2023-05-13 15:33:12,817 - INFO - allennlp.nn.initializers - Done initializing parameters; the following parameters are using their default initialization from their code
2023-05-13 15:33:12,817 - INFO - allennlp.nn.initializers - _attention._multihead_attn.in_proj_bias
2023-05-13 15:33:12,817 - INFO - allennlp.nn.initializers - _attention._multihead_attn.in_proj_weight
2023-05-13 15:33:12,817 - INFO - allennlp.nn.initializers - _attention._multihead_attn.out_proj.bias
2023-05-13 15:33:12,817 - INFO - allennlp.nn.initializers - _attention._multihead_attn.out_proj.weight
2023-05-13 15:33:12,817 - INFO - allennlp.nn.initializers - _decoder_cell.bias_hh
2023-05-13 15:33:12,817 - INFO - allennlp.nn.initializers - _decoder_cell.bias_ih
2023-05-13 15:33:12,817 - INFO - allennlp.nn.initializers - _decoder_cell.weight_hh
2023-05-13 15:33:12,817 - INFO - allennlp.nn.initializers - _decoder_cell.weight_ih
2023-05-13 15:33:12,817 - INFO - allennlp.nn.initializers - _input_projection_layer.bias
2023-05-13 15:33:12,817 - INFO - allennlp.nn.initializers - _input_projection_layer.weight
2023-05-13 15:33:12,818 - INFO - allennlp.nn.initializers - _output_copying_layer.bias
2023-05-13 15:33:12,818 - INFO - allennlp.nn.initializers - _output_copying_layer.weight
2023-05-13 15:33:12,818 - INFO - allennlp.nn.initializers - _output_generation_layer.bias
2023-05-13 15:33:12,818 - INFO - allennlp.nn.initializers - _output_generation_layer.weight
2023-05-13 15:33:12,818 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.embeddings.LayerNorm.bias
2023-05-13 15:33:12,818 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.embeddings.LayerNorm.weight
2023-05-13 15:33:12,818 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.embeddings.position_embeddings.weight
2023-05-13 15:33:12,818 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.embeddings.token_type_embeddings.weight
2023-05-13 15:33:12,818 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.embeddings.word_embeddings.weight
2023-05-13 15:33:12,818 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.0.attention.output.LayerNorm.bias
2023-05-13 15:33:12,818 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.0.attention.output.LayerNorm.weight
2023-05-13 15:33:12,818 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.0.attention.output.dense.bias
2023-05-13 15:33:12,818 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.0.attention.output.dense.weight
2023-05-13 15:33:12,818 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.0.attention.self.key.bias
2023-05-13 15:33:12,818 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.0.attention.self.key.weight
2023-05-13 15:33:12,818 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.0.attention.self.query.bias
2023-05-13 15:33:12,818 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.0.attention.self.query.weight
2023-05-13 15:33:12,818 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.0.attention.self.value.bias
2023-05-13 15:33:12,818 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.0.attention.self.value.weight
2023-05-13 15:33:12,818 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.0.intermediate.dense.bias
2023-05-13 15:33:12,819 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.0.intermediate.dense.weight
2023-05-13 15:33:12,819 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.0.output.LayerNorm.bias
2023-05-13 15:33:12,819 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.0.output.LayerNorm.weight
2023-05-13 15:33:12,819 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.0.output.dense.bias
2023-05-13 15:33:12,819 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.0.output.dense.weight
2023-05-13 15:33:12,819 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.1.attention.output.LayerNorm.bias
2023-05-13 15:33:12,819 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.1.attention.output.LayerNorm.weight
2023-05-13 15:33:12,819 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.1.attention.output.dense.bias
2023-05-13 15:33:12,819 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.1.attention.output.dense.weight
2023-05-13 15:33:12,819 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.1.attention.self.key.bias
2023-05-13 15:33:12,819 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.1.attention.self.key.weight
2023-05-13 15:33:12,819 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.1.attention.self.query.bias
2023-05-13 15:33:12,819 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.1.attention.self.query.weight
2023-05-13 15:33:12,819 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.1.attention.self.value.bias
2023-05-13 15:33:12,819 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.1.attention.self.value.weight
2023-05-13 15:33:12,819 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.1.intermediate.dense.bias
2023-05-13 15:33:12,819 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.1.intermediate.dense.weight
2023-05-13 15:33:12,819 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.1.output.LayerNorm.bias
2023-05-13 15:33:12,819 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.1.output.LayerNorm.weight
2023-05-13 15:33:12,819 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.1.output.dense.bias
2023-05-13 15:33:12,819 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.1.output.dense.weight
2023-05-13 15:33:12,820 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.10.attention.output.LayerNorm.bias
2023-05-13 15:33:12,820 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.10.attention.output.LayerNorm.weight
2023-05-13 15:33:12,820 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.10.attention.output.dense.bias
2023-05-13 15:33:12,820 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.10.attention.output.dense.weight
2023-05-13 15:33:12,820 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.10.attention.self.key.bias
2023-05-13 15:33:12,820 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.10.attention.self.key.weight
2023-05-13 15:33:12,820 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.10.attention.self.query.bias
2023-05-13 15:33:12,820 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.10.attention.self.query.weight
2023-05-13 15:33:12,820 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.10.attention.self.value.bias
2023-05-13 15:33:12,820 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.10.attention.self.value.weight
2023-05-13 15:33:12,820 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.10.intermediate.dense.bias
2023-05-13 15:33:12,820 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.10.intermediate.dense.weight
2023-05-13 15:33:12,820 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.10.output.LayerNorm.bias
2023-05-13 15:33:12,820 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.10.output.LayerNorm.weight
2023-05-13 15:33:12,820 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.10.output.dense.bias
2023-05-13 15:33:12,821 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.10.output.dense.weight
2023-05-13 15:33:12,821 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.11.attention.output.LayerNorm.bias
2023-05-13 15:33:12,821 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.11.attention.output.LayerNorm.weight
2023-05-13 15:33:12,821 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.11.attention.output.dense.bias
2023-05-13 15:33:12,821 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.11.attention.output.dense.weight
2023-05-13 15:33:12,821 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.11.attention.self.key.bias
2023-05-13 15:33:12,821 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.11.attention.self.key.weight
2023-05-13 15:33:12,821 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.11.attention.self.query.bias
2023-05-13 15:33:12,821 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.11.attention.self.query.weight
2023-05-13 15:33:12,821 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.11.attention.self.value.bias
2023-05-13 15:33:12,821 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.11.attention.self.value.weight
2023-05-13 15:33:12,821 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.11.intermediate.dense.bias
2023-05-13 15:33:12,821 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.11.intermediate.dense.weight
2023-05-13 15:33:12,821 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.11.output.LayerNorm.bias
2023-05-13 15:33:12,821 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.11.output.LayerNorm.weight
2023-05-13 15:33:12,821 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.11.output.dense.bias
2023-05-13 15:33:12,821 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.11.output.dense.weight
2023-05-13 15:33:12,821 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.2.attention.output.LayerNorm.bias
2023-05-13 15:33:12,821 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.2.attention.output.LayerNorm.weight
2023-05-13 15:33:12,822 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.2.attention.output.dense.bias
2023-05-13 15:33:12,822 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.2.attention.output.dense.weight
2023-05-13 15:33:12,822 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.2.attention.self.key.bias
2023-05-13 15:33:12,822 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.2.attention.self.key.weight
2023-05-13 15:33:12,822 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.2.attention.self.query.bias
2023-05-13 15:33:12,822 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.2.attention.self.query.weight
2023-05-13 15:33:12,822 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.2.attention.self.value.bias
2023-05-13 15:33:12,822 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.2.attention.self.value.weight
2023-05-13 15:33:12,822 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.2.intermediate.dense.bias
2023-05-13 15:33:12,822 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.2.intermediate.dense.weight
2023-05-13 15:33:12,822 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.2.output.LayerNorm.bias
2023-05-13 15:33:12,822 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.2.output.LayerNorm.weight
2023-05-13 15:33:12,822 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.2.output.dense.bias
2023-05-13 15:33:12,822 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.2.output.dense.weight
2023-05-13 15:33:12,830 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.3.attention.output.LayerNorm.bias
2023-05-13 15:33:12,830 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.3.attention.output.LayerNorm.weight
2023-05-13 15:33:12,830 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.3.attention.output.dense.bias
2023-05-13 15:33:12,830 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.3.attention.output.dense.weight
2023-05-13 15:33:12,830 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.3.attention.self.key.bias
2023-05-13 15:33:12,830 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.3.attention.self.key.weight
2023-05-13 15:33:12,830 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.3.attention.self.query.bias
2023-05-13 15:33:12,830 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.3.attention.self.query.weight
2023-05-13 15:33:12,830 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.3.attention.self.value.bias
2023-05-13 15:33:12,830 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.3.attention.self.value.weight
2023-05-13 15:33:12,830 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.3.intermediate.dense.bias
2023-05-13 15:33:12,830 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.3.intermediate.dense.weight
2023-05-13 15:33:12,830 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.3.output.LayerNorm.bias
2023-05-13 15:33:12,831 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.3.output.LayerNorm.weight
2023-05-13 15:33:12,831 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.3.output.dense.bias
2023-05-13 15:33:12,831 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.3.output.dense.weight
2023-05-13 15:33:12,831 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.4.attention.output.LayerNorm.bias
2023-05-13 15:33:12,831 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.4.attention.output.LayerNorm.weight
2023-05-13 15:33:12,831 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.4.attention.output.dense.bias
2023-05-13 15:33:12,831 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.4.attention.output.dense.weight
2023-05-13 15:33:12,831 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.4.attention.self.key.bias
2023-05-13 15:33:12,831 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.4.attention.self.key.weight
2023-05-13 15:33:12,831 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.4.attention.self.query.bias
2023-05-13 15:33:12,831 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.4.attention.self.query.weight
2023-05-13 15:33:12,831 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.4.attention.self.value.bias
2023-05-13 15:33:12,831 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.4.attention.self.value.weight
2023-05-13 15:33:12,831 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.4.intermediate.dense.bias
2023-05-13 15:33:12,832 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.4.intermediate.dense.weight
2023-05-13 15:33:12,832 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.4.output.LayerNorm.bias
2023-05-13 15:33:12,832 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.4.output.LayerNorm.weight
2023-05-13 15:33:12,832 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.4.output.dense.bias
2023-05-13 15:33:12,832 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.4.output.dense.weight
2023-05-13 15:33:12,832 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.5.attention.output.LayerNorm.bias
2023-05-13 15:33:12,832 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.5.attention.output.LayerNorm.weight
2023-05-13 15:33:12,832 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.5.attention.output.dense.bias
2023-05-13 15:33:12,832 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.5.attention.output.dense.weight
2023-05-13 15:33:12,832 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.5.attention.self.key.bias
2023-05-13 15:33:12,832 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.5.attention.self.key.weight
2023-05-13 15:33:12,832 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.5.attention.self.query.bias
2023-05-13 15:33:12,832 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.5.attention.self.query.weight
2023-05-13 15:33:12,833 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.5.attention.self.value.bias
2023-05-13 15:33:12,833 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.5.attention.self.value.weight
2023-05-13 15:33:12,833 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.5.intermediate.dense.bias
2023-05-13 15:33:12,833 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.5.intermediate.dense.weight
2023-05-13 15:33:12,833 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.5.output.LayerNorm.bias
2023-05-13 15:33:12,833 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.5.output.LayerNorm.weight
2023-05-13 15:33:12,833 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.5.output.dense.bias
2023-05-13 15:33:12,833 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.5.output.dense.weight
2023-05-13 15:33:12,833 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.6.attention.output.LayerNorm.bias
2023-05-13 15:33:12,833 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.6.attention.output.LayerNorm.weight
2023-05-13 15:33:12,833 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.6.attention.output.dense.bias
2023-05-13 15:33:12,833 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.6.attention.output.dense.weight
2023-05-13 15:33:12,833 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.6.attention.self.key.bias
2023-05-13 15:33:12,833 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.6.attention.self.key.weight
2023-05-13 15:33:12,833 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.6.attention.self.query.bias
2023-05-13 15:33:12,834 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.6.attention.self.query.weight
2023-05-13 15:33:12,834 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.6.attention.self.value.bias
2023-05-13 15:33:12,834 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.6.attention.self.value.weight
2023-05-13 15:33:12,834 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.6.intermediate.dense.bias
2023-05-13 15:33:12,834 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.6.intermediate.dense.weight
2023-05-13 15:33:12,834 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.6.output.LayerNorm.bias
2023-05-13 15:33:12,834 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.6.output.LayerNorm.weight
2023-05-13 15:33:12,834 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.6.output.dense.bias
2023-05-13 15:33:12,834 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.6.output.dense.weight
2023-05-13 15:33:12,834 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.7.attention.output.LayerNorm.bias
2023-05-13 15:33:12,834 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.7.attention.output.LayerNorm.weight
2023-05-13 15:33:12,834 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.7.attention.output.dense.bias
2023-05-13 15:33:12,834 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.7.attention.output.dense.weight
2023-05-13 15:33:12,834 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.7.attention.self.key.bias
2023-05-13 15:33:12,834 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.7.attention.self.key.weight
2023-05-13 15:33:12,835 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.7.attention.self.query.bias
2023-05-13 15:33:12,835 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.7.attention.self.query.weight
2023-05-13 15:33:12,835 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.7.attention.self.value.bias
2023-05-13 15:33:12,835 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.7.attention.self.value.weight
2023-05-13 15:33:12,835 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.7.intermediate.dense.bias
2023-05-13 15:33:12,835 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.7.intermediate.dense.weight
2023-05-13 15:33:12,835 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.7.output.LayerNorm.bias
2023-05-13 15:33:12,835 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.7.output.LayerNorm.weight
2023-05-13 15:33:12,835 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.7.output.dense.bias
2023-05-13 15:33:12,835 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.7.output.dense.weight
2023-05-13 15:33:12,835 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.8.attention.output.LayerNorm.bias
2023-05-13 15:33:12,835 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.8.attention.output.LayerNorm.weight
2023-05-13 15:33:12,835 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.8.attention.output.dense.bias
2023-05-13 15:33:12,835 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.8.attention.output.dense.weight
2023-05-13 15:33:12,835 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.8.attention.self.key.bias
2023-05-13 15:33:12,835 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.8.attention.self.key.weight
2023-05-13 15:33:12,835 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.8.attention.self.query.bias
2023-05-13 15:33:12,835 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.8.attention.self.query.weight
2023-05-13 15:33:12,835 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.8.attention.self.value.bias
2023-05-13 15:33:12,835 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.8.attention.self.value.weight
2023-05-13 15:33:12,836 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.8.intermediate.dense.bias
2023-05-13 15:33:12,836 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.8.intermediate.dense.weight
2023-05-13 15:33:12,836 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.8.output.LayerNorm.bias
2023-05-13 15:33:12,836 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.8.output.LayerNorm.weight
2023-05-13 15:33:12,836 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.8.output.dense.bias
2023-05-13 15:33:12,836 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.8.output.dense.weight
2023-05-13 15:33:12,836 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.9.attention.output.LayerNorm.bias
2023-05-13 15:33:12,836 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.9.attention.output.LayerNorm.weight
2023-05-13 15:33:12,836 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.9.attention.output.dense.bias
2023-05-13 15:33:12,836 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.9.attention.output.dense.weight
2023-05-13 15:33:12,836 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.9.attention.self.key.bias
2023-05-13 15:33:12,836 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.9.attention.self.key.weight
2023-05-13 15:33:12,836 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.9.attention.self.query.bias
2023-05-13 15:33:12,836 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.9.attention.self.query.weight
2023-05-13 15:33:12,836 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.9.attention.self.value.bias
2023-05-13 15:33:12,836 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.9.attention.self.value.weight
2023-05-13 15:33:12,836 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.9.intermediate.dense.bias
2023-05-13 15:33:12,836 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.9.intermediate.dense.weight
2023-05-13 15:33:12,836 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.9.output.LayerNorm.bias
2023-05-13 15:33:12,836 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.9.output.LayerNorm.weight
2023-05-13 15:33:12,837 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.9.output.dense.bias
2023-05-13 15:33:12,837 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.9.output.dense.weight
2023-05-13 15:33:12,837 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.pooler.dense.bias
2023-05-13 15:33:12,837 - INFO - allennlp.nn.initializers - _source_embedder.token_embedder_tokens.transformer_model.pooler.dense.weight
2023-05-13 15:33:12,837 - INFO - allennlp.nn.initializers - _target_embedder.weight
2023-05-13 15:33:13,241 - INFO - allennlp.common.params - trainer.type = gradient_descent
2023-05-13 15:33:13,241 - INFO - allennlp.common.params - trainer.cuda_device = None
2023-05-13 15:33:13,241 - INFO - allennlp.common.params - trainer.distributed = False
2023-05-13 15:33:13,241 - INFO - allennlp.common.params - trainer.world_size = 1
2023-05-13 15:33:13,242 - INFO - allennlp.common.params - trainer.patience = None
2023-05-13 15:33:13,242 - INFO - allennlp.common.params - trainer.validation_metric = +fscore
2023-05-13 15:33:13,242 - INFO - allennlp.common.params - trainer.num_epochs = 25
2023-05-13 15:33:13,242 - INFO - allennlp.common.params - trainer.grad_norm = 1
2023-05-13 15:33:13,242 - INFO - allennlp.common.params - trainer.grad_clipping = None
2023-05-13 15:33:13,242 - INFO - allennlp.common.params - trainer.num_gradient_accumulation_steps = 1
2023-05-13 15:33:13,242 - INFO - allennlp.common.params - trainer.use_amp = True
2023-05-13 15:33:13,242 - INFO - allennlp.common.params - trainer.no_grad = None
2023-05-13 15:33:13,243 - INFO - allennlp.common.params - trainer.momentum_scheduler = None
2023-05-13 15:33:13,243 - INFO - allennlp.common.params - trainer.moving_average = None
2023-05-13 15:33:13,243 - INFO - allennlp.common.params - trainer.enable_default_callbacks = True
2023-05-13 15:33:13,243 - INFO - allennlp.common.params - trainer.run_confidence_checks = True
2023-05-13 15:33:13,243 - INFO - allennlp.common.params - trainer.grad_scaling = True
2023-05-13 15:33:16,509 - INFO - allennlp.common.params - trainer.optimizer.type = huggingface_adamw
2023-05-13 15:33:16,510 - INFO - allennlp.common.params - trainer.optimizer.lr = 0.0004
2023-05-13 15:33:16,510 - INFO - allennlp.common.params - trainer.optimizer.betas = (0.9, 0.999)
2023-05-13 15:33:16,510 - INFO - allennlp.common.params - trainer.optimizer.eps = 1e-08
2023-05-13 15:33:16,510 - INFO - allennlp.common.params - trainer.optimizer.weight_decay = 0
2023-05-13 15:33:16,510 - INFO - allennlp.common.params - trainer.optimizer.correct_bias = True
2023-05-13 15:33:16,511 - INFO - allennlp.training.optimizers - Done constructing parameter groups.
2023-05-13 15:33:16,511 - INFO - allennlp.training.optimizers - Group 0: ['_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.4.attention.self.value.weight', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.7.attention.self.query.weight', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.3.output.dense.weight', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.8.output.dense.weight', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.11.output.dense.weight', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.5.attention.output.dense.weight', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.8.attention.self.value.weight', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.2.attention.self.key.weight', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.11.attention.output.dense.weight', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.4.attention.self.query.weight', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.10.intermediate.dense.weight', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.9.attention.self.key.weight', '_source_embedder.token_embedder_tokens.transformer_model.embeddings.word_embeddings.weight', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.4.output.dense.weight', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.9.attention.output.dense.weight', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.0.attention.output.dense.weight', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.6.attention.self.query.weight', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.11.attention.self.query.weight', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.7.attention.self.key.weight', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.6.output.dense.weight', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.7.output.dense.weight', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.3.attention.self.value.weight', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.0.attention.self.value.weight', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.1.attention.self.key.weight', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.8.intermediate.dense.weight', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.1.attention.self.query.weight', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.10.attention.self.query.weight', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.9.attention.self.query.weight', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.10.attention.self.value.weight', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.10.attention.output.dense.weight', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.5.attention.self.query.weight', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.0.intermediate.dense.weight', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.1.attention.output.dense.weight', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.1.intermediate.dense.weight', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.4.attention.output.dense.weight', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.5.output.dense.weight', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.2.output.dense.weight', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.8.attention.output.dense.weight', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.2.attention.self.query.weight', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.6.attention.self.key.weight', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.6.intermediate.dense.weight', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.11.attention.self.key.weight', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.6.attention.output.dense.weight', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.2.intermediate.dense.weight', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.3.intermediate.dense.weight', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.6.attention.self.value.weight', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.5.attention.self.key.weight', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.7.attention.self.value.weight', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.7.intermediate.dense.weight', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.0.attention.self.query.weight', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.3.attention.self.key.weight', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.7.attention.output.dense.weight', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.2.attention.output.dense.weight', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.9.attention.self.value.weight', '_source_embedder.token_embedder_tokens.transformer_model.embeddings.token_type_embeddings.weight', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.11.intermediate.dense.weight', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.1.output.dense.weight', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.5.attention.self.value.weight', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.3.attention.output.dense.weight', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.11.attention.self.value.weight', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.0.output.dense.weight', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.4.attention.self.key.weight', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.8.attention.self.key.weight', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.9.output.dense.weight', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.0.attention.self.key.weight', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.5.intermediate.dense.weight', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.8.attention.self.query.weight', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.9.intermediate.dense.weight', '_source_embedder.token_embedder_tokens.transformer_model.pooler.dense.weight', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.2.attention.self.value.weight', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.10.attention.self.key.weight', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.4.intermediate.dense.weight', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.10.output.dense.weight', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.3.attention.self.query.weight', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.1.attention.self.value.weight', '_source_embedder.token_embedder_tokens.transformer_model.embeddings.position_embeddings.weight'], {'lr': 2e-05, 'weight_decay': 0.01}
2023-05-13 15:33:16,511 - INFO - allennlp.training.optimizers - Group 1: ['_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.3.attention.output.LayerNorm.bias', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.7.attention.output.LayerNorm.bias', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.5.attention.self.query.bias', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.11.attention.self.query.bias', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.11.output.LayerNorm.bias', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.7.output.dense.bias', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.6.intermediate.dense.bias', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.6.attention.output.LayerNorm.bias', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.0.attention.self.query.bias', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.3.intermediate.dense.bias', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.9.attention.output.LayerNorm.bias', '_source_embedder.token_embedder_tokens.transformer_model.pooler.dense.bias', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.10.output.LayerNorm.weight', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.11.intermediate.dense.bias', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.5.attention.output.LayerNorm.bias', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.10.attention.self.value.bias', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.4.attention.output.LayerNorm.bias', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.11.attention.output.LayerNorm.bias', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.9.intermediate.dense.bias', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.9.attention.self.query.bias', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.10.output.dense.bias', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.4.attention.self.query.bias', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.2.attention.output.LayerNorm.bias', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.3.attention.self.value.bias', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.10.attention.self.key.bias', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.3.attention.output.LayerNorm.weight', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.5.attention.output.dense.bias', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.9.attention.output.LayerNorm.weight', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.1.attention.self.value.bias', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.6.attention.output.dense.bias', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.6.attention.output.LayerNorm.weight', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.6.attention.self.key.bias', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.7.attention.self.key.bias', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.1.attention.output.LayerNorm.bias', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.7.attention.self.query.bias', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.4.intermediate.dense.bias', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.8.output.dense.bias', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.3.output.LayerNorm.weight', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.5.output.LayerNorm.weight', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.3.attention.self.query.bias', '_source_embedder.token_embedder_tokens.transformer_model.embeddings.LayerNorm.weight', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.3.attention.output.dense.bias', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.0.attention.self.value.bias', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.4.attention.self.value.bias', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.4.attention.output.LayerNorm.weight', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.7.attention.output.dense.bias', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.4.attention.self.key.bias', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.0.output.dense.bias', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.6.output.LayerNorm.bias', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.9.output.dense.bias', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.9.output.LayerNorm.bias', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.7.intermediate.dense.bias', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.3.output.dense.bias', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.0.attention.self.key.bias', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.6.output.LayerNorm.weight', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.2.attention.self.query.bias', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.9.attention.output.dense.bias', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.11.attention.self.value.bias', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.8.intermediate.dense.bias', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.8.attention.self.value.bias', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.9.attention.self.value.bias', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.1.attention.output.dense.bias', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.11.output.LayerNorm.weight', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.0.output.LayerNorm.bias', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.5.output.LayerNorm.bias', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.1.attention.self.key.bias', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.4.attention.output.dense.bias', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.11.attention.output.dense.bias', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.3.attention.self.key.bias', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.3.output.LayerNorm.bias', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.10.intermediate.dense.bias', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.8.attention.output.LayerNorm.weight', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.7.output.LayerNorm.weight', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.2.output.LayerNorm.bias', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.9.output.LayerNorm.weight', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.6.output.dense.bias', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.10.attention.self.query.bias', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.10.output.LayerNorm.bias', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.2.attention.self.value.bias', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.8.output.LayerNorm.weight', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.8.attention.output.LayerNorm.bias', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.8.attention.output.dense.bias', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.10.attention.output.LayerNorm.weight', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.2.output.dense.bias', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.2.attention.output.LayerNorm.weight', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.4.output.dense.bias', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.8.attention.self.key.bias', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.4.output.LayerNorm.bias', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.11.attention.self.key.bias', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.10.attention.output.dense.bias', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.1.output.dense.bias', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.11.output.dense.bias', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.5.attention.output.LayerNorm.weight', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.6.attention.self.query.bias', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.7.output.LayerNorm.bias', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.1.intermediate.dense.bias', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.6.attention.self.value.bias', '_source_embedder.token_embedder_tokens.transformer_model.embeddings.LayerNorm.bias', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.7.attention.output.LayerNorm.weight', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.5.intermediate.dense.bias', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.11.attention.output.LayerNorm.weight', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.0.attention.output.dense.bias', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.1.output.LayerNorm.weight', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.0.attention.output.LayerNorm.weight', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.1.attention.output.LayerNorm.weight', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.0.intermediate.dense.bias', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.2.attention.output.dense.bias', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.2.output.LayerNorm.weight', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.5.output.dense.bias', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.2.intermediate.dense.bias', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.8.attention.self.query.bias', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.0.attention.output.LayerNorm.bias', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.2.attention.self.key.bias', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.1.attention.self.query.bias', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.4.output.LayerNorm.weight', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.8.output.LayerNorm.bias', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.7.attention.self.value.bias', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.1.output.LayerNorm.bias', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.10.attention.output.LayerNorm.bias', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.5.attention.self.value.bias', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.5.attention.self.key.bias', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.0.output.LayerNorm.weight', '_source_embedder.token_embedder_tokens.transformer_model.encoder.layer.9.attention.self.key.bias'], {'lr': 2e-05, 'weight_decay': 0}
2023-05-13 15:33:16,548 - INFO - allennlp.training.optimizers - Group 2: ['_output_generation_layer.bias', '_attention._multihead_attn.out_proj.bias', '_decoder_cell.weight_hh_raw', '_output_copying_layer.bias', '_target_embedder.weight', '_input_projection_layer.weight', '_decoder_cell.module.bias_hh', '_attention._multihead_attn.in_proj_bias', '_output_generation_layer.weight', '_output_copying_layer.weight', '_decoder_cell.module.weight_ih', '_input_projection_layer.bias', '_attention._multihead_attn.out_proj.weight', '_decoder_cell.module.bias_ih', '_attention._multihead_attn.in_proj_weight'], {}
2023-05-13 15:33:16,548 - INFO - allennlp.training.optimizers - Number of trainable parameters: 118547721
2023-05-13 15:33:16,551 - INFO - allennlp.common.util - The following parameters are Frozen (without gradient):
2023-05-13 15:33:16,552 - INFO - allennlp.common.util - The following parameters are Tunable (with gradient):
2023-05-13 15:33:16,552 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.embeddings.word_embeddings.weight
2023-05-13 15:33:16,552 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.embeddings.position_embeddings.weight
2023-05-13 15:33:16,552 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.embeddings.token_type_embeddings.weight
2023-05-13 15:33:16,552 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.embeddings.LayerNorm.weight
2023-05-13 15:33:16,552 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.embeddings.LayerNorm.bias
2023-05-13 15:33:16,552 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.0.attention.self.query.weight
2023-05-13 15:33:16,552 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.0.attention.self.query.bias
2023-05-13 15:33:16,552 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.0.attention.self.key.weight
2023-05-13 15:33:16,552 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.0.attention.self.key.bias
2023-05-13 15:33:16,552 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.0.attention.self.value.weight
2023-05-13 15:33:16,552 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.0.attention.self.value.bias
2023-05-13 15:33:16,552 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.0.attention.output.dense.weight
2023-05-13 15:33:16,553 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.0.attention.output.dense.bias
2023-05-13 15:33:16,553 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.0.attention.output.LayerNorm.weight
2023-05-13 15:33:16,553 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.0.attention.output.LayerNorm.bias
2023-05-13 15:33:16,553 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.0.intermediate.dense.weight
2023-05-13 15:33:16,553 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.0.intermediate.dense.bias
2023-05-13 15:33:16,553 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.0.output.dense.weight
2023-05-13 15:33:16,553 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.0.output.dense.bias
2023-05-13 15:33:16,553 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.0.output.LayerNorm.weight
2023-05-13 15:33:16,553 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.0.output.LayerNorm.bias
2023-05-13 15:33:16,553 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.1.attention.self.query.weight
2023-05-13 15:33:16,553 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.1.attention.self.query.bias
2023-05-13 15:33:16,553 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.1.attention.self.key.weight
2023-05-13 15:33:16,553 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.1.attention.self.key.bias
2023-05-13 15:33:16,553 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.1.attention.self.value.weight
2023-05-13 15:33:16,553 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.1.attention.self.value.bias
2023-05-13 15:33:16,554 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.1.attention.output.dense.weight
2023-05-13 15:33:16,554 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.1.attention.output.dense.bias
2023-05-13 15:33:16,554 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.1.attention.output.LayerNorm.weight
2023-05-13 15:33:16,554 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.1.attention.output.LayerNorm.bias
2023-05-13 15:33:16,554 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.1.intermediate.dense.weight
2023-05-13 15:33:16,554 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.1.intermediate.dense.bias
2023-05-13 15:33:16,554 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.1.output.dense.weight
2023-05-13 15:33:16,554 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.1.output.dense.bias
2023-05-13 15:33:16,554 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.1.output.LayerNorm.weight
2023-05-13 15:33:16,554 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.1.output.LayerNorm.bias
2023-05-13 15:33:16,554 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.2.attention.self.query.weight
2023-05-13 15:33:16,554 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.2.attention.self.query.bias
2023-05-13 15:33:16,554 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.2.attention.self.key.weight
2023-05-13 15:33:16,554 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.2.attention.self.key.bias
2023-05-13 15:33:16,554 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.2.attention.self.value.weight
2023-05-13 15:33:16,554 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.2.attention.self.value.bias
2023-05-13 15:33:16,555 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.2.attention.output.dense.weight
2023-05-13 15:33:16,555 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.2.attention.output.dense.bias
2023-05-13 15:33:16,555 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.2.attention.output.LayerNorm.weight
2023-05-13 15:33:16,555 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.2.attention.output.LayerNorm.bias
2023-05-13 15:33:16,555 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.2.intermediate.dense.weight
2023-05-13 15:33:16,555 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.2.intermediate.dense.bias
2023-05-13 15:33:16,555 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.2.output.dense.weight
2023-05-13 15:33:16,555 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.2.output.dense.bias
2023-05-13 15:33:16,555 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.2.output.LayerNorm.weight
2023-05-13 15:33:16,555 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.2.output.LayerNorm.bias
2023-05-13 15:33:16,555 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.3.attention.self.query.weight
2023-05-13 15:33:16,555 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.3.attention.self.query.bias
2023-05-13 15:33:16,555 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.3.attention.self.key.weight
2023-05-13 15:33:16,555 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.3.attention.self.key.bias
2023-05-13 15:33:16,555 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.3.attention.self.value.weight
2023-05-13 15:33:16,555 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.3.attention.self.value.bias
2023-05-13 15:33:16,555 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.3.attention.output.dense.weight
2023-05-13 15:33:16,556 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.3.attention.output.dense.bias
2023-05-13 15:33:16,556 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.3.attention.output.LayerNorm.weight
2023-05-13 15:33:16,556 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.3.attention.output.LayerNorm.bias
2023-05-13 15:33:16,556 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.3.intermediate.dense.weight
2023-05-13 15:33:16,556 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.3.intermediate.dense.bias
2023-05-13 15:33:16,556 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.3.output.dense.weight
2023-05-13 15:33:16,556 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.3.output.dense.bias
2023-05-13 15:33:16,556 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.3.output.LayerNorm.weight
2023-05-13 15:33:16,556 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.3.output.LayerNorm.bias
2023-05-13 15:33:16,556 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.4.attention.self.query.weight
2023-05-13 15:33:16,556 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.4.attention.self.query.bias
2023-05-13 15:33:16,556 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.4.attention.self.key.weight
2023-05-13 15:33:16,556 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.4.attention.self.key.bias
2023-05-13 15:33:16,556 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.4.attention.self.value.weight
2023-05-13 15:33:16,556 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.4.attention.self.value.bias
2023-05-13 15:33:16,556 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.4.attention.output.dense.weight
2023-05-13 15:33:16,557 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.4.attention.output.dense.bias
2023-05-13 15:33:16,557 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.4.attention.output.LayerNorm.weight
2023-05-13 15:33:16,557 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.4.attention.output.LayerNorm.bias
2023-05-13 15:33:16,557 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.4.intermediate.dense.weight
2023-05-13 15:33:16,557 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.4.intermediate.dense.bias
2023-05-13 15:33:16,557 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.4.output.dense.weight
2023-05-13 15:33:16,557 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.4.output.dense.bias
2023-05-13 15:33:16,557 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.4.output.LayerNorm.weight
2023-05-13 15:33:16,557 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.4.output.LayerNorm.bias
2023-05-13 15:33:16,557 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.5.attention.self.query.weight
2023-05-13 15:33:16,557 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.5.attention.self.query.bias
2023-05-13 15:33:16,557 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.5.attention.self.key.weight
2023-05-13 15:33:16,557 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.5.attention.self.key.bias
2023-05-13 15:33:16,557 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.5.attention.self.value.weight
2023-05-13 15:33:16,557 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.5.attention.self.value.bias
2023-05-13 15:33:16,557 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.5.attention.output.dense.weight
2023-05-13 15:33:16,557 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.5.attention.output.dense.bias
2023-05-13 15:33:16,558 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.5.attention.output.LayerNorm.weight
2023-05-13 15:33:16,558 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.5.attention.output.LayerNorm.bias
2023-05-13 15:33:16,558 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.5.intermediate.dense.weight
2023-05-13 15:33:16,558 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.5.intermediate.dense.bias
2023-05-13 15:33:16,558 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.5.output.dense.weight
2023-05-13 15:33:16,558 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.5.output.dense.bias
2023-05-13 15:33:16,558 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.5.output.LayerNorm.weight
2023-05-13 15:33:16,558 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.5.output.LayerNorm.bias
2023-05-13 15:33:16,558 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.6.attention.self.query.weight
2023-05-13 15:33:16,558 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.6.attention.self.query.bias
2023-05-13 15:33:16,558 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.6.attention.self.key.weight
2023-05-13 15:33:16,558 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.6.attention.self.key.bias
2023-05-13 15:33:16,558 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.6.attention.self.value.weight
2023-05-13 15:33:16,558 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.6.attention.self.value.bias
2023-05-13 15:33:16,558 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.6.attention.output.dense.weight
2023-05-13 15:33:16,558 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.6.attention.output.dense.bias
2023-05-13 15:33:16,558 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.6.attention.output.LayerNorm.weight
2023-05-13 15:33:16,559 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.6.attention.output.LayerNorm.bias
2023-05-13 15:33:16,559 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.6.intermediate.dense.weight
2023-05-13 15:33:16,559 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.6.intermediate.dense.bias
2023-05-13 15:33:16,559 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.6.output.dense.weight
2023-05-13 15:33:16,559 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.6.output.dense.bias
2023-05-13 15:33:16,559 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.6.output.LayerNorm.weight
2023-05-13 15:33:16,559 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.6.output.LayerNorm.bias
2023-05-13 15:33:16,559 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.7.attention.self.query.weight
2023-05-13 15:33:16,650 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.7.attention.self.query.bias
2023-05-13 15:33:16,650 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.7.attention.self.key.weight
2023-05-13 15:33:16,650 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.7.attention.self.key.bias
2023-05-13 15:33:16,650 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.7.attention.self.value.weight
2023-05-13 15:33:16,651 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.7.attention.self.value.bias
2023-05-13 15:33:16,651 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.7.attention.output.dense.weight
2023-05-13 15:33:16,651 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.7.attention.output.dense.bias
2023-05-13 15:33:16,651 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.7.attention.output.LayerNorm.weight
2023-05-13 15:33:16,651 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.7.attention.output.LayerNorm.bias
2023-05-13 15:33:16,651 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.7.intermediate.dense.weight
2023-05-13 15:33:16,651 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.7.intermediate.dense.bias
2023-05-13 15:33:16,651 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.7.output.dense.weight
2023-05-13 15:33:16,651 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.7.output.dense.bias
2023-05-13 15:33:16,651 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.7.output.LayerNorm.weight
2023-05-13 15:33:16,651 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.7.output.LayerNorm.bias
2023-05-13 15:33:16,652 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.8.attention.self.query.weight
2023-05-13 15:33:16,652 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.8.attention.self.query.bias
2023-05-13 15:33:16,652 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.8.attention.self.key.weight
2023-05-13 15:33:16,652 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.8.attention.self.key.bias
2023-05-13 15:33:16,652 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.8.attention.self.value.weight
2023-05-13 15:33:16,652 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.8.attention.self.value.bias
2023-05-13 15:33:16,652 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.8.attention.output.dense.weight
2023-05-13 15:33:16,652 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.8.attention.output.dense.bias
2023-05-13 15:33:16,652 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.8.attention.output.LayerNorm.weight
2023-05-13 15:33:16,652 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.8.attention.output.LayerNorm.bias
2023-05-13 15:33:16,652 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.8.intermediate.dense.weight
2023-05-13 15:33:16,653 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.8.intermediate.dense.bias
2023-05-13 15:33:16,653 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.8.output.dense.weight
2023-05-13 15:33:16,653 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.8.output.dense.bias
2023-05-13 15:33:16,653 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.8.output.LayerNorm.weight
2023-05-13 15:33:16,653 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.8.output.LayerNorm.bias
2023-05-13 15:33:16,653 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.9.attention.self.query.weight
2023-05-13 15:33:16,653 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.9.attention.self.query.bias
2023-05-13 15:33:16,653 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.9.attention.self.key.weight
2023-05-13 15:33:16,653 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.9.attention.self.key.bias
2023-05-13 15:33:16,653 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.9.attention.self.value.weight
2023-05-13 15:33:16,653 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.9.attention.self.value.bias
2023-05-13 15:33:16,653 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.9.attention.output.dense.weight
2023-05-13 15:33:16,653 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.9.attention.output.dense.bias
2023-05-13 15:33:16,654 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.9.attention.output.LayerNorm.weight
2023-05-13 15:33:16,654 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.9.attention.output.LayerNorm.bias
2023-05-13 15:33:16,654 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.9.intermediate.dense.weight
2023-05-13 15:33:16,654 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.9.intermediate.dense.bias
2023-05-13 15:33:16,654 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.9.output.dense.weight
2023-05-13 15:33:16,654 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.9.output.dense.bias
2023-05-13 15:33:16,654 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.9.output.LayerNorm.weight
2023-05-13 15:33:16,654 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.9.output.LayerNorm.bias
2023-05-13 15:33:16,654 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.10.attention.self.query.weight
2023-05-13 15:33:16,654 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.10.attention.self.query.bias
2023-05-13 15:33:16,654 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.10.attention.self.key.weight
2023-05-13 15:33:16,655 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.10.attention.self.key.bias
2023-05-13 15:33:16,655 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.10.attention.self.value.weight
2023-05-13 15:33:16,655 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.10.attention.self.value.bias
2023-05-13 15:33:16,655 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.10.attention.output.dense.weight
2023-05-13 15:33:16,655 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.10.attention.output.dense.bias
2023-05-13 15:33:16,655 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.10.attention.output.LayerNorm.weight
2023-05-13 15:33:16,655 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.10.attention.output.LayerNorm.bias
2023-05-13 15:33:16,655 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.10.intermediate.dense.weight
2023-05-13 15:33:16,655 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.10.intermediate.dense.bias
2023-05-13 15:33:16,655 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.10.output.dense.weight
2023-05-13 15:33:16,655 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.10.output.dense.bias
2023-05-13 15:33:16,655 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.10.output.LayerNorm.weight
2023-05-13 15:33:16,656 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.10.output.LayerNorm.bias
2023-05-13 15:33:16,656 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.11.attention.self.query.weight
2023-05-13 15:33:16,656 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.11.attention.self.query.bias
2023-05-13 15:33:16,656 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.11.attention.self.key.weight
2023-05-13 15:33:16,656 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.11.attention.self.key.bias
2023-05-13 15:33:16,656 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.11.attention.self.value.weight
2023-05-13 15:33:16,656 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.11.attention.self.value.bias
2023-05-13 15:33:16,656 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.11.attention.output.dense.weight
2023-05-13 15:33:16,656 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.11.attention.output.dense.bias
2023-05-13 15:33:16,656 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.11.attention.output.LayerNorm.weight
2023-05-13 15:33:16,656 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.11.attention.output.LayerNorm.bias
2023-05-13 15:33:16,656 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.11.intermediate.dense.weight
2023-05-13 15:33:16,656 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.11.intermediate.dense.bias
2023-05-13 15:33:16,657 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.11.output.dense.weight
2023-05-13 15:33:16,657 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.11.output.dense.bias
2023-05-13 15:33:16,657 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.11.output.LayerNorm.weight
2023-05-13 15:33:16,657 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.encoder.layer.11.output.LayerNorm.bias
2023-05-13 15:33:16,657 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.pooler.dense.weight
2023-05-13 15:33:16,657 - INFO - allennlp.common.util - _source_embedder.token_embedder_tokens.transformer_model.pooler.dense.bias
2023-05-13 15:33:16,657 - INFO - allennlp.common.util - _target_embedder.weight
2023-05-13 15:33:16,657 - INFO - allennlp.common.util - _attention._multihead_attn.in_proj_weight
2023-05-13 15:33:16,657 - INFO - allennlp.common.util - _attention._multihead_attn.in_proj_bias
2023-05-13 15:33:16,657 - INFO - allennlp.common.util - _attention._multihead_attn.out_proj.weight
2023-05-13 15:33:16,657 - INFO - allennlp.common.util - _attention._multihead_attn.out_proj.bias
2023-05-13 15:33:16,657 - INFO - allennlp.common.util - _input_projection_layer.weight
2023-05-13 15:33:16,657 - INFO - allennlp.common.util - _input_projection_layer.bias
2023-05-13 15:33:16,657 - INFO - allennlp.common.util - _decoder_cell.weight_hh_raw
2023-05-13 15:33:16,657 - INFO - allennlp.common.util - _decoder_cell.module.weight_ih
2023-05-13 15:33:16,657 - INFO - allennlp.common.util - _decoder_cell.module.bias_ih
2023-05-13 15:33:16,657 - INFO - allennlp.common.util - _decoder_cell.module.bias_hh
2023-05-13 15:33:16,658 - INFO - allennlp.common.util - _output_generation_layer.weight
2023-05-13 15:33:16,658 - INFO - allennlp.common.util - _output_generation_layer.bias
2023-05-13 15:33:16,658 - INFO - allennlp.common.util - _output_copying_layer.weight
2023-05-13 15:33:16,658 - INFO - allennlp.common.util - _output_copying_layer.bias
2023-05-13 15:33:16,658 - INFO - allennlp.common.params - trainer.learning_rate_scheduler.type = linear_with_warmup
2023-05-13 15:33:16,658 - INFO - allennlp.common.params - trainer.learning_rate_scheduler.warmup_steps = 2906
2023-05-13 15:33:16,658 - INFO - allennlp.common.params - trainer.learning_rate_scheduler.last_epoch = -1
2023-05-13 15:33:16,658 - INFO - allennlp.common.params - trainer.checkpointer.type = default
2023-05-13 15:33:16,659 - INFO - allennlp.common.params - trainer.checkpointer.save_completed_epochs = True
2023-05-13 15:33:16,659 - INFO - allennlp.common.params - trainer.checkpointer.save_every_num_seconds = None
2023-05-13 15:33:16,659 - INFO - allennlp.common.params - trainer.checkpointer.save_every_num_batches = None
2023-05-13 15:33:16,659 - INFO - allennlp.common.params - trainer.checkpointer.keep_most_recent_by_count = 1
2023-05-13 15:33:16,659 - INFO - allennlp.common.params - trainer.checkpointer.keep_most_recent_by_age = None
2023-05-13 15:33:16,659 - INFO - allennlp.common.params - trainer.callbacks.0.type = should_validate_callback
2023-05-13 15:33:16,659 - INFO - allennlp.common.params - trainer.callbacks.0.validation_start = 15
2023-05-13 15:33:16,660 - INFO - allennlp.common.params - trainer.callbacks.0.validation_interval = 1
2023-05-13 15:33:16,660 - WARNING - allennlp.training.gradient_descent_trainer - You provided a validation dataset but patience was set to None, meaning that early stopping is disabled
2023-05-13 15:33:16,661 - INFO - allennlp.training.gradient_descent_trainer - Beginning training.
2023-05-13 15:33:16,662 - INFO - allennlp.training.gradient_descent_trainer - Epoch 0/24
2023-05-13 15:33:16,662 - INFO - allennlp.training.gradient_descent_trainer - Worker 0 memory usage: 4.3G
2023-05-13 15:33:16,662 - INFO - allennlp.training.gradient_descent_trainer - GPU 0 memory usage: 453M
2023-05-13 15:33:16,663 - INFO - allennlp.training.gradient_descent_trainer - Training
2023-05-13 15:33:16,663 - INFO - tqdm - 0%| | 0/1163 [00:00<?, ?it/s]
2023-05-13 15:33:16,783 - INFO - allennlp.training.callbacks.console_logger - Batch inputs
2023-05-13 15:33:16,784 - INFO - allennlp.training.callbacks.console_logger - batch_input/source_tokens/tokens/token_ids (Shape: 4 x 83)
tensor([[ 2, 23253, 30522, ..., 18, 3, 0],
[ 2, 13553, 2428, ..., 0, 0, 0],
[ 2, 16672, 1025, ..., 0, 0, 0],
[ 2, 5302, 17, ..., 26979, 18, 3]], device='cuda:0')
2023-05-13 15:33:16,785 - INFO - allennlp.training.callbacks.console_logger - batch_input/source_tokens/tokens/mask (Shape: 4 x 83)
tensor([[ True, True, True, ..., True, True, False],
[ True, True, True, ..., False, False, False],
[ True, True, True, ..., False, False, False],
[ True, True, True, ..., True, True, True]], device='cuda:0')
2023-05-13 15:33:16,787 - INFO - allennlp.training.callbacks.console_logger - batch_input/source_tokens/tokens/type_ids (Shape: 4 x 83)
tensor([[0, 0, 0, ..., 0, 0, 0],
[0, 0, 0, ..., 0, 0, 0],
[0, 0, 0, ..., 0, 0, 0],
[0, 0, 0, ..., 0, 0, 0]], device='cuda:0')
2023-05-13 15:33:16,789 - INFO - allennlp.training.callbacks.console_logger - batch_input/source_to_target (Shape: 4 x 83)
tensor([[1, 1, 3, ..., 1, 1, 0],
[1, 1, 1, ..., 0, 0, 0],
[1, 1, 1, ..., 0, 0, 0],
[1, 1, 1, ..., 1, 1, 1]], device='cuda:0')
2023-05-13 15:33:16,790 - INFO - allennlp.training.callbacks.console_logger - batch_input/target_tokens/tokens/tokens (Shape: 4 x 2)
tensor([[2, 6],
[2, 6],
[2, 6],
[2, 6]], device='cuda:0')
2023-05-13 15:33:16,791 - INFO - allennlp.training.callbacks.console_logger - batch_input/source_token_ids (Shape: 4 x 83)
tensor([[ 0, 1, 2, ..., 6, 49, 0],
[ 0, 1, 2, ..., 0, 0, 0],
[ 0, 1, 2, ..., 0, 0, 0],
[ 0, 1, 2, ..., 57, 58, 59]], device='cuda:0')
2023-05-13 15:33:16,793 - INFO - allennlp.training.callbacks.console_logger - batch_input/target_token_ids (Shape: 4 x 2)
tensor([[50, 51],
[39, 40],
[49, 50],
[60, 61]], device='cuda:0')
2023-05-13 15:33:16,794 - INFO - allennlp.training.callbacks.console_logger - Field : "batch_input/metadata" : (Length 4 of type "<class 'dict'>")
2023-05-13 15:33:26,761 - INFO - tqdm - batch_loss: 22.0751, loss: 13.0001 ||: 6%|5 | 64/1163 [00:10<03:36, 5.07it/s]
2023-05-13 15:33:36,821 - INFO - tqdm - batch_loss: 1.7625, loss: 11.5015 ||: 12%|#2 | 140/1163 [00:20<02:02, 8.35it/s]
2023-05-13 15:33:46,936 - INFO - tqdm - batch_loss: 8.3574, loss: 11.3590 ||: 17%|#7 | 201/1163 [00:30<02:18, 6.95it/s]
2023-05-13 15:33:57,017 - INFO - tqdm - batch_loss: 28.9927, loss: 10.0826 ||: 24%|##3 | 277/1163 [00:40<02:44, 5.38it/s]
2023-05-13 15:34:07,119 - INFO - tqdm - batch_loss: 9.8202, loss: 9.8024 ||: 29%|##9 | 340/1163 [00:50<02:01, 6.77it/s]
2023-05-13 15:34:17,238 - INFO - tqdm - batch_loss: 11.5271, loss: 9.1016 ||: 35%|###5 | 408/1163 [01:00<02:15, 5.58it/s]
2023-05-13 15:34:27,387 - INFO - tqdm - batch_loss: 7.2027, loss: 8.6231 ||: 41%|####1 | 480/1163 [01:10<01:28, 7.73it/s]
2023-05-13 15:34:37,404 - INFO - tqdm - batch_loss: 10.3921, loss: 8.1658 ||: 47%|####6 | 544/1163 [01:20<02:01, 5.10it/s]
2023-05-13 15:34:47,524 - INFO - tqdm - batch_loss: 2.4879, loss: 7.8877 ||: 53%|#####3 | 620/1163 [01:30<01:25, 6.32it/s]
2023-05-13 15:34:57,608 - INFO - tqdm - batch_loss: 0.0219, loss: 7.5193 ||: 59%|#####9 | 687/1163 [01:40<01:00, 7.81it/s]
2023-05-13 15:35:07,708 - INFO - tqdm - batch_loss: 1.8938, loss: 7.1715 ||: 66%|######5 | 762/1163 [01:51<01:05, 6.16it/s]
2023-05-13 15:35:17,805 - INFO - tqdm - batch_loss: 0.0029, loss: 6.9154 ||: 71%|#######1 | 830/1163 [02:01<00:38, 8.63it/s]
2023-05-13 15:35:27,904 - INFO - tqdm - batch_loss: 0.0070, loss: 6.7275 ||: 77%|#######7 | 896/1163 [02:11<00:41, 6.50it/s]
2023-05-13 15:35:37,987 - INFO - tqdm - batch_loss: 1.5801, loss: 6.3753 ||: 83%|########3 | 969/1163 [02:21<00:27, 7.07it/s]
2023-05-13 15:35:48,133 - INFO - tqdm - batch_loss: 0.8383, loss: 6.1441 ||: 89%|########9 | 1036/1163 [02:31<00:19, 6.60it/s]
2023-05-13 15:35:58,316 - INFO - tqdm - batch_loss: 1.5910, loss: 5.8352 ||: 96%|#########6| 1117/1163 [02:41<00:06, 6.68it/s]
2023-05-13 15:36:05,705 - INFO - tqdm - batch_loss: 2.2784, loss: 5.7149 ||: 100%|#########9| 1158/1163 [02:49<00:00, 5.30it/s]
2023-05-13 15:36:05,844 - INFO - tqdm - batch_loss: 3.6173, loss: 5.7131 ||: 100%|#########9| 1159/1163 [02:49<00:00, 5.75it/s]
2023-05-13 15:36:06,018 - INFO - tqdm - batch_loss: 3.0704, loss: 5.7108 ||: 100%|#########9| 1160/1163 [02:49<00:00, 5.76it/s]
2023-05-13 15:36:06,252 - INFO - tqdm - batch_loss: 6.7589, loss: 5.7117 ||: 100%|#########9| 1161/1163 [02:49<00:00, 5.21it/s]
2023-05-13 15:36:06,518 - INFO - tqdm - batch_loss: 2.4498, loss: 5.7040 ||: 100%|##########| 1163/1163 [02:49<00:00, 6.08it/s]
2023-05-13 15:36:06,519 - INFO - tqdm - batch_loss: 2.4498, loss: 5.7040 ||: 100%|##########| 1163/1163 [02:49<00:00, 6.85it/s]
2023-05-13 15:36:06,521 - INFO - allennlp.training.callbacks.console_logger - Training | Validation
2023-05-13 15:36:06,521 - INFO - allennlp.training.callbacks.console_logger - gpu_0_memory_MB | 453.306 | N/A
2023-05-13 15:36:06,521 - INFO - allennlp.training.callbacks.console_logger - loss | 5.704 | N/A
2023-05-13 15:36:06,521 - INFO - allennlp.training.callbacks.console_logger - worker_0_memory_MB | 4427.691 | N/A
2023-05-13 15:36:11,824 - INFO - allennlp.training.gradient_descent_trainer - Epoch duration: 0:02:55.162598
2023-05-13 15:36:11,825 - INFO - allennlp.training.gradient_descent_trainer - Estimated training time remaining: 1:07:56
2023-05-13 15:36:11,825 - INFO - allennlp.training.gradient_descent_trainer - Epoch 1/24
2023-05-13 15:36:11,825 - INFO - allennlp.training.gradient_descent_trainer - Worker 0 memory usage: 4.4G
2023-05-13 15:36:11,826 - INFO - allennlp.training.gradient_descent_trainer - GPU 0 memory usage: 2.9G
2023-05-13 15:36:11,829 - INFO - allennlp.training.gradient_descent_trainer - Training
2023-05-13 15:36:11,829 - INFO - tqdm - 0%| | 0/1163 [00:00<?, ?it/s]
2023-05-13 15:36:21,927 - INFO - tqdm - batch_loss: 3.1022, loss: 1.9278 ||: 6%|5 | 68/1163 [00:10<03:59, 4.57it/s]
2023-05-13 15:36:31,985 - INFO - tqdm - batch_loss: 3.9886, loss: 2.0158 ||: 12%|#1 | 139/1163 [00:20<02:29, 6.87it/s]
2023-05-13 15:36:42,044 - INFO - tqdm - batch_loss: 2.8147, loss: 1.8155 ||: 17%|#7 | 203/1163 [00:30<02:16, 7.06it/s]
2023-05-13 15:36:52,083 - INFO - tqdm - batch_loss: 1.2734, loss: 1.9126 ||: 24%|##3 | 276/1163 [00:40<02:24, 6.15it/s]
2023-05-13 15:37:02,117 - INFO - tqdm - batch_loss: 0.0053, loss: 2.0287 ||: 29%|##9 | 341/1163 [00:50<01:43, 7.98it/s]
2023-05-13 15:37:12,117 - INFO - tqdm - batch_loss: 2.5151, loss: 1.9266 ||: 35%|###5 | 409/1163 [01:00<02:28, 5.06it/s]
2023-05-13 15:37:22,247 - INFO - tqdm - batch_loss: 2.7551, loss: 1.9363 ||: 41%|####1 | 478/1163 [01:10<01:20, 8.52it/s]
2023-05-13 15:37:32,792 - INFO - tqdm - batch_loss: 0.5965, loss: 1.9136 ||: 46%|####6 | 539/1163 [01:20<03:42, 2.80it/s]
2023-05-13 15:37:42,872 - INFO - tqdm - batch_loss: 2.1789, loss: 1.8849 ||: 52%|#####2 | 610/1163 [01:31<01:11, 7.70it/s]
2023-05-13 15:37:53,022 - INFO - tqdm - batch_loss: 1.6421, loss: 1.8488 ||: 58%|#####8 | 677/1163 [01:41<01:00, 8.03it/s]
2023-05-13 15:38:03,125 - INFO - tqdm - batch_loss: 3.7823, loss: 1.7822 ||: 65%|######4 | 755/1163 [01:51<01:05, 6.20it/s]
2023-05-13 15:38:13,217 - INFO - tqdm - batch_loss: 0.0020, loss: 1.7548 ||: 71%|####### | 820/1163 [02:01<00:39, 8.75it/s]
2023-05-13 15:38:23,275 - INFO - tqdm - batch_loss: 2.2008, loss: 1.7355 ||: 77%|#######6 | 891/1163 [02:11<00:38, 7.06it/s]
2023-05-13 15:38:33,368 - INFO - tqdm - batch_loss: 0.5050, loss: 1.7332 ||: 83%|########2 | 963/1163 [02:21<00:24, 8.11it/s]
2023-05-13 15:38:43,469 - INFO - tqdm - batch_loss: 1.1960, loss: 1.6841 ||: 89%|########8 | 1030/1163 [02:31<00:22, 5.91it/s]
2023-05-13 15:38:53,502 - INFO - tqdm - batch_loss: 1.3306, loss: 1.6633 ||: 95%|#########5| 1106/1163 [02:41<00:08, 6.90it/s]
2023-05-13 15:39:02,812 - INFO - tqdm - batch_loss: 1.6812, loss: 1.6570 ||: 100%|#########9| 1158/1163 [02:50<00:00, 5.64it/s]
2023-05-13 15:39:02,912 - INFO - tqdm - batch_loss: 0.3756, loss: 1.6559 ||: 100%|#########9| 1159/1163 [02:51<00:00, 6.37it/s]
2023-05-13 15:39:03,020 - INFO - tqdm - batch_loss: 0.0042, loss: 1.6544 ||: 100%|#########9| 1160/1163 [02:51<00:00, 6.95it/s]
2023-05-13 15:39:03,146 - INFO - tqdm - batch_loss: 0.1927, loss: 1.6532 ||: 100%|#########9| 1161/1163 [02:51<00:00, 7.21it/s]
2023-05-13 15:39:03,368 - INFO - tqdm - batch_loss: 0.8270, loss: 1.6511 ||: 100%|##########| 1163/1163 [02:51<00:00, 7.91it/s]
2023-05-13 15:39:03,370 - INFO - tqdm - batch_loss: 0.8270, loss: 1.6511 ||: 100%|##########| 1163/1163 [02:51<00:00, 6.78it/s]
2023-05-13 15:39:03,372 - INFO - allennlp.training.callbacks.console_logger - Training | Validation
2023-05-13 15:39:03,372 - INFO - allennlp.training.callbacks.console_logger - gpu_0_memory_MB | 2972.189 | N/A
2023-05-13 15:39:03,372 - INFO - allennlp.training.callbacks.console_logger - loss | 1.651 | N/A
2023-05-13 15:39:03,372 - INFO - allennlp.training.callbacks.console_logger - worker_0_memory_MB | 4512.148 | N/A
2023-05-13 15:39:08,959 - INFO - allennlp.training.gradient_descent_trainer - Epoch duration: 0:02:57.133503
2023-05-13 15:39:08,959 - INFO - allennlp.training.gradient_descent_trainer - Estimated training time remaining: 1:06:27
2023-05-13 15:39:08,959 - INFO - allennlp.training.gradient_descent_trainer - Epoch 2/24
2023-05-13 15:39:08,959 - INFO - allennlp.training.gradient_descent_trainer - Worker 0 memory usage: 4.4G
2023-05-13 15:39:08,959 - INFO - allennlp.training.gradient_descent_trainer - GPU 0 memory usage: 2.9G
2023-05-13 15:39:08,961 - INFO - allennlp.training.gradient_descent_trainer - Training
2023-05-13 15:39:08,961 - INFO - tqdm - 0%| | 0/1163 [00:00<?, ?it/s]
2023-05-13 15:39:18,971 - INFO - tqdm - batch_loss: 0.0020, loss: 0.9638 ||: 6%|5 | 67/1163 [00:10<02:47, 6.54it/s]
2023-05-13 15:39:28,995 - INFO - tqdm - batch_loss: 0.0026, loss: 0.8696 ||: 13%|#2 | 146/1163 [00:20<01:50, 9.21it/s]
2023-05-13 15:39:39,105 - INFO - tqdm - batch_loss: 0.0228, loss: 0.8725 ||: 18%|#8 | 211/1163 [00:30<01:52, 8.46it/s]
2023-05-13 15:39:49,149 - INFO - tqdm - batch_loss: 0.5112, loss: 0.8596 ||: 24%|##4 | 280/1163 [00:40<02:34, 5.72it/s]
2023-05-13 15:39:59,304 - INFO - tqdm - batch_loss: 4.0485, loss: 0.9190 ||: 30%|##9 | 344/1163 [00:50<01:58, 6.93it/s]
2023-05-13 15:40:09,355 - INFO - tqdm - batch_loss: 0.0131, loss: 0.8956 ||: 36%|###6 | 420/1163 [01:00<02:05, 5.93it/s]
2023-05-13 15:40:19,565 - INFO - tqdm - batch_loss: 3.8358, loss: 0.9005 ||: 42%|####2 | 492/1163 [01:10<01:29, 7.50it/s]
2023-05-13 15:40:29,741 - INFO - tqdm - batch_loss: 2.0694, loss: 0.9236 ||: 48%|####7 | 556/1163 [01:20<01:48, 5.60it/s]
2023-05-13 15:40:39,867 - INFO - tqdm - batch_loss: 0.0614, loss: 0.9419 ||: 54%|#####3 | 628/1163 [01:30<01:26, 6.17it/s]
2023-05-13 15:40:49,930 - INFO - tqdm - batch_loss: 0.5679, loss: 0.9414 ||: 59%|#####8 | 685/1163 [01:40<00:55, 8.54it/s]
2023-05-13 15:40:59,940 - INFO - tqdm - batch_loss: 0.3709, loss: 0.9530 ||: 66%|######5 | 762/1163 [01:50<01:08, 5.88it/s]
2023-05-13 15:41:09,947 - INFO - tqdm - batch_loss: 1.7134, loss: 0.9445 ||: 71%|####### | 824/1163 [02:00<00:47, 7.15it/s]
2023-05-13 15:41:20,041 - INFO - tqdm - batch_loss: 0.0033, loss: 0.9258 ||: 77%|#######7 | 896/1163 [02:11<00:36, 7.39it/s]
2023-05-13 15:41:30,051 - INFO - tqdm - batch_loss: 0.0887, loss: 0.9268 ||: 83%|########2 | 965/1163 [02:21<00:23, 8.33it/s]
2023-05-13 15:41:40,113 - INFO - tqdm - batch_loss: 0.0039, loss: 0.9172 ||: 89%|########8 | 1035/1163 [02:31<00:19, 6.64it/s]
2023-05-13 15:41:50,198 - INFO - tqdm - batch_loss: 0.0069, loss: 0.9062 ||: 95%|#########5| 1109/1163 [02:41<00:06, 8.01it/s]
2023-05-13 15:41:58,974 - INFO - tqdm - batch_loss: 2.8006, loss: 0.8927 ||: 100%|#########9| 1158/1163 [02:50<00:00, 5.44it/s]
2023-05-13 15:41:59,171 - INFO - tqdm - batch_loss: 3.1582, loss: 0.8946 ||: 100%|#########9| 1159/1163 [02:50<00:00, 5.33it/s]
2023-05-13 15:41:59,314 - INFO - tqdm - batch_loss: 1.3047, loss: 0.8950 ||: 100%|#########9| 1160/1163 [02:50<00:00, 5.74it/s]
2023-05-13 15:41:59,532 - INFO - tqdm - batch_loss: 2.9583, loss: 0.8967 ||: 100%|#########9| 1161/1163 [02:50<00:00, 5.33it/s]
2023-05-13 15:41:59,673 - INFO - tqdm - batch_loss: 2.1014, loss: 0.8978 ||: 100%|#########9| 1162/1163 [02:50<00:00, 5.76it/s]
2023-05-13 15:41:59,851 - INFO - tqdm - batch_loss: 0.9919, loss: 0.8979 ||: 100%|##########| 1163/1163 [02:50<00:00, 5.72it/s]
2023-05-13 15:41:59,852 - INFO - tqdm - batch_loss: 0.9919, loss: 0.8979 ||: 100%|##########| 1163/1163 [02:50<00:00, 6.81it/s]
2023-05-13 15:41:59,854 - INFO - allennlp.training.callbacks.console_logger - Training | Validation
2023-05-13 15:41:59,854 - INFO - allennlp.training.callbacks.console_logger - gpu_0_memory_MB | 2981.632 | N/A
2023-05-13 15:41:59,854 - INFO - allennlp.training.callbacks.console_logger - loss | 0.898 | N/A
2023-05-13 15:41:59,854 - INFO - allennlp.training.callbacks.console_logger - worker_0_memory_MB | 4520.766 | N/A
2023-05-13 15:42:09,694 - INFO - allennlp.training.gradient_descent_trainer - Epoch duration: 0:03:00.734998
2023-05-13 15:42:09,694 - INFO - allennlp.training.gradient_descent_trainer - Estimated training time remaining: 1:03:56
2023-05-13 15:42:09,694 - INFO - allennlp.training.gradient_descent_trainer - Epoch 3/24
2023-05-13 15:42:09,695 - INFO - allennlp.training.gradient_descent_trainer - Worker 0 memory usage: 4.4G
2023-05-13 15:42:09,695 - INFO - allennlp.training.gradient_descent_trainer - GPU 0 memory usage: 2.9G
2023-05-13 15:42:09,696 - INFO - allennlp.training.gradient_descent_trainer - Training
2023-05-13 15:42:09,697 - INFO - tqdm - 0%| | 0/1163 [00:00<?, ?it/s]
2023-05-13 15:42:19,707 - INFO - tqdm - batch_loss: 0.1321, loss: 0.8518 ||: 5%|5 | 60/1163 [00:10<02:07, 8.64it/s]
2023-05-13 15:42:29,915 - INFO - tqdm - batch_loss: 0.0099, loss: 0.7577 ||: 11%|#1 | 131/1163 [00:20<03:18, 5.19it/s]
2023-05-13 15:42:40,103 - INFO - tqdm - batch_loss: 1.2177, loss: 0.8163 ||: 17%|#7 | 201/1163 [00:30<01:55, 8.30it/s]
2023-05-13 15:42:50,154 - INFO - tqdm - batch_loss: 1.4085, loss: 0.7834 ||: 23%|##3 | 269/1163 [00:40<02:50, 5.24it/s]
2023-05-13 15:43:00,167 - INFO - tqdm - batch_loss: 0.0064, loss: 0.8225 ||: 29%|##9 | 340/1163 [00:50<01:53, 7.22it/s]
2023-05-13 15:43:10,207 - INFO - tqdm - batch_loss: 0.0007, loss: 0.8205 ||: 35%|###4 | 403/1163 [01:00<01:54, 6.63it/s]
2023-05-13 15:43:20,280 - INFO - tqdm - batch_loss: 0.6622, loss: 0.7835 ||: 41%|####1 | 480/1163 [01:10<01:42, 6.66it/s]
2023-05-13 15:43:30,392 - INFO - tqdm - batch_loss: 0.9375, loss: 0.7658 ||: 47%|####6 | 546/1163 [01:20<01:32, 6.70it/s]
2023-05-13 15:43:40,452 - INFO - tqdm - batch_loss: 0.0024, loss: 0.7747 ||: 53%|#####3 | 621/1163 [01:30<01:21, 6.67it/s]
2023-05-13 15:43:50,650 - INFO - tqdm - batch_loss: 7.5373, loss: 0.8327 ||: 58%|#####8 | 675/1163 [01:40<02:08, 3.80it/s]
2023-05-13 15:44:00,703 - INFO - tqdm - batch_loss: 1.9389, loss: 0.8549 ||: 63%|######2 | 729/1163 [01:51<00:57, 7.55it/s]
2023-05-13 15:44:10,737 - INFO - tqdm - batch_loss: 0.0432, loss: 0.8849 ||: 67%|######7 | 785/1163 [02:01<00:53, 7.09it/s]
2023-05-13 15:44:20,873 - INFO - tqdm - batch_loss: 0.1694, loss: 0.8780 ||: 73%|#######2 | 844/1163 [02:11<01:03, 5.03it/s]
2023-05-13 15:44:30,946 - INFO - tqdm - batch_loss: 0.0016, loss: 0.8702 ||: 79%|#######8 | 918/1163 [02:21<00:28, 8.69it/s]
2023-05-13 15:44:41,079 - INFO - tqdm - batch_loss: 0.0059, loss: 0.8428 ||: 84%|########4 | 981/1163 [02:31<00:22, 8.05it/s]
2023-05-13 15:44:51,202 - INFO - tqdm - batch_loss: 1.9624, loss: 0.8394 ||: 91%|######### | 1053/1163 [02:41<00:19, 5.61it/s]
2023-05-13 15:45:01,348 - INFO - tqdm - batch_loss: 0.2098, loss: 0.8274 ||: 96%|#########6| 1120/1163 [02:51<00:06, 6.99it/s]
2023-05-13 15:45:06,268 - INFO - tqdm - batch_loss: 0.0022, loss: 0.8217 ||: 100%|#########9| 1158/1163 [02:56<00:00, 8.77it/s]
2023-05-13 15:45:06,480 - INFO - tqdm - batch_loss: 0.0004, loss: 0.8203 ||: 100%|#########9| 1160/1163 [02:56<00:00, 9.02it/s]
2023-05-13 15:45:06,673 - INFO - tqdm - batch_loss: 0.0028, loss: 0.8196 ||: 100%|#########9| 1161/1163 [02:56<00:00, 7.76it/s]
2023-05-13 15:45:06,780 - INFO - tqdm - batch_loss: 0.7839, loss: 0.8196 ||: 100%|#########9| 1162/1163 [02:57<00:00, 8.09it/s]
2023-05-13 15:45:06,952 - INFO - tqdm - batch_loss: 0.0108, loss: 0.8189 ||: 100%|##########| 1163/1163 [02:57<00:00, 7.36it/s]
2023-05-13 15:45:06,954 - INFO - tqdm - batch_loss: 0.0108, loss: 0.8189 ||: 100%|##########| 1163/1163 [02:57<00:00, 6.56it/s]
2023-05-13 15:45:06,957 - INFO - allennlp.training.callbacks.console_logger - Training | Validation
2023-05-13 15:45:06,957 - INFO - allennlp.training.callbacks.console_logger - gpu_0_memory_MB | 2960.472 | N/A
2023-05-13 15:45:06,957 - INFO - allennlp.training.callbacks.console_logger - loss | 0.819 | N/A
2023-05-13 15:45:06,957 - INFO - allennlp.training.callbacks.console_logger - worker_0_memory_MB | 4521.004 | N/A
2023-05-13 15:45:12,903 - INFO - allennlp.training.gradient_descent_trainer - Epoch duration: 0:03:03.208645
2023-05-13 15:45:12,904 - INFO - allennlp.training.gradient_descent_trainer - Estimated training time remaining: 1:02:09
2023-05-13 15:45:12,904 - INFO - allennlp.training.gradient_descent_trainer - Epoch 4/24
2023-05-13 15:45:12,904 - INFO - allennlp.training.gradient_descent_trainer - Worker 0 memory usage: 4.4G
2023-05-13 15:45:12,905 - INFO - allennlp.training.gradient_descent_trainer - GPU 0 memory usage: 2.9G
2023-05-13 15:45:12,906 - INFO - allennlp.training.gradient_descent_trainer - Training
2023-05-13 15:45:12,906 - INFO - tqdm - 0%| | 0/1163 [00:00<?, ?it/s]
2023-05-13 15:45:23,047 - INFO - tqdm - batch_loss: 0.0007, loss: 0.5548 ||: 6%|6 | 73/1163 [00:10<02:05, 8.72it/s]
2023-05-13 15:45:33,103 - INFO - tqdm - batch_loss: 0.0010, loss: 0.6308 ||: 12%|#1 | 137/1163 [00:20<02:02, 8.34it/s]
2023-05-13 15:45:43,215 - INFO - tqdm - batch_loss: 1.5309, loss: 0.6118 ||: 18%|#8 | 211/1163 [00:30<02:12, 7.20it/s]
2023-05-13 15:45:53,362 - INFO - tqdm - batch_loss: 0.0074, loss: 0.5664 ||: 23%|##3 | 273/1163 [00:40<02:26, 6.08it/s]
2023-05-13 15:46:03,457 - INFO - tqdm - batch_loss: 0.0002, loss: 0.5693 ||: 30%|##9 | 345/1163 [00:50<01:56, 7.05it/s]
2023-05-13 15:46:13,560 - INFO - tqdm - batch_loss: 0.0071, loss: 0.5809 ||: 36%|###5 | 414/1163 [01:00<01:44, 7.20it/s]
2023-05-13 15:46:23,642 - INFO - tqdm - batch_loss: 0.0012, loss: 0.6165 ||: 41%|####1 | 479/1163 [01:10<01:52, 6.08it/s]
2023-05-13 15:46:33,782 - INFO - tqdm - batch_loss: 0.0074, loss: 0.6431 ||: 48%|####7 | 553/1163 [01:20<01:15, 8.07it/s]
2023-05-13 15:46:43,901 - INFO - tqdm - batch_loss: 0.0009, loss: 0.6505 ||: 53%|#####2 | 614/1163 [01:30<01:06, 8.23it/s]
2023-05-13 15:46:54,081 - INFO - tqdm - batch_loss: 2.1600, loss: 0.6307 ||: 60%|#####9 | 694/1163 [01:41<01:29, 5.23it/s]
2023-05-13 15:47:04,267 - INFO - tqdm - batch_loss: 0.0001, loss: 0.6158 ||: 65%|######5 | 761/1163 [01:51<00:43, 9.27it/s]
2023-05-13 15:47:14,310 - INFO - tqdm - batch_loss: 0.0009, loss: 0.6149 ||: 72%|#######1 | 832/1163 [02:01<00:49, 6.64it/s]
2023-05-13 15:47:24,349 - INFO - tqdm - batch_loss: 1.7165, loss: 0.6514 ||: 77%|#######6 | 895/1163 [02:11<00:39, 6.76it/s]
2023-05-13 15:47:34,374 - INFO - tqdm - batch_loss: 1.2323, loss: 0.6400 ||: 83%|########3 | 967/1163 [02:21<00:38, 5.10it/s]
2023-05-13 15:47:44,395 - INFO - tqdm - batch_loss: 0.9610, loss: 0.6428 ||: 89%|########9 | 1040/1163 [02:31<00:15, 7.84it/s]
2023-05-13 15:47:54,485 - INFO - tqdm - batch_loss: 1.0679, loss: 0.6326 ||: 95%|#########5| 1105/1163 [02:41<00:08, 7.02it/s]
2023-05-13 15:48:01,760 - INFO - tqdm - batch_loss: 3.1928, loss: 0.6294 ||: 100%|#########9| 1158/1163 [02:48<00:00, 7.48it/s]
2023-05-13 15:48:02,000 - INFO - tqdm - batch_loss: 0.0008, loss: 0.6283 ||: 100%|#########9| 1160/1163 [02:49<00:00, 7.78it/s]
2023-05-13 15:48:02,200 - INFO - tqdm - batch_loss: 0.0026, loss: 0.6272 ||: 100%|#########9| 1162/1163 [02:49<00:00, 8.43it/s]
2023-05-13 15:48:02,330 - INFO - tqdm - batch_loss: 1.3183, loss: 0.6278 ||: 100%|##########| 1163/1163 [02:49<00:00, 8.27it/s]
2023-05-13 15:48:02,331 - INFO - tqdm - batch_loss: 1.3183, loss: 0.6278 ||: 100%|##########| 1163/1163 [02:49<00:00, 6.86it/s]
2023-05-13 15:48:02,333 - INFO - allennlp.training.callbacks.console_logger - Training | Validation
2023-05-13 15:48:02,333 - INFO - allennlp.training.callbacks.console_logger - gpu_0_memory_MB | 2935.544 | N/A
2023-05-13 15:48:02,333 - INFO - allennlp.training.callbacks.console_logger - loss | 0.628 | N/A
2023-05-13 15:48:02,333 - INFO - allennlp.training.callbacks.console_logger - worker_0_memory_MB | 4521.004 | N/A
2023-05-13 15:48:08,030 - INFO - allennlp.training.gradient_descent_trainer - Epoch duration: 0:02:55.126089
2023-05-13 15:48:08,030 - INFO - allennlp.training.gradient_descent_trainer - Estimated training time remaining: 0:59:02
2023-05-13 15:48:08,030 - INFO - allennlp.training.gradient_descent_trainer - Epoch 5/24
2023-05-13 15:48:08,030 - INFO - allennlp.training.gradient_descent_trainer - Worker 0 memory usage: 4.4G
2023-05-13 15:48:08,031 - INFO - allennlp.training.gradient_descent_trainer - GPU 0 memory usage: 2.9G
2023-05-13 15:48:08,032 - INFO - allennlp.training.gradient_descent_trainer - Training
2023-05-13 15:48:08,033 - INFO - tqdm - 0%| | 0/1163 [00:00<?, ?it/s]
2023-05-13 15:48:18,160 - INFO - tqdm - batch_loss: 0.0001, loss: 0.7271 ||: 6%|6 | 72/1163 [00:10<02:16, 7.97it/s]
2023-05-13 15:48:28,391 - INFO - tqdm - batch_loss: 0.0028, loss: 0.6748 ||: 12%|#2 | 140/1163 [00:20<03:35, 4.75it/s]
2023-05-13 15:48:38,466 - INFO - tqdm - batch_loss: 0.0519, loss: 0.7680 ||: 19%|#8 | 216/1163 [00:30<02:06, 7.49it/s]
2023-05-13 15:48:48,512 - INFO - tqdm - batch_loss: 1.5679, loss: 0.6851 ||: 24%|##4 | 280/1163 [00:40<01:57, 7.53it/s]
2023-05-13 15:48:58,559 - INFO - tqdm - batch_loss: 0.0131, loss: 0.6697 ||: 31%|### | 357/1163 [00:50<02:05, 6.43it/s]
2023-05-13 15:49:08,721 - INFO - tqdm - batch_loss: 0.0003, loss: 0.6302 ||: 37%|###6 | 426/1163 [01:00<01:48, 6.81it/s]
2023-05-13 15:49:18,881 - INFO - tqdm - batch_loss: 1.0566, loss: 0.6580 ||: 42%|####2 | 491/1163 [01:10<02:08, 5.23it/s]
2023-05-13 15:49:29,067 - INFO - tqdm - batch_loss: 0.0016, loss: 0.6630 ||: 48%|####8 | 559/1163 [01:21<01:09, 8.73it/s]
2023-05-13 15:49:39,171 - INFO - tqdm - batch_loss: 0.2029, loss: 0.6347 ||: 53%|#####3 | 622/1163 [01:31<01:38, 5.51it/s]
2023-05-13 15:49:49,211 - INFO - tqdm - batch_loss: 0.0018, loss: 0.6467 ||: 60%|###### | 698/1163 [01:41<01:05, 7.10it/s]
2023-05-13 15:49:59,421 - INFO - tqdm - batch_loss: 0.0008, loss: 0.6476 ||: 65%|######5 | 759/1163 [01:51<00:48, 8.26it/s]
2023-05-13 15:50:09,438 - INFO - tqdm - batch_loss: 0.7330, loss: 0.6190 ||: 72%|#######1 | 833/1163 [02:01<00:59, 5.57it/s]
2023-05-13 15:50:19,676 - INFO - tqdm - batch_loss: 0.0019, loss: 0.6030 ||: 77%|#######7 | 900/1163 [02:11<00:34, 7.56it/s]
2023-05-13 15:50:29,800 - INFO - tqdm - batch_loss: 0.0004, loss: 0.5872 ||: 83%|########3 | 966/1163 [02:21<00:33, 5.85it/s]
2023-05-13 15:50:39,945 - INFO - tqdm - batch_loss: 1.4379, loss: 0.5834 ||: 89%|########9 | 1040/1163 [02:31<00:17, 7.04it/s]
2023-05-13 15:50:49,960 - INFO - tqdm - batch_loss: 0.3236, loss: 0.5911 ||: 95%|#########5| 1105/1163 [02:41<00:10, 5.77it/s]
2023-05-13 15:50:57,210 - INFO - tqdm - batch_loss: 0.0233, loss: 0.5959 ||: 100%|#########9| 1158/1163 [02:49<00:00, 6.49it/s]
2023-05-13 15:50:57,388 - INFO - tqdm - batch_loss: 1.2748, loss: 0.5965 ||: 100%|#########9| 1159/1163 [02:49<00:00, 6.21it/s]
2023-05-13 15:50:57,493 - INFO - tqdm - batch_loss: 0.0014, loss: 0.5960 ||: 100%|#########9| 1160/1163 [02:49<00:00, 6.91it/s]
2023-05-13 15:50:57,609 - INFO - tqdm - batch_loss: 0.0001, loss: 0.5955 ||: 100%|#########9| 1161/1163 [02:49<00:00, 7.34it/s]
2023-05-13 15:50:57,739 - INFO - tqdm - batch_loss: 0.0160, loss: 0.5950 ||: 100%|#########9| 1162/1163 [02:49<00:00, 7.44it/s]
2023-05-13 15:50:57,842 - INFO - tqdm - batch_loss: 0.0001, loss: 0.5945 ||: 100%|##########| 1163/1163 [02:49<00:00, 8.00it/s]
2023-05-13 15:50:57,843 - INFO - tqdm - batch_loss: 0.0001, loss: 0.5945 ||: 100%|##########| 1163/1163 [02:49<00:00, 6.85it/s]
2023-05-13 15:50:57,844 - INFO - allennlp.training.callbacks.console_logger - Training | Validation
2023-05-13 15:50:57,844 - INFO - allennlp.training.callbacks.console_logger - gpu_0_memory_MB | 2934.998 | N/A
2023-05-13 15:50:57,844 - INFO - allennlp.training.callbacks.console_logger - loss | 0.594 | N/A
2023-05-13 15:50:57,844 - INFO - allennlp.training.callbacks.console_logger - worker_0_memory_MB | 4521.004 | N/A
2023-05-13 15:51:03,609 - INFO - allennlp.training.gradient_descent_trainer - Epoch duration: 0:02:55.578941
2023-05-13 15:51:03,610 - INFO - allennlp.training.gradient_descent_trainer - Estimated training time remaining: 0:56:00
2023-05-13 15:51:03,610 - INFO - allennlp.training.gradient_descent_trainer - Epoch 6/24
2023-05-13 15:51:03,610 - INFO - allennlp.training.gradient_descent_trainer - Worker 0 memory usage: 4.4G
2023-05-13 15:51:03,611 - INFO - allennlp.training.gradient_descent_trainer - GPU 0 memory usage: 2.9G
2023-05-13 15:51:03,612 - INFO - allennlp.training.gradient_descent_trainer - Training
2023-05-13 15:51:03,613 - INFO - tqdm - 0%| | 0/1163 [00:00<?, ?it/s]
2023-05-13 15:51:13,643 - INFO - tqdm - batch_loss: 1.7388, loss: 0.4483 ||: 6%|5 | 67/1163 [00:10<02:40, 6.85it/s]
2023-05-13 15:51:23,757 - INFO - tqdm - batch_loss: 1.7117, loss: 0.6091 ||: 12%|#1 | 136/1163 [00:20<03:01, 5.67it/s]
2023-05-13 15:51:33,954 - INFO - tqdm - batch_loss: 0.0032, loss: 0.5447 ||: 18%|#7 | 208/1163 [00:30<01:58, 8.07it/s]
2023-05-13 15:51:44,069 - INFO - tqdm - batch_loss: 0.0013, loss: 0.5508 ||: 23%|##3 | 269/1163 [00:40<02:12, 6.74it/s]
2023-05-13 15:51:54,086 - INFO - tqdm - batch_loss: 0.0021, loss: 0.5191 ||: 30%|##9 | 344/1163 [00:50<02:20, 5.85it/s]
2023-05-13 15:52:04,228 - INFO - tqdm - batch_loss: 1.3746, loss: 0.5569 ||: 35%|###5 | 409/1163 [01:00<01:48, 6.92it/s]
2023-05-13 15:52:14,275 - INFO - tqdm - batch_loss: 0.0004, loss: 0.5285 ||: 41%|####1 | 479/1163 [01:10<02:24, 4.74it/s]
2023-05-13 15:52:24,338 - INFO - tqdm - batch_loss: 0.0025, loss: 0.5593 ||: 47%|####7 | 549/1163 [01:20<01:23, 7.39it/s]
2023-05-13 15:52:34,401 - INFO - tqdm - batch_loss: 1.4059, loss: 0.5484 ||: 53%|#####2 | 614/1163 [01:30<02:00, 4.57it/s]
2023-05-13 15:52:44,499 - INFO - tqdm - batch_loss: 0.0000, loss: 0.5341 ||: 59%|#####9 | 689/1163 [01:40<00:55, 8.56it/s]
2023-05-13 15:52:54,528 - INFO - tqdm - batch_loss: 0.0006, loss: 0.5468 ||: 65%|######4 | 751/1163 [01:50<01:03, 6.47it/s]
2023-05-13 15:53:04,593 - INFO - tqdm - batch_loss: 0.0041, loss: 0.5594 ||: 71%|####### | 825/1163 [02:00<00:52, 6.38it/s]
2023-05-13 15:53:14,607 - INFO - tqdm - batch_loss: 0.0007, loss: 0.5706 ||: 76%|#######6 | 886/1163 [02:10<00:32, 8.56it/s]
2023-05-13 15:53:24,647 - INFO - tqdm - batch_loss: 0.0001, loss: 0.5520 ||: 83%|########2 | 962/1163 [02:21<00:35, 5.68it/s]
2023-05-13 15:53:34,743 - INFO - tqdm - batch_loss: 1.6315, loss: 0.5401 ||: 89%|########8 | 1030/1163 [02:31<00:17, 7.56it/s]
2023-05-13 15:53:44,828 - INFO - tqdm - batch_loss: 6.3714, loss: 0.5514 ||: 94%|#########4| 1097/1163 [02:41<00:16, 4.10it/s]
2023-05-13 15:53:53,570 - INFO - tqdm - batch_loss: 0.0000, loss: 0.5627 ||: 100%|#########9| 1158/1163 [02:49<00:00, 7.58it/s]
2023-05-13 15:53:53,796 - INFO - tqdm - batch_loss: 0.0001, loss: 0.5618 ||: 100%|#########9| 1160/1163 [02:50<00:00, 8.03it/s]
2023-05-13 15:53:53,984 - INFO - tqdm - batch_loss: 0.0004, loss: 0.5608 ||: 100%|#########9| 1162/1163 [02:50<00:00, 8.80it/s]
2023-05-13 15:53:54,083 - INFO - tqdm - batch_loss: 0.0051, loss: 0.5603 ||: 100%|##########| 1163/1163 [02:50<00:00, 6.82it/s]
2023-05-13 15:53:54,084 - INFO - allennlp.training.callbacks.console_logger - Training | Validation
2023-05-13 15:53:54,084 - INFO - allennlp.training.callbacks.console_logger - gpu_0_memory_MB | 2935.544 | N/A
2023-05-13 15:53:54,084 - INFO - allennlp.training.callbacks.console_logger - loss | 0.560 | N/A
2023-05-13 15:53:54,084 - INFO - allennlp.training.callbacks.console_logger - worker_0_memory_MB | 4521.004 | N/A
2023-05-13 15:54:01,822 - INFO - allennlp.training.gradient_descent_trainer - Epoch duration: 0:02:58.212220
2023-05-13 15:54:01,822 - INFO - allennlp.training.gradient_descent_trainer - Estimated training time remaining: 0:53:01
2023-05-13 15:54:01,823 - INFO - allennlp.training.gradient_descent_trainer - Epoch 7/24
2023-05-13 15:54:01,823 - INFO - allennlp.training.gradient_descent_trainer - Worker 0 memory usage: 4.4G
2023-05-13 15:54:01,823 - INFO - allennlp.training.gradient_descent_trainer - GPU 0 memory usage: 2.9G
2023-05-13 15:54:01,825 - INFO - allennlp.training.gradient_descent_trainer - Training
2023-05-13 15:54:01,825 - INFO - tqdm - 0%| | 0/1163 [00:00<?, ?it/s]
2023-05-13 15:54:11,903 - INFO - tqdm - batch_loss: 1.2019, loss: 0.3721 ||: 6%|6 | 74/1163 [00:10<02:34, 7.04it/s]
2023-05-13 15:54:21,941 - INFO - tqdm - batch_loss: 0.0001, loss: 0.5702 ||: 12%|#1 | 136/1163 [00:20<02:56, 5.82it/s]
2023-05-13 15:54:32,028 - INFO - tqdm - batch_loss: 2.3881, loss: 0.6989 ||: 18%|#7 | 205/1163 [00:30<03:18, 4.82it/s]
2023-05-13 15:54:42,075 - INFO - tqdm - batch_loss: 0.0025, loss: 0.6953 ||: 23%|##3 | 269/1163 [00:40<01:57, 7.59it/s]
2023-05-13 15:54:52,232 - INFO - tqdm - batch_loss: 0.0002, loss: 0.6319 ||: 29%|##9 | 341/1163 [00:50<03:00, 4.56it/s]
2023-05-13 15:55:02,308 - INFO - tqdm - batch_loss: 0.0062, loss: 0.6513 ||: 35%|###4 | 407/1163 [01:00<02:02, 6.15it/s]
2023-05-13 15:55:12,377 - INFO - tqdm - batch_loss: 0.0027, loss: 0.6308 ||: 41%|#### | 473/1163 [01:10<01:46, 6.48it/s]
2023-05-13 15:55:22,609 - INFO - tqdm - batch_loss: 0.0024, loss: 0.6152 ||: 47%|####7 | 547/1163 [01:20<01:23, 7.38it/s]
2023-05-13 15:55:32,751 - INFO - tqdm - batch_loss: 0.0002, loss: 0.5959 ||: 53%|#####2 | 613/1163 [01:30<01:14, 7.34it/s]
2023-05-13 15:55:42,869 - INFO - tqdm - batch_loss: 0.0005, loss: 0.5768 ||: 59%|#####9 | 691/1163 [01:41<01:13, 6.43it/s]
2023-05-13 15:55:52,963 - INFO - tqdm - batch_loss: 0.0008, loss: 0.5970 ||: 65%|######4 | 752/1163 [01:51<00:50, 8.22it/s]
2023-05-13 15:56:03,146 - INFO - tqdm - batch_loss: 2.3033, loss: 0.5900 ||: 70%|####### | 818/1163 [02:01<01:11, 4.85it/s]
2023-05-13 15:56:13,182 - INFO - tqdm - batch_loss: 0.0002, loss: 0.5649 ||: 77%|#######6 | 890/1163 [02:11<00:32, 8.50it/s]
2023-05-13 15:56:23,202 - INFO - tqdm - batch_loss: 0.0008, loss: 0.5669 ||: 83%|########2 | 960/1163 [02:21<00:41, 4.91it/s]
2023-05-13 15:56:33,361 - INFO - tqdm - batch_loss: 1.9409, loss: 0.5561 ||: 89%|########8 | 1034/1163 [02:31<00:16, 7.73it/s]
2023-05-13 15:56:43,417 - INFO - tqdm - batch_loss: 0.6550, loss: 0.5436 ||: 94%|#########4| 1094/1163 [02:41<00:08, 8.03it/s]
2023-05-13 15:56:51,687 - INFO - tqdm - batch_loss: 0.0020, loss: 0.5399 ||: 100%|#########9| 1158/1163 [02:49<00:00, 7.18it/s]
2023-05-13 15:56:51,857 - INFO - tqdm - batch_loss: 0.0003, loss: 0.5394 ||: 100%|#########9| 1159/1163 [02:50<00:00, 6.75it/s]
2023-05-13 15:56:51,999 - INFO - tqdm - batch_loss: 0.0001, loss: 0.5390 ||: 100%|#########9| 1160/1163 [02:50<00:00, 6.83it/s]
2023-05-13 15:56:52,272 - INFO - tqdm - batch_loss: 0.0018, loss: 0.5383 ||: 100%|#########9| 1162/1163 [02:50<00:00, 7.05it/s]
2023-05-13 15:56:52,367 - INFO - tqdm - batch_loss: 1.1968, loss: 0.5388 ||: 100%|##########| 1163/1163 [02:50<00:00, 6.82it/s]
2023-05-13 15:56:52,368 - INFO - allennlp.training.callbacks.console_logger - Training | Validation
2023-05-13 15:56:52,368 - INFO - allennlp.training.callbacks.console_logger - gpu_0_memory_MB | 2934.998 | N/A
2023-05-13 15:56:52,368 - INFO - allennlp.training.callbacks.console_logger - loss | 0.539 | N/A
2023-05-13 15:56:52,369 - INFO - allennlp.training.callbacks.console_logger - worker_0_memory_MB | 4521.004 | N/A
2023-05-13 15:57:02,820 - INFO - allennlp.training.gradient_descent_trainer - Epoch duration: 0:03:00.997079
2023-05-13 15:57:02,820 - INFO - allennlp.training.gradient_descent_trainer - Estimated training time remaining: 0:50:08
2023-05-13 15:57:02,820 - INFO - allennlp.training.gradient_descent_trainer - Epoch 8/24
2023-05-13 15:57:02,820 - INFO - allennlp.training.gradient_descent_trainer - Worker 0 memory usage: 4.4G
2023-05-13 15:57:02,821 - INFO - allennlp.training.gradient_descent_trainer - GPU 0 memory usage: 2.9G
2023-05-13 15:57:02,822 - INFO - allennlp.training.gradient_descent_trainer - Training
2023-05-13 15:57:02,822 - INFO - tqdm - 0%| | 0/1163 [00:00<?, ?it/s]
2023-05-13 15:57:12,855 - INFO - tqdm - batch_loss: 0.1027, loss: 0.6887 ||: 6%|5 | 68/1163 [00:10<02:58, 6.14it/s]
2023-05-13 15:57:22,883 - INFO - tqdm - batch_loss: 1.9845, loss: 0.6276 ||: 12%|#1 | 136/1163 [00:20<02:14, 7.62it/s]
2023-05-13 15:57:33,016 - INFO - tqdm - batch_loss: 0.0000, loss: 0.4995 ||: 17%|#7 | 203/1163 [00:30<02:52, 5.58it/s]
2023-05-13 15:57:43,159 - INFO - tqdm - batch_loss: 0.0000, loss: 0.4736 ||: 24%|##3 | 275/1163 [00:40<02:09, 6.88it/s]
2023-05-13 15:57:53,323 - INFO - tqdm - batch_loss: 0.1735, loss: 0.4528 ||: 29%|##9 | 339/1163 [00:50<01:59, 6.88it/s]
2023-05-13 15:58:03,462 - INFO - tqdm - batch_loss: 0.0000, loss: 0.4489 ||: 36%|###5 | 413/1163 [01:00<01:55, 6.47it/s]
2023-05-13 15:58:13,584 - INFO - tqdm - batch_loss: 0.0000, loss: 0.4201 ||: 41%|####1 | 480/1163 [01:10<01:22, 8.30it/s]
2023-05-13 15:58:23,716 - INFO - tqdm - batch_loss: 0.0016, loss: 0.4203 ||: 47%|####7 | 549/1163 [01:20<02:12, 4.64it/s]
2023-05-13 15:58:33,769 - INFO - tqdm - batch_loss: 0.0405, loss: 0.4151 ||: 53%|#####3 | 619/1163 [01:30<01:20, 6.74it/s]
2023-05-13 15:58:43,918 - INFO - tqdm - batch_loss: 1.0922, loss: 0.4000 ||: 59%|#####9 | 689/1163 [01:41<01:16, 6.19it/s]
2023-05-13 15:58:54,097 - INFO - tqdm - batch_loss: 0.0010, loss: 0.3906 ||: 66%|######5 | 766/1163 [01:51<00:50, 7.88it/s]
2023-05-13 15:59:04,182 - INFO - tqdm - batch_loss: 0.0010, loss: 0.3880 ||: 71%|#######1 | 830/1163 [02:01<00:54, 6.13it/s]
2023-05-13 15:59:14,191 - INFO - tqdm - batch_loss: 8.1618, loss: 0.4068 ||: 78%|#######7 | 903/1163 [02:11<00:45, 5.68it/s]
2023-05-13 15:59:24,352 - INFO - tqdm - batch_loss: 0.0006, loss: 0.4172 ||: 83%|########3 | 966/1163 [02:21<00:25, 7.58it/s]
2023-05-13 15:59:34,367 - INFO - tqdm - batch_loss: 0.0001, loss: 0.4116 ||: 89%|########9 | 1038/1163 [02:31<00:15, 7.86it/s]
2023-05-13 15:59:44,540 - INFO - tqdm - batch_loss: 2.1485, loss: 0.4315 ||: 95%|#########5| 1107/1163 [02:41<00:08, 6.83it/s]
2023-05-13 15:59:51,984 - INFO - tqdm - batch_loss: 0.0000, loss: 0.4333 ||: 100%|#########9| 1158/1163 [02:49<00:00, 5.78it/s]
2023-05-13 15:59:52,214 - INFO - tqdm - batch_loss: 0.0069, loss: 0.4329 ||: 100%|#########9| 1159/1163 [02:49<00:00, 5.26it/s]
2023-05-13 15:59:52,326 - INFO - tqdm - batch_loss: 0.0004, loss: 0.4326 ||: 100%|#########9| 1160/1163 [02:49<00:00, 6.00it/s]
2023-05-13 15:59:52,533 - INFO - tqdm - batch_loss: 0.0000, loss: 0.4322 ||: 100%|#########9| 1161/1163 [02:49<00:00, 5.59it/s]
2023-05-13 15:59:52,651 - INFO - tqdm - batch_loss: 2.2003, loss: 0.4337 ||: 100%|#########9| 1162/1163 [02:49<00:00, 6.23it/s]
2023-05-13 15:59:52,841 - INFO - tqdm - batch_loss: 0.0000, loss: 0.4333 ||: 100%|##########| 1163/1163 [02:50<00:00, 5.90it/s]
2023-05-13 15:59:52,843 - INFO - tqdm - batch_loss: 0.0000, loss: 0.4333 ||: 100%|##########| 1163/1163 [02:50<00:00, 6.84it/s]
2023-05-13 15:59:52,845 - INFO - allennlp.training.callbacks.console_logger - Training | Validation
2023-05-13 15:59:52,845 - INFO - allennlp.training.callbacks.console_logger - gpu_0_memory_MB | 3001.037 | N/A
2023-05-13 15:59:52,845 - INFO - allennlp.training.callbacks.console_logger - loss | 0.433 | N/A
2023-05-13 15:59:52,845 - INFO - allennlp.training.callbacks.console_logger - worker_0_memory_MB | 4521.004 | N/A
2023-05-13 16:00:03,290 - INFO - allennlp.training.gradient_descent_trainer - Epoch duration: 0:03:00.470288
2023-05-13 16:00:03,291 - INFO - allennlp.training.gradient_descent_trainer - Estimated training time remaining: 0:47:17
2023-05-13 16:00:03,291 - INFO - allennlp.training.gradient_descent_trainer - Epoch 9/24
2023-05-13 16:00:03,291 - INFO - allennlp.training.gradient_descent_trainer - Worker 0 memory usage: 4.4G
2023-05-13 16:00:03,291 - INFO - allennlp.training.gradient_descent_trainer - GPU 0 memory usage: 2.9G
2023-05-13 16:00:03,293 - INFO - allennlp.training.gradient_descent_trainer - Training
2023-05-13 16:00:03,293 - INFO - tqdm - 0%| | 0/1163 [00:00<?, ?it/s]
2023-05-13 16:00:13,369 - INFO - tqdm - batch_loss: 0.0055, loss: 0.6884 ||: 5%|5 | 63/1163 [00:10<03:32, 5.17it/s]
2023-05-13 16:00:23,383 - INFO - tqdm - batch_loss: 0.0001, loss: 0.6140 ||: 12%|#1 | 138/1163 [00:20<02:18, 7.43it/s]
2023-05-13 16:00:33,428 - INFO - tqdm - batch_loss: 0.0003, loss: 0.5652 ||: 17%|#7 | 201/1163 [00:30<02:11, 7.31it/s]
2023-05-13 16:00:43,503 - INFO - tqdm - batch_loss: 0.0065, loss: 0.4630 ||: 23%|##3 | 273/1163 [00:40<02:36, 5.69it/s]
2023-05-13 16:00:53,585 - INFO - tqdm - batch_loss: 0.2724, loss: 0.4683 ||: 29%|##8 | 337/1163 [00:50<01:51, 7.38it/s]
2023-05-13 16:01:03,644 - INFO - tqdm - batch_loss: 0.0000, loss: 0.4572 ||: 35%|###4 | 406/1163 [01:00<02:00, 6.29it/s]
2023-05-13 16:01:13,753 - INFO - tqdm - batch_loss: 0.0001, loss: 0.4654 ||: 41%|#### | 474/1163 [01:10<01:24, 8.13it/s]
2023-05-13 16:01:23,805 - INFO - tqdm - batch_loss: 0.2441, loss: 0.4388 ||: 46%|####6 | 539/1163 [01:20<02:03, 5.04it/s]
2023-05-13 16:01:34,094 - INFO - tqdm - batch_loss: 24.6751, loss: 0.4852 ||: 53%|#####2 | 612/1163 [01:30<01:38, 5.60it/s]
2023-05-13 16:01:44,210 - INFO - tqdm - batch_loss: 0.0001, loss: 0.4554 ||: 58%|#####8 | 678/1163 [01:40<01:02, 7.74it/s]
2023-05-13 16:01:54,249 - INFO - tqdm - batch_loss: 0.0009, loss: 0.4693 ||: 65%|######4 | 755/1163 [01:50<01:04, 6.37it/s]
2023-05-13 16:02:04,271 - INFO - tqdm - batch_loss: 3.7973, loss: 0.4722 ||: 70%|####### | 816/1163 [02:00<00:51, 6.70it/s]
2023-05-13 16:02:14,476 - INFO - tqdm - batch_loss: 0.0068, loss: 0.4500 ||: 77%|#######6 | 890/1163 [02:11<00:54, 5.00it/s]
2023-05-13 16:02:24,539 - INFO - tqdm - batch_loss: 0.0009, loss: 0.4401 ||: 83%|########2 | 960/1163 [02:21<00:23, 8.54it/s]
2023-05-13 16:02:34,676 - INFO - tqdm - batch_loss: 0.0003, loss: 0.4367 ||: 88%|########8 | 1026/1163 [02:31<00:24, 5.70it/s]
2023-05-13 16:02:44,793 - INFO - tqdm - batch_loss: 1.7678, loss: 0.4235 ||: 95%|#########4| 1103/1163 [02:41<00:07, 8.14it/s]
2023-05-13 16:02:54,475 - INFO - tqdm - batch_loss: 1.7925, loss: 0.4252 ||: 100%|#########9| 1159/1163 [02:51<00:00, 7.57it/s]
2023-05-13 16:02:54,580 - INFO - tqdm - batch_loss: 0.0000, loss: 0.4248 ||: 100%|#########9| 1160/1163 [02:51<00:00, 7.97it/s]
2023-05-13 16:02:54,797 - INFO - tqdm - batch_loss: 2.7110, loss: 0.4264 ||: 100%|#########9| 1162/1163 [02:51<00:00, 8.43it/s]
2023-05-13 16:02:54,959 - INFO - tqdm - batch_loss: 0.0006, loss: 0.4260 ||: 100%|##########| 1163/1163 [02:51<00:00, 7.78it/s]
2023-05-13 16:02:54,961 - INFO - tqdm - batch_loss: 0.0006, loss: 0.4260 ||: 100%|##########| 1163/1163 [02:51<00:00, 6.77it/s]
2023-05-13 16:02:54,963 - INFO - allennlp.training.callbacks.console_logger - Training | Validation
2023-05-13 16:02:54,963 - INFO - allennlp.training.callbacks.console_logger - gpu_0_memory_MB | 2968.007 | N/A
2023-05-13 16:02:54,963 - INFO - allennlp.training.callbacks.console_logger - loss | 0.426 | N/A
2023-05-13 16:02:54,963 - INFO - allennlp.training.callbacks.console_logger - worker_0_memory_MB | 4521.004 | N/A
2023-05-13 16:03:07,542 - INFO - allennlp.training.gradient_descent_trainer - Epoch duration: 0:03:04.251681
2023-05-13 16:03:07,543 - INFO - allennlp.training.gradient_descent_trainer - Estimated training time remaining: 0:44:27
2023-05-13 16:03:07,543 - INFO - allennlp.training.gradient_descent_trainer - Epoch 10/24
2023-05-13 16:03:07,543 - INFO - allennlp.training.gradient_descent_trainer - Worker 0 memory usage: 4.4G
2023-05-13 16:03:07,548 - INFO - allennlp.training.gradient_descent_trainer - GPU 0 memory usage: 2.9G
2023-05-13 16:03:07,549 - INFO - allennlp.training.gradient_descent_trainer - Training
2023-05-13 16:03:07,549 - INFO - tqdm - 0%| | 0/1163 [00:00<?, ?it/s]
2023-05-13 16:03:17,699 - INFO - tqdm - batch_loss: 0.0000, loss: 0.2338 ||: 6%|5 | 67/1163 [00:10<03:02, 6.01it/s]
2023-05-13 16:03:27,919 - INFO - tqdm - batch_loss: 0.0005, loss: 0.2412 ||: 11%|# | 126/1163 [00:20<03:15, 5.31it/s]
2023-05-13 16:03:38,013 - INFO - tqdm - batch_loss: 0.0000, loss: 0.1988 ||: 18%|#7 | 206/1163 [00:30<02:01, 7.88it/s]
2023-05-13 16:03:48,102 - INFO - tqdm - batch_loss: 0.0002, loss: 0.2550 ||: 23%|##2 | 265/1163 [00:40<01:54, 7.84it/s]
2023-05-13 16:03:58,163 - INFO - tqdm - batch_loss: 0.0010, loss: 0.2544 ||: 29%|##8 | 337/1163 [00:50<02:06, 6.52it/s]
2023-05-13 16:04:08,236 - INFO - tqdm - batch_loss: 0.0006, loss: 0.2859 ||: 34%|###4 | 401/1163 [01:00<01:35, 7.96it/s]
2023-05-13 16:04:18,292 - INFO - tqdm - batch_loss: 0.0007, loss: 0.2876 ||: 40%|#### | 466/1163 [01:10<01:36, 7.22it/s]
2023-05-13 16:04:28,335 - INFO - tqdm - batch_loss: 0.7189, loss: 0.2897 ||: 47%|####6 | 541/1163 [01:20<01:30, 6.90it/s]
2023-05-13 16:04:38,398 - INFO - tqdm - batch_loss: 7.6967, loss: 0.3028 ||: 52%|#####2 | 606/1163 [01:30<01:49, 5.08it/s]
2023-05-13 16:04:48,488 - INFO - tqdm - batch_loss: 0.0001, loss: 0.3158 ||: 59%|#####8 | 682/1163 [01:40<01:03, 7.57it/s]
2023-05-13 16:04:58,643 - INFO - tqdm - batch_loss: 0.6838, loss: 0.3267 ||: 65%|######4 | 752/1163 [01:51<00:53, 7.64it/s]
2023-05-13 16:05:08,754 - INFO - tqdm - batch_loss: 0.0004, loss: 0.3143 ||: 71%|####### | 823/1163 [02:01<00:53, 6.31it/s]
2023-05-13 16:05:18,951 - INFO - tqdm - batch_loss: 0.0000, loss: 0.3064 ||: 77%|#######6 | 890/1163 [02:11<00:33, 8.06it/s]
2023-05-13 16:05:29,068 - INFO - tqdm - batch_loss: 0.3256, loss: 0.3049 ||: 83%|########2 | 960/1163 [02:21<00:41, 4.94it/s]
2023-05-13 16:05:39,151 - INFO - tqdm - batch_loss: 1.1842, loss: 0.3222 ||: 89%|########8 | 1034/1163 [02:31<00:17, 7.44it/s]
2023-05-13 16:05:49,216 - INFO - tqdm - batch_loss: 1.0719, loss: 0.3313 ||: 94%|#########4| 1095/1163 [02:41<00:12, 5.29it/s]
2023-05-13 16:05:57,829 - INFO - tqdm - batch_loss: 1.3395, loss: 0.3246 ||: 100%|#########9| 1158/1163 [02:50<00:00, 6.94it/s]
2023-05-13 16:05:57,948 - INFO - tqdm - batch_loss: 0.2997, loss: 0.3246 ||: 100%|#########9| 1159/1163 [02:50<00:00, 7.23it/s]
2023-05-13 16:05:58,135 - INFO - tqdm - batch_loss: 0.0001, loss: 0.3240 ||: 100%|#########9| 1161/1163 [02:50<00:00, 8.29it/s]
2023-05-13 16:05:58,360 - INFO - tqdm - batch_loss: 0.0370, loss: 0.3235 ||: 100%|##########| 1163/1163 [02:50<00:00, 8.49it/s]
2023-05-13 16:05:58,362 - INFO - tqdm - batch_loss: 0.0370, loss: 0.3235 ||: 100%|##########| 1163/1163 [02:50<00:00, 6.81it/s]
2023-05-13 16:05:58,363 - INFO - allennlp.training.callbacks.console_logger - Training | Validation
2023-05-13 16:05:58,364 - INFO - allennlp.training.callbacks.console_logger - gpu_0_memory_MB | 2938.032 | N/A
2023-05-13 16:05:58,364 - INFO - allennlp.training.callbacks.console_logger - loss | 0.324 | N/A
2023-05-13 16:05:58,364 - INFO - allennlp.training.callbacks.console_logger - worker_0_memory_MB | 4521.004 | N/A
2023-05-13 16:06:09,908 - INFO - allennlp.training.gradient_descent_trainer - Epoch duration: 0:03:02.364773
2023-05-13 16:06:09,908 - INFO - allennlp.training.gradient_descent_trainer - Estimated training time remaining: 0:41:36
2023-05-13 16:06:09,908 - INFO - allennlp.training.gradient_descent_trainer - Epoch 11/24
2023-05-13 16:06:09,908 - INFO - allennlp.training.gradient_descent_trainer - Worker 0 memory usage: 4.4G
2023-05-13 16:06:09,909 - INFO - allennlp.training.gradient_descent_trainer - GPU 0 memory usage: 2.9G
2023-05-13 16:06:09,910 - INFO - allennlp.training.gradient_descent_trainer - Training
2023-05-13 16:06:09,910 - INFO - tqdm - 0%| | 0/1163 [00:00<?, ?it/s]
2023-05-13 16:06:19,925 - INFO - tqdm - batch_loss: 0.0001, loss: 0.6903 ||: 6%|6 | 70/1163 [00:10<02:58, 6.12it/s]
2023-05-13 16:06:29,983 - INFO - tqdm - batch_loss: 0.1486, loss: 0.5818 ||: 12%|#1 | 139/1163 [00:20<02:16, 7.50it/s]
2023-05-13 16:06:40,178 - INFO - tqdm - batch_loss: 0.0005, loss: 0.4726 ||: 17%|#7 | 202/1163 [00:30<03:36, 4.44it/s]
2023-05-13 16:06:50,355 - INFO - tqdm - batch_loss: 1.7188, loss: 0.3929 ||: 24%|##3 | 278/1163 [00:40<02:00, 7.35it/s]
2023-05-13 16:07:00,489 - INFO - tqdm - batch_loss: 0.0005, loss: 0.3744 ||: 29%|##9 | 340/1163 [00:50<01:56, 7.04it/s]
2023-05-13 16:07:10,636 - INFO - tqdm - batch_loss: 1.5825, loss: 0.3845 ||: 36%|###5 | 413/1163 [01:00<02:10, 5.74it/s]
2023-05-13 16:07:20,817 - INFO - tqdm - batch_loss: 3.5484, loss: 0.3665 ||: 41%|####1 | 479/1163 [01:10<01:43, 6.64it/s]
2023-05-13 16:07:30,866 - INFO - tqdm - batch_loss: 0.0021, loss: 0.3863 ||: 47%|####7 | 547/1163 [01:20<01:42, 6.01it/s]
2023-05-13 16:07:41,113 - INFO - tqdm - batch_loss: 0.0016, loss: 0.3873 ||: 53%|#####3 | 620/1163 [01:31<01:05, 8.24it/s]
2023-05-13 16:07:51,198 - INFO - tqdm - batch_loss: 0.0020, loss: 0.3672 ||: 59%|#####8 | 684/1163 [01:41<01:08, 7.02it/s]
2023-05-13 16:08:01,268 - INFO - tqdm - batch_loss: 0.0000, loss: 0.3819 ||: 65%|######5 | 761/1163 [01:51<00:52, 7.69it/s]
2023-05-13 16:08:11,326 - INFO - tqdm - batch_loss: 0.0002, loss: 0.4024 ||: 71%|####### | 825/1163 [02:01<00:44, 7.52it/s]
2023-05-13 16:08:21,364 - INFO - tqdm - batch_loss: 0.0000, loss: 0.4020 ||: 77%|#######7 | 896/1163 [02:11<00:46, 5.78it/s]
2023-05-13 16:08:31,417 - INFO - tqdm - batch_loss: 0.0125, loss: 0.3933 ||: 83%|########2 | 965/1163 [02:21<00:26, 7.58it/s]
2023-05-13 16:08:41,447 - INFO - tqdm - batch_loss: 0.0005, loss: 0.3872 ||: 88%|########8 | 1029/1163 [02:31<00:21, 6.36it/s]
2023-05-13 16:08:51,504 - INFO - tqdm - batch_loss: 0.0009, loss: 0.3861 ||: 95%|#########5| 1106/1163 [02:41<00:06, 8.97it/s]
2023-05-13 16:08:59,723 - INFO - tqdm - batch_loss: 1.6099, loss: 0.3823 ||: 100%|#########9| 1158/1163 [02:49<00:00, 8.01it/s]
2023-05-13 16:08:59,826 - INFO - tqdm - batch_loss: 0.0000, loss: 0.3820 ||: 100%|#########9| 1159/1163 [02:49<00:00, 8.33it/s]
2023-05-13 16:09:00,100 - INFO - tqdm - batch_loss: 0.0001, loss: 0.3813 ||: 100%|#########9| 1161/1163 [02:50<00:00, 7.90it/s]
2023-05-13 16:09:00,240 - INFO - tqdm - batch_loss: 0.8730, loss: 0.3817 ||: 100%|#########9| 1162/1163 [02:50<00:00, 7.74it/s]
2023-05-13 16:09:00,367 - INFO - tqdm - batch_loss: 0.0004, loss: 0.3814 ||: 100%|##########| 1163/1163 [02:50<00:00, 7.76it/s]
2023-05-13 16:09:00,369 - INFO - tqdm - batch_loss: 0.0004, loss: 0.3814 ||: 100%|##########| 1163/1163 [02:50<00:00, 6.82it/s]
2023-05-13 16:09:00,371 - INFO - allennlp.training.callbacks.console_logger - Training | Validation
2023-05-13 16:09:00,371 - INFO - allennlp.training.callbacks.console_logger - gpu_0_memory_MB | 2934.998 | N/A
2023-05-13 16:09:00,371 - INFO - allennlp.training.callbacks.console_logger - loss | 0.381 | N/A
2023-05-13 16:09:00,371 - INFO - allennlp.training.callbacks.console_logger - worker_0_memory_MB | 4521.004 | N/A
2023-05-13 16:09:05,954 - INFO - allennlp.training.gradient_descent_trainer - Epoch duration: 0:02:56.045346
2023-05-13 16:09:05,954 - INFO - allennlp.training.gradient_descent_trainer - Estimated training time remaining: 0:38:42
2023-05-13 16:09:05,954 - INFO - allennlp.training.gradient_descent_trainer - Epoch 12/24
2023-05-13 16:09:05,954 - INFO - allennlp.training.gradient_descent_trainer - Worker 0 memory usage: 4.4G
2023-05-13 16:09:05,954 - INFO - allennlp.training.gradient_descent_trainer - GPU 0 memory usage: 2.9G
2023-05-13 16:09:05,956 - INFO - allennlp.training.gradient_descent_trainer - Training
2023-05-13 16:09:05,956 - INFO - tqdm - 0%| | 0/1163 [00:00<?, ?it/s]
2023-05-13 16:09:16,072 - INFO - tqdm - batch_loss: 0.0027, loss: 0.3461 ||: 6%|5 | 65/1163 [00:10<02:38, 6.92it/s]
2023-05-13 16:09:26,139 - INFO - tqdm - batch_loss: 0.0013, loss: 0.3808 ||: 12%|#2 | 140/1163 [00:20<02:30, 6.79it/s]
2023-05-13 16:09:36,255 - INFO - tqdm - batch_loss: 0.0013, loss: 0.3614 ||: 18%|#7 | 204/1163 [00:30<02:02, 7.85it/s]
2023-05-13 16:09:46,300 - INFO - tqdm - batch_loss: 0.0000, loss: 0.3293 ||: 23%|##3 | 272/1163 [00:40<02:15, 6.55it/s]
2023-05-13 16:09:56,424 - INFO - tqdm - batch_loss: 0.0008, loss: 0.3344 ||: 30%|##9 | 347/1163 [00:50<01:38, 8.25it/s]
2023-05-13 16:10:06,552 - INFO - tqdm - batch_loss: 0.0001, loss: 0.3372 ||: 35%|###5 | 411/1163 [01:00<01:34, 7.96it/s]
2023-05-13 16:10:16,609 - INFO - tqdm - batch_loss: 0.0005, loss: 0.3169 ||: 42%|####2 | 489/1163 [01:10<01:40, 6.73it/s]
2023-05-13 16:10:26,657 - INFO - tqdm - batch_loss: 0.0021, loss: 0.3294 ||: 47%|####7 | 552/1163 [01:20<01:23, 7.34it/s]
2023-05-13 16:10:36,668 - INFO - tqdm - batch_loss: 0.0000, loss: 0.3124 ||: 53%|#####3 | 621/1163 [01:30<01:21, 6.64it/s]
2023-05-13 16:10:46,815 - INFO - tqdm - batch_loss: 0.0001, loss: 0.3128 ||: 60%|###### | 699/1163 [01:40<01:00, 7.71it/s]
2023-05-13 16:10:56,966 - INFO - tqdm - batch_loss: 0.0001, loss: 0.3486 ||: 66%|######5 | 763/1163 [01:51<01:06, 6.05it/s]
2023-05-13 16:11:07,069 - INFO - tqdm - batch_loss: 0.0007, loss: 0.3365 ||: 72%|#######2 | 840/1163 [02:01<00:42, 7.61it/s]
2023-05-13 16:11:17,089 - INFO - tqdm - batch_loss: 1.5970, loss: 0.3518 ||: 78%|#######7 | 903/1163 [02:11<00:43, 5.94it/s]
2023-05-13 16:11:27,305 - INFO - tqdm - batch_loss: 0.8028, loss: 0.3495 ||: 83%|########2 | 965/1163 [02:21<00:49, 3.98it/s]
2023-05-13 16:11:37,341 - INFO - tqdm - batch_loss: 0.0020, loss: 0.3517 ||: 89%|########8 | 1034/1163 [02:31<00:17, 7.45it/s]
2023-05-13 16:11:47,517 - INFO - tqdm - batch_loss: 0.0002, loss: 0.3544 ||: 95%|#########4| 1102/1163 [02:41<00:07, 8.47it/s]
2023-05-13 16:11:54,792 - INFO - tqdm - batch_loss: 0.0004, loss: 0.3493 ||: 100%|#########9| 1158/1163 [02:48<00:00, 8.12it/s]
2023-05-13 16:11:54,924 - INFO - tqdm - batch_loss: 0.0002, loss: 0.3490 ||: 100%|#########9| 1159/1163 [02:48<00:00, 7.96it/s]
2023-05-13 16:11:55,070 - INFO - tqdm - batch_loss: 0.0001, loss: 0.3487 ||: 100%|#########9| 1160/1163 [02:49<00:00, 7.63it/s]
2023-05-13 16:11:55,228 - INFO - tqdm - batch_loss: 0.0019, loss: 0.3484 ||: 100%|#########9| 1161/1163 [02:49<00:00, 7.20it/s]
2023-05-13 16:11:55,514 - INFO - tqdm - batch_loss: 1.7003, loss: 0.3493 ||: 100%|##########| 1163/1163 [02:49<00:00, 7.10it/s]
2023-05-13 16:11:55,516 - INFO - tqdm - batch_loss: 1.7003, loss: 0.3493 ||: 100%|##########| 1163/1163 [02:49<00:00, 6.86it/s]
2023-05-13 16:11:55,519 - INFO - allennlp.training.callbacks.console_logger - Training | Validation
2023-05-13 16:11:55,519 - INFO - allennlp.training.callbacks.console_logger - gpu_0_memory_MB | 2999.276 | N/A
2023-05-13 16:11:55,519 - INFO - allennlp.training.callbacks.console_logger - loss | 0.349 | N/A
2023-05-13 16:11:55,519 - INFO - allennlp.training.callbacks.console_logger - worker_0_memory_MB | 4521.004 | N/A
2023-05-13 16:12:05,134 - INFO - allennlp.training.gradient_descent_trainer - Epoch duration: 0:02:59.179658
2023-05-13 16:12:05,134 - INFO - allennlp.training.gradient_descent_trainer - Estimated training time remaining: 0:35:40
2023-05-13 16:12:05,134 - INFO - allennlp.training.gradient_descent_trainer - Epoch 13/24
2023-05-13 16:12:05,134 - INFO - allennlp.training.gradient_descent_trainer - Worker 0 memory usage: 4.4G
2023-05-13 16:12:05,135 - INFO - allennlp.training.gradient_descent_trainer - GPU 0 memory usage: 2.9G
2023-05-13 16:12:05,136 - INFO - allennlp.training.gradient_descent_trainer - Training
2023-05-13 16:12:05,136 - INFO - tqdm - 0%| | 0/1163 [00:00<?, ?it/s]
2023-05-13 16:12:15,281 - INFO - tqdm - batch_loss: 2.1133, loss: 0.3438 ||: 6%|6 | 73/1163 [00:10<03:02, 5.98it/s]
2023-05-13 16:12:25,357 - INFO - tqdm - batch_loss: 0.0035, loss: 0.3443 ||: 12%|#2 | 145/1163 [00:20<02:09, 7.87it/s]
2023-05-13 16:12:35,380 - INFO - tqdm - batch_loss: 0.0001, loss: 0.3166 ||: 18%|#8 | 210/1163 [00:30<02:50, 5.57it/s]
2023-05-13 16:12:45,526 - INFO - tqdm - batch_loss: 0.0000, loss: 0.2853 ||: 24%|##4 | 284/1163 [00:40<01:48, 8.14it/s]
2023-05-13 16:12:55,574 - INFO - tqdm - batch_loss: 0.0002, loss: 0.2773 ||: 30%|##9 | 346/1163 [00:50<02:02, 6.68it/s]
2023-05-13 16:13:05,739 - INFO - tqdm - batch_loss: 2.9146, loss: 0.2635 ||: 36%|###6 | 420/1163 [01:00<02:22, 5.22it/s]
2023-05-13 16:13:15,803 - INFO - tqdm - batch_loss: 0.0000, loss: 0.2708 ||: 42%|####2 | 491/1163 [01:10<01:33, 7.17it/s]
2023-05-13 16:13:25,902 - INFO - tqdm - batch_loss: 3.0192, loss: 0.2721 ||: 48%|####7 | 558/1163 [01:20<02:02, 4.94it/s]
2023-05-13 16:13:35,958 - INFO - tqdm - batch_loss: 0.0001, loss: 0.2635 ||: 55%|#####4 | 635/1163 [01:30<01:16, 6.90it/s]
2023-05-13 16:13:45,981 - INFO - tqdm - batch_loss: 0.0006, loss: 0.2787 ||: 60%|#####9 | 695/1163 [01:40<01:15, 6.20it/s]
2023-05-13 16:13:55,994 - INFO - tqdm - batch_loss: 0.0000, loss: 0.2841 ||: 66%|######5 | 767/1163 [01:50<00:58, 6.77it/s]
2023-05-13 16:14:06,135 - INFO - tqdm - batch_loss: 0.0131, loss: 0.2909 ||: 72%|#######2 | 840/1163 [02:00<00:44, 7.33it/s]
2023-05-13 16:14:16,158 - INFO - tqdm - batch_loss: 0.0000, loss: 0.3007 ||: 78%|#######7 | 902/1163 [02:11<00:39, 6.58it/s]
2023-05-13 16:14:26,264 - INFO - tqdm - batch_loss: 0.0002, loss: 0.3035 ||: 84%|########4 | 977/1163 [02:21<00:25, 7.27it/s]
2023-05-13 16:14:36,327 - INFO - tqdm - batch_loss: 0.0001, loss: 0.3083 ||: 89%|########9 | 1040/1163 [02:31<00:17, 7.08it/s]
2023-05-13 16:14:46,435 - INFO - tqdm - batch_loss: 0.0000, loss: 0.3042 ||: 96%|#########5| 1115/1163 [02:41<00:07, 6.73it/s]
2023-05-13 16:14:52,613 - INFO - tqdm - batch_loss: 0.0000, loss: 0.3117 ||: 100%|#########9| 1158/1163 [02:47<00:00, 8.46it/s]
2023-05-13 16:14:52,758 - INFO - tqdm - batch_loss: 0.0002, loss: 0.3114 ||: 100%|#########9| 1159/1163 [02:47<00:00, 8.10it/s]
2023-05-13 16:14:52,929 - INFO - tqdm - batch_loss: 0.0000, loss: 0.3111 ||: 100%|#########9| 1160/1163 [02:47<00:00, 7.47it/s]
2023-05-13 16:14:53,127 - INFO - tqdm - batch_loss: 0.0000, loss: 0.3106 ||: 100%|#########9| 1162/1163 [02:47<00:00, 8.30it/s]
2023-05-13 16:14:53,314 - INFO - tqdm - batch_loss: 2.9943, loss: 0.3129 ||: 100%|##########| 1163/1163 [02:48<00:00, 7.42it/s]
2023-05-13 16:14:53,315 - INFO - tqdm - batch_loss: 2.9943, loss: 0.3129 ||: 100%|##########| 1163/1163 [02:48<00:00, 6.92it/s]
2023-05-13 16:14:53,317 - INFO - allennlp.training.callbacks.console_logger - Training | Validation
2023-05-13 16:14:53,317 - INFO - allennlp.training.callbacks.console_logger - gpu_0_memory_MB | 2985.339 | N/A
2023-05-13 16:14:53,317 - INFO - allennlp.training.callbacks.console_logger - loss | 0.313 | N/A
2023-05-13 16:14:53,317 - INFO - allennlp.training.callbacks.console_logger - worker_0_memory_MB | 4521.004 | N/A
2023-05-13 16:15:01,161 - INFO - allennlp.training.gradient_descent_trainer - Epoch duration: 0:02:56.026618
2023-05-13 16:15:01,161 - INFO - allennlp.training.gradient_descent_trainer - Estimated training time remaining: 0:32:41
2023-05-13 16:15:01,161 - INFO - allennlp.training.gradient_descent_trainer - Epoch 14/24
2023-05-13 16:15:01,161 - INFO - allennlp.training.gradient_descent_trainer - Worker 0 memory usage: 4.4G
2023-05-13 16:15:01,162 - INFO - allennlp.training.gradient_descent_trainer - GPU 0 memory usage: 2.9G
2023-05-13 16:15:01,163 - INFO - allennlp.training.gradient_descent_trainer - Training
2023-05-13 16:15:01,164 - INFO - tqdm - 0%| | 0/1163 [00:00<?, ?it/s]
2023-05-13 16:15:11,186 - INFO - tqdm - batch_loss: 0.0024, loss: 0.1017 ||: 5%|5 | 62/1163 [00:10<02:25, 7.55it/s]
2023-05-13 16:15:21,323 - INFO - tqdm - batch_loss: 0.0001, loss: 0.1507 ||: 11%|#1 | 129/1163 [00:20<03:03, 5.62it/s]
2023-05-13 16:15:31,361 - INFO - tqdm - batch_loss: 0.0000, loss: 0.1582 ||: 17%|#7 | 200/1163 [00:30<02:34, 6.24it/s]
2023-05-13 16:15:41,368 - INFO - tqdm - batch_loss: 0.0425, loss: 0.1548 ||: 23%|##2 | 262/1163 [00:40<02:18, 6.49it/s]
2023-05-13 16:15:51,402 - INFO - tqdm - batch_loss: 0.0978, loss: 0.1943 ||: 29%|##9 | 342/1163 [00:50<02:07, 6.45it/s]
2023-05-13 16:16:01,484 - INFO - tqdm - batch_loss: 0.0001, loss: 0.1848 ||: 36%|###5 | 414/1163 [01:00<01:16, 9.73it/s]
2023-05-13 16:16:11,522 - INFO - tqdm - batch_loss: 0.0015, loss: 0.2171 ||: 41%|####1 | 482/1163 [01:10<02:39, 4.28it/s]
2023-05-13 16:16:21,677 - INFO - tqdm - batch_loss: 0.0001, loss: 0.2159 ||: 48%|####7 | 553/1163 [01:20<01:41, 6.00it/s]
2023-05-13 16:16:31,868 - INFO - tqdm - batch_loss: 0.0000, loss: 0.2102 ||: 53%|#####2 | 615/1163 [01:30<01:35, 5.76it/s]
2023-05-13 16:16:42,049 - INFO - tqdm - batch_loss: 1.1751, loss: 0.2188 ||: 59%|#####9 | 689/1163 [01:40<01:13, 6.49it/s]
2023-05-13 16:16:52,224 - INFO - tqdm - batch_loss: 0.0000, loss: 0.2278 ||: 65%|######4 | 754/1163 [01:51<00:49, 8.26it/s]
2023-05-13 16:17:02,351 - INFO - tqdm - batch_loss: 0.0001, loss: 0.2280 ||: 71%|#######1 | 826/1163 [02:01<00:54, 6.20it/s]
2023-05-13 16:17:12,436 - INFO - tqdm - batch_loss: 1.1750, loss: 0.2286 ||: 77%|#######6 | 894/1163 [02:11<00:42, 6.26it/s]
2023-05-13 16:17:22,476 - INFO - tqdm - batch_loss: 0.0000, loss: 0.2333 ||: 82%|########2 | 959/1163 [02:21<00:29, 6.98it/s]
2023-05-13 16:17:32,574 - INFO - tqdm - batch_loss: 0.1215, loss: 0.2354 ||: 89%|########8 | 1031/1163 [02:31<00:16, 7.97it/s]
2023-05-13 16:17:42,611 - INFO - tqdm - batch_loss: 0.0000, loss: 0.2441 ||: 94%|#########4| 1095/1163 [02:41<00:07, 8.61it/s]
2023-05-13 16:17:51,386 - INFO - tqdm - batch_loss: 0.0004, loss: 0.2385 ||: 100%|#########9| 1158/1163 [02:50<00:01, 4.65it/s]
2023-05-13 16:17:51,513 - INFO - tqdm - batch_loss: 0.0023, loss: 0.2383 ||: 100%|#########9| 1159/1163 [02:50<00:00, 5.29it/s]
2023-05-13 16:17:51,751 - INFO - tqdm - batch_loss: 0.0000, loss: 0.2381 ||: 100%|#########9| 1160/1163 [02:50<00:00, 4.92it/s]
2023-05-13 16:17:51,870 - INFO - tqdm - batch_loss: 0.0009, loss: 0.2379 ||: 100%|#########9| 1161/1163 [02:50<00:00, 5.61it/s]
2023-05-13 16:17:52,075 - INFO - tqdm - batch_loss: 0.0000, loss: 0.2377 ||: 100%|#########9| 1162/1163 [02:50<00:00, 5.37it/s]
2023-05-13 16:17:52,320 - INFO - tqdm - batch_loss: 0.4424, loss: 0.2378 ||: 100%|##########| 1163/1163 [02:51<00:00, 4.90it/s]
2023-05-13 16:17:52,324 - INFO - tqdm - batch_loss: 0.4424, loss: 0.2378 ||: 100%|##########| 1163/1163 [02:51<00:00, 6.79it/s]
2023-05-13 16:17:52,327 - INFO - allennlp.training.callbacks.console_logger - Training | Validation
2023-05-13 16:17:52,327 - INFO - allennlp.training.callbacks.console_logger - gpu_0_memory_MB | 2939.405 | N/A
2023-05-13 16:17:52,327 - INFO - allennlp.training.callbacks.console_logger - loss | 0.238 | N/A
2023-05-13 16:17:52,327 - INFO - allennlp.training.callbacks.console_logger - worker_0_memory_MB | 4521.004 | N/A
2023-05-13 16:18:12,399 - INFO - allennlp.training.gradient_descent_trainer - Epoch duration: 0:03:11.237693
2023-05-13 16:18:12,399 - INFO - allennlp.training.gradient_descent_trainer - Estimated training time remaining: 0:29:43
2023-05-13 16:18:12,399 - INFO - allennlp.training.gradient_descent_trainer - Epoch 15/24
2023-05-13 16:18:12,399 - INFO - allennlp.training.gradient_descent_trainer - Worker 0 memory usage: 4.4G
2023-05-13 16:18:12,400 - INFO - allennlp.training.gradient_descent_trainer - GPU 0 memory usage: 2.9G
2023-05-13 16:18:12,401 - INFO - allennlp.training.gradient_descent_trainer - Training
2023-05-13 16:18:12,401 - INFO - tqdm - 0%| | 0/1163 [00:00<?, ?it/s]
2023-05-13 16:18:22,451 - INFO - tqdm - batch_loss: 0.0172, loss: 0.2289 ||: 6%|6 | 74/1163 [00:10<02:29, 7.30it/s]
2023-05-13 16:18:32,668 - INFO - tqdm - batch_loss: 0.3757, loss: 0.2143 ||: 13%|#2 | 146/1163 [00:20<02:16, 7.48it/s]
2023-05-13 16:18:42,675 - INFO - tqdm - batch_loss: 0.8465, loss: 0.1944 ||: 18%|#8 | 213/1163 [00:30<02:47, 5.66it/s]
2023-05-13 16:18:52,689 - INFO - tqdm - batch_loss: 0.0027, loss: 0.2022 ||: 24%|##4 | 281/1163 [00:40<02:12, 6.64it/s]
2023-05-13 16:19:02,734 - INFO - tqdm - batch_loss: 0.0000, loss: 0.1995 ||: 29%|##9 | 339/1163 [00:50<01:56, 7.05it/s]
2023-05-13 16:19:12,767 - INFO - tqdm - batch_loss: 1.8995, loss: 0.2124 ||: 35%|###5 | 411/1163 [01:00<02:41, 4.66it/s]
2023-05-13 16:19:22,856 - INFO - tqdm - batch_loss: 0.0005, loss: 0.1994 ||: 41%|#### | 474/1163 [01:10<01:43, 6.67it/s]
2023-05-13 16:19:32,924 - INFO - tqdm - batch_loss: 0.0001, loss: 0.1954 ||: 47%|####6 | 546/1163 [01:20<01:45, 5.85it/s]
2023-05-13 16:19:43,117 - INFO - tqdm - batch_loss: 0.0000, loss: 0.2033 ||: 53%|#####3 | 621/1163 [01:30<01:01, 8.76it/s]
2023-05-13 16:19:53,263 - INFO - tqdm - batch_loss: 0.0000, loss: 0.2032 ||: 59%|#####9 | 691/1163 [01:40<00:58, 8.11it/s]
2023-05-13 16:20:03,348 - INFO - tqdm - batch_loss: 0.0000, loss: 0.2010 ||: 66%|######6 | 769/1163 [01:50<00:50, 7.76it/s]
2023-05-13 16:20:13,365 - INFO - tqdm - batch_loss: 0.0001, loss: 0.2028 ||: 72%|#######1 | 833/1163 [02:00<00:40, 8.15it/s]
2023-05-13 16:20:23,395 - INFO - tqdm - batch_loss: 0.0001, loss: 0.2064 ||: 77%|#######7 | 897/1163 [02:10<00:55, 4.76it/s]
2023-05-13 16:20:33,441 - INFO - tqdm - batch_loss: 0.0040, loss: 0.2048 ||: 83%|########3 | 968/1163 [02:21<00:26, 7.34it/s]
2023-05-13 16:20:43,454 - INFO - tqdm - batch_loss: 0.0002, loss: 0.2185 ||: 89%|########8 | 1032/1163 [02:31<00:19, 6.68it/s]
2023-05-13 16:20:53,517 - INFO - tqdm - batch_loss: 0.0000, loss: 0.2254 ||: 95%|#########5| 1109/1163 [02:41<00:06, 8.36it/s]
2023-05-13 16:21:01,532 - INFO - tqdm - batch_loss: 0.0005, loss: 0.2249 ||: 100%|#########9| 1158/1163 [02:49<00:00, 7.04it/s]
2023-05-13 16:21:01,696 - INFO - tqdm - batch_loss: 1.4130, loss: 0.2259 ||: 100%|#########9| 1159/1163 [02:49<00:00, 6.76it/s]
2023-05-13 16:21:01,837 - INFO - tqdm - batch_loss: 1.6726, loss: 0.2271 ||: 100%|#########9| 1160/1163 [02:49<00:00, 6.85it/s]
2023-05-13 16:21:01,937 - INFO - tqdm - batch_loss: 0.0003, loss: 0.2269 ||: 100%|#########9| 1161/1163 [02:49<00:00, 7.51it/s]
2023-05-13 16:21:02,089 - INFO - tqdm - batch_loss: 0.0002, loss: 0.2267 ||: 100%|#########9| 1162/1163 [02:49<00:00, 7.22it/s]
2023-05-13 16:21:02,191 - INFO - tqdm - batch_loss: 0.0000, loss: 0.2265 ||: 100%|##########| 1163/1163 [02:49<00:00, 7.82it/s]
2023-05-13 16:21:02,193 - INFO - tqdm - batch_loss: 0.0000, loss: 0.2265 ||: 100%|##########| 1163/1163 [02:49<00:00, 6.85it/s]
2023-05-13 16:21:02,193 - INFO - allennlp.training.gradient_descent_trainer - Validating
2023-05-13 16:21:02,195 - INFO - tqdm - 0%| | 0/16 [00:00<?, ?it/s]
2023-05-13 16:21:14,289 - INFO - tqdm - precision: 0.6667, recall: 0.5385, fscore: 0.5957, batch_loss: 0.1850, loss: 0.5564 ||: 38%|###7 | 6/16 [00:12<00:22, 2.21s/it]
2023-05-13 16:21:26,496 - INFO - tqdm - precision: 0.7724, recall: 0.7216, fscore: 0.7461, batch_loss: 0.8622, loss: 0.6154 ||: 94%|#########3| 15/16 [00:24<00:01, 1.85s/it]
2023-05-13 16:21:27,736 - INFO - tqdm - precision: 0.7706, recall: 0.7039, fscore: 0.7358, batch_loss: 0.8425, loss: 0.6296 ||: 100%|##########| 16/16 [00:25<00:00, 1.67s/it]
2023-05-13 16:21:27,736 - INFO - tqdm - precision: 0.7706, recall: 0.7039, fscore: 0.7358, batch_loss: 0.8425, loss: 0.6296 ||: 100%|##########| 16/16 [00:25<00:00, 1.60s/it]
2023-05-13 16:21:27,737 - INFO - allennlp.training.callbacks.console_logger - Training | Validation
2023-05-13 16:21:27,737 - INFO - allennlp.training.callbacks.console_logger - fscore | N/A | 0.736
2023-05-13 16:21:27,737 - INFO - allennlp.training.callbacks.console_logger - gpu_0_memory_MB | 2977.857 | N/A
2023-05-13 16:21:27,738 - INFO - allennlp.training.callbacks.console_logger - loss | 0.227 | 0.630
2023-05-13 16:21:27,738 - INFO - allennlp.training.callbacks.console_logger - precision | N/A | 0.771
2023-05-13 16:21:27,738 - INFO - allennlp.training.callbacks.console_logger - recall | N/A | 0.704
2023-05-13 16:21:27,738 - INFO - allennlp.training.callbacks.console_logger - worker_0_memory_MB | 4521.004 | N/A
2023-05-13 16:21:33,381 - INFO - allennlp.training.gradient_descent_trainer - Epoch duration: 0:03:20.981616
2023-05-13 16:21:33,381 - INFO - allennlp.training.gradient_descent_trainer - Estimated training time remaining: 0:27:06
2023-05-13 16:21:33,381 - INFO - allennlp.training.gradient_descent_trainer - Epoch 16/24
2023-05-13 16:21:33,381 - INFO - allennlp.training.gradient_descent_trainer - Worker 0 memory usage: 4.4G
2023-05-13 16:21:33,382 - INFO - allennlp.training.gradient_descent_trainer - GPU 0 memory usage: 2.9G
2023-05-13 16:21:33,384 - INFO - allennlp.training.gradient_descent_trainer - Training
2023-05-13 16:21:33,385 - INFO - tqdm - 0%| | 0/1163 [00:00<?, ?it/s]
2023-05-13 16:21:43,485 - INFO - tqdm - batch_loss: 0.0015, loss: 0.2244 ||: 6%|6 | 75/1163 [00:10<01:58, 9.17it/s]
2023-05-13 16:21:53,592 - INFO - tqdm - batch_loss: 0.5899, loss: 0.3880 ||: 12%|#1 | 134/1163 [00:20<02:37, 6.54it/s]
2023-05-13 16:22:03,604 - INFO - tqdm - batch_loss: 0.0002, loss: 0.3171 ||: 18%|#7 | 205/1163 [00:30<02:30, 6.37it/s]
2023-05-13 16:22:13,608 - INFO - tqdm - batch_loss: 0.0010, loss: 0.3271 ||: 23%|##3 | 270/1163 [00:40<02:09, 6.88it/s]
2023-05-13 16:22:23,684 - INFO - tqdm - batch_loss: 0.0000, loss: 0.3120 ||: 29%|##9 | 339/1163 [00:50<02:02, 6.73it/s]
2023-05-13 16:22:33,822 - INFO - tqdm - batch_loss: 0.0000, loss: 0.2733 ||: 36%|###5 | 413/1163 [01:00<01:39, 7.52it/s]
2023-05-13 16:22:44,046 - INFO - tqdm - batch_loss: 0.0002, loss: 0.2698 ||: 42%|####1 | 483/1163 [01:10<01:21, 8.38it/s]
2023-05-13 16:22:54,048 - INFO - tqdm - batch_loss: 0.0024, loss: 0.2644 ||: 48%|####7 | 555/1163 [01:20<02:00, 5.03it/s]
2023-05-13 16:23:04,112 - INFO - tqdm - batch_loss: 0.0000, loss: 0.2659 ||: 53%|#####3 | 620/1163 [01:30<01:21, 6.69it/s]
2023-05-13 16:23:14,143 - INFO - tqdm - batch_loss: 0.6394, loss: 0.2576 ||: 59%|#####8 | 681/1163 [01:40<01:53, 4.23it/s]
2023-05-13 16:23:24,271 - INFO - tqdm - batch_loss: 0.9153, loss: 0.2590 ||: 65%|######4 | 753/1163 [01:50<01:01, 6.69it/s]
2023-05-13 16:23:34,401 - INFO - tqdm - batch_loss: 0.0000, loss: 0.2510 ||: 70%|####### | 815/1163 [02:01<00:43, 7.92it/s]
2023-05-13 16:23:44,483 - INFO - tqdm - batch_loss: 0.0000, loss: 0.2405 ||: 77%|#######6 | 892/1163 [02:11<00:37, 7.26it/s]
2023-05-13 16:23:54,523 - INFO - tqdm - batch_loss: 0.0058, loss: 0.2339 ||: 82%|########2 | 958/1163 [02:21<00:25, 8.02it/s]
2023-05-13 16:24:04,568 - INFO - tqdm - batch_loss: 0.0002, loss: 0.2345 ||: 88%|########8 | 1027/1163 [02:31<00:23, 5.86it/s]
2023-05-13 16:24:14,619 - INFO - tqdm - batch_loss: 0.0002, loss: 0.2363 ||: 95%|#########4| 1100/1163 [02:41<00:07, 8.89it/s]
2023-05-13 16:24:23,803 - INFO - tqdm - batch_loss: 0.0000, loss: 0.2315 ||: 100%|#########9| 1158/1163 [02:50<00:00, 6.69it/s]
2023-05-13 16:24:24,033 - INFO - tqdm - batch_loss: 0.1536, loss: 0.2315 ||: 100%|#########9| 1160/1163 [02:50<00:00, 7.39it/s]
2023-05-13 16:24:24,287 - INFO - tqdm - batch_loss: 0.0000, loss: 0.2311 ||: 100%|#########9| 1162/1163 [02:50<00:00, 7.56it/s]
2023-05-13 16:24:24,433 - INFO - tqdm - batch_loss: 0.0000, loss: 0.2309 ||: 100%|##########| 1163/1163 [02:51<00:00, 7.40it/s]
2023-05-13 16:24:24,434 - INFO - tqdm - batch_loss: 0.0000, loss: 0.2309 ||: 100%|##########| 1163/1163 [02:51<00:00, 6.80it/s]
2023-05-13 16:24:24,436 - INFO - allennlp.training.gradient_descent_trainer - Validating
2023-05-13 16:24:24,437 - INFO - tqdm - 0%| | 0/16 [00:00<?, ?it/s]
2023-05-13 16:24:34,512 - INFO - tqdm - precision: 0.7240, recall: 0.8528, fscore: 0.7831, batch_loss: 0.6197, loss: 0.6982 ||: 44%|####3 | 7/16 [00:10<00:12, 1.38s/it]
2023-05-13 16:24:46,763 - INFO - tqdm - precision: 0.6440, recall: 0.8384, fscore: 0.7285, batch_loss: 1.0722, loss: 0.8667 ||: 88%|########7 | 14/16 [00:22<00:04, 2.23s/it]
2023-05-13 16:24:49,364 - INFO - tqdm - precision: 0.6269, recall: 0.8212, fscore: 0.7110, batch_loss: 1.0929, loss: 0.8720 ||: 100%|##########| 16/16 [00:24<00:00, 1.72s/it]
2023-05-13 16:24:49,365 - INFO - tqdm - precision: 0.6269, recall: 0.8212, fscore: 0.7110, batch_loss: 1.0929, loss: 0.8720 ||: 100%|##########| 16/16 [00:24<00:00, 1.56s/it]
2023-05-13 16:24:49,366 - INFO - allennlp.training.callbacks.console_logger - Training | Validation
2023-05-13 16:24:49,366 - INFO - allennlp.training.callbacks.console_logger - fscore | N/A | 0.711
2023-05-13 16:24:49,366 - INFO - allennlp.training.callbacks.console_logger - gpu_0_memory_MB | 2936.810 | N/A
2023-05-13 16:24:49,366 - INFO - allennlp.training.callbacks.console_logger - loss | 0.231 | 0.872
2023-05-13 16:24:49,366 - INFO - allennlp.training.callbacks.console_logger - precision | N/A | 0.627
2023-05-13 16:24:49,366 - INFO - allennlp.training.callbacks.console_logger - recall | N/A | 0.821
2023-05-13 16:24:49,366 - INFO - allennlp.training.callbacks.console_logger - worker_0_memory_MB | 4521.004 | N/A
2023-05-13 16:25:08,133 - INFO - allennlp.training.gradient_descent_trainer - Epoch duration: 0:03:34.751223
2023-05-13 16:25:08,133 - INFO - allennlp.training.gradient_descent_trainer - Estimated training time remaining: 0:24:15
2023-05-13 16:25:08,133 - INFO - allennlp.training.gradient_descent_trainer - Epoch 17/24
2023-05-13 16:25:08,133 - INFO - allennlp.training.gradient_descent_trainer - Worker 0 memory usage: 4.4G
2023-05-13 16:25:08,134 - INFO - allennlp.training.gradient_descent_trainer - GPU 0 memory usage: 2.9G
2023-05-13 16:25:08,135 - INFO - allennlp.training.gradient_descent_trainer - Training
2023-05-13 16:25:08,136 - INFO - tqdm - 0%| | 0/1163 [00:00<?, ?it/s]
2023-05-13 16:25:18,205 - INFO - tqdm - batch_loss: 0.0010, loss: 0.0676 ||: 6%|5 | 67/1163 [00:10<02:33, 7.16it/s]
2023-05-13 16:25:28,414 - INFO - tqdm - batch_loss: 1.0718, loss: 0.1822 ||: 11%|#1 | 132/1163 [00:20<03:14, 5.31it/s]
2023-05-13 16:25:38,611 - INFO - tqdm - batch_loss: 0.0000, loss: 0.2045 ||: 18%|#7 | 205/1163 [00:30<02:23, 6.66it/s]
2023-05-13 16:25:48,735 - INFO - tqdm - batch_loss: 0.0000, loss: 0.2208 ||: 23%|##3 | 269/1163 [00:40<01:44, 8.57it/s]
2023-05-13 16:25:58,850 - INFO - tqdm - batch_loss: 0.0011, loss: 0.2095 ||: 29%|##9 | 339/1163 [00:50<02:59, 4.60it/s]
2023-05-13 16:26:08,912 - INFO - tqdm - batch_loss: 0.0000, loss: 0.2048 ||: 35%|###4 | 406/1163 [01:00<01:25, 8.81it/s]
2023-05-13 16:26:18,960 - INFO - tqdm - batch_loss: 0.0036, loss: 0.1986 ||: 41%|#### | 473/1163 [01:10<01:54, 6.04it/s]
2023-05-13 16:26:29,034 - INFO - tqdm - batch_loss: 0.3503, loss: 0.2054 ||: 48%|####7 | 553/1163 [01:20<01:08, 8.87it/s]
2023-05-13 16:26:39,063 - INFO - tqdm - batch_loss: 0.0004, loss: 0.2047 ||: 53%|#####2 | 616/1163 [01:30<01:30, 6.02it/s]
2023-05-13 16:26:49,124 - INFO - tqdm - batch_loss: 0.0657, loss: 0.1985 ||: 60%|#####9 | 692/1163 [01:40<01:23, 5.63it/s]
2023-05-13 16:26:59,157 - INFO - tqdm - batch_loss: 0.0000, loss: 0.2078 ||: 65%|######5 | 758/1163 [01:51<00:48, 8.41it/s]
2023-05-13 16:27:09,235 - INFO - tqdm - batch_loss: 0.0000, loss: 0.2141 ||: 71%|#######1 | 827/1163 [02:01<00:50, 6.60it/s]
2023-05-13 16:27:19,327 - INFO - tqdm - batch_loss: 0.2969, loss: 0.2098 ||: 78%|#######7 | 902/1163 [02:11<00:39, 6.69it/s]
2023-05-13 16:27:29,421 - INFO - tqdm - batch_loss: 0.0006, loss: 0.2017 ||: 83%|########3 | 967/1163 [02:21<00:26, 7.29it/s]
2023-05-13 16:27:39,460 - INFO - tqdm - batch_loss: 2.4084, loss: 0.2157 ||: 89%|########9 | 1036/1163 [02:31<00:24, 5.08it/s]
2023-05-13 16:27:49,557 - INFO - tqdm - batch_loss: 2.0941, loss: 0.2162 ||: 95%|#########4| 1104/1163 [02:41<00:08, 6.58it/s]
2023-05-13 16:27:57,316 - INFO - tqdm - batch_loss: 1.3887, loss: 0.2092 ||: 100%|#########9| 1158/1163 [02:49<00:00, 6.64it/s]
2023-05-13 16:27:57,615 - INFO - tqdm - batch_loss: 5.4107, loss: 0.2137 ||: 100%|#########9| 1159/1163 [02:49<00:00, 5.13it/s]
2023-05-13 16:27:57,816 - INFO - tqdm - batch_loss: 0.0018, loss: 0.2135 ||: 100%|#########9| 1160/1163 [02:49<00:00, 5.09it/s]
2023-05-13 16:27:57,951 - INFO - tqdm - batch_loss: 0.0000, loss: 0.2133 ||: 100%|#########9| 1161/1163 [02:49<00:00, 5.61it/s]
2023-05-13 16:27:58,066 - INFO - tqdm - batch_loss: 0.0002, loss: 0.2131 ||: 100%|#########9| 1162/1163 [02:49<00:00, 6.28it/s]
2023-05-13 16:27:58,184 - INFO - tqdm - batch_loss: 0.0000, loss: 0.2129 ||: 100%|##########| 1163/1163 [02:50<00:00, 6.81it/s]
2023-05-13 16:27:58,186 - INFO - tqdm - batch_loss: 0.0000, loss: 0.2129 ||: 100%|##########| 1163/1163 [02:50<00:00, 6.84it/s]
2023-05-13 16:27:58,187 - INFO - allennlp.training.gradient_descent_trainer - Validating
2023-05-13 16:27:58,189 - INFO - tqdm - 0%| | 0/16 [00:00<?, ?it/s]
2023-05-13 16:28:08,482 - INFO - tqdm - precision: 0.7222, recall: 0.8563, fscore: 0.7836, batch_loss: 0.9316, loss: 0.7181 ||: 44%|####3 | 7/16 [00:10<00:11, 1.32s/it]
2023-05-13 16:28:20,718 - INFO - tqdm - precision: 0.6683, recall: 0.8121, fscore: 0.7332, batch_loss: 0.0226, loss: 0.7831 ||: 94%|#########3| 15/16 [00:22<00:01, 1.78s/it]
2023-05-13 16:28:21,553 - INFO - tqdm - precision: 0.6767, recall: 0.8128, fscore: 0.7386, batch_loss: 0.7804, loss: 0.7829 ||: 100%|##########| 16/16 [00:23<00:00, 1.49s/it]
2023-05-13 16:28:21,554 - INFO - tqdm - precision: 0.6767, recall: 0.8128, fscore: 0.7386, batch_loss: 0.7804, loss: 0.7829 ||: 100%|##########| 16/16 [00:23<00:00, 1.46s/it]
2023-05-13 16:28:21,554 - INFO - allennlp.training.callbacks.console_logger - Training | Validation
2023-05-13 16:28:21,554 - INFO - allennlp.training.callbacks.console_logger - fscore | N/A | 0.739
2023-05-13 16:28:21,554 - INFO - allennlp.training.callbacks.console_logger - gpu_0_memory_MB | 2935.913 | N/A
2023-05-13 16:28:21,555 - INFO - allennlp.training.callbacks.console_logger - loss | 0.213 | 0.783
2023-05-13 16:28:21,555 - INFO - allennlp.training.callbacks.console_logger - precision | N/A | 0.677
2023-05-13 16:28:21,555 - INFO - allennlp.training.callbacks.console_logger - recall | N/A | 0.813
2023-05-13 16:28:21,555 - INFO - allennlp.training.callbacks.console_logger - worker_0_memory_MB | 4521.004 | N/A
2023-05-13 16:28:27,213 - INFO - allennlp.training.gradient_descent_trainer - Epoch duration: 0:03:19.079963
2023-05-13 16:28:27,213 - INFO - allennlp.training.gradient_descent_trainer - Estimated training time remaining: 0:21:25
2023-05-13 16:28:27,214 - INFO - allennlp.training.gradient_descent_trainer - Epoch 18/24
2023-05-13 16:28:27,214 - INFO - allennlp.training.gradient_descent_trainer - Worker 0 memory usage: 4.4G
2023-05-13 16:28:27,214 - INFO - allennlp.training.gradient_descent_trainer - GPU 0 memory usage: 2.9G
2023-05-13 16:28:27,216 - INFO - allennlp.training.gradient_descent_trainer - Training
2023-05-13 16:28:27,217 - INFO - tqdm - 0%| | 0/1163 [00:00<?, ?it/s]
2023-05-13 16:28:37,357 - INFO - tqdm - batch_loss: 0.0002, loss: 0.1940 ||: 5%|5 | 63/1163 [00:10<02:27, 7.45it/s]
2023-05-13 16:28:47,457 - INFO - tqdm - batch_loss: 0.2387, loss: 0.4408 ||: 11%|#1 | 130/1163 [00:20<03:12, 5.36it/s]
2023-05-13 16:28:57,524 - INFO - tqdm - batch_loss: 0.0000, loss: 0.3331 ||: 17%|#7 | 200/1163 [00:30<02:15, 7.13it/s]
2023-05-13 16:29:07,707 - INFO - tqdm - batch_loss: 1.2151, loss: 0.3191 ||: 23%|##2 | 264/1163 [00:40<02:36, 5.75it/s]
2023-05-13 16:29:17,779 - INFO - tqdm - batch_loss: 0.0000, loss: 0.2897 ||: 29%|##9 | 342/1163 [00:50<01:46, 7.74it/s]
2023-05-13 16:29:27,810 - INFO - tqdm - batch_loss: 0.0003, loss: 0.2666 ||: 34%|###4 | 401/1163 [01:00<01:39, 7.67it/s]
2023-05-13 16:29:37,856 - INFO - tqdm - batch_loss: 0.0001, loss: 0.2433 ||: 40%|#### | 468/1163 [01:10<01:47, 6.48it/s]
2023-05-13 16:29:47,893 - INFO - tqdm - batch_loss: 0.4973, loss: 0.2556 ||: 46%|####6 | 540/1163 [01:20<01:27, 7.09it/s]
2023-05-13 16:29:57,993 - INFO - tqdm - batch_loss: 0.0038, loss: 0.2574 ||: 52%|#####1 | 604/1163 [01:30<02:06, 4.41it/s]
2023-05-13 16:30:07,994 - INFO - tqdm - batch_loss: 0.0000, loss: 0.2546 ||: 58%|#####8 | 678/1163 [01:40<01:04, 7.54it/s]
2023-05-13 16:30:18,055 - INFO - tqdm - batch_loss: 0.0001, loss: 0.2567 ||: 64%|######3 | 743/1163 [01:50<00:45, 9.16it/s]
2023-05-13 16:30:28,167 - INFO - tqdm - batch_loss: 0.0001, loss: 0.2533 ||: 70%|######9 | 814/1163 [02:00<00:48, 7.18it/s]
2023-05-13 16:30:38,397 - INFO - tqdm - batch_loss: 3.3949, loss: 0.2410 ||: 76%|#######6 | 885/1163 [02:11<00:33, 8.29it/s]
2023-05-13 16:30:48,647 - INFO - tqdm - batch_loss: 0.1329, loss: 0.2352 ||: 82%|########1 | 952/1163 [02:21<00:51, 4.13it/s]
2023-05-13 16:30:58,663 - INFO - tqdm - batch_loss: 0.0002, loss: 0.2346 ||: 89%|########8 | 1030/1163 [02:31<00:15, 8.84it/s]
2023-05-13 16:31:08,750 - INFO - tqdm - batch_loss: 0.0046, loss: 0.2318 ||: 94%|#########4| 1098/1163 [02:41<00:10, 5.97it/s]
2023-05-13 16:31:16,788 - INFO - tqdm - batch_loss: 0.0000, loss: 0.2278 ||: 100%|#########9| 1158/1163 [02:49<00:00, 7.78it/s]
2023-05-13 16:31:16,900 - INFO - tqdm - batch_loss: 0.0001, loss: 0.2276 ||: 100%|#########9| 1159/1163 [02:49<00:00, 8.09it/s]
2023-05-13 16:31:17,112 - INFO - tqdm - batch_loss: 0.0001, loss: 0.2274 ||: 100%|#########9| 1160/1163 [02:49<00:00, 6.67it/s]
2023-05-13 16:31:17,386 - INFO - tqdm - batch_loss: 0.0000, loss: 0.2272 ||: 100%|#########9| 1161/1163 [02:50<00:00, 5.35it/s]
2023-05-13 16:31:17,504 - INFO - tqdm - batch_loss: 0.0000, loss: 0.2270 ||: 100%|#########9| 1162/1163 [02:50<00:00, 6.01it/s]
2023-05-13 16:31:17,620 - INFO - tqdm - batch_loss: 0.0000, loss: 0.2268 ||: 100%|##########| 1163/1163 [02:50<00:00, 6.62it/s]
2023-05-13 16:31:17,622 - INFO - tqdm - batch_loss: 0.0000, loss: 0.2268 ||: 100%|##########| 1163/1163 [02:50<00:00, 6.82it/s]
2023-05-13 16:31:17,623 - INFO - allennlp.training.gradient_descent_trainer - Validating
2023-05-13 16:31:17,625 - INFO - tqdm - 0%| | 0/16 [00:00<?, ?it/s]
2023-05-13 16:31:28,867 - INFO - tqdm - precision: 0.7042, recall: 0.6579, fscore: 0.6803, batch_loss: 0.0059, loss: 0.3997 ||: 31%|###1 | 5/16 [00:11<00:22, 2.09s/it]
2023-05-13 16:31:39,193 - INFO - tqdm - precision: 0.6842, recall: 0.7647, fscore: 0.7222, batch_loss: 0.7475, loss: 0.5877 ||: 81%|########1 | 13/16 [00:21<00:03, 1.31s/it]
2023-05-13 16:31:41,733 - INFO - tqdm - precision: 0.7068, recall: 0.7877, fscore: 0.7450, batch_loss: 0.6993, loss: 0.5861 ||: 100%|##########| 16/16 [00:24<00:00, 1.03s/it]
2023-05-13 16:31:41,734 - INFO - tqdm - precision: 0.7068, recall: 0.7877, fscore: 0.7450, batch_loss: 0.6993, loss: 0.5861 ||: 100%|##########| 16/16 [00:24<00:00, 1.51s/it]
2023-05-13 16:31:41,735 - INFO - allennlp.training.callbacks.console_logger - Training | Validation
2023-05-13 16:31:41,735 - INFO - allennlp.training.callbacks.console_logger - fscore | N/A | 0.745
2023-05-13 16:31:41,735 - INFO - allennlp.training.callbacks.console_logger - gpu_0_memory_MB | 2935.913 | N/A
2023-05-13 16:31:41,735 - INFO - allennlp.training.callbacks.console_logger - loss | 0.227 | 0.586
2023-05-13 16:31:41,735 - INFO - allennlp.training.callbacks.console_logger - precision | N/A | 0.707
2023-05-13 16:31:41,735 - INFO - allennlp.training.callbacks.console_logger - recall | N/A | 0.788
2023-05-13 16:31:41,735 - INFO - allennlp.training.callbacks.console_logger - worker_0_memory_MB | 4521.004 | N/A
2023-05-13 16:31:47,085 - INFO - allennlp.training.gradient_descent_trainer - Epoch duration: 0:03:19.871180
2023-05-13 16:31:47,085 - INFO - allennlp.training.gradient_descent_trainer - Estimated training time remaining: 0:18:26
2023-05-13 16:31:47,085 - INFO - allennlp.training.gradient_descent_trainer - Epoch 19/24
2023-05-13 16:31:47,085 - INFO - allennlp.training.gradient_descent_trainer - Worker 0 memory usage: 4.4G
2023-05-13 16:31:47,086 - INFO - allennlp.training.gradient_descent_trainer - GPU 0 memory usage: 2.9G
2023-05-13 16:31:47,087 - INFO - allennlp.training.gradient_descent_trainer - Training
2023-05-13 16:31:47,087 - INFO - tqdm - 0%| | 0/1163 [00:00<?, ?it/s]
2023-05-13 16:31:57,245 - INFO - tqdm - batch_loss: 0.0289, loss: 0.1573 ||: 6%|5 | 65/1163 [00:10<02:19, 7.87it/s]
2023-05-13 16:32:07,372 - INFO - tqdm - batch_loss: 0.0000, loss: 0.1129 ||: 12%|#2 | 141/1163 [00:20<03:11, 5.34it/s]
2023-05-13 16:32:17,432 - INFO - tqdm - batch_loss: 0.0000, loss: 0.1089 ||: 18%|#7 | 204/1163 [00:30<02:22, 6.71it/s]
2023-05-13 16:32:27,433 - INFO - tqdm - batch_loss: 0.0003, loss: 0.1350 ||: 24%|##3 | 274/1163 [00:40<02:13, 6.67it/s]
2023-05-13 16:32:37,495 - INFO - tqdm - batch_loss: 0.0000, loss: 0.1400 ||: 29%|##9 | 342/1163 [00:50<02:03, 6.64it/s]
2023-05-13 16:32:47,505 - INFO - tqdm - batch_loss: 0.0001, loss: 0.1414 ||: 35%|###4 | 407/1163 [01:00<01:39, 7.60it/s]
2023-05-13 16:32:57,676 - INFO - tqdm - batch_loss: 0.0000, loss: 0.1528 ||: 42%|####1 | 483/1163 [01:10<01:49, 6.20it/s]
2023-05-13 16:33:07,740 - INFO - tqdm - batch_loss: 0.0003, loss: 0.1510 ||: 47%|####7 | 549/1163 [01:20<01:37, 6.32it/s]
2023-05-13 16:33:17,927 - INFO - tqdm - batch_loss: 0.0004, loss: 0.1581 ||: 53%|#####2 | 616/1163 [01:30<02:14, 4.06it/s]
2023-05-13 16:33:27,980 - INFO - tqdm - batch_loss: 0.0000, loss: 0.1566 ||: 59%|#####9 | 690/1163 [01:40<00:56, 8.34it/s]
2023-05-13 16:33:38,071 - INFO - tqdm - batch_loss: 1.4459, loss: 0.1582 ||: 64%|######4 | 750/1163 [01:50<01:07, 6.16it/s]
2023-05-13 16:33:48,155 - INFO - tqdm - batch_loss: 0.0000, loss: 0.1552 ||: 71%|#######1 | 827/1163 [02:01<00:45, 7.36it/s]
2023-05-13 16:33:58,196 - INFO - tqdm - batch_loss: 0.0000, loss: 0.1532 ||: 77%|#######6 | 895/1163 [02:11<00:35, 7.51it/s]
2023-05-13 16:34:08,227 - INFO - tqdm - batch_loss: 0.0077, loss: 0.1577 ||: 83%|########2 | 963/1163 [02:21<00:39, 5.10it/s]
2023-05-13 16:34:18,313 - INFO - tqdm - batch_loss: 0.0000, loss: 0.1541 ||: 89%|########8 | 1033/1163 [02:31<00:15, 8.39it/s]
2023-05-13 16:34:28,397 - INFO - tqdm - batch_loss: 0.0004, loss: 0.1548 ||: 94%|#########3| 1092/1163 [02:41<00:11, 6.39it/s]
2023-05-13 16:34:36,654 - INFO - tqdm - batch_loss: 0.0000, loss: 0.1589 ||: 100%|#########9| 1158/1163 [02:49<00:00, 8.29it/s]
2023-05-13 16:34:36,980 - INFO - tqdm - batch_loss: 0.0180, loss: 0.1586 ||: 100%|#########9| 1160/1163 [02:49<00:00, 7.35it/s]
2023-05-13 16:34:37,137 - INFO - tqdm - batch_loss: 0.0001, loss: 0.1585 ||: 100%|#########9| 1161/1163 [02:50<00:00, 7.13it/s]
2023-05-13 16:34:37,282 - INFO - tqdm - batch_loss: 0.0002, loss: 0.1584 ||: 100%|#########9| 1162/1163 [02:50<00:00, 7.07it/s]
2023-05-13 16:34:37,386 - INFO - tqdm - batch_loss: 0.0004, loss: 0.1582 ||: 100%|##########| 1163/1163 [02:50<00:00, 7.56it/s]
2023-05-13 16:34:37,388 - INFO - tqdm - batch_loss: 0.0004, loss: 0.1582 ||: 100%|##########| 1163/1163 [02:50<00:00, 6.83it/s]
2023-05-13 16:34:37,388 - INFO - allennlp.training.gradient_descent_trainer - Validating
2023-05-13 16:34:37,390 - INFO - tqdm - 0%| | 0/16 [00:00<?, ?it/s]
2023-05-13 16:34:47,972 - INFO - tqdm - precision: 0.7667, recall: 0.6715, fscore: 0.7160, batch_loss: 0.4701, loss: 0.7855 ||: 38%|###7 | 6/16 [00:10<00:19, 1.91s/it]
2023-05-13 16:34:58,025 - INFO - tqdm - precision: 0.7471, recall: 0.7111, fscore: 0.7287, batch_loss: 0.0009, loss: 0.6365 ||: 81%|########1 | 13/16 [00:20<00:05, 1.88s/it]
2023-05-13 16:35:04,364 - INFO - tqdm - precision: 0.7616, recall: 0.7318, fscore: 0.7464, batch_loss: 0.4661, loss: 0.6555 ||: 100%|##########| 16/16 [00:26<00:00, 1.96s/it]
2023-05-13 16:35:04,364 - INFO - tqdm - precision: 0.7616, recall: 0.7318, fscore: 0.7464, batch_loss: 0.4661, loss: 0.6555 ||: 100%|##########| 16/16 [00:26<00:00, 1.69s/it]
2023-05-13 16:35:04,365 - INFO - allennlp.training.callbacks.console_logger - Training | Validation
2023-05-13 16:35:04,365 - INFO - allennlp.training.callbacks.console_logger - fscore | N/A | 0.746
2023-05-13 16:35:04,365 - INFO - allennlp.training.callbacks.console_logger - gpu_0_memory_MB | 2935.913 | N/A
2023-05-13 16:35:04,365 - INFO - allennlp.training.callbacks.console_logger - loss | 0.158 | 0.656
2023-05-13 16:35:04,365 - INFO - allennlp.training.callbacks.console_logger - precision | N/A | 0.762
2023-05-13 16:35:04,365 - INFO - allennlp.training.callbacks.console_logger - recall | N/A | 0.732
2023-05-13 16:35:04,365 - INFO - allennlp.training.callbacks.console_logger - worker_0_memory_MB | 4521.004 | N/A
2023-05-13 16:35:09,867 - INFO - allennlp.training.gradient_descent_trainer - Epoch duration: 0:03:22.782191
2023-05-13 16:35:09,868 - INFO - allennlp.training.gradient_descent_trainer - Estimated training time remaining: 0:15:26
2023-05-13 16:35:09,868 - INFO - allennlp.training.gradient_descent_trainer - Epoch 20/24
2023-05-13 16:35:09,868 - INFO - allennlp.training.gradient_descent_trainer - Worker 0 memory usage: 4.4G
2023-05-13 16:35:09,868 - INFO - allennlp.training.gradient_descent_trainer - GPU 0 memory usage: 2.9G
2023-05-13 16:35:09,870 - INFO - allennlp.training.gradient_descent_trainer - Training
2023-05-13 16:35:09,870 - INFO - tqdm - 0%| | 0/1163 [00:00<?, ?it/s]
2023-05-13 16:35:19,933 - INFO - tqdm - batch_loss: 0.0000, loss: 0.1120 ||: 5%|5 | 60/1163 [00:10<02:36, 7.03it/s]
2023-05-13 16:35:29,953 - INFO - tqdm - batch_loss: 0.0000, loss: 0.1178 ||: 11%|#1 | 129/1163 [00:20<03:52, 4.45it/s]
2023-05-13 16:35:39,976 - INFO - tqdm - batch_loss: 0.0000, loss: 0.1185 ||: 17%|#7 | 199/1163 [00:30<02:22, 6.77it/s]
2023-05-13 16:35:50,252 - INFO - tqdm - batch_loss: 3.5792, loss: 0.1267 ||: 23%|##3 | 268/1163 [00:40<03:35, 4.16it/s]
2023-05-13 16:36:00,446 - INFO - tqdm - batch_loss: 0.6957, loss: 0.1421 ||: 30%|##9 | 347/1163 [00:50<01:47, 7.58it/s]
2023-05-13 16:36:10,474 - INFO - tqdm - batch_loss: 0.0494, loss: 0.1453 ||: 36%|###5 | 416/1163 [01:00<01:25, 8.78it/s]
2023-05-13 16:36:20,742 - INFO - tqdm - batch_loss: 0.0737, loss: 0.1669 ||: 42%|####2 | 492/1163 [01:10<02:07, 5.27it/s]
2023-05-13 16:36:30,855 - INFO - tqdm - batch_loss: 0.0000, loss: 0.1770 ||: 48%|####8 | 561/1163 [01:20<01:06, 9.09it/s]
2023-05-13 16:36:40,961 - INFO - tqdm - batch_loss: 1.3076, loss: 0.1828 ||: 54%|#####4 | 632/1163 [01:31<01:43, 5.14it/s]
2023-05-13 16:36:50,990 - INFO - tqdm - batch_loss: 0.0193, loss: 0.1910 ||: 61%|###### | 706/1163 [01:41<01:13, 6.21it/s]
2023-05-13 16:37:01,131 - INFO - tqdm - batch_loss: 0.0000, loss: 0.1860 ||: 66%|######6 | 772/1163 [01:51<00:54, 7.16it/s]
2023-05-13 16:37:11,264 - INFO - tqdm - batch_loss: 0.0003, loss: 0.1714 ||: 73%|#######2 | 845/1163 [02:01<00:51, 6.22it/s]
2023-05-13 16:37:21,352 - INFO - tqdm - batch_loss: 0.0000, loss: 0.1829 ||: 78%|#######8 | 910/1163 [02:11<00:33, 7.58it/s]
2023-05-13 16:37:31,400 - INFO - tqdm - batch_loss: 0.0000, loss: 0.1776 ||: 84%|########4 | 978/1163 [02:21<00:23, 7.98it/s]
2023-05-13 16:37:41,468 - INFO - tqdm - batch_loss: 0.0000, loss: 0.1879 ||: 90%|######### | 1050/1163 [02:31<00:16, 6.79it/s]
2023-05-13 16:37:51,796 - INFO - tqdm - batch_loss: 0.0097, loss: 0.1888 ||: 96%|#########5| 1113/1163 [02:41<00:08, 6.22it/s]
2023-05-13 16:37:57,644 - INFO - tqdm - batch_loss: 0.0000, loss: 0.1869 ||: 100%|#########9| 1158/1163 [02:47<00:00, 8.33it/s]
2023-05-13 16:37:57,785 - INFO - tqdm - batch_loss: 1.0622, loss: 0.1877 ||: 100%|#########9| 1159/1163 [02:47<00:00, 8.01it/s]
2023-05-13 16:37:57,961 - INFO - tqdm - batch_loss: 0.0028, loss: 0.1875 ||: 100%|#########9| 1160/1163 [02:48<00:00, 7.27it/s]
2023-05-13 16:37:58,150 - INFO - tqdm - batch_loss: 0.0000, loss: 0.1874 ||: 100%|#########9| 1161/1163 [02:48<00:00, 6.63it/s]
2023-05-13 16:37:58,286 - INFO - tqdm - batch_loss: 0.0000, loss: 0.1872 ||: 100%|#########9| 1162/1163 [02:48<00:00, 6.81it/s]
2023-05-13 16:37:58,385 - INFO - tqdm - batch_loss: 0.0031, loss: 0.1870 ||: 100%|##########| 1163/1163 [02:48<00:00, 6.90it/s]
2023-05-13 16:37:58,386 - INFO - allennlp.training.gradient_descent_trainer - Validating
2023-05-13 16:37:58,388 - INFO - tqdm - 0%| | 0/16 [00:00<?, ?it/s]
2023-05-13 16:38:09,200 - INFO - tqdm - precision: 0.7953, recall: 0.7991, fscore: 0.7972, batch_loss: 0.9909, loss: 0.7392 ||: 50%|##### | 8/16 [00:10<00:07, 1.03it/s]
2023-05-13 16:38:19,467 - INFO - tqdm - precision: 0.7492, recall: 0.7354, fscore: 0.7422, batch_loss: 0.9360, loss: 0.7516 ||: 88%|########7 | 14/16 [00:21<00:03, 1.69s/it]
2023-05-13 16:38:23,675 - INFO - tqdm - precision: 0.7521, recall: 0.7542, fscore: 0.7531, batch_loss: 0.1871, loss: 0.6955 ||: 100%|##########| 16/16 [00:25<00:00, 2.00s/it]
2023-05-13 16:38:23,675 - INFO - tqdm - precision: 0.7521, recall: 0.7542, fscore: 0.7531, batch_loss: 0.1871, loss: 0.6955 ||: 100%|##########| 16/16 [00:25<00:00, 1.58s/it]
2023-05-13 16:38:23,676 - INFO - allennlp.training.callbacks.console_logger - Training | Validation
2023-05-13 16:38:23,676 - INFO - allennlp.training.callbacks.console_logger - fscore | N/A | 0.753
2023-05-13 16:38:23,676 - INFO - allennlp.training.callbacks.console_logger - gpu_0_memory_MB | 2990.136 | N/A
2023-05-13 16:38:23,676 - INFO - allennlp.training.callbacks.console_logger - loss | 0.187 | 0.695
2023-05-13 16:38:23,676 - INFO - allennlp.training.callbacks.console_logger - precision | N/A | 0.752
2023-05-13 16:38:23,676 - INFO - allennlp.training.callbacks.console_logger - recall | N/A | 0.754
2023-05-13 16:38:23,676 - INFO - allennlp.training.callbacks.console_logger - worker_0_memory_MB | 4521.004 | N/A
2023-05-13 16:38:29,307 - INFO - allennlp.training.gradient_descent_trainer - Epoch duration: 0:03:19.438911
2023-05-13 16:38:29,307 - INFO - allennlp.training.gradient_descent_trainer - Estimated training time remaining: 0:12:24
2023-05-13 16:38:29,307 - INFO - allennlp.training.gradient_descent_trainer - Epoch 21/24
2023-05-13 16:38:29,307 - INFO - allennlp.training.gradient_descent_trainer - Worker 0 memory usage: 4.4G
2023-05-13 16:38:29,308 - INFO - allennlp.training.gradient_descent_trainer - GPU 0 memory usage: 2.9G
2023-05-13 16:38:29,309 - INFO - allennlp.training.gradient_descent_trainer - Training
2023-05-13 16:38:29,309 - INFO - tqdm - 0%| | 0/1163 [00:00<?, ?it/s]
2023-05-13 16:38:39,476 - INFO - tqdm - batch_loss: 0.0000, loss: 0.1125 ||: 6%|5 | 68/1163 [00:10<03:59, 4.57it/s]
2023-05-13 16:38:49,612 - INFO - tqdm - batch_loss: 0.0001, loss: 0.0982 ||: 12%|#2 | 143/1163 [00:20<02:17, 7.44it/s]
2023-05-13 16:38:59,695 - INFO - tqdm - batch_loss: 0.0000, loss: 0.1190 ||: 17%|#7 | 203/1163 [00:30<01:51, 8.61it/s]
2023-05-13 16:39:09,852 - INFO - tqdm - batch_loss: 0.0000, loss: 0.1361 ||: 24%|##3 | 277/1163 [00:40<02:31, 5.85it/s]
2023-05-13 16:39:19,908 - INFO - tqdm - batch_loss: 0.0000, loss: 0.1319 ||: 30%|##9 | 344/1163 [00:50<02:21, 5.80it/s]
2023-05-13 16:39:29,993 - INFO - tqdm - batch_loss: 0.0013, loss: 0.1260 ||: 35%|###5 | 411/1163 [01:00<01:53, 6.65it/s]
2023-05-13 16:39:40,058 - INFO - tqdm - batch_loss: 0.0000, loss: 0.1239 ||: 42%|####1 | 488/1163 [01:10<01:33, 7.19it/s]
2023-05-13 16:39:50,177 - INFO - tqdm - batch_loss: 0.0001, loss: 0.1228 ||: 47%|####7 | 549/1163 [01:20<01:19, 7.75it/s]
2023-05-13 16:40:00,428 - INFO - tqdm - batch_loss: 0.1142, loss: 0.1289 ||: 54%|#####3 | 624/1163 [01:31<01:51, 4.84it/s]
2023-05-13 16:40:10,505 - INFO - tqdm - batch_loss: 1.3430, loss: 0.1407 ||: 60%|#####9 | 694/1163 [01:41<01:01, 7.66it/s]
2023-05-13 16:40:20,527 - INFO - tqdm - batch_loss: 0.0000, loss: 0.1420 ||: 66%|######5 | 765/1163 [01:51<00:56, 7.02it/s]
2023-05-13 16:40:30,536 - INFO - tqdm - batch_loss: 0.1544, loss: 0.1469 ||: 72%|#######2 | 840/1163 [02:01<00:42, 7.54it/s]
2023-05-13 16:40:40,654 - INFO - tqdm - batch_loss: 0.0000, loss: 0.1454 ||: 77%|#######7 | 901/1163 [02:11<00:38, 6.76it/s]
2023-05-13 16:40:50,660 - INFO - tqdm - batch_loss: 1.0540, loss: 0.1487 ||: 83%|########3 | 970/1163 [02:21<00:34, 5.64it/s]
2023-05-13 16:41:00,727 - INFO - tqdm - batch_loss: 0.3312, loss: 0.1442 ||: 89%|########9 | 1037/1163 [02:31<00:18, 6.97it/s]
2023-05-13 16:41:10,744 - INFO - tqdm - batch_loss: 0.0000, loss: 0.1447 ||: 95%|#########4| 1100/1163 [02:41<00:12, 4.94it/s]
2023-05-13 16:41:19,546 - INFO - tqdm - batch_loss: 0.0000, loss: 0.1472 ||: 100%|#########9| 1159/1163 [02:50<00:00, 7.94it/s]
2023-05-13 16:41:19,671 - INFO - tqdm - batch_loss: 1.8138, loss: 0.1487 ||: 100%|#########9| 1160/1163 [02:50<00:00, 7.95it/s]
2023-05-13 16:41:19,817 - INFO - tqdm - batch_loss: 0.0000, loss: 0.1485 ||: 100%|#########9| 1161/1163 [02:50<00:00, 7.65it/s]
2023-05-13 16:41:19,973 - INFO - tqdm - batch_loss: 0.0000, loss: 0.1484 ||: 100%|#########9| 1162/1163 [02:50<00:00, 7.28it/s]
2023-05-13 16:41:20,067 - INFO - tqdm - batch_loss: 0.0000, loss: 0.1483 ||: 100%|##########| 1163/1163 [02:50<00:00, 6.81it/s]
2023-05-13 16:41:20,067 - INFO - allennlp.training.gradient_descent_trainer - Validating
2023-05-13 16:41:20,068 - INFO - tqdm - 0%| | 0/16 [00:00<?, ?it/s]
2023-05-13 16:41:30,622 - INFO - tqdm - precision: 0.8176, recall: 0.7429, fscore: 0.7784, batch_loss: 0.9550, loss: 0.7804 ||: 44%|####3 | 7/16 [00:10<00:14, 1.63s/it]
2023-05-13 16:41:43,484 - INFO - tqdm - precision: 0.7468, recall: 0.7233, fscore: 0.7348, batch_loss: 0.5902, loss: 0.7783 ||: 94%|#########3| 15/16 [00:23<00:01, 1.93s/it]
2023-05-13 16:41:44,175 - INFO - tqdm - precision: 0.7549, recall: 0.7486, fscore: 0.7518, batch_loss: 1.0070, loss: 0.7926 ||: 100%|##########| 16/16 [00:24<00:00, 1.56s/it]
2023-05-13 16:41:44,176 - INFO - tqdm - precision: 0.7549, recall: 0.7486, fscore: 0.7518, batch_loss: 1.0070, loss: 0.7926 ||: 100%|##########| 16/16 [00:24<00:00, 1.51s/it]
2023-05-13 16:41:44,177 - INFO - allennlp.training.callbacks.console_logger - Training | Validation
2023-05-13 16:41:44,177 - INFO - allennlp.training.callbacks.console_logger - fscore | N/A | 0.752
2023-05-13 16:41:44,177 - INFO - allennlp.training.callbacks.console_logger - gpu_0_memory_MB | 2935.913 | N/A
2023-05-13 16:41:44,177 - INFO - allennlp.training.callbacks.console_logger - loss | 0.148 | 0.793
2023-05-13 16:41:44,177 - INFO - allennlp.training.callbacks.console_logger - precision | N/A | 0.755
2023-05-13 16:41:44,177 - INFO - allennlp.training.callbacks.console_logger - recall | N/A | 0.749
2023-05-13 16:41:44,177 - INFO - allennlp.training.callbacks.console_logger - worker_0_memory_MB | 4521.004 | N/A
2023-05-13 16:41:50,537 - INFO - allennlp.training.gradient_descent_trainer - Epoch duration: 0:03:21.229792
2023-05-13 16:41:50,537 - INFO - allennlp.training.gradient_descent_trainer - Estimated training time remaining: 0:09:20
2023-05-13 16:41:50,537 - INFO - allennlp.training.gradient_descent_trainer - Epoch 22/24
2023-05-13 16:41:50,537 - INFO - allennlp.training.gradient_descent_trainer - Worker 0 memory usage: 4.4G
2023-05-13 16:41:50,538 - INFO - allennlp.training.gradient_descent_trainer - GPU 0 memory usage: 2.9G
2023-05-13 16:41:50,539 - INFO - allennlp.training.gradient_descent_trainer - Training
2023-05-13 16:41:50,540 - INFO - tqdm - 0%| | 0/1163 [00:00<?, ?it/s]
2023-05-13 16:42:00,566 - INFO - tqdm - batch_loss: 0.0002, loss: 0.1005 ||: 6%|5 | 67/1163 [00:10<02:43, 6.69it/s]
2023-05-13 16:42:10,766 - INFO - tqdm - batch_loss: 0.0000, loss: 0.2283 ||: 12%|#1 | 139/1163 [00:20<02:06, 8.08it/s]
2023-05-13 16:42:20,857 - INFO - tqdm - batch_loss: 0.0005, loss: 0.1988 ||: 18%|#7 | 204/1163 [00:30<02:04, 7.71it/s]
2023-05-13 16:42:30,887 - INFO - tqdm - batch_loss: 0.0000, loss: 0.1670 ||: 24%|##4 | 282/1163 [00:40<02:08, 6.86it/s]
2023-05-13 16:42:40,980 - INFO - tqdm - batch_loss: 0.0089, loss: 0.1560 ||: 30%|##9 | 345/1163 [00:50<02:10, 6.27it/s]
2023-05-13 16:42:51,155 - INFO - tqdm - batch_loss: 0.0000, loss: 0.1539 ||: 36%|###5 | 415/1163 [01:00<02:13, 5.60it/s]
2023-05-13 16:43:01,267 - INFO - tqdm - batch_loss: 0.0000, loss: 0.1421 ||: 42%|####2 | 491/1163 [01:10<01:19, 8.41it/s]
2023-05-13 16:43:11,368 - INFO - tqdm - batch_loss: 0.0000, loss: 0.1459 ||: 48%|####7 | 554/1163 [01:20<01:08, 8.84it/s]
2023-05-13 16:43:21,619 - INFO - tqdm - batch_loss: 0.0537, loss: 0.1392 ||: 54%|#####4 | 631/1163 [01:31<01:28, 5.98it/s]
2023-05-13 16:43:31,846 - INFO - tqdm - batch_loss: 0.0002, loss: 0.1301 ||: 60%|###### | 698/1163 [01:41<01:07, 6.87it/s]
2023-05-13 16:43:42,039 - INFO - tqdm - batch_loss: 0.0002, loss: 0.1293 ||: 66%|######6 | 771/1163 [01:51<01:16, 5.11it/s]
2023-05-13 16:43:52,077 - INFO - tqdm - batch_loss: 0.0004, loss: 0.1314 ||: 73%|#######2 | 844/1163 [02:01<00:42, 7.54it/s]
2023-05-13 16:44:02,237 - INFO - tqdm - batch_loss: 1.4601, loss: 0.1286 ||: 78%|#######8 | 909/1163 [02:11<00:35, 7.25it/s]
2023-05-13 16:44:12,386 - INFO - tqdm - batch_loss: 0.0000, loss: 0.1238 ||: 85%|########4 | 986/1163 [02:21<00:29, 6.06it/s]
2023-05-13 16:44:22,487 - INFO - tqdm - batch_loss: 0.0000, loss: 0.1281 ||: 91%|######### | 1054/1163 [02:31<00:14, 7.30it/s]
2023-05-13 16:44:32,553 - INFO - tqdm - batch_loss: 0.0000, loss: 0.1312 ||: 96%|#########6| 1119/1163 [02:42<00:07, 6.11it/s]
2023-05-13 16:44:38,618 - INFO - tqdm - batch_loss: 0.0000, loss: 0.1313 ||: 100%|#########9| 1159/1163 [02:48<00:00, 7.84it/s]
2023-05-13 16:44:38,747 - INFO - tqdm - batch_loss: 0.0000, loss: 0.1312 ||: 100%|#########9| 1160/1163 [02:48<00:00, 7.81it/s]
2023-05-13 16:44:38,935 - INFO - tqdm - batch_loss: 0.0000, loss: 0.1309 ||: 100%|#########9| 1162/1163 [02:48<00:00, 8.73it/s]
2023-05-13 16:44:39,079 - INFO - tqdm - batch_loss: 0.0001, loss: 0.1308 ||: 100%|##########| 1163/1163 [02:48<00:00, 8.26it/s]
2023-05-13 16:44:39,080 - INFO - tqdm - batch_loss: 0.0001, loss: 0.1308 ||: 100%|##########| 1163/1163 [02:48<00:00, 6.90it/s]
2023-05-13 16:44:39,081 - INFO - allennlp.training.gradient_descent_trainer - Validating
2023-05-13 16:44:39,083 - INFO - tqdm - 0%| | 0/16 [00:00<?, ?it/s]
2023-05-13 16:44:50,051 - INFO - tqdm - precision: 0.7266, recall: 0.6643, fscore: 0.6940, batch_loss: 0.7820, loss: 0.8420 ||: 50%|##### | 8/16 [00:10<00:13, 1.74s/it]
2023-05-13 16:45:00,385 - INFO - tqdm - precision: 0.7645, recall: 0.7346, fscore: 0.7493, batch_loss: 1.0186, loss: 0.8778 ||: 100%|##########| 16/16 [00:21<00:00, 1.40s/it]
2023-05-13 16:45:00,385 - INFO - tqdm - precision: 0.7645, recall: 0.7346, fscore: 0.7493, batch_loss: 1.0186, loss: 0.8778 ||: 100%|##########| 16/16 [00:21<00:00, 1.33s/it]
2023-05-13 16:45:00,386 - INFO - allennlp.training.callbacks.console_logger - Training | Validation
2023-05-13 16:45:00,386 - INFO - allennlp.training.callbacks.console_logger - fscore | N/A | 0.749
2023-05-13 16:45:00,386 - INFO - allennlp.training.callbacks.console_logger - gpu_0_memory_MB | 2935.913 | N/A
2023-05-13 16:45:00,386 - INFO - allennlp.training.callbacks.console_logger - loss | 0.131 | 0.878
2023-05-13 16:45:00,386 - INFO - allennlp.training.callbacks.console_logger - precision | N/A | 0.765
2023-05-13 16:45:00,386 - INFO - allennlp.training.callbacks.console_logger - recall | N/A | 0.735
2023-05-13 16:45:00,386 - INFO - allennlp.training.callbacks.console_logger - worker_0_memory_MB | 4521.004 | N/A
2023-05-13 16:45:13,578 - INFO - allennlp.training.gradient_descent_trainer - Epoch duration: 0:03:23.040485
2023-05-13 16:45:13,578 - INFO - allennlp.training.gradient_descent_trainer - Estimated training time remaining: 0:06:14
2023-05-13 16:45:13,578 - INFO - allennlp.training.gradient_descent_trainer - Epoch 23/24
2023-05-13 16:45:13,578 - INFO - allennlp.training.gradient_descent_trainer - Worker 0 memory usage: 4.4G
2023-05-13 16:45:13,579 - INFO - allennlp.training.gradient_descent_trainer - GPU 0 memory usage: 2.9G
2023-05-13 16:45:13,580 - INFO - allennlp.training.gradient_descent_trainer - Training
2023-05-13 16:45:13,580 - INFO - tqdm - 0%| | 0/1163 [00:00<?, ?it/s]
2023-05-13 16:45:23,707 - INFO - tqdm - batch_loss: 0.0211, loss: 0.1289 ||: 5%|5 | 63/1163 [00:10<02:42, 6.78it/s]
2023-05-13 16:45:33,764 - INFO - tqdm - batch_loss: 0.0000, loss: 0.1039 ||: 12%|#2 | 141/1163 [00:20<02:29, 6.82it/s]
2023-05-13 16:45:43,936 - INFO - tqdm - batch_loss: 0.0000, loss: 0.1223 ||: 18%|#7 | 205/1163 [00:30<03:58, 4.01it/s]
2023-05-13 16:45:54,041 - INFO - tqdm - batch_loss: 0.0000, loss: 0.1223 ||: 24%|##3 | 277/1163 [00:40<02:21, 6.27it/s]
2023-05-13 16:46:04,041 - INFO - tqdm - batch_loss: 0.8121, loss: 0.1231 ||: 30%|##9 | 345/1163 [00:50<02:09, 6.33it/s]
2023-05-13 16:46:14,226 - INFO - tqdm - batch_loss: 0.0000, loss: 0.1271 ||: 35%|###5 | 408/1163 [01:00<02:25, 5.20it/s]
2023-05-13 16:46:24,242 - INFO - tqdm - batch_loss: 0.0000, loss: 0.1349 ||: 42%|####1 | 485/1163 [01:10<01:39, 6.81it/s]
2023-05-13 16:46:34,330 - INFO - tqdm - batch_loss: 0.0006, loss: 0.1366 ||: 47%|####6 | 544/1163 [01:20<01:33, 6.63it/s]
2023-05-13 16:46:44,386 - INFO - tqdm - batch_loss: 0.0000, loss: 0.1316 ||: 53%|#####2 | 614/1163 [01:30<01:25, 6.43it/s]
2023-05-13 16:46:54,469 - INFO - tqdm - batch_loss: 0.0000, loss: 0.1280 ||: 59%|#####9 | 687/1163 [01:40<00:59, 8.04it/s]
2023-05-13 16:47:04,643 - INFO - tqdm - batch_loss: 0.0008, loss: 0.1318 ||: 64%|######4 | 749/1163 [01:51<01:23, 4.95it/s]
2023-05-13 16:47:14,679 - INFO - tqdm - batch_loss: 0.7808, loss: 0.1279 ||: 71%|#######1 | 828/1163 [02:01<00:45, 7.33it/s]
2023-05-13 16:47:24,740 - INFO - tqdm - batch_loss: 0.0000, loss: 0.1232 ||: 77%|#######7 | 897/1163 [02:11<00:30, 8.81it/s]
2023-05-13 16:47:34,774 - INFO - tqdm - batch_loss: 0.0000, loss: 0.1213 ||: 83%|########3 | 969/1163 [02:21<00:34, 5.59it/s]
2023-05-13 16:47:44,800 - INFO - tqdm - batch_loss: 0.0000, loss: 0.1264 ||: 89%|########9 | 1037/1163 [02:31<00:17, 7.26it/s]
2023-05-13 16:47:55,021 - INFO - tqdm - batch_loss: 0.0000, loss: 0.1266 ||: 94%|#########4| 1099/1163 [02:41<00:12, 5.13it/s]
2023-05-13 16:48:03,161 - INFO - tqdm - batch_loss: 0.0000, loss: 0.1220 ||: 100%|#########9| 1159/1163 [02:49<00:00, 9.03it/s]
2023-05-13 16:48:03,263 - INFO - tqdm - batch_loss: 0.0000, loss: 0.1219 ||: 100%|#########9| 1160/1163 [02:49<00:00, 9.19it/s]
2023-05-13 16:48:03,460 - INFO - tqdm - batch_loss: 0.0000, loss: 0.1218 ||: 100%|#########9| 1161/1163 [02:49<00:00, 7.66it/s]
2023-05-13 16:48:03,645 - INFO - tqdm - batch_loss: 0.0000, loss: 0.1217 ||: 100%|#########9| 1162/1163 [02:50<00:00, 6.92it/s]
2023-05-13 16:48:03,847 - INFO - tqdm - batch_loss: 0.1178, loss: 0.1217 ||: 100%|##########| 1163/1163 [02:50<00:00, 6.25it/s]
2023-05-13 16:48:03,848 - INFO - tqdm - batch_loss: 0.1178, loss: 0.1217 ||: 100%|##########| 1163/1163 [02:50<00:00, 6.83it/s]
2023-05-13 16:48:03,849 - INFO - allennlp.training.gradient_descent_trainer - Validating
2023-05-13 16:48:03,851 - INFO - tqdm - 0%| | 0/16 [00:00<?, ?it/s]
2023-05-13 16:48:14,756 - INFO - tqdm - precision: 0.7903, recall: 0.7903, fscore: 0.7903, batch_loss: 1.1783, loss: 1.0303 ||: 44%|####3 | 7/16 [00:10<00:15, 1.68s/it]
2023-05-13 16:48:25,994 - INFO - tqdm - precision: 0.7340, recall: 0.7201, fscore: 0.7270, batch_loss: 1.3082, loss: 0.9971 ||: 94%|#########3| 15/16 [00:22<00:02, 2.03s/it]
2023-05-13 16:48:26,683 - INFO - tqdm - precision: 0.7458, recall: 0.7458, fscore: 0.7458, batch_loss: 1.0895, loss: 1.0029 ||: 100%|##########| 16/16 [00:22<00:00, 1.63s/it]
2023-05-13 16:48:26,684 - INFO - tqdm - precision: 0.7458, recall: 0.7458, fscore: 0.7458, batch_loss: 1.0895, loss: 1.0029 ||: 100%|##########| 16/16 [00:22<00:00, 1.43s/it]
2023-05-13 16:48:26,685 - INFO - allennlp.training.callbacks.console_logger - Training | Validation
2023-05-13 16:48:26,685 - INFO - allennlp.training.callbacks.console_logger - fscore | N/A | 0.746
2023-05-13 16:48:26,685 - INFO - allennlp.training.callbacks.console_logger - gpu_0_memory_MB | 2962.435 | N/A
2023-05-13 16:48:26,685 - INFO - allennlp.training.callbacks.console_logger - loss | 0.122 | 1.003
2023-05-13 16:48:26,685 - INFO - allennlp.training.callbacks.console_logger - precision | N/A | 0.746
2023-05-13 16:48:26,685 - INFO - allennlp.training.callbacks.console_logger - recall | N/A | 0.746
2023-05-13 16:48:26,685 - INFO - allennlp.training.callbacks.console_logger - worker_0_memory_MB | 4521.004 | N/A
2023-05-13 16:48:32,233 - INFO - allennlp.training.gradient_descent_trainer - Epoch duration: 0:03:18.655121
2023-05-13 16:48:32,234 - INFO - allennlp.training.gradient_descent_trainer - Estimated training time remaining: 0:03:07
2023-05-13 16:48:32,234 - INFO - allennlp.training.gradient_descent_trainer - Epoch 24/24
2023-05-13 16:48:32,234 - INFO - allennlp.training.gradient_descent_trainer - Worker 0 memory usage: 4.4G
2023-05-13 16:48:32,234 - INFO - allennlp.training.gradient_descent_trainer - GPU 0 memory usage: 2.9G
2023-05-13 16:48:32,236 - INFO - allennlp.training.gradient_descent_trainer - Training
2023-05-13 16:48:32,236 - INFO - tqdm - 0%| | 0/1163 [00:00<?, ?it/s]
2023-05-13 16:48:42,336 - INFO - tqdm - batch_loss: 0.0000, loss: 0.0796 ||: 6%|6 | 73/1163 [00:10<03:26, 5.29it/s]
2023-05-13 16:48:52,468 - INFO - tqdm - batch_loss: 0.2337, loss: 0.1343 ||: 12%|#2 | 145/1163 [00:20<02:47, 6.08it/s]
2023-05-13 16:49:02,480 - INFO - tqdm - batch_loss: 0.0000, loss: 0.1347 ||: 18%|#7 | 209/1163 [00:30<02:35, 6.14it/s]
2023-05-13 16:49:12,569 - INFO - tqdm - batch_loss: 0.0055, loss: 0.1376 ||: 24%|##4 | 283/1163 [00:40<02:03, 7.12it/s]
2023-05-13 16:49:22,697 - INFO - tqdm - batch_loss: 0.0000, loss: 0.1438 ||: 30%|### | 349/1163 [00:50<02:18, 5.86it/s]
2023-05-13 16:49:32,762 - INFO - tqdm - batch_loss: 0.0000, loss: 0.1289 ||: 36%|###6 | 422/1163 [01:00<02:21, 5.23it/s]
2023-05-13 16:49:42,796 - INFO - tqdm - batch_loss: 0.0000, loss: 0.1175 ||: 42%|####2 | 491/1163 [01:10<01:21, 8.25it/s]
2023-05-13 16:49:52,823 - INFO - tqdm - batch_loss: 0.0000, loss: 0.1266 ||: 48%|####7 | 554/1163 [01:20<01:55, 5.27it/s]
2023-05-13 16:50:03,026 - INFO - tqdm - batch_loss: 0.0000, loss: 0.1213 ||: 54%|#####3 | 628/1163 [01:30<01:09, 7.75it/s]
2023-05-13 16:50:13,178 - INFO - tqdm - batch_loss: 0.0000, loss: 0.1168 ||: 60%|#####9 | 694/1163 [01:40<00:58, 7.96it/s]
2023-05-13 16:50:23,380 - INFO - tqdm - batch_loss: 0.0000, loss: 0.1144 ||: 66%|######5 | 762/1163 [01:51<01:11, 5.63it/s]
2023-05-13 16:50:33,481 - INFO - tqdm - batch_loss: 0.6088, loss: 0.1099 ||: 71%|#######1 | 826/1163 [02:01<00:39, 8.53it/s]
2023-05-13 16:50:43,551 - INFO - tqdm - batch_loss: 0.0000, loss: 0.1137 ||: 77%|#######6 | 890/1163 [02:11<00:55, 4.88it/s]
2023-05-13 16:50:53,685 - INFO - tqdm - batch_loss: 0.0000, loss: 0.1173 ||: 83%|########2 | 965/1163 [02:21<00:28, 6.89it/s]
2023-05-13 16:51:03,786 - INFO - tqdm - batch_loss: 1.8091, loss: 0.1147 ||: 88%|########8 | 1028/1163 [02:31<00:18, 7.35it/s]
2023-05-13 16:51:13,878 - INFO - tqdm - batch_loss: 0.0007, loss: 0.1183 ||: 95%|#########4| 1103/1163 [02:41<00:09, 6.13it/s]
2023-05-13 16:51:22,074 - INFO - tqdm - batch_loss: 0.0000, loss: 0.1188 ||: 100%|#########9| 1158/1163 [02:49<00:00, 6.69it/s]
2023-05-13 16:51:22,205 - INFO - tqdm - batch_loss: 0.0000, loss: 0.1187 ||: 100%|#########9| 1159/1163 [02:49<00:00, 6.89it/s]
2023-05-13 16:51:22,435 - INFO - tqdm - batch_loss: 0.2387, loss: 0.1187 ||: 100%|#########9| 1161/1163 [02:50<00:00, 7.52it/s]
2023-05-13 16:51:22,626 - INFO - tqdm - batch_loss: 0.0000, loss: 0.1185 ||: 100%|##########| 1163/1163 [02:50<00:00, 8.39it/s]
2023-05-13 16:51:22,628 - INFO - tqdm - batch_loss: 0.0000, loss: 0.1185 ||: 100%|##########| 1163/1163 [02:50<00:00, 6.83it/s]
2023-05-13 16:51:22,628 - INFO - allennlp.training.gradient_descent_trainer - Validating
2023-05-13 16:51:22,630 - INFO - tqdm - 0%| | 0/16 [00:00<?, ?it/s]
2023-05-13 16:51:34,044 - INFO - tqdm - precision: 0.7500, recall: 0.7576, fscore: 0.7538, batch_loss: 0.7929, loss: 0.7100 ||: 38%|###7 | 6/16 [00:11<00:17, 1.75s/it]
2023-05-13 16:51:44,133 - INFO - tqdm - precision: 0.7451, recall: 0.7430, fscore: 0.7441, batch_loss: 1.6418, loss: 0.8955 ||: 100%|##########| 16/16 [00:21<00:00, 1.16it/s]
2023-05-13 16:51:44,133 - INFO - tqdm - precision: 0.7451, recall: 0.7430, fscore: 0.7441, batch_loss: 1.6418, loss: 0.8955 ||: 100%|##########| 16/16 [00:21<00:00, 1.34s/it]
2023-05-13 16:51:44,134 - INFO - allennlp.training.callbacks.console_logger - Training | Validation
2023-05-13 16:51:44,134 - INFO - allennlp.training.callbacks.console_logger - fscore | N/A | 0.744
2023-05-13 16:51:44,134 - INFO - allennlp.training.callbacks.console_logger - gpu_0_memory_MB | 2960.673 | N/A
2023-05-13 16:51:44,134 - INFO - allennlp.training.callbacks.console_logger - loss | 0.118 | 0.896
2023-05-13 16:51:44,134 - INFO - allennlp.training.callbacks.console_logger - precision | N/A | 0.745
2023-05-13 16:51:44,134 - INFO - allennlp.training.callbacks.console_logger - recall | N/A | 0.743
2023-05-13 16:51:44,134 - INFO - allennlp.training.callbacks.console_logger - worker_0_memory_MB | 4521.004 | N/A
2023-05-13 16:51:50,666 - INFO - allennlp.training.gradient_descent_trainer - Epoch duration: 0:03:18.432223
2023-05-13 16:51:53,069 - INFO - allennlp.common.util - Metrics: {
"best_epoch": 20,
"peak_worker_0_memory_MB": 4521.00390625,
"peak_gpu_0_memory_MB": 3001.03662109375,
"training_duration": "1:18:27.472121",
"epoch": 24,
"training_loss": 0.11846704688033759,
"training_worker_0_memory_MB": 4521.00390625,
"training_gpu_0_memory_MB": 2960.6728515625,
"validation_precision": 0.7450980544090271,
"validation_recall": 0.74301677942276,
"validation_fscore": 0.7440559267997742,
"validation_loss": 0.895537956195767,
"best_validation_precision": 0.7520891427993774,
"best_validation_recall": 0.7541899681091309,
"best_validation_fscore": 0.7531380653381348,
"best_validation_loss": 0.6954866239684634
}
2023-05-13 16:51:53,069 - INFO - allennlp.models.archival - archiving weights and vocabulary to output/model.tar.gz