Model save

Browse files

Files changed (7) hide show

README.md +66 -71
config.json +184 -183
generation_config.json +5 -5
metrics.txt +2 -0
model.safetensors +1 -1
preprocessor_config.json +36 -22
training_args.bin +2 -2

README.md CHANGED Viewed

@@ -1,71 +1,66 @@
----
-tags:
-- image-to-text
-- image-captioning
-license: apache-2.0
-metrics:
-- rouge
-datasets:
-- nlphuji/flickr30k
-widget:
-- src: https://huggingface.co/datasets/mishig/sample_images/resolve/main/savanna.jpg
-  example_title: Savanna
-- src: https://huggingface.co/datasets/mishig/sample_images/resolve/main/football-match.jpg
-  example_title: Football Match
-- src: https://huggingface.co/datasets/mishig/sample_images/resolve/main/airport.jpg
-  example_title: Airport
-base_model:
-- google/vit-base-patch16-224-in21k
-model-index:
-- name: mozilla/distilvit
-  results:
-  - task:
-      type: image-to-text
-      name: Image To Text
-    dataset:
-      name: nlphuji/flickr30k
-      type: nlphuji/flickr30k
-    metrics:
-    - name: ROUGE-1
-      type: rouge
-      value: 43.006
-      verified: true
-    - name: ROUGE-2
-      type: rouge
-      value: 16.9939
-      verified: true
-    - name: ROUGE-L
-      type: rouge
-      value: 38.8923
-      verified: true
-    - name: ROUGE-LSUM
-      type: rouge
-      value: 38.8877
-      verified: true
-    - name: loss
-      type: loss
-      value: 0.19939416646957397
-    - name: gen_len
-      type: gen_len
-      value: 11.327256736227712
-      verified: true
----
-This model is a work in progress.
-Fine-tuned version of those base models:
-- a VIT model for the image encoder:  https://huggingface.co/google/vit-base-patch16-224-in21k
-- a Distilled GPT-2 model for the text decoder: https://huggingface.co/distilbert/distilgpt2
-This model was trained on:
-- Flickr30k : https://huggingface.co/datasets/nlphuji/flickr30k
-- COCO 2017: https://cocodataset.org
-You can find the code used to create the model here: https://github.com/mozilla/distilvit

+---
+license: apache-2.0
+base_model: mozilla/distilvit
+tags:
+- generated_from_trainer
+metrics:
+- rouge
+model-index:
+- name: distilvit
+  results: []
+---
+<!-- This model card has been generated automatically according to the information the Trainer had access to. You
+should probably proofread and complete it, then remove this comment. -->
+# distilvit
+This model is a fine-tuned version of [mozilla/distilvit](https://huggingface.co/mozilla/distilvit) on an unknown dataset.
+It achieves the following results on the evaluation set:
+- Gen Len: 10.6487
+- Loss: 0.1739
+- Meteor: 0.4120
+- Rouge1: 50.0916
+- Rouge2: 24.7223
+- Rougel: 46.9416
+- Rougelsum: 46.9372
+## Model description
+More information needed
+## Intended uses & limitations
+More information needed
+## Training and evaluation data
+More information needed
+## Training procedure
+### Training hyperparameters
+The following hyperparameters were used during training:
+- learning_rate: 5e-05
+- train_batch_size: 100
+- eval_batch_size: 100
+- seed: 42
+- optimizer: Adam with betas=(0.9,0.999) and epsilon=1e-08
+- lr_scheduler_type: linear
+- num_epochs: 1
+### Training results
+| Training Loss | Epoch  | Step | Gen Len | Validation Loss | Meteor | Rouge1  | Rouge2  | Rougel  | Rougelsum |
+|:-------------:|:------:|:----:|:-------:|:---------------:|:------:|:-------:|:-------:|:-------:|:---------:|
+| No log        | 0.3891 | 100  | 10.4163 | 0.1764          | 0.4117 | 50.0198 | 24.6331 | 46.9071 | 46.8907   |
+| No log        | 0.7782 | 200  | 10.6487 | 0.1739          | 0.4120 | 50.0916 | 24.7223 | 46.9416 | 46.9372   |
+### Framework versions
+- Transformers 4.40.2
+- Pytorch 2.3.0+cu121
+- Datasets 2.19.1
+- Tokenizers 0.19.1

config.json CHANGED Viewed

@@ -1,183 +1,184 @@
-{
-  "_name_or_path": "distilvit-flickr",
-  "architectures": [
-    "VisionEncoderDecoderModel"
-  ],
-  "decoder": {
-    "_name_or_path": "distilbert/distilgpt2",
-    "_num_labels": 1,
-    "activation_function": "gelu_new",
-    "add_cross_attention": true,
-    "architectures": [
-      "GPT2LMHeadModel"
-    ],
-    "attn_pdrop": 0.1,
-    "bad_words_ids": null,
-    "begin_suppress_tokens": null,
-    "bos_token_id": 50256,
-    "chunk_size_feed_forward": 0,
-    "cross_attention_hidden_size": null,
-    "decoder_start_token_id": null,
-    "diversity_penalty": 0.0,
-    "do_sample": false,
-    "early_stopping": false,
-    "embd_pdrop": 0.1,
-    "encoder_no_repeat_ngram_size": 0,
-    "eos_token_id": 50256,
-    "exponential_decay_length_penalty": null,
-    "finetuning_task": null,
-    "forced_bos_token_id": null,
-    "forced_eos_token_id": null,
-    "id2label": {
-      "0": "LABEL_0"
-    },
-    "initializer_range": 0.02,
-    "is_decoder": true,
-    "is_encoder_decoder": false,
-    "label2id": {
-      "LABEL_0": 0
-    },
-    "layer_norm_epsilon": 1e-05,
-    "length_penalty": 1.0,
-    "max_length": 20,
-    "min_length": 0,
-    "model_type": "gpt2",
-    "n_ctx": 1024,
-    "n_embd": 768,
-    "n_head": 12,
-    "n_inner": null,
-    "n_layer": 6,
-    "n_positions": 1024,
-    "no_repeat_ngram_size": 0,
-    "num_beam_groups": 1,
-    "num_beams": 1,
-    "num_return_sequences": 1,
-    "output_attentions": false,
-    "output_hidden_states": false,
-    "output_scores": false,
-    "pad_token_id": null,
-    "prefix": null,
-    "problem_type": null,
-    "pruned_heads": {},
-    "remove_invalid_values": false,
-    "reorder_and_upcast_attn": false,
-    "repetition_penalty": 1.0,
-    "resid_pdrop": 0.1,
-    "return_dict": true,
-    "return_dict_in_generate": false,
-    "scale_attn_by_inverse_layer_idx": false,
-    "scale_attn_weights": true,
-    "sep_token_id": null,
-    "summary_activation": null,
-    "summary_first_dropout": 0.1,
-    "summary_proj_to_labels": true,
-    "summary_type": "cls_index",
-    "summary_use_proj": true,
-    "suppress_tokens": null,
-    "task_specific_params": {
-      "text-generation": {
-        "do_sample": true,
-        "max_length": 50
-      }
-    },
-    "temperature": 1.0,
-    "tf_legacy_loss": false,
-    "tie_encoder_decoder": false,
-    "tie_word_embeddings": true,
-    "tokenizer_class": null,
-    "top_k": 50,
-    "top_p": 1.0,
-    "torch_dtype": null,
-    "torchscript": false,
-    "typical_p": 1.0,
-    "use_bfloat16": false,
-    "use_cache": true,
-    "vocab_size": 50257
-  },
-  "decoder_start_token_id": 50256,
-  "encoder": {
-    "_name_or_path": "google/vit-base-patch16-224-in21k",
-    "add_cross_attention": false,
-    "architectures": [
-      "ViTModel"
-    ],
-    "attention_probs_dropout_prob": 0.0,
-    "bad_words_ids": null,
-    "begin_suppress_tokens": null,
-    "bos_token_id": null,
-    "chunk_size_feed_forward": 0,
-    "cross_attention_hidden_size": null,
-    "decoder_start_token_id": null,
-    "diversity_penalty": 0.0,
-    "do_sample": false,
-    "early_stopping": false,
-    "encoder_no_repeat_ngram_size": 0,
-    "encoder_stride": 16,
-    "eos_token_id": null,
-    "exponential_decay_length_penalty": null,
-    "finetuning_task": null,
-    "forced_bos_token_id": null,
-    "forced_eos_token_id": null,
-    "hidden_act": "gelu",
-    "hidden_dropout_prob": 0.0,
-    "hidden_size": 768,
-    "id2label": {
-      "0": "LABEL_0",
-      "1": "LABEL_1"
-    },
-    "image_size": 224,
-    "initializer_range": 0.02,
-    "intermediate_size": 3072,
-    "is_decoder": false,
-    "is_encoder_decoder": false,
-    "label2id": {
-      "LABEL_0": 0,
-      "LABEL_1": 1
-    },
-    "layer_norm_eps": 1e-12,
-    "length_penalty": 1.0,
-    "max_length": 20,
-    "min_length": 0,
-    "model_type": "vit",
-    "no_repeat_ngram_size": 0,
-    "num_attention_heads": 12,
-    "num_beam_groups": 1,
-    "num_beams": 1,
-    "num_channels": 3,
-    "num_hidden_layers": 12,
-    "num_return_sequences": 1,
-    "output_attentions": false,
-    "output_hidden_states": false,
-    "output_scores": false,
-    "pad_token_id": null,
-    "patch_size": 16,
-    "prefix": null,
-    "problem_type": null,
-    "pruned_heads": {},
-    "qkv_bias": true,
-    "remove_invalid_values": false,
-    "repetition_penalty": 1.0,
-    "return_dict": true,
-    "return_dict_in_generate": false,
-    "sep_token_id": null,
-    "suppress_tokens": null,
-    "task_specific_params": null,
-    "temperature": 1.0,
-    "tf_legacy_loss": false,
-    "tie_encoder_decoder": false,
-    "tie_word_embeddings": true,
-    "tokenizer_class": null,
-    "top_k": 50,
-    "top_p": 1.0,
-    "torch_dtype": null,
-    "torchscript": false,
-    "typical_p": 1.0,
-    "use_bfloat16": false
-  },
-  "eos_token_id": 50256,
-  "is_encoder_decoder": true,
-  "model_type": "vision-encoder-decoder",
-  "pad_token_id": 50256,
-  "tie_word_embeddings": false,
-  "transformers_version": "4.36.2"
-}

+{
+  "_name_or_path": "mozilla/distilvit",
+  "architectures": [
+    "VisionEncoderDecoderModel"
+  ],
+  "decoder": {
+    "_name_or_path": "distilbert/distilgpt2",
+    "_num_labels": 1,
+    "activation_function": "gelu_new",
+    "add_cross_attention": true,
+    "architectures": [
+      "GPT2LMHeadModel"
+    ],
+    "attn_pdrop": 0.1,
+    "bad_words_ids": null,
+    "begin_suppress_tokens": null,
+    "bos_token_id": 50256,
+    "chunk_size_feed_forward": 0,
+    "cross_attention_hidden_size": null,
+    "decoder_start_token_id": null,
+    "diversity_penalty": 0.0,
+    "do_sample": false,
+    "early_stopping": false,
+    "embd_pdrop": 0.1,
+    "encoder_no_repeat_ngram_size": 0,
+    "eos_token_id": 50256,
+    "exponential_decay_length_penalty": null,
+    "finetuning_task": null,
+    "forced_bos_token_id": null,
+    "forced_eos_token_id": null,
+    "id2label": {
+      "0": "LABEL_0"
+    },
+    "initializer_range": 0.02,
+    "is_decoder": true,
+    "is_encoder_decoder": false,
+    "label2id": {
+      "LABEL_0": 0
+    },
+    "layer_norm_epsilon": 1e-05,
+    "length_penalty": 1.0,
+    "max_length": 20,
+    "min_length": 0,
+    "model_type": "gpt2",
+    "n_ctx": 1024,
+    "n_embd": 768,
+    "n_head": 12,
+    "n_inner": null,
+    "n_layer": 6,
+    "n_positions": 1024,
+    "no_repeat_ngram_size": 0,
+    "num_beam_groups": 1,
+    "num_beams": 1,
+    "num_return_sequences": 1,
+    "output_attentions": false,
+    "output_hidden_states": false,
+    "output_scores": false,
+    "pad_token_id": null,
+    "prefix": null,
+    "problem_type": null,
+    "pruned_heads": {},
+    "remove_invalid_values": false,
+    "reorder_and_upcast_attn": false,
+    "repetition_penalty": 1.0,
+    "resid_pdrop": 0.1,
+    "return_dict": true,
+    "return_dict_in_generate": false,
+    "scale_attn_by_inverse_layer_idx": false,
+    "scale_attn_weights": true,
+    "sep_token_id": null,
+    "summary_activation": null,
+    "summary_first_dropout": 0.1,
+    "summary_proj_to_labels": true,
+    "summary_type": "cls_index",
+    "summary_use_proj": true,
+    "suppress_tokens": null,
+    "task_specific_params": {
+      "text-generation": {
+        "do_sample": true,
+        "max_length": 50
+      }
+    },
+    "temperature": 1.0,
+    "tf_legacy_loss": false,
+    "tie_encoder_decoder": false,
+    "tie_word_embeddings": true,
+    "tokenizer_class": null,
+    "top_k": 50,
+    "top_p": 1.0,
+    "torch_dtype": null,
+    "torchscript": false,
+    "typical_p": 1.0,
+    "use_bfloat16": false,
+    "use_cache": true,
+    "vocab_size": 50257
+  },
+  "decoder_start_token_id": 50256,
+  "encoder": {
+    "_name_or_path": "google/vit-base-patch16-224-in21k",
+    "add_cross_attention": false,
+    "architectures": [
+      "ViTModel"
+    ],
+    "attention_probs_dropout_prob": 0.0,
+    "bad_words_ids": null,
+    "begin_suppress_tokens": null,
+    "bos_token_id": null,
+    "chunk_size_feed_forward": 0,
+    "cross_attention_hidden_size": null,
+    "decoder_start_token_id": null,
+    "diversity_penalty": 0.0,
+    "do_sample": false,
+    "early_stopping": false,
+    "encoder_no_repeat_ngram_size": 0,
+    "encoder_stride": 16,
+    "eos_token_id": null,
+    "exponential_decay_length_penalty": null,
+    "finetuning_task": null,
+    "forced_bos_token_id": null,
+    "forced_eos_token_id": null,
+    "hidden_act": "gelu",
+    "hidden_dropout_prob": 0.0,
+    "hidden_size": 768,
+    "id2label": {
+      "0": "LABEL_0",
+      "1": "LABEL_1"
+    },
+    "image_size": 224,
+    "initializer_range": 0.02,
+    "intermediate_size": 3072,
+    "is_decoder": false,
+    "is_encoder_decoder": false,
+    "label2id": {
+      "LABEL_0": 0,
+      "LABEL_1": 1
+    },
+    "layer_norm_eps": 1e-12,
+    "length_penalty": 1.0,
+    "max_length": 20,
+    "min_length": 0,
+    "model_type": "vit",
+    "no_repeat_ngram_size": 0,
+    "num_attention_heads": 12,
+    "num_beam_groups": 1,
+    "num_beams": 1,
+    "num_channels": 3,
+    "num_hidden_layers": 12,
+    "num_return_sequences": 1,
+    "output_attentions": false,
+    "output_hidden_states": false,
+    "output_scores": false,
+    "pad_token_id": null,
+    "patch_size": 16,
+    "prefix": null,
+    "problem_type": null,
+    "pruned_heads": {},
+    "qkv_bias": true,
+    "remove_invalid_values": false,
+    "repetition_penalty": 1.0,
+    "return_dict": true,
+    "return_dict_in_generate": false,
+    "sep_token_id": null,
+    "suppress_tokens": null,
+    "task_specific_params": null,
+    "temperature": 1.0,
+    "tf_legacy_loss": false,
+    "tie_encoder_decoder": false,
+    "tie_word_embeddings": true,
+    "tokenizer_class": null,
+    "top_k": 50,
+    "top_p": 1.0,
+    "torch_dtype": null,
+    "torchscript": false,
+    "typical_p": 1.0,
+    "use_bfloat16": false
+  },
+  "eos_token_id": 50256,
+  "is_encoder_decoder": true,
+  "model_type": "vision-encoder-decoder",
+  "pad_token_id": 50256,
+  "tie_word_embeddings": false,
+  "torch_dtype": "float32",
+  "transformers_version": "4.40.2"
+}

generation_config.json CHANGED Viewed

@@ -1,5 +1,5 @@
-{
-  "bos_token_id": 50256,
-  "eos_token_id": 50256,
-  "transformers_version": "4.36.2"
-}

+{
+  "bos_token_id": 50256,
+  "eos_token_id": 50256,
+  "transformers_version": "4.40.2"
+}

metrics.txt ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ {'eval_loss': 0.17637203633785248, 'eval_rouge1': 50.0198, 'eval_rouge2': 24.6331, 'eval_rougeL': 46.9071, 'eval_rougeLsum': 46.8907, 'eval_meteor': 0.4116637061220643, 'eval_gen_len': 10.41630231105559, 'eval_runtime': 233.9623, 'eval_samples_per_second': 13.686, 'eval_steps_per_second': 0.141, 'epoch': 0.38910505836575876}
2	+ {'eval_loss': 0.17389048635959625, 'eval_rouge1': 50.0916, 'eval_rouge2': 24.7223, 'eval_rougeL': 46.9416, 'eval_rougeLsum': 46.9372, 'eval_meteor': 0.41204454830693554, 'eval_gen_len': 10.648657089319176, 'eval_runtime': 232.7742, 'eval_samples_per_second': 13.756, 'eval_steps_per_second': 0.142, 'epoch': 0.7782101167315175}

model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:dfbf6a06c87f2ecf5debe4ca7a967db958ef5ef91b9e95d0968db292b29bd044
 size 729979160

 version https://git-lfs.github.com/spec/v1
+oid sha256:eb10c7e6caaf59d39c22b873daa806f504ea670c4757dbd67a8b5e3be97be6b3
 size 729979160

preprocessor_config.json CHANGED Viewed

@@ -1,22 +1,36 @@
-{
-  "do_normalize": true,
-  "do_rescale": true,
-  "do_resize": true,
-  "image_mean": [
-    0.5,
-    0.5,
-    0.5
-  ],
-  "image_processor_type": "ViTFeatureExtractor",
-  "image_std": [
-    0.5,
-    0.5,
-    0.5
-  ],
-  "resample": 2,
-  "rescale_factor": 0.00392156862745098,
-  "size": {
-    "height": 224,
-    "width": 224
-  }
-}

+{
+  "_valid_processor_keys": [
+    "images",
+    "do_resize",
+    "size",
+    "resample",
+    "do_rescale",
+    "rescale_factor",
+    "do_normalize",
+    "image_mean",
+    "image_std",
+    "return_tensors",
+    "data_format",
+    "input_data_format"
+  ],
+  "do_normalize": true,
+  "do_rescale": true,
+  "do_resize": true,
+  "image_mean": [
+    0.5,
+    0.5,
+    0.5
+  ],
+  "image_processor_type": "ViTFeatureExtractor",
+  "image_std": [
+    0.5,
+    0.5,
+    0.5
+  ],
+  "resample": 2,
+  "rescale_factor": 0.00392156862745098,
+  "size": {
+    "height": 224,
+    "width": 224
+  }
+}

training_args.bin CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:fd3daca9a2e8e9dd775d2cd9f9a0e33ee454cf4e9d1bb12d250711d49ddadcfd
-size 5112

 version https://git-lfs.github.com/spec/v1
+oid sha256:5d6dbda62eac800cee527380ece0a33f7df91a801dc0529c3c35871cb1123276
+size 5176