remove unused files

Browse files

Files changed (6) hide show

app.py +0 -46
checkpoints/ckpt_2/config.json +0 -163
checkpoints/ckpt_2/flax_model.msgpack +0 -3
checkpoints/ckpt_3/config.json +0 -163
checkpoints/ckpt_3/flax_model.msgpack +0 -3
model.py +0 -89

app.py DELETED Viewed

@@ -1,46 +0,0 @@
-import streamlit as st
-from PIL import Image
-import numpy as np
-# Designing the interface
-st.title("WIT: Image -> Caption App")
-# For newline
-st.write('\n')
-image = Image.open('images/image.png')
-show = st.image(image, use_column_width=True)
-from model import *
-st.sidebar.title("Upload Image")
-# Disabling warning
-st.set_option('deprecation.showfileUploaderEncoding', False)
-# Choose your own image
-uploaded_file = st.sidebar.file_uploader(" ", type=['png', 'jpg', 'jpeg'])
-if uploaded_file is not None:
-    image = Image.open(uploaded_file)
-    show.image(image, 'Uploaded Image', use_column_width=True)
-# For newline
-st.sidebar.write('\n')
-if st.sidebar.button("Click here to get image caption"):
-    if uploaded_file is None:
-        st.sidebar.write("Please upload an Image to Classify")
-    else:
-        with st.spinner('Generating image caption ...'):
-            caption = 'dummy caption'
-            st.success(f'caption: {caption}')
-        st.sidebar.header("ViT-GPT2 predicts:")
-        st.sidebar.write(f"caption: {caption}", '\n')

checkpoints/ckpt_2/config.json DELETED Viewed

@@ -1,163 +0,0 @@
-{
-  "architectures": [
-    "ViTGPT2LMForConditionalGeneration"
-  ],
-  "bos_token_id": 0,
-  "decoder_start_token_id": 0,
-  "eos_token_id": 2,
-  "gpt2_config": {
-    "_name_or_path": "",
-    "activation_function": "gelu_new",
-    "add_cross_attention": true,
-    "architectures": null,
-    "attn_pdrop": 0.1,
-    "bad_words_ids": null,
-    "bos_token_id": 0,
-    "chunk_size_feed_forward": 0,
-    "decoder_start_token_id": null,
-    "diversity_penalty": 0.0,
-    "do_sample": false,
-    "early_stopping": false,
-    "embd_pdrop": 0.1,
-    "encoder_no_repeat_ngram_size": 0,
-    "eos_token_id": 2,
-    "finetuning_task": null,
-    "forced_bos_token_id": null,
-    "forced_eos_token_id": null,
-    "gradient_checkpointing": false,
-    "id2label": {
-      "0": "LABEL_0",
-      "1": "LABEL_1"
-    },
-    "initializer_range": 0.02,
-    "is_decoder": false,
-    "is_encoder_decoder": false,
-    "label2id": {
-      "LABEL_0": 0,
-      "LABEL_1": 1
-    },
-    "layer_norm_epsilon": 1e-05,
-    "length_penalty": 1.0,
-    "max_length": 20,
-    "min_length": 0,
-    "model_type": "gpt2",
-    "n_ctx": 1024,
-    "n_embd": 768,
-    "n_head": 12,
-    "n_inner": null,
-    "n_layer": 12,
-    "n_positions": 1024,
-    "no_repeat_ngram_size": 0,
-    "num_beam_groups": 1,
-    "num_beams": 1,
-    "num_return_sequences": 1,
-    "output_attentions": false,
-    "output_hidden_states": false,
-    "output_scores": false,
-    "pad_token_id": 1,
-    "prefix": null,
-    "problem_type": null,
-    "pruned_heads": {},
-    "remove_invalid_values": false,
-    "repetition_penalty": 1.0,
-    "resid_pdrop": 0.1,
-    "return_dict": true,
-    "return_dict_in_generate": false,
-    "scale_attn_weights": true,
-    "sep_token_id": null,
-    "summary_activation": null,
-    "summary_first_dropout": 0.1,
-    "summary_proj_to_labels": true,
-    "summary_type": "cls_index",
-    "summary_use_proj": true,
-    "task_specific_params": null,
-    "temperature": 1.0,
-    "tie_encoder_decoder": false,
-    "tie_word_embeddings": true,
-    "tokenizer_class": null,
-    "top_k": 50,
-    "top_p": 1.0,
-    "torch_dtype": null,
-    "torchscript": false,
-    "transformers_version": "4.9.0.dev0",
-    "use_bfloat16": false,
-    "use_cache": true,
-    "vocab_size": 50000
-  },
-  "is_encoder_decoder": true,
-  "model_type": "vit-gpt2",
-  "pad_token_id": 1,
-  "transformers_version": null,
-  "vit_config": {
-    "_name_or_path": "",
-    "add_cross_attention": false,
-    "architectures": [
-      "ViTModel"
-    ],
-    "attention_probs_dropout_prob": 0.0,
-    "bad_words_ids": null,
-    "bos_token_id": null,
-    "chunk_size_feed_forward": 0,
-    "decoder_start_token_id": null,
-    "diversity_penalty": 0.0,
-    "do_sample": false,
-    "early_stopping": false,
-    "encoder_no_repeat_ngram_size": 0,
-    "eos_token_id": null,
-    "finetuning_task": null,
-    "forced_bos_token_id": null,
-    "forced_eos_token_id": null,
-    "hidden_act": "gelu",
-    "hidden_dropout_prob": 0.0,
-    "hidden_size": 768,
-    "id2label": {
-      "0": "LABEL_0",
-      "1": "LABEL_1"
-    },
-    "image_size": 224,
-    "initializer_range": 0.02,
-    "intermediate_size": 3072,
-    "is_decoder": false,
-    "is_encoder_decoder": false,
-    "label2id": {
-      "LABEL_0": 0,
-      "LABEL_1": 1
-    },
-    "layer_norm_eps": 1e-12,
-    "length_penalty": 1.0,
-    "max_length": 20,
-    "min_length": 0,
-    "model_type": "vit",
-    "no_repeat_ngram_size": 0,
-    "num_attention_heads": 12,
-    "num_beam_groups": 1,
-    "num_beams": 1,
-    "num_channels": 3,
-    "num_hidden_layers": 12,
-    "num_return_sequences": 1,
-    "output_attentions": false,
-    "output_hidden_states": false,
-    "output_scores": false,
-    "pad_token_id": null,
-    "patch_size": 16,
-    "prefix": null,
-    "problem_type": null,
-    "pruned_heads": {},
-    "remove_invalid_values": false,
-    "repetition_penalty": 1.0,
-    "return_dict": true,
-    "return_dict_in_generate": false,
-    "sep_token_id": null,
-    "task_specific_params": null,
-    "temperature": 1.0,
-    "tie_encoder_decoder": false,
-    "tie_word_embeddings": true,
-    "tokenizer_class": null,
-    "top_k": 50,
-    "top_p": 1.0,
-    "torch_dtype": null,
-    "torchscript": false,
-    "transformers_version": "4.9.0.dev0",
-    "use_bfloat16": false
-  }
-}

checkpoints/ckpt_2/flax_model.msgpack DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:f91dda0691002393e4712170cb03a3e609b8d51b45de841db2f560cfb9549f05
-size 1012706583

checkpoints/ckpt_3/config.json DELETED Viewed

@@ -1,163 +0,0 @@
-{
-  "architectures": [
-    "ViTGPT2LMForConditionalGeneration"
-  ],
-  "bos_token_id": 0,
-  "decoder_start_token_id": 0,
-  "eos_token_id": 2,
-  "gpt2_config": {
-    "_name_or_path": "",
-    "activation_function": "gelu_new",
-    "add_cross_attention": true,
-    "architectures": null,
-    "attn_pdrop": 0.1,
-    "bad_words_ids": null,
-    "bos_token_id": 0,
-    "chunk_size_feed_forward": 0,
-    "decoder_start_token_id": null,
-    "diversity_penalty": 0.0,
-    "do_sample": false,
-    "early_stopping": false,
-    "embd_pdrop": 0.1,
-    "encoder_no_repeat_ngram_size": 0,
-    "eos_token_id": 2,
-    "finetuning_task": null,
-    "forced_bos_token_id": null,
-    "forced_eos_token_id": null,
-    "gradient_checkpointing": false,
-    "id2label": {
-      "0": "LABEL_0",
-      "1": "LABEL_1"
-    },
-    "initializer_range": 0.02,
-    "is_decoder": false,
-    "is_encoder_decoder": false,
-    "label2id": {
-      "LABEL_0": 0,
-      "LABEL_1": 1
-    },
-    "layer_norm_epsilon": 1e-05,
-    "length_penalty": 1.0,
-    "max_length": 20,
-    "min_length": 0,
-    "model_type": "gpt2",
-    "n_ctx": 1024,
-    "n_embd": 768,
-    "n_head": 12,
-    "n_inner": null,
-    "n_layer": 12,
-    "n_positions": 1024,
-    "no_repeat_ngram_size": 0,
-    "num_beam_groups": 1,
-    "num_beams": 1,
-    "num_return_sequences": 1,
-    "output_attentions": false,
-    "output_hidden_states": false,
-    "output_scores": false,
-    "pad_token_id": 1,
-    "prefix": null,
-    "problem_type": null,
-    "pruned_heads": {},
-    "remove_invalid_values": false,
-    "repetition_penalty": 1.0,
-    "resid_pdrop": 0.1,
-    "return_dict": true,
-    "return_dict_in_generate": false,
-    "scale_attn_weights": true,
-    "sep_token_id": null,
-    "summary_activation": null,
-    "summary_first_dropout": 0.1,
-    "summary_proj_to_labels": true,
-    "summary_type": "cls_index",
-    "summary_use_proj": true,
-    "task_specific_params": null,
-    "temperature": 1.0,
-    "tie_encoder_decoder": false,
-    "tie_word_embeddings": true,
-    "tokenizer_class": null,
-    "top_k": 50,
-    "top_p": 1.0,
-    "torch_dtype": null,
-    "torchscript": false,
-    "transformers_version": "4.9.0.dev0",
-    "use_bfloat16": false,
-    "use_cache": true,
-    "vocab_size": 50000
-  },
-  "is_encoder_decoder": true,
-  "model_type": "vit-gpt2",
-  "pad_token_id": 1,
-  "transformers_version": null,
-  "vit_config": {
-    "_name_or_path": "",
-    "add_cross_attention": false,
-    "architectures": [
-      "ViTModel"
-    ],
-    "attention_probs_dropout_prob": 0.0,
-    "bad_words_ids": null,
-    "bos_token_id": null,
-    "chunk_size_feed_forward": 0,
-    "decoder_start_token_id": null,
-    "diversity_penalty": 0.0,
-    "do_sample": false,
-    "early_stopping": false,
-    "encoder_no_repeat_ngram_size": 0,
-    "eos_token_id": null,
-    "finetuning_task": null,
-    "forced_bos_token_id": null,
-    "forced_eos_token_id": null,
-    "hidden_act": "gelu",
-    "hidden_dropout_prob": 0.0,
-    "hidden_size": 768,
-    "id2label": {
-      "0": "LABEL_0",
-      "1": "LABEL_1"
-    },
-    "image_size": 224,
-    "initializer_range": 0.02,
-    "intermediate_size": 3072,
-    "is_decoder": false,
-    "is_encoder_decoder": false,
-    "label2id": {
-      "LABEL_0": 0,
-      "LABEL_1": 1
-    },
-    "layer_norm_eps": 1e-12,
-    "length_penalty": 1.0,
-    "max_length": 20,
-    "min_length": 0,
-    "model_type": "vit",
-    "no_repeat_ngram_size": 0,
-    "num_attention_heads": 12,
-    "num_beam_groups": 1,
-    "num_beams": 1,
-    "num_channels": 3,
-    "num_hidden_layers": 12,
-    "num_return_sequences": 1,
-    "output_attentions": false,
-    "output_hidden_states": false,
-    "output_scores": false,
-    "pad_token_id": null,
-    "patch_size": 16,
-    "prefix": null,
-    "problem_type": null,
-    "pruned_heads": {},
-    "remove_invalid_values": false,
-    "repetition_penalty": 1.0,
-    "return_dict": true,
-    "return_dict_in_generate": false,
-    "sep_token_id": null,
-    "task_specific_params": null,
-    "temperature": 1.0,
-    "tie_encoder_decoder": false,
-    "tie_word_embeddings": true,
-    "tokenizer_class": null,
-    "top_k": 50,
-    "top_p": 1.0,
-    "torch_dtype": null,
-    "torchscript": false,
-    "transformers_version": "4.9.0.dev0",
-    "use_bfloat16": false
-  }
-}

checkpoints/ckpt_3/flax_model.msgpack DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:1a5152a207ba30a963b32775047d0276b9cec79d9c7343eb01db4e8dab14bac2
-size 1012706583

model.py DELETED Viewed

@@ -1,89 +0,0 @@
-import sys, os
-current_path = os.path.dirname(os.path.abspath(__file__))
-sys.path.append(current_path)
-# jax
-import jax
-# Main model -  ViTGPT2LM
-from vit_gpt2.modeling_flax_vit_gpt2_lm import FlaxViTGPT2LMForConditionalGeneration
-# Vit - as encoder
-from transformers import ViTFeatureExtractor
-from PIL import Image
-import requests
-import numpy as np
-# GPT2 / GPT2LM - as decoder
-from transformers import ViTFeatureExtractor, GPT2Tokenizer
-model_name_or_path = './outputs/ckpt_2/'
-flax_vit_gpt2_lm = FlaxViTGPT2LMForConditionalGeneration.from_pretrained(model_name_or_path)
-vit_model_name = 'google/vit-base-patch16-224-in21k'
-feature_extractor = ViTFeatureExtractor.from_pretrained(vit_model_name)
-gpt2_model_name = 'asi/gpt-fr-cased-small'
-tokenizer = GPT2Tokenizer.from_pretrained(gpt2_model_name)
-max_length = 64
-num_beams = 16
-gen_kwargs = {"max_length": max_length, "num_beams": num_beams}
-@jax.jit
-def predict_fn(pixel_values):
-    return flax_vit_gpt2_lm.generate(pixel_values, **gen_kwargs)
-def predict(image, pxs=None):
-    # batch dim is added automatically
-    encoder_inputs = feature_extractor(images=image, return_tensors="jax")
-    pixel_values = encoder_inputs.pixel_values
-    if pxs is not None:
-        pixel_values = pxs
-    # generation
-    generation = predict_fn(pixel_values)
-    token_ids = np.array(generation.sequences)[0]
-    caption = tokenizer.decode(token_ids)
-    return caption, token_ids
-if __name__ == '__main__':
-    from datetime import datetime
-    idx = 11
-    url = f'./wit_data_dir/train/images/{idx}.jpg'
-    image = Image.open(url)
-    encoder_inputs = feature_extractor(images=image, return_tensors="np")
-    pv1 = encoder_inputs.pixel_values
-    pv2 = np.load(f'./wit_data_dir/train/numpy/{idx}.npy')
-    print(np.sum(np.abs(pv1 - pv2)))
-    s = datetime.now()
-    caption, token_ids = predict(image, pxs=pv2)
-    e = datetime.now()
-    e = (e - s).total_seconds()
-    print(e)
-    print(f'token_ids: {token_ids}')
-    print(f'caption: {caption}')
-    for _ in range(1):
-        s = datetime.now()
-        caption, token_ids = predict(image, pxs=None)
-        e = datetime.now()
-        e = (e - s).total_seconds()
-        print(e)
-        print('-' * 20)
-    print(f'token_ids: {token_ids}')
-    print(f'caption: {caption}')