{ "_name_or_path": "dandelin/vilt-b32-mlm", "architectures": [ "ViltForQuestionAnswering" ], "attention_probs_dropout_prob": 0.0, "hidden_act": "gelu", "hidden_dropout_prob": 0.0, "hidden_size": 768, "id2label": { "0": "woods", "1": "down", "2": "table", "3": "backpack", "4": "wine", "5": "sky", "6": "suv", "7": "crossing", "8": "girl", "9": "shrimp", "10": "plastic", "11": "forest", "12": "double", "13": "blonde", "14": "0", "15": "white and blue", "16": "woman", "17": "leather", "18": "yes", "19": "shadows", "20": "3", "21": "6", "22": "many", "23": "cat", "24": "right", "25": "7", "26": "hat", "27": "picnic table", "28": "king", "29": "green", "30": "chair", "31": "donut", "32": "red and blue", "33": "wedding", "34": "white", "35": "wine tasting", "36": "1", "37": "gray", "38": "they aren't", "39": "8", "40": "shadow", "41": "black", "42": "fashion", "43": "dog", "44": "out", "45": "white and black", "46": "train", "47": "ice cream", "48": "bus", "49": "birthday", "50": "queen", "51": "10", "52": "cup", "53": "little girl", "54": "air", "55": "no", "56": "beige", "57": "bedroom", "58": "jeep", "59": "not sure", "60": "full", "61": "boy", "62": "at table", "63": "watching", "64": "5", "65": "wall", "66": "walking", "67": "brown", "68": "human", "69": "car", "70": "tired", "71": "chopsticks", "72": "park", "73": "4", "74": "red and yellow", "75": "blue and white", "76": "outside", "77": "pink", "78": "doughnut", "79": "red", "80": "hawaii", "81": "2", "82": "resting", "83": "crown", "84": "sun", "85": "yellow", "86": "style", "87": "skateboard" }, "image_size": 384, "initializer_range": 0.02, "intermediate_size": 3072, "label2id": { "0": 14, "1": 36, "10": 51, "2": 81, "3": 20, "4": 73, "5": 64, "6": 21, "7": 25, "8": 39, "air": 54, "at table": 62, "backpack": 3, "bedroom": 57, "beige": 56, "birthday": 49, "black": 41, "blonde": 13, "blue and white": 75, "boy": 61, "brown": 67, "bus": 48, "car": 69, "cat": 23, "chair": 30, "chopsticks": 71, "crossing": 7, "crown": 83, "cup": 52, "dog": 43, "donut": 31, "double": 12, "doughnut": 78, "down": 1, "fashion": 42, "forest": 11, "full": 60, "girl": 8, "gray": 37, "green": 29, "hat": 26, "hawaii": 80, "human": 68, "ice cream": 47, "jeep": 58, "king": 28, "leather": 17, "little girl": 53, "many": 22, "no": 55, "not sure": 59, "out": 44, "outside": 76, "park": 72, "picnic table": 27, "pink": 77, "plastic": 10, "queen": 50, "red": 79, "red and blue": 32, "red and yellow": 74, "resting": 82, "right": 24, "shadow": 40, "shadows": 19, "shrimp": 9, "skateboard": 87, "sky": 5, "style": 86, "sun": 84, "suv": 6, "table": 2, "they aren't": 38, "tired": 70, "train": 46, "walking": 66, "wall": 65, "watching": 63, "wedding": 33, "white": 34, "white and black": 45, "white and blue": 15, "wine": 4, "wine tasting": 35, "woman": 16, "woods": 0, "yellow": 85, "yes": 18 }, "layer_norm_eps": 1e-12, "max_image_length": -1, "max_position_embeddings": 40, "modality_type_vocab_size": 2, "model_type": "vilt", "num_attention_heads": 12, "num_channels": 3, "num_hidden_layers": 12, "num_images": -1, "patch_size": 32, "qkv_bias": true, "tie_word_embeddings": false, "torch_dtype": "float32", "transformers_version": "4.31.0", "type_vocab_size": 2, "vocab_size": 30522 }