{ "_name_or_path": "dandelin/vilt-b32-mlm", "architectures": [ "ViltForQuestionAnswering" ], "attention_probs_dropout_prob": 0.0, "hidden_act": "gelu", "hidden_dropout_prob": 0.0, "hidden_size": 768, "id2label": { "0": "out", "1": "girl", "2": "white and black", "3": "shadows", "4": "double", "5": "wine tasting", "6": "queen", "7": "not sure", "8": "10", "9": "many", "10": "tired", "11": "cat", "12": "woman", "13": "park", "14": "suv", "15": "wine", "16": "jeep", "17": "hat", "18": "red and yellow", "19": "red and blue", "20": "1", "21": "at table", "22": "6", "23": "no", "24": "sky", "25": "brown", "26": "hawaii", "27": "table", "28": "plastic", "29": "cup", "30": "birthday", "31": "donut", "32": "crossing", "33": "down", "34": "skateboard", "35": "3", "36": "full", "37": "gray", "38": "style", "39": "bedroom", "40": "8", "41": "little girl", "42": "pink", "43": "2", "44": "sun", "45": "boy", "46": "red", "47": "shadow", "48": "wedding", "49": "shrimp", "50": "dog", "51": "crown", "52": "king", "53": "bus", "54": "0", "55": "blue and white", "56": "wall", "57": "4", "58": "5", "59": "7", "60": "black", "61": "chair", "62": "yellow", "63": "car", "64": "train", "65": "fashion", "66": "resting", "67": "walking", "68": "woods", "69": "they aren't", "70": "backpack", "71": "chopsticks", "72": "forest", "73": "leather", "74": "white and blue", "75": "picnic table", "76": "ice cream", "77": "blonde", "78": "human", "79": "beige", "80": "green", "81": "air", "82": "doughnut", "83": "yes", "84": "right", "85": "watching", "86": "outside", "87": "white" }, "image_size": 384, "initializer_range": 0.02, "intermediate_size": 3072, "label2id": { "0": 54, "1": 20, "10": 8, "2": 43, "3": 35, "4": 57, "5": 58, "6": 22, "7": 59, "8": 40, "air": 81, "at table": 21, "backpack": 70, "bedroom": 39, "beige": 79, "birthday": 30, "black": 60, "blonde": 77, "blue and white": 55, "boy": 45, "brown": 25, "bus": 53, "car": 63, "cat": 11, "chair": 61, "chopsticks": 71, "crossing": 32, "crown": 51, "cup": 29, "dog": 50, "donut": 31, "double": 4, "doughnut": 82, "down": 33, "fashion": 65, "forest": 72, "full": 36, "girl": 1, "gray": 37, "green": 80, "hat": 17, "hawaii": 26, "human": 78, "ice cream": 76, "jeep": 16, "king": 52, "leather": 73, "little girl": 41, "many": 9, "no": 23, "not sure": 7, "out": 0, "outside": 86, "park": 13, "picnic table": 75, "pink": 42, "plastic": 28, "queen": 6, "red": 46, "red and blue": 19, "red and yellow": 18, "resting": 66, "right": 84, "shadow": 47, "shadows": 3, "shrimp": 49, "skateboard": 34, "sky": 24, "style": 38, "sun": 44, "suv": 14, "table": 27, "they aren't": 69, "tired": 10, "train": 64, "walking": 67, "wall": 56, "watching": 85, "wedding": 48, "white": 87, "white and black": 2, "white and blue": 74, "wine": 15, "wine tasting": 5, "woman": 12, "woods": 68, "yellow": 62, "yes": 83 }, "layer_norm_eps": 1e-12, "max_image_length": -1, "max_position_embeddings": 40, "modality_type_vocab_size": 2, "model_type": "vilt", "num_attention_heads": 12, "num_channels": 3, "num_hidden_layers": 12, "num_images": -1, "patch_size": 32, "qkv_bias": true, "tie_word_embeddings": false, "torch_dtype": "float32", "transformers_version": "4.37.2", "type_vocab_size": 2, "vocab_size": 30522 }