{ "_name_or_path": "dandelin/vilt-b32-mlm", "architectures": [ "ViltForQuestionAnswering" ], "attention_probs_dropout_prob": 0.0, "hidden_act": "gelu", "hidden_dropout_prob": 0.0, "hidden_size": 768, "id2label": { "0": "skiing", "1": "security", "2": "bikes", "3": "3", "4": "stand", "5": "church", "6": "in car", "7": "snowboarder", "8": "women", "9": "clock", "10": "blue and white", "11": "little girl", "12": "0", "13": "soccer", "14": "tabby", "15": "not sure", "16": "sleeping", "17": "on road", "18": "chopsticks", "19": "desert", "20": "cup", "21": "clear", "22": "dirt", "23": "monitor", "24": "bricks", "25": "protection", "26": "hawaii", "27": "walking", "28": "at table", "29": "bicycles", "30": "resting", "31": "sidewalk", "32": "wedding", "33": "unknown", "34": "2010", "35": "8:35", "36": "jeep", "37": "lg", "38": "roof", "39": "solid", "40": "lady", "41": "door", "42": "neon", "43": "suv", "44": "crown", "45": "wine tasting", "46": "hair", "47": "king", "48": "sun", "49": "7:35", "50": "human", "51": "4", "52": "green", "53": "10", "54": "many", "55": "red and blue", "56": "africa", "57": "giraffes", "58": "purple", "59": "beagle", "60": "skateboarding", "61": "blue", "62": "tan", "63": "5", "64": "watching", "65": "right", "66": "nothing", "67": "hat", "68": "6", "69": "arrow", "70": "9:35", "71": "stripes", "72": "natural", "73": "they aren't", "74": "8", "75": "screen", "76": "down", "77": "window", "78": "don't know", "79": "big ben", "80": "happy", "81": "bedroom", "82": "2013", "83": "park", "84": "shadows", "85": "cross", "86": "out", "87": "cat", "88": "plain", "89": "photographer", "90": "queen", "91": "chair", "92": "brick", "93": "sky", "94": "2", "95": "net", "96": "yellow", "97": "dog", "98": "person", "99": "forest", "100": "trees", "101": "plate", "102": "picnic table", "103": "street", "104": "snow", "105": "table", "106": "cage", "107": "platform", "108": "cloudy", "109": "ball", "110": "double", "111": "zoo", "112": "curtain", "113": "doughnut", "114": "giraffe", "115": "white and blue", "116": "brown", "117": "tower", "118": "boy", "119": "black and white", "120": "skier", "121": "tired", "122": "7:45", "123": "bicycle", "124": "outside", "125": "shelter", "126": "plastic", "127": "soccer ball", "128": "talking on phone", "129": "yes", "130": "lying down", "131": "bike rack", "132": "bus", "133": "talking", "134": "snowboard", "135": "snowboarding", "136": "leather", "137": "shade", "138": "blonde", "139": "girl", "140": "clock tower", "141": "on street", "142": "name tag", "143": "french", "144": "beige", "145": "camera", "146": "backpack", "147": "air", "148": "small", "149": "style", "150": "can't tell", "151": "crossing", "152": "canopy", "153": "train", "154": "rack", "155": "7", "156": "laying down", "157": "smiling", "158": "shadow", "159": "fence", "160": "lanyard", "161": "white and black", "162": "wall", "163": "ice cream", "164": "red", "165": "wine", "166": "2000", "167": "1", "168": "red and yellow", "169": "pink", "170": "large", "171": "birthday", "172": "low", "173": "tv", "174": "car", "175": "white", "176": "ground", "177": "woman", "178": "calico", "179": "donut", "180": "fashion", "181": "no", "182": "smile", "183": "full", "184": "skateboard", "185": "woods", "186": "exit", "187": "windows", "188": "necklace", "189": "gray and black", "190": "man", "191": "curtains", "192": "black", "193": "orange", "194": "not there", "195": "station", "196": "gray", "197": "shrimp", "198": "tent" }, "image_size": 384, "initializer_range": 0.02, "intermediate_size": 3072, "label2id": { "0": 12, "1": 167, "10": 53, "2": 94, "2000": 166, "2010": 34, "2013": 82, "3": 3, "4": 51, "5": 63, "6": 68, "7": 155, "7:35": 49, "7:45": 122, "8": 74, "8:35": 35, "9:35": 70, "africa": 56, "air": 147, "arrow": 69, "at table": 28, "backpack": 146, "ball": 109, "beagle": 59, "bedroom": 81, "beige": 144, "bicycle": 123, "bicycles": 29, "big ben": 79, "bike rack": 131, "bikes": 2, "birthday": 171, "black": 192, "black and white": 119, "blonde": 138, "blue": 61, "blue and white": 10, "boy": 118, "brick": 92, "bricks": 24, "brown": 116, "bus": 132, "cage": 106, "calico": 178, "camera": 145, "can't tell": 150, "canopy": 152, "car": 174, "cat": 87, "chair": 91, "chopsticks": 18, "church": 5, "clear": 21, "clock": 9, "clock tower": 140, "cloudy": 108, "cross": 85, "crossing": 151, "crown": 44, "cup": 20, "curtain": 112, "curtains": 191, "desert": 19, "dirt": 22, "dog": 97, "don't know": 78, "donut": 179, "door": 41, "double": 110, "doughnut": 113, "down": 76, "exit": 186, "fashion": 180, "fence": 159, "forest": 99, "french": 143, "full": 183, "giraffe": 114, "giraffes": 57, "girl": 139, "gray": 196, "gray and black": 189, "green": 52, "ground": 176, "hair": 46, "happy": 80, "hat": 67, "hawaii": 26, "human": 50, "ice cream": 163, "in car": 6, "jeep": 36, "king": 47, "lady": 40, "lanyard": 160, "large": 170, "laying down": 156, "leather": 136, "lg": 37, "little girl": 11, "low": 172, "lying down": 130, "man": 190, "many": 54, "monitor": 23, "name tag": 142, "natural": 72, "necklace": 188, "neon": 42, "net": 95, "no": 181, "not sure": 15, "not there": 194, "nothing": 66, "on road": 17, "on street": 141, "orange": 193, "out": 86, "outside": 124, "park": 83, "person": 98, "photographer": 89, "picnic table": 102, "pink": 169, "plain": 88, "plastic": 126, "plate": 101, "platform": 107, "protection": 25, "purple": 58, "queen": 90, "rack": 154, "red": 164, "red and blue": 55, "red and yellow": 168, "resting": 30, "right": 65, "roof": 38, "screen": 75, "security": 1, "shade": 137, "shadow": 158, "shadows": 84, "shelter": 125, "shrimp": 197, "sidewalk": 31, "skateboard": 184, "skateboarding": 60, "skier": 120, "skiing": 0, "sky": 93, "sleeping": 16, "small": 148, "smile": 182, "smiling": 157, "snow": 104, "snowboard": 134, "snowboarder": 7, "snowboarding": 135, "soccer": 13, "soccer ball": 127, "solid": 39, "stand": 4, "station": 195, "street": 103, "stripes": 71, "style": 149, "sun": 48, "suv": 43, "tabby": 14, "table": 105, "talking": 133, "talking on phone": 128, "tan": 62, "tent": 198, "they aren't": 73, "tired": 121, "tower": 117, "train": 153, "trees": 100, "tv": 173, "unknown": 33, "walking": 27, "wall": 162, "watching": 64, "wedding": 32, "white": 175, "white and black": 161, "white and blue": 115, "window": 77, "windows": 187, "wine": 165, "wine tasting": 45, "woman": 177, "women": 8, "woods": 185, "yellow": 96, "yes": 129, "zoo": 111 }, "layer_norm_eps": 1e-12, "max_image_length": -1, "max_position_embeddings": 40, "modality_type_vocab_size": 2, "model_type": "vilt", "num_attention_heads": 12, "num_channels": 3, "num_hidden_layers": 12, "num_images": -1, "patch_size": 32, "qkv_bias": true, "tie_word_embeddings": false, "torch_dtype": "float32", "transformers_version": "4.41.2", "type_vocab_size": 2, "vocab_size": 30522 }