{ "_name_or_path": "dandelin/vilt-b32-mlm", "architectures": [ "ViltForQuestionAnswering" ], "attention_probs_dropout_prob": 0.0, "hidden_act": "gelu", "hidden_dropout_prob": 0.0, "hidden_size": 768, "id2label": { "0": "street", "1": "suv", "2": "purple", "3": "snow", "4": "large", "5": "happy", "6": "necklace", "7": "red", "8": "trees", "9": "ground", "10": "black", "11": "white", "12": "1", "13": "style", "14": "sun", "15": "7:45", "16": "bicycle", "17": "soccer ball", "18": "dog", "19": "table", "20": "boy", "21": "sidewalk", "22": "rack", "23": "nothing", "24": "window", "25": "air", "26": "many", "27": "stripes", "28": "women", "29": "shrimp", "30": "2000", "31": "big ben", "32": "snowboard", "33": "skier", "34": "shadow", "35": "plastic", "36": "curtain", "37": "security", "38": "lanyard", "39": "low", "40": "lady", "41": "church", "42": "tan", "43": "name tag", "44": "small", "45": "10", "46": "yes", "47": "on street", "48": "sky", "49": "tower", "50": "8:35", "51": "roof", "52": "bedroom", "53": "laying down", "54": "protection", "55": "3", "56": "out", "57": "cloudy", "58": "french", "59": "skateboard", "60": "9:35", "61": "neon", "62": "plate", "63": "cup", "64": "unknown", "65": "exit", "66": "calico", "67": "human", "68": "white and blue", "69": "station", "70": "shade", "71": "arrow", "72": "talking on phone", "73": "bikes", "74": "picnic table", "75": "wedding", "76": "train", "77": "0", "78": "girl", "79": "backpack", "80": "bricks", "81": "birthday", "82": "natural", "83": "brick", "84": "ball", "85": "curtains", "86": "park", "87": "tabby", "88": "hat", "89": "right", "90": "net", "91": "crossing", "92": "6", "93": "5", "94": "plain", "95": "lg", "96": "clock tower", "97": "hawaii", "98": "gray and black", "99": "little girl", "100": "blue and white", "101": "forest", "102": "ice cream", "103": "yellow", "104": "door", "105": "soccer", "106": "queen", "107": "beige", "108": "screen", "109": "red and yellow", "110": "watching", "111": "africa", "112": "no", "113": "2013", "114": "talking", "115": "fence", "116": "clear", "117": "stand", "118": "hair", "119": "woods", "120": "snowboarding", "121": "2", "122": "not sure", "123": "dirt", "124": "tent", "125": "green", "126": "platform", "127": "person", "128": "orange", "129": "leather", "130": "wine tasting", "131": "windows", "132": "beagle", "133": "king", "134": "can't tell", "135": "giraffes", "136": "crown", "137": "8", "138": "bicycles", "139": "zoo", "140": "double", "141": "walking", "142": "chair", "143": "desert", "144": "red and blue", "145": "fashion", "146": "blonde", "147": "car", "148": "shelter", "149": "4", "150": "jeep", "151": "wall", "152": "in car", "153": "they aren't", "154": "bus", "155": "bike rack", "156": "down", "157": "don't know", "158": "smiling", "159": "at table", "160": "shadows", "161": "on road", "162": "man", "163": "7:35", "164": "full", "165": "tired", "166": "canopy", "167": "cat", "168": "tv", "169": "skiing", "170": "photographer", "171": "not there", "172": "skateboarding", "173": "doughnut", "174": "resting", "175": "brown", "176": "black and white", "177": "solid", "178": "cross", "179": "clock", "180": "woman", "181": "monitor", "182": "2010", "183": "giraffe", "184": "smile", "185": "chopsticks", "186": "white and black", "187": "wine", "188": "lying down", "189": "gray", "190": "cage", "191": "pink", "192": "7", "193": "donut", "194": "camera", "195": "outside", "196": "blue", "197": "snowboarder", "198": "sleeping" }, "image_size": 384, "initializer_range": 0.02, "intermediate_size": 3072, "label2id": { "0": 77, "1": 12, "10": 45, "2": 121, "2000": 30, "2010": 182, "2013": 113, "3": 55, "4": 149, "5": 93, "6": 92, "7": 192, "7:35": 163, "7:45": 15, "8": 137, "8:35": 50, "9:35": 60, "africa": 111, "air": 25, "arrow": 71, "at table": 159, "backpack": 79, "ball": 84, "beagle": 132, "bedroom": 52, "beige": 107, "bicycle": 16, "bicycles": 138, "big ben": 31, "bike rack": 155, "bikes": 73, "birthday": 81, "black": 10, "black and white": 176, "blonde": 146, "blue": 196, "blue and white": 100, "boy": 20, "brick": 83, "bricks": 80, "brown": 175, "bus": 154, "cage": 190, "calico": 66, "camera": 194, "can't tell": 134, "canopy": 166, "car": 147, "cat": 167, "chair": 142, "chopsticks": 185, "church": 41, "clear": 116, "clock": 179, "clock tower": 96, "cloudy": 57, "cross": 178, "crossing": 91, "crown": 136, "cup": 63, "curtain": 36, "curtains": 85, "desert": 143, "dirt": 123, "dog": 18, "don't know": 157, "donut": 193, "door": 104, "double": 140, "doughnut": 173, "down": 156, "exit": 65, "fashion": 145, "fence": 115, "forest": 101, "french": 58, "full": 164, "giraffe": 183, "giraffes": 135, "girl": 78, "gray": 189, "gray and black": 98, "green": 125, "ground": 9, "hair": 118, "happy": 5, "hat": 88, "hawaii": 97, "human": 67, "ice cream": 102, "in car": 152, "jeep": 150, "king": 133, "lady": 40, "lanyard": 38, "large": 4, "laying down": 53, "leather": 129, "lg": 95, "little girl": 99, "low": 39, "lying down": 188, "man": 162, "many": 26, "monitor": 181, "name tag": 43, "natural": 82, "necklace": 6, "neon": 61, "net": 90, "no": 112, "not sure": 122, "not there": 171, "nothing": 23, "on road": 161, "on street": 47, "orange": 128, "out": 56, "outside": 195, "park": 86, "person": 127, "photographer": 170, "picnic table": 74, "pink": 191, "plain": 94, "plastic": 35, "plate": 62, "platform": 126, "protection": 54, "purple": 2, "queen": 106, "rack": 22, "red": 7, "red and blue": 144, "red and yellow": 109, "resting": 174, "right": 89, "roof": 51, "screen": 108, "security": 37, "shade": 70, "shadow": 34, "shadows": 160, "shelter": 148, "shrimp": 29, "sidewalk": 21, "skateboard": 59, "skateboarding": 172, "skier": 33, "skiing": 169, "sky": 48, "sleeping": 198, "small": 44, "smile": 184, "smiling": 158, "snow": 3, "snowboard": 32, "snowboarder": 197, "snowboarding": 120, "soccer": 105, "soccer ball": 17, "solid": 177, "stand": 117, "station": 69, "street": 0, "stripes": 27, "style": 13, "sun": 14, "suv": 1, "tabby": 87, "table": 19, "talking": 114, "talking on phone": 72, "tan": 42, "tent": 124, "they aren't": 153, "tired": 165, "tower": 49, "train": 76, "trees": 8, "tv": 168, "unknown": 64, "walking": 141, "wall": 151, "watching": 110, "wedding": 75, "white": 11, "white and black": 186, "white and blue": 68, "window": 24, "windows": 131, "wine": 187, "wine tasting": 130, "woman": 180, "women": 28, "woods": 119, "yellow": 103, "yes": 46, "zoo": 139 }, "layer_norm_eps": 1e-12, "max_image_length": -1, "max_position_embeddings": 40, "modality_type_vocab_size": 2, "model_type": "vilt", "num_attention_heads": 12, "num_channels": 3, "num_hidden_layers": 12, "num_images": -1, "patch_size": 32, "qkv_bias": true, "tie_word_embeddings": false, "torch_dtype": "float32", "transformers_version": "4.42.3", "type_vocab_size": 2, "vocab_size": 30522 }