{ "_name_or_path": "dandelin/vilt-b32-mlm", "architectures": [ "ViltForQuestionAnswering" ], "attention_probs_dropout_prob": 0.0, "hidden_act": "gelu", "hidden_dropout_prob": 0.0, "hidden_size": 768, "id2label": { "0": "small", "1": "they aren't", "2": "out", "3": "wine", "4": "red", "5": "clock", "6": "black", "7": "gray and black", "8": "train", "9": "curtain", "10": "cage", "11": "man", "12": "snowboarding", "13": "security", "14": "women", "15": "wall", "16": "7:45", "17": "forest", "18": "brick", "19": "8", "20": "plain", "21": "platform", "22": "2010", "23": "bedroom", "24": "zoo", "25": "necklace", "26": "skateboard", "27": "tv", "28": "watching", "29": "doughnut", "30": "door", "31": "cloudy", "32": "1", "33": "green", "34": "calico", "35": "full", "36": "not sure", "37": "exit", "38": "at table", "39": "on road", "40": "solid", "41": "red and yellow", "42": "bus", "43": "fashion", "44": "person", "45": "white and blue", "46": "street", "47": "crossing", "48": "9:35", "49": "5", "50": "name tag", "51": "6", "52": "bikes", "53": "cat", "54": "wine tasting", "55": "monitor", "56": "skiing", "57": "shelter", "58": "giraffe", "59": "human", "60": "sky", "61": "skier", "62": "blue and white", "63": "white and black", "64": "screen", "65": "snow", "66": "0", "67": "roof", "68": "outside", "69": "can't tell", "70": "king", "71": "trees", "72": "tower", "73": "laying down", "74": "soccer ball", "75": "desert", "76": "leather", "77": "beige", "78": "tabby", "79": "cup", "80": "2013", "81": "suv", "82": "chopsticks", "83": "snowboard", "84": "low", "85": "photographer", "86": "lady", "87": "rack", "88": "station", "89": "bike rack", "90": "yellow", "91": "backpack", "92": "red and blue", "93": "purple", "94": "girl", "95": "protection", "96": "bicycles", "97": "plate", "98": "2", "99": "talking on phone", "100": "donut", "101": "unknown", "102": "bricks", "103": "net", "104": "10", "105": "4", "106": "in car", "107": "cross", "108": "blue", "109": "fence", "110": "africa", "111": "8:35", "112": "wedding", "113": "talking", "114": "beagle", "115": "shadow", "116": "canopy", "117": "shrimp", "118": "on street", "119": "no", "120": "shade", "121": "camera", "122": "jeep", "123": "picnic table", "124": "tent", "125": "church", "126": "walking", "127": "woman", "128": "dirt", "129": "dog", "130": "brown", "131": "smile", "132": "pink", "133": "right", "134": "orange", "135": "double", "136": "sleeping", "137": "ball", "138": "happy", "139": "sun", "140": "sidewalk", "141": "don't know", "142": "white", "143": "hawaii", "144": "blonde", "145": "many", "146": "ground", "147": "boy", "148": "nothing", "149": "style", "150": "little girl", "151": "arrow", "152": "queen", "153": "soccer", "154": "birthday", "155": "black and white", "156": "park", "157": "giraffes", "158": "air", "159": "skateboarding", "160": "smiling", "161": "down", "162": "clock tower", "163": "large", "164": "not there", "165": "table", "166": "windows", "167": "shadows", "168": "lanyard", "169": "tan", "170": "gray", "171": "yes", "172": "lg", "173": "lying down", "174": "resting", "175": "7", "176": "hat", "177": "ice cream", "178": "stripes", "179": "window", "180": "chair", "181": "3", "182": "snowboarder", "183": "clear", "184": "neon", "185": "crown", "186": "woods", "187": "7:35", "188": "car", "189": "natural", "190": "french", "191": "stand", "192": "tired", "193": "curtains", "194": "hair", "195": "plastic", "196": "2000", "197": "bicycle", "198": "big ben" }, "image_size": 384, "initializer_range": 0.02, "intermediate_size": 3072, "label2id": { "0": 66, "1": 32, "10": 104, "2": 98, "2000": 196, "2010": 22, "2013": 80, "3": 181, "4": 105, "5": 49, "6": 51, "7": 175, "7:35": 187, "7:45": 16, "8": 19, "8:35": 111, "9:35": 48, "africa": 110, "air": 158, "arrow": 151, "at table": 38, "backpack": 91, "ball": 137, "beagle": 114, "bedroom": 23, "beige": 77, "bicycle": 197, "bicycles": 96, "big ben": 198, "bike rack": 89, "bikes": 52, "birthday": 154, "black": 6, "black and white": 155, "blonde": 144, "blue": 108, "blue and white": 62, "boy": 147, "brick": 18, "bricks": 102, "brown": 130, "bus": 42, "cage": 10, "calico": 34, "camera": 121, "can't tell": 69, "canopy": 116, "car": 188, "cat": 53, "chair": 180, "chopsticks": 82, "church": 125, "clear": 183, "clock": 5, "clock tower": 162, "cloudy": 31, "cross": 107, "crossing": 47, "crown": 185, "cup": 79, "curtain": 9, "curtains": 193, "desert": 75, "dirt": 128, "dog": 129, "don't know": 141, "donut": 100, "door": 30, "double": 135, "doughnut": 29, "down": 161, "exit": 37, "fashion": 43, "fence": 109, "forest": 17, "french": 190, "full": 35, "giraffe": 58, "giraffes": 157, "girl": 94, "gray": 170, "gray and black": 7, "green": 33, "ground": 146, "hair": 194, "happy": 138, "hat": 176, "hawaii": 143, "human": 59, "ice cream": 177, "in car": 106, "jeep": 122, "king": 70, "lady": 86, "lanyard": 168, "large": 163, "laying down": 73, "leather": 76, "lg": 172, "little girl": 150, "low": 84, "lying down": 173, "man": 11, "many": 145, "monitor": 55, "name tag": 50, "natural": 189, "necklace": 25, "neon": 184, "net": 103, "no": 119, "not sure": 36, "not there": 164, "nothing": 148, "on road": 39, "on street": 118, "orange": 134, "out": 2, "outside": 68, "park": 156, "person": 44, "photographer": 85, "picnic table": 123, "pink": 132, "plain": 20, "plastic": 195, "plate": 97, "platform": 21, "protection": 95, "purple": 93, "queen": 152, "rack": 87, "red": 4, "red and blue": 92, "red and yellow": 41, "resting": 174, "right": 133, "roof": 67, "screen": 64, "security": 13, "shade": 120, "shadow": 115, "shadows": 167, "shelter": 57, "shrimp": 117, "sidewalk": 140, "skateboard": 26, "skateboarding": 159, "skier": 61, "skiing": 56, "sky": 60, "sleeping": 136, "small": 0, "smile": 131, "smiling": 160, "snow": 65, "snowboard": 83, "snowboarder": 182, "snowboarding": 12, "soccer": 153, "soccer ball": 74, "solid": 40, "stand": 191, "station": 88, "street": 46, "stripes": 178, "style": 149, "sun": 139, "suv": 81, "tabby": 78, "table": 165, "talking": 113, "talking on phone": 99, "tan": 169, "tent": 124, "they aren't": 1, "tired": 192, "tower": 72, "train": 8, "trees": 71, "tv": 27, "unknown": 101, "walking": 126, "wall": 15, "watching": 28, "wedding": 112, "white": 142, "white and black": 63, "white and blue": 45, "window": 179, "windows": 166, "wine": 3, "wine tasting": 54, "woman": 127, "women": 14, "woods": 186, "yellow": 90, "yes": 171, "zoo": 24 }, "layer_norm_eps": 1e-12, "max_image_length": -1, "max_position_embeddings": 40, "modality_type_vocab_size": 2, "model_type": "vilt", "num_attention_heads": 12, "num_channels": 3, "num_hidden_layers": 12, "num_images": -1, "patch_size": 32, "qkv_bias": true, "tie_word_embeddings": false, "torch_dtype": "float32", "transformers_version": "4.35.2", "type_vocab_size": 2, "vocab_size": 30522 }