{ "_name_or_path": "dandelin/vilt-b32-mlm", "architectures": [ "ViltForQuestionAnswering" ], "attention_probs_dropout_prob": 0.0, "hidden_act": "gelu", "hidden_dropout_prob": 0.0, "hidden_size": 768, "id2label": { "0": "window", "1": "park", "2": "giraffes", "3": "in car", "4": "human", "5": "cup", "6": "church", "7": "they aren't", "8": "wine tasting", "9": "snowboarding", "10": "2", "11": "woods", "12": "wine", "13": "sun", "14": "sidewalk", "15": "white and blue", "16": "white and black", "17": "blue", "18": "snow", "19": "5", "20": "purple", "21": "air", "22": "birthday", "23": "lanyard", "24": "ice cream", "25": "tower", "26": "4", "27": "desert", "28": "cross", "29": "camera", "30": "security", "31": "jeep", "32": "bike rack", "33": "talking", "34": "woman", "35": "wedding", "36": "forest", "37": "bus", "38": "smiling", "39": "hat", "40": "photographer", "41": "green", "42": "bedroom", "43": "full", "44": "crossing", "45": "skateboarding", "46": "soccer", "47": "blue and white", "48": "hair", "49": "hawaii", "50": "curtain", "51": "calico", "52": "big ben", "53": "laying down", "54": "giraffe", "55": "exit", "56": "clock", "57": "1", "58": "skier", "59": "small", "60": "can't tell", "61": "brick", "62": "stand", "63": "queen", "64": "7", "65": "resting", "66": "ball", "67": "many", "68": "picnic table", "69": "unknown", "70": "name tag", "71": "red and blue", "72": "stripes", "73": "zoo", "74": "clock tower", "75": "table", "76": "women", "77": "not sure", "78": "monitor", "79": "girl", "80": "black and white", "81": "double", "82": "ground", "83": "sky", "84": "on street", "85": "door", "86": "tired", "87": "natural", "88": "0", "89": "bricks", "90": "windows", "91": "africa", "92": "train", "93": "3", "94": "crown", "95": "red and yellow", "96": "8", "97": "backpack", "98": "french", "99": "happy", "100": "dirt", "101": "lg", "102": "tan", "103": "screen", "104": "fashion", "105": "necklace", "106": "cloudy", "107": "lady", "108": "tv", "109": "outside", "110": "10", "111": "station", "112": "talking on phone", "113": "yes", "114": "2010", "115": "don't know", "116": "dog", "117": "shadow", "118": "canopy", "119": "net", "120": "wall", "121": "white", "122": "nothing", "123": "cage", "124": "beagle", "125": "shelter", "126": "on road", "127": "snowboarder", "128": "7:45", "129": "little girl", "130": "soccer ball", "131": "8:35", "132": "street", "133": "tent", "134": "car", "135": "smile", "136": "gray", "137": "beige", "138": "suv", "139": "pink", "140": "tabby", "141": "2000", "142": "2013", "143": "protection", "144": "blonde", "145": "plate", "146": "cat", "147": "chopsticks", "148": "skiing", "149": "large", "150": "bicycles", "151": "leather", "152": "black", "153": "right", "154": "clear", "155": "6", "156": "arrow", "157": "at table", "158": "boy", "159": "plastic", "160": "7:35", "161": "no", "162": "skateboard", "163": "man", "164": "9:35", "165": "trees", "166": "platform", "167": "shade", "168": "yellow", "169": "chair", "170": "shadows", "171": "donut", "172": "neon", "173": "down", "174": "watching", "175": "shrimp", "176": "plain", "177": "king", "178": "walking", "179": "not there", "180": "snowboard", "181": "brown", "182": "out", "183": "person", "184": "bikes", "185": "style", "186": "curtains", "187": "doughnut", "188": "orange", "189": "red", "190": "low", "191": "roof", "192": "lying down", "193": "solid", "194": "rack", "195": "fence", "196": "sleeping", "197": "bicycle", "198": "gray and black" }, "image_size": 384, "initializer_range": 0.02, "intermediate_size": 3072, "label2id": { "0": 88, "1": 57, "10": 110, "2": 10, "2000": 141, "2010": 114, "2013": 142, "3": 93, "4": 26, "5": 19, "6": 155, "7": 64, "7:35": 160, "7:45": 128, "8": 96, "8:35": 131, "9:35": 164, "africa": 91, "air": 21, "arrow": 156, "at table": 157, "backpack": 97, "ball": 66, "beagle": 124, "bedroom": 42, "beige": 137, "bicycle": 197, "bicycles": 150, "big ben": 52, "bike rack": 32, "bikes": 184, "birthday": 22, "black": 152, "black and white": 80, "blonde": 144, "blue": 17, "blue and white": 47, "boy": 158, "brick": 61, "bricks": 89, "brown": 181, "bus": 37, "cage": 123, "calico": 51, "camera": 29, "can't tell": 60, "canopy": 118, "car": 134, "cat": 146, "chair": 169, "chopsticks": 147, "church": 6, "clear": 154, "clock": 56, "clock tower": 74, "cloudy": 106, "cross": 28, "crossing": 44, "crown": 94, "cup": 5, "curtain": 50, "curtains": 186, "desert": 27, "dirt": 100, "dog": 116, "don't know": 115, "donut": 171, "door": 85, "double": 81, "doughnut": 187, "down": 173, "exit": 55, "fashion": 104, "fence": 195, "forest": 36, "french": 98, "full": 43, "giraffe": 54, "giraffes": 2, "girl": 79, "gray": 136, "gray and black": 198, "green": 41, "ground": 82, "hair": 48, "happy": 99, "hat": 39, "hawaii": 49, "human": 4, "ice cream": 24, "in car": 3, "jeep": 31, "king": 177, "lady": 107, "lanyard": 23, "large": 149, "laying down": 53, "leather": 151, "lg": 101, "little girl": 129, "low": 190, "lying down": 192, "man": 163, "many": 67, "monitor": 78, "name tag": 70, "natural": 87, "necklace": 105, "neon": 172, "net": 119, "no": 161, "not sure": 77, "not there": 179, "nothing": 122, "on road": 126, "on street": 84, "orange": 188, "out": 182, "outside": 109, "park": 1, "person": 183, "photographer": 40, "picnic table": 68, "pink": 139, "plain": 176, "plastic": 159, "plate": 145, "platform": 166, "protection": 143, "purple": 20, "queen": 63, "rack": 194, "red": 189, "red and blue": 71, "red and yellow": 95, "resting": 65, "right": 153, "roof": 191, "screen": 103, "security": 30, "shade": 167, "shadow": 117, "shadows": 170, "shelter": 125, "shrimp": 175, "sidewalk": 14, "skateboard": 162, "skateboarding": 45, "skier": 58, "skiing": 148, "sky": 83, "sleeping": 196, "small": 59, "smile": 135, "smiling": 38, "snow": 18, "snowboard": 180, "snowboarder": 127, "snowboarding": 9, "soccer": 46, "soccer ball": 130, "solid": 193, "stand": 62, "station": 111, "street": 132, "stripes": 72, "style": 185, "sun": 13, "suv": 138, "tabby": 140, "table": 75, "talking": 33, "talking on phone": 112, "tan": 102, "tent": 133, "they aren't": 7, "tired": 86, "tower": 25, "train": 92, "trees": 165, "tv": 108, "unknown": 69, "walking": 178, "wall": 120, "watching": 174, "wedding": 35, "white": 121, "white and black": 16, "white and blue": 15, "window": 0, "windows": 90, "wine": 12, "wine tasting": 8, "woman": 34, "women": 76, "woods": 11, "yellow": 168, "yes": 113, "zoo": 73 }, "layer_norm_eps": 1e-12, "max_image_length": -1, "max_position_embeddings": 40, "modality_type_vocab_size": 2, "model_type": "vilt", "num_attention_heads": 12, "num_channels": 3, "num_hidden_layers": 12, "num_images": -1, "patch_size": 32, "qkv_bias": true, "tie_word_embeddings": false, "torch_dtype": "float32", "transformers_version": "4.39.3", "type_vocab_size": 2, "vocab_size": 30522 }