{ "_name_or_path": "dandelin/vilt-b32-mlm", "architectures": [ "ViltForQuestionAnswering" ], "attention_probs_dropout_prob": 0.0, "hidden_act": "gelu", "hidden_dropout_prob": 0.0, "hidden_size": 768, "id2label": { "0": "skiing", "1": "women", "2": "hawaii", "3": "clock", "4": "solid", "5": "exit", "6": "jeep", "7": "girl", "8": "2010", "9": "plate", "10": "no", "11": "beige", "12": "hat", "13": "shelter", "14": "talking on phone", "15": "snowboard", "16": "train", "17": "cat", "18": "little girl", "19": "ground", "20": "security", "21": "shadow", "22": "smiling", "23": "cup", "24": "2", "25": "low", "26": "sleeping", "27": "1", "28": "soccer ball", "29": "cloudy", "30": "africa", "31": "double", "32": "roof", "33": "tower", "34": "white and black", "35": "gray", "36": "lady", "37": "curtains", "38": "not there", "39": "fashion", "40": "8:35", "41": "snowboarding", "42": "crossing", "43": "tan", "44": "canopy", "45": "right", "46": "on road", "47": "skier", "48": "car", "49": "they aren't", "50": "sky", "51": "neon", "52": "hair", "53": "talking", "54": "white", "55": "dirt", "56": "blue", "57": "boy", "58": "photographer", "59": "door", "60": "not sure", "61": "2013", "62": "green", "63": "lg", "64": "wall", "65": "red and blue", "66": "4", "67": "street", "68": "wedding", "69": "woman", "70": "lanyard", "71": "station", "72": "net", "73": "arrow", "74": "clock tower", "75": "ball", "76": "necklace", "77": "monitor", "78": "yellow", "79": "lying down", "80": "many", "81": "zoo", "82": "bricks", "83": "window", "84": "black", "85": "gray and black", "86": "tired", "87": "sidewalk", "88": "desert", "89": "small", "90": "ice cream", "91": "red", "92": "woods", "93": "outside", "94": "brown", "95": "platform", "96": "blue and white", "97": "shade", "98": "church", "99": "person", "100": "doughnut", "101": "birthday", "102": "7:35", "103": "orange", "104": "tv", "105": "plain", "106": "camera", "107": "soccer", "108": "chopsticks", "109": "giraffe", "110": "smile", "111": "cross", "112": "giraffes", "113": "full", "114": "rack", "115": "laying down", "116": "picnic table", "117": "human", "118": "10", "119": "air", "120": "park", "121": "on street", "122": "sun", "123": "style", "124": "6", "125": "natural", "126": "chair", "127": "out", "128": "calico", "129": "stand", "130": "protection", "131": "bicycle", "132": "crown", "133": "tabby", "134": "walking", "135": "king", "136": "blonde", "137": "forest", "138": "windows", "139": "brick", "140": "0", "141": "don't know", "142": "unknown", "143": "skateboarding", "144": "stripes", "145": "black and white", "146": "in car", "147": "bicycles", "148": "cage", "149": "purple", "150": "suv", "151": "large", "152": "nothing", "153": "french", "154": "beagle", "155": "clear", "156": "tent", "157": "9:35", "158": "trees", "159": "dog", "160": "bikes", "161": "snowboarder", "162": "donut", "163": "shadows", "164": "screen", "165": "skateboard", "166": "can't tell", "167": "resting", "168": "queen", "169": "bike rack", "170": "snow", "171": "bus", "172": "shrimp", "173": "plastic", "174": "big ben", "175": "watching", "176": "2000", "177": "3", "178": "leather", "179": "happy", "180": "down", "181": "red and yellow", "182": "8", "183": "fence", "184": "yes", "185": "backpack", "186": "man", "187": "at table", "188": "7:45", "189": "white and blue", "190": "wine", "191": "name tag", "192": "table", "193": "wine tasting", "194": "7", "195": "5", "196": "pink", "197": "curtain", "198": "bedroom" }, "image_size": 384, "initializer_range": 0.02, "intermediate_size": 3072, "label2id": { "0": 140, "1": 27, "10": 118, "2": 24, "2000": 176, "2010": 8, "2013": 61, "3": 177, "4": 66, "5": 195, "6": 124, "7": 194, "7:35": 102, "7:45": 188, "8": 182, "8:35": 40, "9:35": 157, "africa": 30, "air": 119, "arrow": 73, "at table": 187, "backpack": 185, "ball": 75, "beagle": 154, "bedroom": 198, "beige": 11, "bicycle": 131, "bicycles": 147, "big ben": 174, "bike rack": 169, "bikes": 160, "birthday": 101, "black": 84, "black and white": 145, "blonde": 136, "blue": 56, "blue and white": 96, "boy": 57, "brick": 139, "bricks": 82, "brown": 94, "bus": 171, "cage": 148, "calico": 128, "camera": 106, "can't tell": 166, "canopy": 44, "car": 48, "cat": 17, "chair": 126, "chopsticks": 108, "church": 98, "clear": 155, "clock": 3, "clock tower": 74, "cloudy": 29, "cross": 111, "crossing": 42, "crown": 132, "cup": 23, "curtain": 197, "curtains": 37, "desert": 88, "dirt": 55, "dog": 159, "don't know": 141, "donut": 162, "door": 59, "double": 31, "doughnut": 100, "down": 180, "exit": 5, "fashion": 39, "fence": 183, "forest": 137, "french": 153, "full": 113, "giraffe": 109, "giraffes": 112, "girl": 7, "gray": 35, "gray and black": 85, "green": 62, "ground": 19, "hair": 52, "happy": 179, "hat": 12, "hawaii": 2, "human": 117, "ice cream": 90, "in car": 146, "jeep": 6, "king": 135, "lady": 36, "lanyard": 70, "large": 151, "laying down": 115, "leather": 178, "lg": 63, "little girl": 18, "low": 25, "lying down": 79, "man": 186, "many": 80, "monitor": 77, "name tag": 191, "natural": 125, "necklace": 76, "neon": 51, "net": 72, "no": 10, "not sure": 60, "not there": 38, "nothing": 152, "on road": 46, "on street": 121, "orange": 103, "out": 127, "outside": 93, "park": 120, "person": 99, "photographer": 58, "picnic table": 116, "pink": 196, "plain": 105, "plastic": 173, "plate": 9, "platform": 95, "protection": 130, "purple": 149, "queen": 168, "rack": 114, "red": 91, "red and blue": 65, "red and yellow": 181, "resting": 167, "right": 45, "roof": 32, "screen": 164, "security": 20, "shade": 97, "shadow": 21, "shadows": 163, "shelter": 13, "shrimp": 172, "sidewalk": 87, "skateboard": 165, "skateboarding": 143, "skier": 47, "skiing": 0, "sky": 50, "sleeping": 26, "small": 89, "smile": 110, "smiling": 22, "snow": 170, "snowboard": 15, "snowboarder": 161, "snowboarding": 41, "soccer": 107, "soccer ball": 28, "solid": 4, "stand": 129, "station": 71, "street": 67, "stripes": 144, "style": 123, "sun": 122, "suv": 150, "tabby": 133, "table": 192, "talking": 53, "talking on phone": 14, "tan": 43, "tent": 156, "they aren't": 49, "tired": 86, "tower": 33, "train": 16, "trees": 158, "tv": 104, "unknown": 142, "walking": 134, "wall": 64, "watching": 175, "wedding": 68, "white": 54, "white and black": 34, "white and blue": 189, "window": 83, "windows": 138, "wine": 190, "wine tasting": 193, "woman": 69, "women": 1, "woods": 92, "yellow": 78, "yes": 184, "zoo": 81 }, "layer_norm_eps": 1e-12, "max_image_length": -1, "max_position_embeddings": 40, "modality_type_vocab_size": 2, "model_type": "vilt", "num_attention_heads": 12, "num_channels": 3, "num_hidden_layers": 12, "num_images": -1, "patch_size": 32, "qkv_bias": true, "tie_word_embeddings": false, "torch_dtype": "float32", "transformers_version": "4.29.2", "type_vocab_size": 2, "vocab_size": 30522 }