{ "_name_or_path": "dandelin/vilt-b32-mlm", "architectures": [ "ViltForQuestionAnswering" ], "attention_probs_dropout_prob": 0.0, "hidden_act": "gelu", "hidden_dropout_prob": 0.0, "hidden_size": 768, "id2label": { "0": "8", "1": "purple", "2": "air", "3": "bedroom", "4": "yellow", "5": "forest", "6": "person", "7": "big ben", "8": "ball", "9": "doughnut", "10": "fence", "11": "5", "12": "3", "13": "1", "14": "snowboarder", "15": "down", "16": "stand", "17": "soccer ball", "18": "exit", "19": "not sure", "20": "curtains", "21": "2013", "22": "security", "23": "chopsticks", "24": "resting", "25": "hawaii", "26": "lying down", "27": "6", "28": "monitor", "29": "tower", "30": "unknown", "31": "bikes", "32": "walking", "33": "beagle", "34": "windows", "35": "calico", "36": "don't know", "37": "large", "38": "shadow", "39": "style", "40": "tabby", "41": "station", "42": "blue", "43": "plate", "44": "7", "45": "small", "46": "4", "47": "wine tasting", "48": "cloudy", "49": "door", "50": "dirt", "51": "bus", "52": "9:35", "53": "car", "54": "cross", "55": "smile", "56": "woman", "57": "happy", "58": "snowboard", "59": "skateboard", "60": "7:45", "61": "wine", "62": "chair", "63": "tv", "64": "solid", "65": "bicycle", "66": "soccer", "67": "beige", "68": "screen", "69": "giraffes", "70": "tired", "71": "natural", "72": "many", "73": "7:35", "74": "canopy", "75": "cage", "76": "clock", "77": "curtain", "78": "lanyard", "79": "church", "80": "shade", "81": "orange", "82": "talking", "83": "wall", "84": "green", "85": "white", "86": "arrow", "87": "trees", "88": "can't tell", "89": "low", "90": "table", "91": "tent", "92": "plain", "93": "boy", "94": "giraffe", "95": "women", "96": "gray and black", "97": "girl", "98": "smiling", "99": "black and white", "100": "sky", "101": "leather", "102": "bricks", "103": "black", "104": "on street", "105": "woods", "106": "2", "107": "park", "108": "they aren't", "109": "out", "110": "roof", "111": "train", "112": "talking on phone", "113": "ice cream", "114": "platform", "115": "zoo", "116": "white and black", "117": "not there", "118": "at table", "119": "street", "120": "pink", "121": "sleeping", "122": "desert", "123": "wedding", "124": "man", "125": "shrimp", "126": "2000", "127": "nothing", "128": "8:35", "129": "brown", "130": "hair", "131": "plastic", "132": "french", "133": "neon", "134": "tan", "135": "rack", "136": "blue and white", "137": "shelter", "138": "right", "139": "sun", "140": "queen", "141": "yes", "142": "king", "143": "no", "144": "window", "145": "gray", "146": "full", "147": "lg", "148": "red and yellow", "149": "cat", "150": "2010", "151": "cup", "152": "name tag", "153": "necklace", "154": "skiing", "155": "blonde", "156": "white and blue", "157": "outside", "158": "photographer", "159": "human", "160": "stripes", "161": "africa", "162": "little girl", "163": "snowboarding", "164": "shadows", "165": "watching", "166": "red", "167": "ground", "168": "jeep", "169": "fashion", "170": "camera", "171": "in car", "172": "hat", "173": "backpack", "174": "protection", "175": "0", "176": "crossing", "177": "donut", "178": "picnic table", "179": "double", "180": "10", "181": "red and blue", "182": "skier", "183": "crown", "184": "lady", "185": "clear", "186": "sidewalk", "187": "net", "188": "brick", "189": "birthday", "190": "clock tower", "191": "on road", "192": "bike rack", "193": "skateboarding", "194": "suv", "195": "dog", "196": "bicycles", "197": "snow", "198": "laying down" }, "image_size": 384, "initializer_range": 0.02, "intermediate_size": 3072, "label2id": { "0": 175, "1": 13, "10": 180, "2": 106, "2000": 126, "2010": 150, "2013": 21, "3": 12, "4": 46, "5": 11, "6": 27, "7": 44, "7:35": 73, "7:45": 60, "8": 0, "8:35": 128, "9:35": 52, "africa": 161, "air": 2, "arrow": 86, "at table": 118, "backpack": 173, "ball": 8, "beagle": 33, "bedroom": 3, "beige": 67, "bicycle": 65, "bicycles": 196, "big ben": 7, "bike rack": 192, "bikes": 31, "birthday": 189, "black": 103, "black and white": 99, "blonde": 155, "blue": 42, "blue and white": 136, "boy": 93, "brick": 188, "bricks": 102, "brown": 129, "bus": 51, "cage": 75, "calico": 35, "camera": 170, "can't tell": 88, "canopy": 74, "car": 53, "cat": 149, "chair": 62, "chopsticks": 23, "church": 79, "clear": 185, "clock": 76, "clock tower": 190, "cloudy": 48, "cross": 54, "crossing": 176, "crown": 183, "cup": 151, "curtain": 77, "curtains": 20, "desert": 122, "dirt": 50, "dog": 195, "don't know": 36, "donut": 177, "door": 49, "double": 179, "doughnut": 9, "down": 15, "exit": 18, "fashion": 169, "fence": 10, "forest": 5, "french": 132, "full": 146, "giraffe": 94, "giraffes": 69, "girl": 97, "gray": 145, "gray and black": 96, "green": 84, "ground": 167, "hair": 130, "happy": 57, "hat": 172, "hawaii": 25, "human": 159, "ice cream": 113, "in car": 171, "jeep": 168, "king": 142, "lady": 184, "lanyard": 78, "large": 37, "laying down": 198, "leather": 101, "lg": 147, "little girl": 162, "low": 89, "lying down": 26, "man": 124, "many": 72, "monitor": 28, "name tag": 152, "natural": 71, "necklace": 153, "neon": 133, "net": 187, "no": 143, "not sure": 19, "not there": 117, "nothing": 127, "on road": 191, "on street": 104, "orange": 81, "out": 109, "outside": 157, "park": 107, "person": 6, "photographer": 158, "picnic table": 178, "pink": 120, "plain": 92, "plastic": 131, "plate": 43, "platform": 114, "protection": 174, "purple": 1, "queen": 140, "rack": 135, "red": 166, "red and blue": 181, "red and yellow": 148, "resting": 24, "right": 138, "roof": 110, "screen": 68, "security": 22, "shade": 80, "shadow": 38, "shadows": 164, "shelter": 137, "shrimp": 125, "sidewalk": 186, "skateboard": 59, "skateboarding": 193, "skier": 182, "skiing": 154, "sky": 100, "sleeping": 121, "small": 45, "smile": 55, "smiling": 98, "snow": 197, "snowboard": 58, "snowboarder": 14, "snowboarding": 163, "soccer": 66, "soccer ball": 17, "solid": 64, "stand": 16, "station": 41, "street": 119, "stripes": 160, "style": 39, "sun": 139, "suv": 194, "tabby": 40, "table": 90, "talking": 82, "talking on phone": 112, "tan": 134, "tent": 91, "they aren't": 108, "tired": 70, "tower": 29, "train": 111, "trees": 87, "tv": 63, "unknown": 30, "walking": 32, "wall": 83, "watching": 165, "wedding": 123, "white": 85, "white and black": 116, "white and blue": 156, "window": 144, "windows": 34, "wine": 61, "wine tasting": 47, "woman": 56, "women": 95, "woods": 105, "yellow": 4, "yes": 141, "zoo": 115 }, "layer_norm_eps": 1e-12, "max_image_length": -1, "max_position_embeddings": 40, "modality_type_vocab_size": 2, "model_type": "vilt", "num_attention_heads": 12, "num_channels": 3, "num_hidden_layers": 12, "num_images": -1, "patch_size": 32, "qkv_bias": true, "tie_word_embeddings": false, "torch_dtype": "float32", "transformers_version": "4.38.2", "type_vocab_size": 2, "vocab_size": 30522 }