{ "_name_or_path": "dandelin/vilt-b32-mlm", "architectures": [ "ViltForQuestionAnswering" ], "attention_probs_dropout_prob": 0.0, "hidden_act": "gelu", "hidden_dropout_prob": 0.0, "hidden_size": 768, "id2label": { "0": "walking", "1": "sleeping", "2": "black", "3": "car", "4": "black and white", "5": "queen", "6": "10", "7": "french", "8": "crossing", "9": "5", "10": "air", "11": "zoo", "12": "wall", "13": "tower", "14": "stand", "15": "snowboard", "16": "purple", "17": "lady", "18": "church", "19": "sun", "20": "wedding", "21": "little girl", "22": "camera", "23": "station", "24": "yellow", "25": "chair", "26": "smiling", "27": "8:35", "28": "blue", "29": "on road", "30": "leather", "31": "on street", "32": "bicycles", "33": "unknown", "34": "king", "35": "skateboarding", "36": "neon", "37": "no", "38": "lanyard", "39": "cloudy", "40": "trees", "41": "park", "42": "shelter", "43": "in car", "44": "arrow", "45": "bus", "46": "train", "47": "bicycle", "48": "wine", "49": "9:35", "50": "talking on phone", "51": "4", "52": "roof", "53": "woman", "54": "blue and white", "55": "orange", "56": "happy", "57": "bike rack", "58": "red", "59": "woods", "60": "dog", "61": "big ben", "62": "sidewalk", "63": "can't tell", "64": "tv", "65": "windows", "66": "human", "67": "snowboarder", "68": "screen", "69": "calico", "70": "cat", "71": "name tag", "72": "brick", "73": "soccer ball", "74": "backpack", "75": "they aren't", "76": "blonde", "77": "canopy", "78": "6", "79": "girl", "80": "giraffes", "81": "chopsticks", "82": "beige", "83": "nothing", "84": "white and black", "85": "dirt", "86": "plate", "87": "shrimp", "88": "7:45", "89": "brown", "90": "small", "91": "picnic table", "92": "curtain", "93": "stripes", "94": "talking", "95": "right", "96": "skier", "97": "full", "98": "platform", "99": "door", "100": "desert", "101": "double", "102": "bedroom", "103": "cup", "104": "yes", "105": "solid", "106": "snowboarding", "107": "hair", "108": "clear", "109": "tan", "110": "doughnut", "111": "net", "112": "exit", "113": "not there", "114": "green", "115": "forest", "116": "3", "117": "2", "118": "ball", "119": "soccer", "120": "fashion", "121": "bricks", "122": "person", "123": "many", "124": "photographer", "125": "don't know", "126": "tent", "127": "lg", "128": "natural", "129": "rack", "130": "outside", "131": "gray and black", "132": "africa", "133": "gray", "134": "suv", "135": "bikes", "136": "clock tower", "137": "large", "138": "shadows", "139": "hat", "140": "snow", "141": "not sure", "142": "white and blue", "143": "hawaii", "144": "style", "145": "shadow", "146": "tabby", "147": "low", "148": "cage", "149": "tired", "150": "clock", "151": "2010", "152": "cross", "153": "8", "154": "window", "155": "ice cream", "156": "crown", "157": "pink", "158": "skateboard", "159": "7", "160": "shade", "161": "giraffe", "162": "sky", "163": "lying down", "164": "smile", "165": "jeep", "166": "red and yellow", "167": "resting", "168": "7:35", "169": "white", "170": "wine tasting", "171": "fence", "172": "skiing", "173": "street", "174": "ground", "175": "table", "176": "protection", "177": "boy", "178": "red and blue", "179": "beagle", "180": "0", "181": "2000", "182": "necklace", "183": "watching", "184": "at table", "185": "out", "186": "curtains", "187": "donut", "188": "security", "189": "1", "190": "plastic", "191": "birthday", "192": "plain", "193": "laying down", "194": "monitor", "195": "women", "196": "man", "197": "2013", "198": "down" }, "image_size": 384, "initializer_range": 0.02, "intermediate_size": 3072, "label2id": { "0": 180, "1": 189, "10": 6, "2": 117, "2000": 181, "2010": 151, "2013": 197, "3": 116, "4": 51, "5": 9, "6": 78, "7": 159, "7:35": 168, "7:45": 88, "8": 153, "8:35": 27, "9:35": 49, "africa": 132, "air": 10, "arrow": 44, "at table": 184, "backpack": 74, "ball": 118, "beagle": 179, "bedroom": 102, "beige": 82, "bicycle": 47, "bicycles": 32, "big ben": 61, "bike rack": 57, "bikes": 135, "birthday": 191, "black": 2, "black and white": 4, "blonde": 76, "blue": 28, "blue and white": 54, "boy": 177, "brick": 72, "bricks": 121, "brown": 89, "bus": 45, "cage": 148, "calico": 69, "camera": 22, "can't tell": 63, "canopy": 77, "car": 3, "cat": 70, "chair": 25, "chopsticks": 81, "church": 18, "clear": 108, "clock": 150, "clock tower": 136, "cloudy": 39, "cross": 152, "crossing": 8, "crown": 156, "cup": 103, "curtain": 92, "curtains": 186, "desert": 100, "dirt": 85, "dog": 60, "don't know": 125, "donut": 187, "door": 99, "double": 101, "doughnut": 110, "down": 198, "exit": 112, "fashion": 120, "fence": 171, "forest": 115, "french": 7, "full": 97, "giraffe": 161, "giraffes": 80, "girl": 79, "gray": 133, "gray and black": 131, "green": 114, "ground": 174, "hair": 107, "happy": 56, "hat": 139, "hawaii": 143, "human": 66, "ice cream": 155, "in car": 43, "jeep": 165, "king": 34, "lady": 17, "lanyard": 38, "large": 137, "laying down": 193, "leather": 30, "lg": 127, "little girl": 21, "low": 147, "lying down": 163, "man": 196, "many": 123, "monitor": 194, "name tag": 71, "natural": 128, "necklace": 182, "neon": 36, "net": 111, "no": 37, "not sure": 141, "not there": 113, "nothing": 83, "on road": 29, "on street": 31, "orange": 55, "out": 185, "outside": 130, "park": 41, "person": 122, "photographer": 124, "picnic table": 91, "pink": 157, "plain": 192, "plastic": 190, "plate": 86, "platform": 98, "protection": 176, "purple": 16, "queen": 5, "rack": 129, "red": 58, "red and blue": 178, "red and yellow": 166, "resting": 167, "right": 95, "roof": 52, "screen": 68, "security": 188, "shade": 160, "shadow": 145, "shadows": 138, "shelter": 42, "shrimp": 87, "sidewalk": 62, "skateboard": 158, "skateboarding": 35, "skier": 96, "skiing": 172, "sky": 162, "sleeping": 1, "small": 90, "smile": 164, "smiling": 26, "snow": 140, "snowboard": 15, "snowboarder": 67, "snowboarding": 106, "soccer": 119, "soccer ball": 73, "solid": 105, "stand": 14, "station": 23, "street": 173, "stripes": 93, "style": 144, "sun": 19, "suv": 134, "tabby": 146, "table": 175, "talking": 94, "talking on phone": 50, "tan": 109, "tent": 126, "they aren't": 75, "tired": 149, "tower": 13, "train": 46, "trees": 40, "tv": 64, "unknown": 33, "walking": 0, "wall": 12, "watching": 183, "wedding": 20, "white": 169, "white and black": 84, "white and blue": 142, "window": 154, "windows": 65, "wine": 48, "wine tasting": 170, "woman": 53, "women": 195, "woods": 59, "yellow": 24, "yes": 104, "zoo": 11 }, "layer_norm_eps": 1e-12, "max_image_length": -1, "max_position_embeddings": 40, "modality_type_vocab_size": 2, "model_type": "vilt", "num_attention_heads": 12, "num_channels": 3, "num_hidden_layers": 12, "num_images": -1, "patch_size": 32, "qkv_bias": true, "tie_word_embeddings": false, "torch_dtype": "float32", "transformers_version": "4.40.0", "type_vocab_size": 2, "vocab_size": 30522 }