{ "_name_or_path": "dandelin/vilt-b32-mlm", "architectures": [ "ViltForQuestionAnswering" ], "attention_probs_dropout_prob": 0.0, "hidden_act": "gelu", "hidden_dropout_prob": 0.0, "hidden_size": 768, "id2label": { "0": "blue and white", "1": "screen", "2": "brick", "3": "lanyard", "4": "trees", "5": "clock", "6": "blonde", "7": "canopy", "8": "windows", "9": "no", "10": "beagle", "11": "bicycle", "12": "they aren't", "13": "arrow", "14": "white and black", "15": "big ben", "16": "plate", "17": "bike rack", "18": "bedroom", "19": "donut", "20": "tv", "21": "yes", "22": "nothing", "23": "sidewalk", "24": "wine tasting", "25": "skateboard", "26": "ice cream", "27": "yellow", "28": "1", "29": "jeep", "30": "snowboard", "31": "unknown", "32": "photographer", "33": "door", "34": "hair", "35": "at table", "36": "shelter", "37": "out", "38": "cage", "39": "red and blue", "40": "picnic table", "41": "woods", "42": "wedding", "43": "church", "44": "station", "45": "7", "46": "lying down", "47": "net", "48": "snowboarding", "49": "person", "50": "suv", "51": "full", "52": "cup", "53": "2", "54": "zoo", "55": "skiing", "56": "window", "57": "dog", "58": "table", "59": "monitor", "60": "train", "61": "boy", "62": "french", "63": "ground", "64": "wine", "65": "ball", "66": "platform", "67": "gray", "68": "shrimp", "69": "natural", "70": "woman", "71": "shadows", "72": "9:35", "73": "white and blue", "74": "plastic", "75": "bikes", "76": "necklace", "77": "7:35", "78": "style", "79": "clock tower", "80": "giraffes", "81": "orange", "82": "down", "83": "dirt", "84": "8:35", "85": "3", "86": "bicycles", "87": "purple", "88": "10", "89": "walking", "90": "not there", "91": "little girl", "92": "skateboarding", "93": "on road", "94": "on street", "95": "gray and black", "96": "7:45", "97": "fence", "98": "queen", "99": "tired", "100": "lg", "101": "backpack", "102": "hat", "103": "king", "104": "in car", "105": "smiling", "106": "hawaii", "107": "camera", "108": "crossing", "109": "plain", "110": "cloudy", "111": "solid", "112": "small", "113": "man", "114": "clear", "115": "blue", "116": "green", "117": "africa", "118": "black and white", "119": "red", "120": "white", "121": "cat", "122": "5", "123": "low", "124": "birthday", "125": "skier", "126": "2000", "127": "security", "128": "calico", "129": "shadow", "130": "stripes", "131": "many", "132": "2013", "133": "sky", "134": "lady", "135": "protection", "136": "not sure", "137": "giraffe", "138": "brown", "139": "chopsticks", "140": "curtains", "141": "6", "142": "don't know", "143": "can't tell", "144": "large", "145": "resting", "146": "double", "147": "park", "148": "stand", "149": "pink", "150": "air", "151": "shade", "152": "forest", "153": "2010", "154": "watching", "155": "beige", "156": "soccer ball", "157": "street", "158": "curtain", "159": "talking", "160": "tent", "161": "happy", "162": "name tag", "163": "girl", "164": "tabby", "165": "car", "166": "leather", "167": "wall", "168": "talking on phone", "169": "0", "170": "snow", "171": "sleeping", "172": "exit", "173": "outside", "174": "8", "175": "right", "176": "tan", "177": "human", "178": "tower", "179": "neon", "180": "soccer", "181": "roof", "182": "cross", "183": "smile", "184": "4", "185": "crown", "186": "bus", "187": "rack", "188": "women", "189": "fashion", "190": "doughnut", "191": "snowboarder", "192": "laying down", "193": "bricks", "194": "sun", "195": "desert", "196": "black", "197": "chair", "198": "red and yellow" }, "image_size": 384, "initializer_range": 0.02, "intermediate_size": 3072, "label2id": { "0": 169, "1": 28, "10": 88, "2": 53, "2000": 126, "2010": 153, "2013": 132, "3": 85, "4": 184, "5": 122, "6": 141, "7": 45, "7:35": 77, "7:45": 96, "8": 174, "8:35": 84, "9:35": 72, "africa": 117, "air": 150, "arrow": 13, "at table": 35, "backpack": 101, "ball": 65, "beagle": 10, "bedroom": 18, "beige": 155, "bicycle": 11, "bicycles": 86, "big ben": 15, "bike rack": 17, "bikes": 75, "birthday": 124, "black": 196, "black and white": 118, "blonde": 6, "blue": 115, "blue and white": 0, "boy": 61, "brick": 2, "bricks": 193, "brown": 138, "bus": 186, "cage": 38, "calico": 128, "camera": 107, "can't tell": 143, "canopy": 7, "car": 165, "cat": 121, "chair": 197, "chopsticks": 139, "church": 43, "clear": 114, "clock": 5, "clock tower": 79, "cloudy": 110, "cross": 182, "crossing": 108, "crown": 185, "cup": 52, "curtain": 158, "curtains": 140, "desert": 195, "dirt": 83, "dog": 57, "don't know": 142, "donut": 19, "door": 33, "double": 146, "doughnut": 190, "down": 82, "exit": 172, "fashion": 189, "fence": 97, "forest": 152, "french": 62, "full": 51, "giraffe": 137, "giraffes": 80, "girl": 163, "gray": 67, "gray and black": 95, "green": 116, "ground": 63, "hair": 34, "happy": 161, "hat": 102, "hawaii": 106, "human": 177, "ice cream": 26, "in car": 104, "jeep": 29, "king": 103, "lady": 134, "lanyard": 3, "large": 144, "laying down": 192, "leather": 166, "lg": 100, "little girl": 91, "low": 123, "lying down": 46, "man": 113, "many": 131, "monitor": 59, "name tag": 162, "natural": 69, "necklace": 76, "neon": 179, "net": 47, "no": 9, "not sure": 136, "not there": 90, "nothing": 22, "on road": 93, "on street": 94, "orange": 81, "out": 37, "outside": 173, "park": 147, "person": 49, "photographer": 32, "picnic table": 40, "pink": 149, "plain": 109, "plastic": 74, "plate": 16, "platform": 66, "protection": 135, "purple": 87, "queen": 98, "rack": 187, "red": 119, "red and blue": 39, "red and yellow": 198, "resting": 145, "right": 175, "roof": 181, "screen": 1, "security": 127, "shade": 151, "shadow": 129, "shadows": 71, "shelter": 36, "shrimp": 68, "sidewalk": 23, "skateboard": 25, "skateboarding": 92, "skier": 125, "skiing": 55, "sky": 133, "sleeping": 171, "small": 112, "smile": 183, "smiling": 105, "snow": 170, "snowboard": 30, "snowboarder": 191, "snowboarding": 48, "soccer": 180, "soccer ball": 156, "solid": 111, "stand": 148, "station": 44, "street": 157, "stripes": 130, "style": 78, "sun": 194, "suv": 50, "tabby": 164, "table": 58, "talking": 159, "talking on phone": 168, "tan": 176, "tent": 160, "they aren't": 12, "tired": 99, "tower": 178, "train": 60, "trees": 4, "tv": 20, "unknown": 31, "walking": 89, "wall": 167, "watching": 154, "wedding": 42, "white": 120, "white and black": 14, "white and blue": 73, "window": 56, "windows": 8, "wine": 64, "wine tasting": 24, "woman": 70, "women": 188, "woods": 41, "yellow": 27, "yes": 21, "zoo": 54 }, "layer_norm_eps": 1e-12, "max_image_length": -1, "max_position_embeddings": 40, "modality_type_vocab_size": 2, "model_type": "vilt", "num_attention_heads": 12, "num_channels": 3, "num_hidden_layers": 12, "num_images": -1, "patch_size": 32, "qkv_bias": true, "tie_word_embeddings": false, "torch_dtype": "float32", "transformers_version": "4.34.0", "type_vocab_size": 2, "vocab_size": 30522 }