{ "_name_or_path": "dandelin/vilt-b32-mlm", "architectures": [ "ViltForQuestionAnswering" ], "attention_probs_dropout_prob": 0.0, "hidden_act": "gelu", "hidden_dropout_prob": 0.0, "hidden_size": 768, "id2label": { "0": "yes", "1": "black and white", "2": "sidewalk", "3": "full", "4": "lanyard", "5": "at table", "6": "security", "7": "canopy", "8": "plate", "9": "4", "10": "red", "11": "cloudy", "12": "down", "13": "large", "14": "jeep", "15": "dirt", "16": "monitor", "17": "crown", "18": "low", "19": "smile", "20": "don't know", "21": "park", "22": "necklace", "23": "8:35", "24": "2000", "25": "style", "26": "2013", "27": "8", "28": "clock", "29": "wine", "30": "air", "31": "desert", "32": "skateboarding", "33": "skier", "34": "window", "35": "hat", "36": "white", "37": "donut", "38": "photographer", "39": "dog", "40": "shelter", "41": "not sure", "42": "red and blue", "43": "resting", "44": "not there", "45": "big ben", "46": "green", "47": "platform", "48": "sleeping", "49": "backpack", "50": "protection", "51": "bike rack", "52": "nothing", "53": "tent", "54": "plain", "55": "solid", "56": "bikes", "57": "2", "58": "smiling", "59": "white and black", "60": "brick", "61": "gray", "62": "snow", "63": "door", "64": "blue and white", "65": "picnic table", "66": "doughnut", "67": "outside", "68": "5", "69": "fence", "70": "arrow", "71": "camera", "72": "9:35", "73": "man", "74": "3", "75": "giraffes", "76": "1", "77": "birthday", "78": "ball", "79": "net", "80": "tv", "81": "trees", "82": "walking", "83": "7:45", "84": "blue", "85": "small", "86": "exit", "87": "bus", "88": "7:35", "89": "church", "90": "suv", "91": "human", "92": "beige", "93": "they aren't", "94": "natural", "95": "lady", "96": "roof", "97": "right", "98": "2010", "99": "happy", "100": "curtains", "101": "tan", "102": "hair", "103": "lying down", "104": "table", "105": "wall", "106": "tabby", "107": "on street", "108": "bicycles", "109": "shade", "110": "6", "111": "stripes", "112": "soccer ball", "113": "king", "114": "white and blue", "115": "0", "116": "many", "117": "giraffe", "118": "ice cream", "119": "laying down", "120": "gray and black", "121": "skateboard", "122": "bricks", "123": "shrimp", "124": "cup", "125": "watching", "126": "forest", "127": "woods", "128": "beagle", "129": "yellow", "130": "street", "131": "stand", "132": "on road", "133": "french", "134": "shadow", "135": "snowboarding", "136": "queen", "137": "bedroom", "138": "talking on phone", "139": "black", "140": "car", "141": "plastic", "142": "cat", "143": "sky", "144": "shadows", "145": "clear", "146": "orange", "147": "talking", "148": "hawaii", "149": "women", "150": "unknown", "151": "no", "152": "crossing", "153": "person", "154": "skiing", "155": "brown", "156": "tired", "157": "name tag", "158": "out", "159": "girl", "160": "lg", "161": "can't tell", "162": "boy", "163": "7", "164": "chair", "165": "10", "166": "train", "167": "soccer", "168": "chopsticks", "169": "purple", "170": "wine tasting", "171": "zoo", "172": "clock tower", "173": "africa", "174": "snowboarder", "175": "tower", "176": "cage", "177": "pink", "178": "rack", "179": "double", "180": "screen", "181": "station", "182": "blonde", "183": "bicycle", "184": "sun", "185": "red and yellow", "186": "calico", "187": "curtain", "188": "wedding", "189": "in car", "190": "leather", "191": "windows", "192": "ground", "193": "woman", "194": "fashion", "195": "snowboard", "196": "cross", "197": "little girl", "198": "neon" }, "image_size": 384, "initializer_range": 0.02, "intermediate_size": 3072, "label2id": { "0": 115, "1": 76, "10": 165, "2": 57, "2000": 24, "2010": 98, "2013": 26, "3": 74, "4": 9, "5": 68, "6": 110, "7": 163, "7:35": 88, "7:45": 83, "8": 27, "8:35": 23, "9:35": 72, "africa": 173, "air": 30, "arrow": 70, "at table": 5, "backpack": 49, "ball": 78, "beagle": 128, "bedroom": 137, "beige": 92, "bicycle": 183, "bicycles": 108, "big ben": 45, "bike rack": 51, "bikes": 56, "birthday": 77, "black": 139, "black and white": 1, "blonde": 182, "blue": 84, "blue and white": 64, "boy": 162, "brick": 60, "bricks": 122, "brown": 155, "bus": 87, "cage": 176, "calico": 186, "camera": 71, "can't tell": 161, "canopy": 7, "car": 140, "cat": 142, "chair": 164, "chopsticks": 168, "church": 89, "clear": 145, "clock": 28, "clock tower": 172, "cloudy": 11, "cross": 196, "crossing": 152, "crown": 17, "cup": 124, "curtain": 187, "curtains": 100, "desert": 31, "dirt": 15, "dog": 39, "don't know": 20, "donut": 37, "door": 63, "double": 179, "doughnut": 66, "down": 12, "exit": 86, "fashion": 194, "fence": 69, "forest": 126, "french": 133, "full": 3, "giraffe": 117, "giraffes": 75, "girl": 159, "gray": 61, "gray and black": 120, "green": 46, "ground": 192, "hair": 102, "happy": 99, "hat": 35, "hawaii": 148, "human": 91, "ice cream": 118, "in car": 189, "jeep": 14, "king": 113, "lady": 95, "lanyard": 4, "large": 13, "laying down": 119, "leather": 190, "lg": 160, "little girl": 197, "low": 18, "lying down": 103, "man": 73, "many": 116, "monitor": 16, "name tag": 157, "natural": 94, "necklace": 22, "neon": 198, "net": 79, "no": 151, "not sure": 41, "not there": 44, "nothing": 52, "on road": 132, "on street": 107, "orange": 146, "out": 158, "outside": 67, "park": 21, "person": 153, "photographer": 38, "picnic table": 65, "pink": 177, "plain": 54, "plastic": 141, "plate": 8, "platform": 47, "protection": 50, "purple": 169, "queen": 136, "rack": 178, "red": 10, "red and blue": 42, "red and yellow": 185, "resting": 43, "right": 97, "roof": 96, "screen": 180, "security": 6, "shade": 109, "shadow": 134, "shadows": 144, "shelter": 40, "shrimp": 123, "sidewalk": 2, "skateboard": 121, "skateboarding": 32, "skier": 33, "skiing": 154, "sky": 143, "sleeping": 48, "small": 85, "smile": 19, "smiling": 58, "snow": 62, "snowboard": 195, "snowboarder": 174, "snowboarding": 135, "soccer": 167, "soccer ball": 112, "solid": 55, "stand": 131, "station": 181, "street": 130, "stripes": 111, "style": 25, "sun": 184, "suv": 90, "tabby": 106, "table": 104, "talking": 147, "talking on phone": 138, "tan": 101, "tent": 53, "they aren't": 93, "tired": 156, "tower": 175, "train": 166, "trees": 81, "tv": 80, "unknown": 150, "walking": 82, "wall": 105, "watching": 125, "wedding": 188, "white": 36, "white and black": 59, "white and blue": 114, "window": 34, "windows": 191, "wine": 29, "wine tasting": 170, "woman": 193, "women": 149, "woods": 127, "yellow": 129, "yes": 0, "zoo": 171 }, "layer_norm_eps": 1e-12, "max_image_length": -1, "max_position_embeddings": 40, "modality_type_vocab_size": 2, "model_type": "vilt", "num_attention_heads": 12, "num_channels": 3, "num_hidden_layers": 12, "num_images": -1, "patch_size": 32, "qkv_bias": true, "tie_word_embeddings": false, "torch_dtype": "float32", "transformers_version": "4.37.2", "type_vocab_size": 2, "vocab_size": 30522 }