{ "_name_or_path": "dandelin/vilt-b32-mlm", "architectures": [ "ViltForQuestionAnswering" ], "attention_probs_dropout_prob": 0.0, "hidden_act": "gelu", "hidden_dropout_prob": 0.0, "hidden_size": 768, "id2label": { "0": "not there", "1": "not sure", "2": "snowboarding", "3": "photographer", "4": "africa", "5": "arrow", "6": "6", "7": "skateboarding", "8": "gray and black", "9": "skiing", "10": "doughnut", "11": "crown", "12": "park", "13": "bus", "14": "clock tower", "15": "natural", "16": "ice cream", "17": "net", "18": "nothing", "19": "donut", "20": "birthday", "21": "brick", "22": "security", "23": "smiling", "24": "1", "25": "hawaii", "26": "curtains", "27": "man", "28": "trees", "29": "leather", "30": "street", "31": "smile", "32": "talking", "33": "protection", "34": "0", "35": "woods", "36": "black", "37": "white", "38": "jeep", "39": "on road", "40": "clock", "41": "canopy", "42": "bricks", "43": "table", "44": "crossing", "45": "cat", "46": "car", "47": "chair", "48": "lanyard", "49": "person", "50": "no", "51": "low", "52": "camera", "53": "sky", "54": "small", "55": "pink", "56": "exit", "57": "blue", "58": "plastic", "59": "stand", "60": "bicycles", "61": "dirt", "62": "snowboard", "63": "air", "64": "8", "65": "zoo", "66": "bedroom", "67": "large", "68": "sleeping", "69": "tent", "70": "laying down", "71": "at table", "72": "boy", "73": "blonde", "74": "skateboard", "75": "cup", "76": "yes", "77": "door", "78": "out", "79": "7:35", "80": "ball", "81": "in car", "82": "tired", "83": "happy", "84": "lady", "85": "monitor", "86": "unknown", "87": "skier", "88": "down", "89": "windows", "90": "gray", "91": "ground", "92": "backpack", "93": "outside", "94": "green", "95": "wedding", "96": "wall", "97": "shade", "98": "9:35", "99": "yellow", "100": "2000", "101": "forest", "102": "they aren't", "103": "queen", "104": "tv", "105": "red and yellow", "106": "brown", "107": "desert", "108": "tan", "109": "sidewalk", "110": "chopsticks", "111": "8:35", "112": "dog", "113": "black and white", "114": "many", "115": "shadows", "116": "french", "117": "2013", "118": "on street", "119": "hair", "120": "red", "121": "name tag", "122": "necklace", "123": "little girl", "124": "calico", "125": "snow", "126": "platform", "127": "watching", "128": "can't tell", "129": "curtain", "130": "2", "131": "right", "132": "beagle", "133": "resting", "134": "soccer", "135": "roof", "136": "style", "137": "lying down", "138": "full", "139": "bikes", "140": "7:45", "141": "snowboarder", "142": "big ben", "143": "5", "144": "train", "145": "orange", "146": "4", "147": "clear", "148": "fence", "149": "double", "150": "stripes", "151": "king", "152": "suv", "153": "plain", "154": "station", "155": "wine tasting", "156": "fashion", "157": "human", "158": "shadow", "159": "giraffe", "160": "sun", "161": "church", "162": "cage", "163": "bicycle", "164": "cross", "165": "shrimp", "166": "plate", "167": "giraffes", "168": "shelter", "169": "lg", "170": "women", "171": "bike rack", "172": "window", "173": "red and blue", "174": "7", "175": "soccer ball", "176": "rack", "177": "white and blue", "178": "2010", "179": "beige", "180": "screen", "181": "talking on phone", "182": "walking", "183": "girl", "184": "3", "185": "tabby", "186": "10", "187": "neon", "188": "cloudy", "189": "woman", "190": "tower", "191": "purple", "192": "picnic table", "193": "hat", "194": "white and black", "195": "solid", "196": "wine", "197": "don't know", "198": "blue and white" }, "image_size": 384, "initializer_range": 0.02, "intermediate_size": 3072, "label2id": { "0": 34, "1": 24, "10": 186, "2": 130, "2000": 100, "2010": 178, "2013": 117, "3": 184, "4": 146, "5": 143, "6": 6, "7": 174, "7:35": 79, "7:45": 140, "8": 64, "8:35": 111, "9:35": 98, "africa": 4, "air": 63, "arrow": 5, "at table": 71, "backpack": 92, "ball": 80, "beagle": 132, "bedroom": 66, "beige": 179, "bicycle": 163, "bicycles": 60, "big ben": 142, "bike rack": 171, "bikes": 139, "birthday": 20, "black": 36, "black and white": 113, "blonde": 73, "blue": 57, "blue and white": 198, "boy": 72, "brick": 21, "bricks": 42, "brown": 106, "bus": 13, "cage": 162, "calico": 124, "camera": 52, "can't tell": 128, "canopy": 41, "car": 46, "cat": 45, "chair": 47, "chopsticks": 110, "church": 161, "clear": 147, "clock": 40, "clock tower": 14, "cloudy": 188, "cross": 164, "crossing": 44, "crown": 11, "cup": 75, "curtain": 129, "curtains": 26, "desert": 107, "dirt": 61, "dog": 112, "don't know": 197, "donut": 19, "door": 77, "double": 149, "doughnut": 10, "down": 88, "exit": 56, "fashion": 156, "fence": 148, "forest": 101, "french": 116, "full": 138, "giraffe": 159, "giraffes": 167, "girl": 183, "gray": 90, "gray and black": 8, "green": 94, "ground": 91, "hair": 119, "happy": 83, "hat": 193, "hawaii": 25, "human": 157, "ice cream": 16, "in car": 81, "jeep": 38, "king": 151, "lady": 84, "lanyard": 48, "large": 67, "laying down": 70, "leather": 29, "lg": 169, "little girl": 123, "low": 51, "lying down": 137, "man": 27, "many": 114, "monitor": 85, "name tag": 121, "natural": 15, "necklace": 122, "neon": 187, "net": 17, "no": 50, "not sure": 1, "not there": 0, "nothing": 18, "on road": 39, "on street": 118, "orange": 145, "out": 78, "outside": 93, "park": 12, "person": 49, "photographer": 3, "picnic table": 192, "pink": 55, "plain": 153, "plastic": 58, "plate": 166, "platform": 126, "protection": 33, "purple": 191, "queen": 103, "rack": 176, "red": 120, "red and blue": 173, "red and yellow": 105, "resting": 133, "right": 131, "roof": 135, "screen": 180, "security": 22, "shade": 97, "shadow": 158, "shadows": 115, "shelter": 168, "shrimp": 165, "sidewalk": 109, "skateboard": 74, "skateboarding": 7, "skier": 87, "skiing": 9, "sky": 53, "sleeping": 68, "small": 54, "smile": 31, "smiling": 23, "snow": 125, "snowboard": 62, "snowboarder": 141, "snowboarding": 2, "soccer": 134, "soccer ball": 175, "solid": 195, "stand": 59, "station": 154, "street": 30, "stripes": 150, "style": 136, "sun": 160, "suv": 152, "tabby": 185, "table": 43, "talking": 32, "talking on phone": 181, "tan": 108, "tent": 69, "they aren't": 102, "tired": 82, "tower": 190, "train": 144, "trees": 28, "tv": 104, "unknown": 86, "walking": 182, "wall": 96, "watching": 127, "wedding": 95, "white": 37, "white and black": 194, "white and blue": 177, "window": 172, "windows": 89, "wine": 196, "wine tasting": 155, "woman": 189, "women": 170, "woods": 35, "yellow": 99, "yes": 76, "zoo": 65 }, "layer_norm_eps": 1e-12, "max_image_length": -1, "max_position_embeddings": 40, "modality_type_vocab_size": 2, "model_type": "vilt", "num_attention_heads": 12, "num_channels": 3, "num_hidden_layers": 12, "num_images": -1, "patch_size": 32, "qkv_bias": true, "tie_word_embeddings": false, "torch_dtype": "float32", "transformers_version": "4.34.0", "type_vocab_size": 2, "vocab_size": 30522 }