{ "_name_or_path": "dandelin/vilt-b32-mlm", "architectures": [ "ViltForQuestionAnswering" ], "attention_probs_dropout_prob": 0.0, "hidden_act": "gelu", "hidden_dropout_prob": 0.0, "hidden_size": 768, "id2label": { "0": "queen", "1": "park", "2": "7:35", "3": "soccer ball", "4": "bricks", "5": "king", "6": "clock tower", "7": "yes", "8": "screen", "9": "wall", "10": "clock", "11": "don't know", "12": "protection", "13": "stand", "14": "lady", "15": "at table", "16": "doughnut", "17": "green", "18": "dirt", "19": "hawaii", "20": "girl", "21": "gray", "22": "crown", "23": "monitor", "24": "nothing", "25": "low", "26": "smiling", "27": "women", "28": "tired", "29": "lg", "30": "neon", "31": "church", "32": "not there", "33": "snow", "34": "yellow", "35": "calico", "36": "soccer", "37": "tan", "38": "ice cream", "39": "zoo", "40": "blue and white", "41": "train", "42": "trees", "43": "style", "44": "black", "45": "plain", "46": "windows", "47": "out", "48": "boy", "49": "cross", "50": "laying down", "51": "little girl", "52": "human", "53": "tv", "54": "sidewalk", "55": "5", "56": "giraffe", "57": "talking", "58": "right", "59": "resting", "60": "table", "61": "6", "62": "giraffes", "63": "arrow", "64": "1", "65": "door", "66": "exit", "67": "9:35", "68": "curtain", "69": "bedroom", "70": "africa", "71": "3", "72": "air", "73": "many", "74": "tent", "75": "solid", "76": "2013", "77": "wine tasting", "78": "lying down", "79": "bicycles", "80": "skiing", "81": "curtains", "82": "skateboard", "83": "not sure", "84": "sun", "85": "chair", "86": "large", "87": "beige", "88": "brick", "89": "shadow", "90": "unknown", "91": "red and yellow", "92": "clear", "93": "talking on phone", "94": "camera", "95": "forest", "96": "birthday", "97": "ball", "98": "8:35", "99": "desert", "100": "on street", "101": "fence", "102": "down", "103": "0", "104": "roof", "105": "dog", "106": "white and blue", "107": "canopy", "108": "shrimp", "109": "rack", "110": "cup", "111": "in car", "112": "shadows", "113": "cloudy", "114": "stripes", "115": "small", "116": "skier", "117": "fashion", "118": "station", "119": "wedding", "120": "snowboard", "121": "plastic", "122": "skateboarding", "123": "on road", "124": "snowboarder", "125": "plate", "126": "they aren't", "127": "sleeping", "128": "2000", "129": "bus", "130": "cat", "131": "8", "132": "bikes", "133": "4", "134": "no", "135": "blue", "136": "big ben", "137": "crossing", "138": "man", "139": "cage", "140": "security", "141": "lanyard", "142": "platform", "143": "picnic table", "144": "person", "145": "jeep", "146": "outside", "147": "ground", "148": "hat", "149": "blonde", "150": "car", "151": "backpack", "152": "red and blue", "153": "watching", "154": "full", "155": "red", "156": "sky", "157": "orange", "158": "tower", "159": "donut", "160": "woman", "161": "suv", "162": "wine", "163": "shelter", "164": "smile", "165": "net", "166": "white and black", "167": "shade", "168": "necklace", "169": "street", "170": "bicycle", "171": "natural", "172": "brown", "173": "2", "174": "7", "175": "chopsticks", "176": "tabby", "177": "name tag", "178": "beagle", "179": "french", "180": "snowboarding", "181": "purple", "182": "hair", "183": "pink", "184": "happy", "185": "white", "186": "photographer", "187": "gray and black", "188": "7:45", "189": "can't tell", "190": "black and white", "191": "woods", "192": "leather", "193": "bike rack", "194": "double", "195": "walking", "196": "10", "197": "2010", "198": "window" }, "image_size": 384, "initializer_range": 0.02, "intermediate_size": 3072, "label2id": { "0": 103, "1": 64, "10": 196, "2": 173, "2000": 128, "2010": 197, "2013": 76, "3": 71, "4": 133, "5": 55, "6": 61, "7": 174, "7:35": 2, "7:45": 188, "8": 131, "8:35": 98, "9:35": 67, "africa": 70, "air": 72, "arrow": 63, "at table": 15, "backpack": 151, "ball": 97, "beagle": 178, "bedroom": 69, "beige": 87, "bicycle": 170, "bicycles": 79, "big ben": 136, "bike rack": 193, "bikes": 132, "birthday": 96, "black": 44, "black and white": 190, "blonde": 149, "blue": 135, "blue and white": 40, "boy": 48, "brick": 88, "bricks": 4, "brown": 172, "bus": 129, "cage": 139, "calico": 35, "camera": 94, "can't tell": 189, "canopy": 107, "car": 150, "cat": 130, "chair": 85, "chopsticks": 175, "church": 31, "clear": 92, "clock": 10, "clock tower": 6, "cloudy": 113, "cross": 49, "crossing": 137, "crown": 22, "cup": 110, "curtain": 68, "curtains": 81, "desert": 99, "dirt": 18, "dog": 105, "don't know": 11, "donut": 159, "door": 65, "double": 194, "doughnut": 16, "down": 102, "exit": 66, "fashion": 117, "fence": 101, "forest": 95, "french": 179, "full": 154, "giraffe": 56, "giraffes": 62, "girl": 20, "gray": 21, "gray and black": 187, "green": 17, "ground": 147, "hair": 182, "happy": 184, "hat": 148, "hawaii": 19, "human": 52, "ice cream": 38, "in car": 111, "jeep": 145, "king": 5, "lady": 14, "lanyard": 141, "large": 86, "laying down": 50, "leather": 192, "lg": 29, "little girl": 51, "low": 25, "lying down": 78, "man": 138, "many": 73, "monitor": 23, "name tag": 177, "natural": 171, "necklace": 168, "neon": 30, "net": 165, "no": 134, "not sure": 83, "not there": 32, "nothing": 24, "on road": 123, "on street": 100, "orange": 157, "out": 47, "outside": 146, "park": 1, "person": 144, "photographer": 186, "picnic table": 143, "pink": 183, "plain": 45, "plastic": 121, "plate": 125, "platform": 142, "protection": 12, "purple": 181, "queen": 0, "rack": 109, "red": 155, "red and blue": 152, "red and yellow": 91, "resting": 59, "right": 58, "roof": 104, "screen": 8, "security": 140, "shade": 167, "shadow": 89, "shadows": 112, "shelter": 163, "shrimp": 108, "sidewalk": 54, "skateboard": 82, "skateboarding": 122, "skier": 116, "skiing": 80, "sky": 156, "sleeping": 127, "small": 115, "smile": 164, "smiling": 26, "snow": 33, "snowboard": 120, "snowboarder": 124, "snowboarding": 180, "soccer": 36, "soccer ball": 3, "solid": 75, "stand": 13, "station": 118, "street": 169, "stripes": 114, "style": 43, "sun": 84, "suv": 161, "tabby": 176, "table": 60, "talking": 57, "talking on phone": 93, "tan": 37, "tent": 74, "they aren't": 126, "tired": 28, "tower": 158, "train": 41, "trees": 42, "tv": 53, "unknown": 90, "walking": 195, "wall": 9, "watching": 153, "wedding": 119, "white": 185, "white and black": 166, "white and blue": 106, "window": 198, "windows": 46, "wine": 162, "wine tasting": 77, "woman": 160, "women": 27, "woods": 191, "yellow": 34, "yes": 7, "zoo": 39 }, "layer_norm_eps": 1e-12, "max_image_length": -1, "max_position_embeddings": 40, "modality_type_vocab_size": 2, "model_type": "vilt", "num_attention_heads": 12, "num_channels": 3, "num_hidden_layers": 12, "num_images": -1, "patch_size": 32, "qkv_bias": true, "tie_word_embeddings": false, "torch_dtype": "float32", "transformers_version": "4.41.2", "type_vocab_size": 2, "vocab_size": 30522 }