Spaces:

Ramos-Ramos
/

albef-vqa

Runtime error

App Files Files Community

ryanramos commited on Dec 1, 2022

Commit

d1b8c9b

1 Parent(s): 0df51f9

Add source code

Browse files

Files changed (24) hide show

__init__.py +5 -0
answer_list.json +1 -0
configs/retrieval.yaml +73 -0
configs/vqa.yaml +78 -0
data/__init__.py +5 -0
data/retrieval_datamodule.py +188 -0
data/retrieval_dataset.py +149 -0
data/transforms.py +139 -0
data/vqa_datamodules.py +206 -0
data/vqa_dataset.py +115 -0
finetune_retrieval.py +400 -0
finetune_vqa.py +204 -0
images/COCO_val2014_000000026348.jpg +0 -0
images/COCO_val2014_000000057222.jpg +0 -0
images/COCO_val2014_000000111207.jpg +0 -0
images/COCO_val2014_000000159269.jpg +0 -0
images/COCO_val2014_000000184359.jpg +0 -0
images/COCO_val2014_000000407072.jpg +0 -0
images/COCO_val2014_000000473994.jpg +0 -0
images/COCO_val2014_000000552075.jpg +0 -0
model.py +666 -0
requirements.txt +5 -0
utils.py +127 -0
vqa_data.json +1 -0

__init__.py ADDED Viewed

	@@ -0,0 +1,5 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.

answer_list.json ADDED Viewed

	@@ -0,0 +1 @@

+ ["net", "pitcher", "orange", "yes", "white", "skiing", "red", "frisbee", "brushing teeth", "no", "black and white", "skateboard", "1", "blue", "green", "motorcycle", "gray", "2", "purse", "skis", "poles", "surfboard", "dog", "on", "office", "large", "very big", "laptop", "vent", "computer", "black", "bear", "3", "wii", "glasses", "tree", "eating", "log", "5", "raft", "left", "living room", "pink", "right", "railing", "grass", "wire", "10 years", "knife", "cake", "banana", "chef", "vanilla", "4", "outdoor", "mustard", "bun", "clouds", "dock", "brown", "silver", "refrigerator", "square", "teddy", "elm", "stripes", "baseball", "catcher", "beer", "bottom", "north", "nike", "yellow and white", "morning", "elephant", "red and white", "propeller", "tan", "wall", "rolex", "clock", "table", "0", "wood", "christmas", "spinach", "thick", "bag", "leaves", "necklace", "6", "bathroom", "shower", "towel", "solid", "referee", "wilson", "8:00", "e", "24", "hat", "grazing", "sheep", "10", "tag", "spanish", "hot dog", "plate", "lunch", "butter", "peppers", "onions", "very", "mayonnaise", "mayo", "sweet potato", "pig", "sweet", "flowers", "floral", "yellow", "window", "7", "pizza", "car", "cargo", "stairs", "abstract", "rug", "baseball cap", "texting", "pole", "crosswalk", "nothing", "urban", "bus", "light", "afternoon", "boat", "cheese", "paper", "real", "sun", "birthday", "words", "inside", "shadows", "tomato", "evergreen", "100 feet", "shingles", "trees", "building", "hay", "ski pole", "patterned", "walking", "ice", "laundry", "pepsi", "good", "1:50", "purple", "13", "africa", "teddy bears", "socks", "giraffe", "soccer", "blue and yellow", "zebras", "cupcake", "broccoli", "soldier", "parking lot", "cows", "herding", "on table", "fish", "nightstand", "50", "overcast", "cross", "toaster oven", "tile", "11:55", "red and yellow", "nowhere", "hair dryer", "truck", "11", "people", "rectangle", "hot dogs", "party", "12:55", "apron", "kitchen", "cooking", "ring", "1 way", "stop", "neither", "many", "female", "brushing", "tie", "tennis racket", "knife and fork", "restaurant", "cat", "bed", "sand", "ocean", "cold", "kites", "cumulus", "standing", "male", "star", "tracks", "chocolate", "round", "fork and knife", "yankees", "pictures", "dots", "bird", "parrot", "red white and blue", "man", "metal", "fence", "snowboarding", "pine", "snow", "shorts", "swim", "wine", "brick", "no parking", "children", "beef", "phone", "english", "cell phone", "pink and yellow", "clear", "watermelon", "bedroom", "fork", "cow", "rackets", "tennis rackets", "8", "collar", "tennis", "1950s", "playing tennis", "skirt", "30", "polka dot", "beach", "horse", "grill", "african american", "down", "street", "in air", "sweater", "yellow and blue", "park", "backyard", "spectators", "parasailing", "31", "river", "55", "shadow", "winter", "chicken", "tea", "evening", "dusk", "ski resort", "helmet", "penne", "bench", "resting", "elephants", "southwest", "usa", "cars", "town", "bananas", "umbrella", "container", "woman", "on counter", "salad", "striped", "motel", "vertical", "oranges", "hot sauce", "bottle", "juice", "eyes", "ground", "backpack", "black and yellow", "forward", "jackets", "1 on right", "green and yellow", "playing baseball", "riding", "sitting", "carrot", "basket", "seagull", "ski poles", "p", "parking", "street light", "mets", "strap", "bike", "riding bike", "poodle", "shoes", "carpet", "lettuce", "food", "1 foot", "roses", "mountains", "scissors", "camera", "beige", "beard", "cutting", "baby", "tape", "watch", "never", "taking picture", "eggs", "syrup", "sandwich", "water skiing", "microphone", "back", "bears", "donuts", "w", "sky", "double decker", "england", "surfing", "running", "shirt", "barn", "weather vane", "white and blue", "fishing", "bridge", "los angeles", "open", "red sox", "bat", "plane", "white and green", "transportation", "sunny", "bus stop", "city", "brown and white", "bicycle", "crow", "magazines", "daisy", "14", "old", "curtains", "jumped", "snowboard", "dinosaur", "racing", "asphalt", "court", "plastic", "circle", "red and blue", "zebra", "12", "biplane", "shallow", "brazil", "logo", "2:20", "electric", "night time", "motion", "toothbrushes", "orange and white", "66", "spoon", "toyota", "tennis shoes", "46", "second", "no 1", "iphone", "friend", "apple", "carnation", "15", "tiger", "glove", "airplane", "bow", "air france", "passengers", "tv", "on building", "3:55", "victorian", "steeple", "happy", "skateboarding", "fruit", "cutting board", "cantaloupe", "kiwi", "sliced", "heart", "water", "rainy", "carrots", "giraffes", "eat", "ramp", "lab", "field", "horizontal", "birds", "home", "shrimp", "12 feet", "girl", "modern", "turtle", "dell", "boots", "sunglasses", "black and orange", "yellow and black", "gloves", "hp", "desk", "both", "sign", "on street", "2000", "cirrus", "to dry", "ceiling", "fluorescent", "up", "9", "boys", "playing soccer", "american", "passenger", "turn", "palm", "no train", "wedding", "branch", "parrots", "air force", "on tracks", "small", "tank", "dirty", "france", "honda", "2.00", "whale", "vase", "flying", "professional", "driving", "tissue", "protest", "corona", "for balance", "twin", "clothes", "t shirt", "window sill", "wild", "noon", "caution", "spring", "raining", "cane", "school", "windsurfing", "parachute", "black and red", "25", "background", "toaster", "planes", "yellow and red", "spatula", "10:10", "ivory", "train", "welcome", "highway", "off", "on track", "electricity", "italy", "dinner", "sink", "squares", "5 ft", "parked", "store", "dress", "signs", "meow", "football", "rugby", "stainless steel", "la", "dirt", "blue and white", "klm", "house", "unknown", "ford", "reading", "chair", "mountain", "alive", "water skis", "picture", "parade", "slippers", "trailer", "boating", "holding it", "shade", "cloth", "6:20", "candle", "hose", "hand", "3:25", "on sidewalk", "poster", "downhill", "68", "reflection", "summer", "pickles", "halloween", "bats", "london", "zoo", "surfer", "racket", "flickr", "cutting hair", "strawberries", "mushroom", "teddy bear", "big", "suitcase", "veggie", "pepper", "houses", "70", "toshiba", "triangle", "boxes", "photograph", "smoke", "engine", "camel", "sidewalk", "left 1", "red and green", "4:35", "on couch", "candy", "minnie mouse", "homemade", "mouse", "box", "movie", "45", "strawberry", "fridge", "full", "vegetables", "bright", "play", "remote", "pond", "savannah", "celery", "concrete", "semi", "dump", "scania", "safety", "posing", "fabric", "laying", "couch", "blueberries", "handle", "pipe", "stick", "parmesan", "steak", "chain link", "catch", "barbed wire", "mozzarella", "soda", "fire hydrant", "cat food", "pepperoni", "lot", "licking", "red and black", "clay", "tennis court", "jumping", "potatoes", "toothbrush", "kite", "not at all", "flying kite", "broken", "black and silver", "lap", "outside", "44", "delta", "greyhound", "ring finger", "talking on phone", "bad", "kettle", "35", "motorcycles", "produce", "comfort", "steering wheel", "18", "humans", "coffee", "white and brown", "fall", "bread", "cherry", "4:30", "flag", "night", "lamp", "cucumber", "can't see", "porcelain", "oval", "museum", "rain", "sprinkles", "20", "kids", "bracelet", "sneakers", "mask", "mickey mouse", "twins", "very high", "costume", "cabbage", "paint", "lighting", "young", "air conditioner", "wooden", "board", "someone", "beets", "16", "day time", "4 inches", "lights", "ladder", "glass", "ferris wheel", "fries", "steamed", "shepherd", "cotton", "suit", "goatee", "on his head", "print", "happy birthday", "forks", "travel", "maple", "200", "oil", "jeans", "can", "chopsticks", "on wall", "construction", "mack", "36", "chinese", "moped", "festival", "gas", "throwing", "circus", "wires", "not possible", "plates", "sugar", "in", "women's", "door", "no man", "volleyball", "serving", "ponytail", "business", "decoration", "santa", "flat", "barrel", "12:15", "candles", "atv", "free", "hair", "waffle", "ball", "stop sign", "wetsuit", "very deep", "swimsuit", "green and black", "foreground", "stands", "china airlines", "flower", "300", "lobster", "on bench", "plaster", "phones", "sailboat", "apples", "road", "recently", "cones", "cactus", "rice", "vegetarian", "donut", "ketchup", "police", "mirror", "rock", "meat", "blinds", "cell phones", "china", "rust", "7:25", "stone", "vans", "middle", "eagle", "9:30", "ping pong", "microwave", "gmc", "umbrellas", "wrist", "cuddling", "laughing", "boy", "next to toilet", "tabby", "petting", "south", "40", "name tag", "checkered", "name", "slow", "cardboard", "windows", "croissant", "plain", "cookie", "on ground", "low", "water bottle", "goggles", "turkey", "pull", "shut", "kite flying", "bowl", "smile", "in bowl", "bush", "cloudy", "top left", "skateboarder", "coca cola", "pan", "drinking", "short", "floor", "thanksgiving", "radio", "drink", "on toilet", "bike rack", "bleachers", "train tracks", "horses", "far", "top", "toilet", "in water", "private", "nature", "checkers", "commercial", "stroller", "power", "stuffed animals", "uniforms", "japan", "liquor", "faucet", "green and orange", "corn", "sub", "white and yellow", "mercedes", "in sky", "tarp", "indian", "counter", "multicolored", "polar", "go", "now", "no number", "swimming", "bridle", "cowboy", "union station", "salt and pepper", "olives", "pizza cutter", "british airways", "nighttime", "domestic", "trolley", "australia", "tiles", "pug", "wicker", "british", "us airways express", "burton", "christmas tree", "napkin", "writing", "rocks", "hello kitty", "lacoste", "gold", "fan", "skateboards", "day", "on floor", "2008", "dark", "flying kites", "rural", "olympics", "bmw", "34", "factory", "denim", "typing", "for fun", "steel", "watching tv", "chevron", "driver", "baggage claim", "grapes", "f", "angels", "roof", "handlebars", "train station", "public", "oak", "sleeping", "canada", "on runway", "air canada", "on top", "tired", "blonde", "cups", "little", "adidas", "10 feet", "white and gray", "leaf", "fisheye", "forest", "war", "octagon", "raspberry", "helmets", "united states", "29", "noodles", "van", "long", "traveling", "luggage", "airport", "single", "pitching", "dugout", "garbage", "in street", "happiness", "cigarette", "on tower", "antelope", "graffiti", "skating", "on road", "curved", "red light", "washington", "ski lift", "athletics", "brace", "squatting", "catching", "batter", "batting", "game", "towards", "33", "sliding", "makeup", "japanese", "person", "pirates", "plaid", "rose", "daytime", "keyboard", "surfboards", "hummingbird", "ollie", "11:30", "clock tower", "5:55", "san francisco", "stopping", "tags", "samsung", "computers", "cabinets", "talking", "cage", "asparagus", "5 years", "hanger", "adult", "rabbit", "empty", "softball", "1st", "playing", "chairs", "farm", "cross country", "dump truck", "women", "snowboarder", "tall", "monkey", "mantle", "fire", "books", "quilt", "cessna", "chandelier", "dunkin donuts", "beans", "relish", "no flag", "parking meter", "spots", "ducks", "sandals", "doughnut", "lighthouse", "yacht", "german shepherd", "in middle", "raw", "chain", "2 feet", "pedestal", "sauerkraut", "bagels", "mutt", "dog and cat", "race", "poor", "cat and dog", "station", "printer", "daisies", "front", "gravel", "rear", "grassy", "pigeons", "dogs", "in car", "life", "wii remotes", "suv", "leather", "bottom right", "peace", "facebook", "blanket", "fountain", "frisbees", "12:30", "am", "scooter", "going", "analog", "america", "pitbull", "relaxing", "paddle boarding", "white and pink", "shampoo", "alps", "ride", "side", "mane", "on desk", "on chair", "2012", "multi", "straight", "big ben", "closed", "frosted", "3 feet", "waves", "buoy", "life vest", "trash can", "medium", "boxer", "very tall", "yamaha", "sunlight", "hit ball", "dry", "coke", "gym", "orange and black", "center", "rope", "flip flops", "4th of july", "siamese", "crafts", "color", "italian", "playing frisbee", "skate park", "orange juice", "windowsill", "corgi", "thumb", "peanut butter", "pie", "toast", "no hat", "benches", "diamond", "blender", "avocado", "television", "speakers", "pony", "baseball field", "pavement", "sydney", "not there", "diamonds", "4 feet", "goalie", "soccer ball", "runway", "video game", "gaming", "casual", "green and white", "toilet brush", "working", "pickup", "girls", "remotes", "pasta", "hood", "braves", "skier", "motorola", "17", "b", "100", "diet coke", "hospital", "wagon", "milk", "ferry", "rainbow", "on bed", "toward", "1:30", "19", "security", "herself", "mercedes benz", "supreme", "thin", "platform", "gray and red", "thai", "storage", "thailand", "swan", "peach", "10:05", "dome", "chiquita", "2:00", "mountain dew", "23", "knives", "street sign", "on beach", "playing wii", "using laptop", "stickers", "yogurt", "on grass", "9:50", "9:45", "sweat", "gatorade", "umpire", "37", "transport", "desktop", "desserts", "main", "boston", "fell", "top right", "case", "asleep", "over", "9:55", "grapefruit", "breakfast", "headphones", "freight", "cup", "sweatband", "nobody", "lamps", "9:25", "scarf", "on fridge", "main st", "moving", "confused", "fresh", "kiting", "blue jay", "flats", "long time", "chihuahua", "ceramic", "mushrooms", "on plate", "human", "power lines", "hotel", "map", "earring", "boarding", "display", "warm", "napkins", "brown and black", "broom", "basketball", "papers", "holding baby", "sad", "kickstand", "60", "shoulder", "sleep", "footprints", "tunnel", "1990", "hats", "6 inches", "ham", "bacon", "church", "53", "pineapple", "at camera", "red bull", "pilot", "tattoo", "work", "polar bear", "taking off", "website", "22", "4:00", "coffee maker", "fast", "fur", "rubber", "tongs", "german", "germany", "3 inches", "toy", "3:20", "calm", "pots", "balloons", "fruits", "9:20", "drawer", "oven", "soup", "stove", "heels", "wind", "island", "blood", "leg", "theater", "tennis racquet", "21", "gothic", "2:35", "wii remote", "turning", "20 feet", "pink and black", "ears", "fun", "wreath", "to right", "child", "fly", "head", "drywall", "shorter", "pier", "feeding giraffe", "in vase", "burger", "easter", "onion", "uniform", "remote control", "guitar", "time", "verizon", "tomatoes", "ship", "tulips", "glaze", "on suitcase", "tent", "1:45", "market", "bnsf", "bandana", "still", "don't know", "piano", "mouth", "run", "sparrow", "throw", "lines", "vest", "1950", "jet", "sepia", "2015", "busy", "lighter", "dessert", "bending", "75", "finch", "pastries", "outdoors", "bakery", "clean", "ipod", "tablecloth", "cigarettes", "looking at phone", "in front", "food truck", "face", "swinging", "safari", "500", "volkswagen", "2010", "shape", "shelves", "riding horses", "2016", "behind bus", "towels", "lemon", "straw", "bamboo", "5 feet", "hardwood", "oregon", "schnauzer", "organic", "h", "kid", "meter", "61", "charging", "bald", "caucasian", "man on left", "stand", "27", "dining room", "sandwiches", "32", "apartment", "tower", "virgin", "out", "white and red", "2:05", "i don't know", "chains", "legs", "age", "goats", "s", "congratulations", "dresser", "camper", "half", "silverware", "decorative", "hawaiian", "petting horse", "wheel", "florida", "reds", "washington dc", "moon", "conference", "screen", "controller", "robin", "men", "protection", "roll", "harley davidson", "coal", "mustache", "smiling", "pedestrians", "88", "me", "tray", "males", "monitor", "bell", "landscape", "club", "toothpick", "seagulls", "bowtie", "lake", "steam", "surf", "baseball glove", "blinders", "woods", "stuffed", "sunbathing", "shearing", "dad", "mixer", "pot", "blending", "identification", "owl", "wine glass", "on bike", "billabong", "new york", "yarn", "tube", "tennis ball", "2:55", "ice cream", "chevrolet", "shirt and tie", "taking selfie", "blue and green", "he isn't", "cutting cake", "east", "setting", "brewers", "riding bikes", "7 eleven", "stars", "jockey", "jacket", "standing still", "book", "gray and white", "pen", "red white blue", "above", "alaska", "tongue", "feathers", "k", "camping", "pasture", "corner", "away", "ski", "texas", "fire truck", "sailboats", "jump", "walk", "spray paint", "loading", "united", "1000", "brushing his teeth", "roman numerals", "garlic", "surprise", "3rd", "first", "side of road", "dodgers", "airplanes", "unsure", "russian", "wet", "skyscraper", "5 star", "brushing her teeth", "blankets", "natural", "across street", "smartphone", "duck", "sausage", "paris", "newspaper", "pants", "spices", "pillow", "to left", "snowboards", "colgate", "on elephant", "string", "horns", "2:40", "men's", "cobblestone", "regular", "staring", "28", "barber shop", "linoleum", "grind", "cut", "x", "above sink", "above stove", "dishes", "dalmatian", "watching", "glazed", "5:25", "j", "messy", "wallet", "tuna", "toasted", "grilled", "french", "green and blue", "sunflowers", "to catch frisbee", "wool", "sprint", "no grass", "cabinet", "shell", "foil", "bottles", "bar", "king", "paper towels", "friends", "beagle", "school bus", "laptops", "snowing", "cement", "pc", "accident", "stuffed animal", "wakeboard", "balance", "in suitcase", "white and black", "nikon", "cleats", "on sink", "pool", "mom", "downtown", "asian", "heater", "bathing", "193", "against wall", "canopy", "jungle", "berries", "military", "pickle", "clams", "seafood", "in box", "boats", "tables", "lizard", "lemonade", "m", "soft", "illinois", "country", "for sale", "arm", "listening", "curly", "play tennis", "hands", "cereal", "blue and red", "robe", "around neck", "red and silver", "soap", "trains", "throwing frisbee", "smoking", "india", "headband", "not very", "westin", "serve", "bicycles", "can't tell", "to catch ball", "visibility", "ana", "reins", "rodeo", "boot", "on horse", "12:35", "riding motorcycle", "mexico", "mother", "african", "left and right", "button", "earrings", "blackberry", "cell", "10:00", "harness", "pillows", "vegetable", "tablet", "fern", "cats", "golden retriever", "goat", "tractor", "valentine's day", "hearts", "khaki", "man on right", "mcdonald's", "player", "arriving", "husky", "on skateboard", "vases", "coat", "beanie", "coming", "granite", "shopping cart", "it's raining", "sports", "leash", "balls", "blurry", "baseball bat", "team", "mango", "mug", "eiffel tower", "worms", "trash", "robot", "show", "terrier", "painting", "rooster", "42", "jones", "state farm", "balloon", "trunk", "coach", "t", "playing game", "fireplace", "behind clouds", "uphill", "motocross", "sony", "magazine", "kitesurfing", "catching frisbee", "catch frisbee", "bud light", "drive", "fighting", "1 on left", "very old", "hallway", "lexus", "wii controller", "9:15", "fast food", "5:45", "catholic", "muffin", "traffic light", "band", "button up", "grocery", "shelf", "2:25", "honey", "plants", "oars", "foggy", "nathan's", "cord", "yard", "48", "donut shop", "chimney", "calico", "suits", "sideways", "animals", "black and blue", "bikini", "photographer", "700", "queen", "1:00", "12:05", "horseback riding", "awake", "bunny", "12:00", "continental", "flamingo", "rye", "family", "lots", "owner", "stew", "palm tree", "cruise ship", "56", "design", "ny", "far right", "tire", "younger", "biking", "at&t", "giants", "marshmallows", "caramel", "polo", "emirates", "salon", "focus", "on motorcycle", "magnets", "mat", "ivy", "cakes", "chrome", "bob", "asia", "graduation", "cauliflower", "in snow", "c", "rough", "vacation", "air", "windy", "victoria", "4:45", "trick", "coconut", "labrador", "on left", "yellow and green", "butterfly", "fake", "on napkin", "bricks", "wine glasses", "detroit", "man's", "parsley", "art", "subway", "wave", "placemat", "hydrant", "sofa", "pigeon", "riding elephant", "all", "branches", "plant", "to eat", "zucchini", "feta", "neon", "mouse pad", "cloud", "toilet paper", "pumpkin", "rowing", "toronto", "handicap", "seeds", "fly kite", "chicago", "marble", "frame", "150", "rocky", "give way", "sauce", "it's not", "control", "high chair", "playstation", "xbox", "not likely", "roman", "land", "1:35", "lifeguard", "on pizza", "size", "bull", "dandelions", "equestrian", "goose", "8 feet", "recessed", "statue", "index", "phillies", "strike", "mirrors", "pointing", "farmer", "collie", "motorbike", "lanes", "bikes", "biker", "arrows", "gas station", "logs", "smaller", "desert", "yield", "flags", "stool", "kitten", "doll", "daffodils", "letters", "dishwasher", "first base", "nuts", "2013", "persian", "swim trunks", "deep", "o", "doubles", "toothpicks", "in field", "wristband", "wheels", "baking", "4:15", "11:00", "ear", "2007", "51", "chevy", "using computer", "frog", "storm", "boogie board", "hungry", "by window", "ambulance", "pigtails", "audi", "microsoft", "on man", "cannot tell", "stained glass", "hugging", "laying down", "3:00", "taxi", "pedestrian", "landing", "numbers", "38", "stones", "on tree", "clocks", "new", "picnic", "fog", "buffalo", "under armour", "cocker spaniel", "orioles", "no sign", "telling time", "bags", "golden gate", "cover", "castle", "canoe", "selfie", "cream", "floating", "indoor", "antique", "aluminum", "silver and black", "cast iron", "peas", "sun hat", "on right", "swiss", "flour", "under sink", "fashion", "fedora", "shells", "1 hour", "puppy", "in stands", "not here", "motor", "thousands", "120", "sail", "butt", "mexican", "dead end", "paddle", "bathing suit", "shop", "onion rings", "boxing", "birthday cake", "chalk", "scenery", "style", "nissan", "sticker", "on rack", "1 4", "woman's", "surprised", "north face", "squash", "not sure", "email", "spotted", "seat", "himself", "circles", "san diego", "kia", "mattress", "obama", "lamb", "american flag", "climbing", "skull and crossbones", "roast beef", "visor", "herd", "double", "52", "high", "stagecoach", "cart", "feeding", "eaten", "cone", "11:15", "smoothie", "golf", "colorado", "electronics", "5:15", "bowling", "players", "ketchup and mustard", "styrofoam", "6 feet", "hawk", "cheddar", "12:28", "arabic", "12:25", "12:10", "shower curtain", "army", "salmon", "10:40", "hanging", "whole", "behind fence", "bars", "moss", "no dog", "traffic", "10:25", "r", "countryside", "machine", "directions", "cooked", "aa", "6:45", "4 way", "stripe", "brand", "baseball player", "bunk", "coleslaw", "fishing boat", "at table", "europe", "dead", "arch", "scrambled", "clothing", "closet", "egg", "suitcases", "indoors", "coffee pot", "tires", "lilies", "cafe", "9:35", "teal", "toothpaste", "in background", "tarmac", "painted", "sunset", "orange and yellow", "oar", "peaches", "zebra and giraffe", "ladybug", "20 ft", "sesame seeds", "hills", "2:30", "stucco", "tail", "couple", "kawasaki", "smooth", "powdered sugar", "pedestrian crossing", "french fries", "picnic table", "teeth", "ribbon", "saddle", "15 feet", "earbuds", "on train", "39", "curb", "tow", "shark", "white and orange", "6:25", "gravy", "fork and spoon", "pooping", "curtain", "lime", "skull", "crossing", "speed limit", "peacock", "boredom", "neck", "hit", "dragon", "tissues", "basil", "waving", "blue team", "rectangles", "helicopter", "mud", "us", "balcony", "red and gray", "firefighter", "sunflower", "wallpaper", "best buy", "11:20", "public market center", "seattle", "bookshelf", "looking", "1 inch", "harley", "urinal", "cartoon", "t shirt and jeans", "navy", "fedex", "rays", "deck", "coaster", "1:20", "50 feet", "4:20", "us open", "looking at camera", "600", "national express", "white house", "5:00", "jp morgan", "palm trees", "tub", "pens", "soldiers", "2 people", "animal", "speaker", "hamburger", "spaghetti", "green beans", "it isn't", "10:20", "buildings", "on shelf", "baseball uniform", "tiled", "orange and blue", "90", "north america", "arrow", "news", "tropicana", "formal", "in grass", "thumbs up", "clip", "gate", "tennis player", "lilac", "pastry", "nose", "pacifier", "11:35", "different teams", "cardinals", "exhaust", "hauling", "on tray", "bagel", "huge", "out of focus", "cook", "wheat", "photo", "ghost", "sedan", "qatar", "zig zag", "lanyard", "pink and white", "sesame", "space", "no clock", "warning", "snowy", "tater tots", "tropical", "grandfather", "mac", "magnet", "photoshop", "pajamas", "350", "casserole", "4:55", "pelican", "2009", "clydesdale", "tow truck", "belt", "west", "omelet", "heavy", "crown", "in corner", "hexagon", "mound", "iris", "g", "12:45", "2:15", "3:10", "drawing", "only", "little girl", "washing", "nokia", "windsor", "2 men", "parmesan cheese", "on woman", "freezer", "icing", "venice", "dairy", "several", "concentration", "3:15", "no smoking", "kayak", "frosting", "jetblue", "thoroughbred", "parakeet", "shoe", "skeleton", "britain", "ties", "in sink", "patio", "bank", "camouflage", "privacy", "bib", "blue and gray", "looking out window", "falling", "bucket", "cupcakes", "throw ball", "garden", "almonds", "ducati", "ireland", "plastic wrap", "starbucks", "all way", "bark", "home plate", "base", "dog food", "toys", "blue and orange", "1 in front", "foot", "dc", "california", "towing", "cheesecake", "bushes", "bow tie", "millions", "down street", "2011", "police officer", "windmill", "taking pictures", "street name", "cleaning", "on pole", "russia", "main street", "catch ball", "mario", "pirate", "track", "garage", "7:10", "they aren't", "mother and child", "tents", "fancy", "tattoos", "alcohol", "2:45", "wheelchair", "money", "top hat", "willow", "cd", "brushing hair", "pancake", "80", "listening to music", "green and red", "barrier", "vests", "hiking", "tank top", "lufthansa", "student", "menu", "forehand", "wii controllers", "acer", "wall st", "hundreds", "water ski", "furniture", "paisley", "pizza hut", "baseball game", "hill", "prom", "1 world", "tiara", "students", "information", "hazy", "nasa", "canon", "bird feeder", "crane", "dr pepper", "logitech", "2:10", "all of them", "utensils", "telephone", "converse", "bone", "jeep", "nursing", "krispy kreme", "cameraman", "pee", "ranch", "polka dots", "railroad crossing", "shirts", "feeder", "above toilet", "unclear", "below", "43", "spoons", "calendar", "vaio", "fox", "mint", "after", "spiderman", "lg", "concert", "on rock", "fluffy", "gray and black", "coats", "lady", "dodge", "easyjet", "pearl", "bunt", "flat screen", "10:30", "music", "polar bears", "riding horse", "lift", "angry", "cookies", "3:45", "buttons", "hot", "cute", "behind", "dole", "in motion", "26", "pans", "love", "winnie pooh", "pear", "copyright", "2 hours", "snowsuit", "kissing", "backhand", "to get to other side", "metro", "swans", "very fast", "can't see it", "nintendo", "direction", "waiting", "mohawk", "st patrick's day", "rail", "hoodie", "feet", "swirls", "muffins", "4:05", "106", "10:55", "coins", "mitt", "game controller", "room", "adults", "urinals", "cameras", "marker", "upright", "brass", "sled", "teacher", "conductor", "farmers market", "toiletries", "blue and black", "soccer field", "banana peel", "sprite", "doughnuts", "bank of america", "on his face", "heat", "emergency", "ski slope", "hard", "41", "6:00", "in his hand", "cluttered", "dog show", "on boat", "grizzly", "drums", "not", "in hand", "easy", "400", "under table", "d", "hitting ball", "photography", "intersection", "backwards", "crocs", "marina", "chips", "bible", "harry potter", "hawaii", "fanta", "half full", "carriage", "curious", "12:50", "black white", "geese", "pork", "mailbox", "l", "sidecar", "poop", "wings", "penguin", "to see", "pocket", "steps", "cubs", "junk", "deer", "ottoman", "salt", "condiments", "1:55", "post", "bulldog", "notebook", "no cat", "champagne", "jets", "knee pads", "throw frisbee", "drinks", "leopard", "taller", "cooler", "bundt", "monday", "grape", "wine tasting", "under", "baskets", "santa hat", "chest", "sewing", "on car", "sony ericsson", "peeing", "for photo", "tour", "few", "singapore", "fireman", "fire extinguisher", "wildebeest", "lemons", "peanuts", "babies", "wiimote", "guitar hero", "slide", "stopped", "library", "multi colored", "blue and pink", "choppy", "sailing", "brush", "grinding", "jelly", "dairy queen", "shaking hands", "ge", "tigers", "tokyo", "philadelphia", "ski boots", "buses", "11:45", "collage", "pink and blue", "jesus", "singles", "iron", "coffee table", "2 years", "don't walk", "classroom", "on water", "potato salad", "posts", "harbor", "residential", "joshua", "uk", "burgers", "deli", "kicking", "lace", "overalls", "vehicles", "ram", "dancing", "47", "shed", "lid", "he's not", "fans", "amtrak", "space shuttle", "ostrich", "bathtub", "kneeling", "2:50", "mall", "yellow and orange", "gazebo", "wax", "slow down", "lays", "hammer time", "octopus", "crib", "banana split", "broadway", "pottery", "wavy", "farmers", "holding phone", "on phone", "squirrel", "wax paper", "tusks", "dining", "packing", "kangaroo", "dawn", "defense", "powdered", "thomas", "budweiser", "back left", "stir fry", "beijing", "11:10", "tripod", "wide", "slope", "black and gray", "planter", "chili", "siblings", "kayaking", "captivity", "opaque", "rack", "panda", "doorway", "wheelie", "pelicans", "genetics", "not in service", "volvo", "dachshund", "v", "on laptop", "western", "gone", "birthday party", "parking garage", "tying tie", "blueberry", "scale", "notes", "train car", "man made", "stability", "lily", "lying down", "pacific", "high heels", "pare", "checkerboard", "partly cloudy", "cool", "n", "toilets", "tree branch", "copper", "cycling", "5:50", "870", "shopping", "7:05", "zipper", "holding umbrella", "batman", "lotion", "1:25", "black and brown", "playing video game", "girl on right", "legos", "drinking water", "burrito", "plow", "jet ski", "spiral", "ibm", "tools", "flashlight", "cherries", "maple leaf", "mountainous", "under tree", "vines", "sushi", "baker", "snake", "globe", "target", "john", "pomeranian", "tuxedo", "hockey", "sleeve", "leaning", "wireless", "11:05", "compaq", "do not enter", "radish", "1:05", "dim", "advertisement", "movement", "model", "hammock", "swing", "sheet", "google", "boardwalk", "right 1", "haircut", "ankle", "3:30", "exit", "csx", "tim hortons", "lego", "cucumbers", "angel", "12:20", "racquet", "behind woman", "potato", "egg salad", "controllers", "recliner", "upside down", "mosaic", "before", "antenna", "3:50", "10:15", "lion", "camo", "fighter", "silver and red", "dirt bike", "playing video games", "used", "crates", "horizontally", "plunger", "refrigerators", "radiator", "stork", "in basket", "cap", "living", "married", "briefcase", "bottom left", "30 mph", "ascending", "flip phone", "101", "11:50", "gun", "arizona", "foam", "serious", "y", "close up", "pancakes", "heineken", "paw", "cnn", "comforter", "sheets", "8:35", "driveway", "fair", "cleaner", "1 year", "delivery", "commuter", "apple and banana", "chase", "72", "safe", "trucks", "trunks", "spider", "64", "slacks", "meeting", "7:00", "skiers", "shaved", "carrot cake", "holding", "surfers", "giraffe and zebra", "7:45", "mississippi", "seaweed", "black and pink", "horse racing", "orchid", "rv", "tourist", "above door", "leaving", "pitch", "crest", "miami", "asics", "flood", "bus station", "take off", "amazon", "practice", "entering", "diesel", "pm", "wetsuits", "remodeling", "porch", "7:35", "tie dye", "baked", "life jacket", "cylinder", "grilled cheese", "meatballs", "paddling", "banana bread", "monster", "smiley face", "not high", "keys", "dreadlocks", "kitchenaid", "straight ahead", "badminton", "long sleeve", "sheepdog", "5:18", "end", "on shore", "scratching", "oriental", "5:05", "alligator", "city bus", "purple and white", "10:50", "each other", "weeds", "tinkerbell", "rottweiler", "apartments", "snowflakes", "stop light", "sweatshirt", "shore", "bidet", "switzerland", "stretching", "tv stand", "boundaries", "65", "bronze", "jar", "middle 1", "54", "skate", "easton", "turn right", "raspberries", "singing", "on bus", "carnations", "descending", "classic", "suspenders", "not long", "8:50", "father", "anniversary", "hsbc", "very long", "space needle", "skatepark", "fruit salad", "kenmore", "no water", "8:05", "db", "baby's breath", "shelter", "1980", "no left turn", "washington monument", "ham and cheese", "10 inches", "8:55", "savory", "6:35", "indians", "9:05", "fires", "pipes", "donkey", "cds", "mitsubishi", "tell time", "outfield", "christian", "puma", "parking meters", "cranes", "flip", "wine bottle", "stadium", "mouthwash", "heinz", "distance", "macaroni", "on plane", "triumph", "more", "4:50", "single engine", "disney", "on stove", "shih tzu", "fried", "to hit ball", "in her hand", "sunrise", "2nd", "elmo", "kite string", "suzuki", "traffic lights", "blt", "i", "hitting", "htc", "healthy", "current", "star alliance", "stomach", "watch tv", "tulip", "5:10", "right side", "4:40", "ginger", "on sign", "cushion", "5:30", "learning", "pencil", "maroon", "food processor", "5:40", "dog bed", "michigan", "close", "license plate", "crows", "right hand", "normal", "green and brown", "1.00", "000", "1:40", "wing", "american airlines", "kodak", "mural", "sniffing", "1:15", "behind bench", "cardinal", "no light", "warmth", "paved", "skyscrapers", "swinging bat", "watermark", "in cup", "pizza box", "dough", "hiding", "goal", "no plate", "shower head", "ripe", "1:10", "1 in back", "older", "nest", "multiple", "cinnamon", "bin", "new orleans", "colored", "enclosure", "bride", "on dresser", "star wars", "in back", "triangles", "over easy", "cilantro", "statues", "sticks", "formica", "roundabout", "bowls", "ahead", "years", "drain", "veggies", "no shirt", "taking photo", "tugboat", "broke", "59", "cadillac", "prince", "left side", "1 in middle", "10:45", "drying", "11:25", "silk", "conference room", "buoys", "pockets", "daffodil", "6:40", "walgreens", "4 ft", "6:05", "virgin atlantic", "12:40", "digital", "ups", "westjet", "bikers", "us air force", "limes", "comcast", "dip", "7:55", "man in middle", "bus driver", "soon", "futon", "selling", "braid", "mariners", "wisconsin", "99", "citizen", "broccoli and carrots", "grocery store", "us airways", "49", "bored", "red velvet", "hotel room", "qantas", "tam", "korean air", "10:35", "whirlpool", "coffee cup", "hilly", "9:12", "whipped cream", "video", "finger", "competition", "hollywood", "sas", "backward", "beads", "cosmo", "10:08", "jal", "6:30", "100 year party ct", "hispanic", "in cabbage town", "opponent", "woodpecker", "visilab", "mt airy", "crosstown", "freightliner"]

configs/retrieval.yaml ADDED Viewed

	@@ -0,0 +1,73 @@

+hidden_size: &hidden_size 768
+vocab_size: &vocab_size 30522
+type_vocab_size: &type_vocab_size 2
+max_position_embeddings: &max_position_embeddings 512
+pad_token_id: &pad_token_id 0
+embed_size: &embed_size 256
+seed: 42
+world_size: 1
+device: "cuda"
+dist_url: "env://"
+output_path: "./examples/albef/outputs/retrieval_output.pt"
+datamodule_args:
+  train_files: ["./examples/albef/data_files/coco_train.json"]
+  test_files: ["./examples/albef/data_files/coco_test.json"]
+  image_root: "./examples/albef/data_files/coco"
+  batch_size: 32
+  num_workers: 8
+vision_encoder_args:
+  hidden_size: *hidden_size
+  image_size: 384
+  patch_size: 16
+  num_hidden_layers: 12
+  num_attention_heads: 12
+  mlp_dim: 3072
+  dropout: 0.0
+  attention_dropout: 0.0
+  layer_norm_eps: 1e-6
+text_encoder_args:
+  vocab_size: *vocab_size
+  hidden_size: *hidden_size
+  type_vocab_size: *type_vocab_size
+  max_position_embeddings: *max_position_embeddings
+  pad_token_id: *pad_token_id
+  num_hidden_layers: 6
+  num_attention_heads: 12
+  intermediate_size: 3072
+  layer_norm_eps: 1e-12
+  dropout: 0.0
+multimodal_encoder_args:
+  hidden_size: *hidden_size
+  num_hidden_layers: 6
+  num_attention_heads: 12
+  intermediate_size: 3072
+  layer_norm_eps: 1e-12
+projection_args:
+  in_features: *hidden_size
+  out_features: *embed_size
+similarity_args:
+  embed_size: *embed_size
+  queue_size: 65536
+  temp: 0.07
+training_args:
+  log_every_n_steps: 100
+  alpha: 0.4
+  weight_decay: 0.02
+  lr: 1e-5
+  min_lr: 1e-6
+  max_epochs: 5
+  step_size: 100
+  warmup_steps: 1
+  checkpoint_root: "./examples/albef/checkpoints"
+eval_args:
+  log_every_n_steps: 100
+  k_test: 256

configs/vqa.yaml ADDED Viewed

	@@ -0,0 +1,78 @@

+hidden_size: &hidden_size 768
+vocab_size: &vocab_size 30522
+type_vocab_size: &type_vocab_size 2
+max_position_embeddings: &max_position_embeddings 512
+pad_token_id: &pad_token_id 0
+seed: 42
+world_size: 1
+device: "cuda"
+dist_url: "env://"
+output_root: "./examples/albef/outputs"
+datamodule_args:
+  train_files: ["./examples/albef/data_files/vqa_train.json", "./examples/albef/data_files/vg_qa.json", "./examples/albef/data_files/vqa_val.json"]
+  test_files: ["./examples/albef/data_files/vqa_test.json"]
+  answer_list: "./examples/albef/data_files/answer_list.json"
+  vqa_root: "./examples/albef/data_files/coco"
+  vg_root: "./examples/albef/data_files/visual_genome"
+  batch_size: 32
+  num_workers: 8
+vision_encoder_args:
+  hidden_size: *hidden_size
+  image_size: 384
+  patch_size: 16
+  num_hidden_layers: 12
+  num_attention_heads: 12
+  mlp_dim: 3072
+  dropout: 0.0
+  attention_dropout: 0.0
+  layer_norm_eps: 1e-6
+text_encoder_args:
+  vocab_size: *vocab_size
+  hidden_size: *hidden_size
+  type_vocab_size: *type_vocab_size
+  max_position_embeddings: *max_position_embeddings
+  pad_token_id: *pad_token_id
+  num_hidden_layers: 6
+  num_attention_heads: 12
+  intermediate_size: 3072
+  layer_norm_eps: 1e-12
+  dropout: 0.0
+multimodal_encoder_args:
+  hidden_size: *hidden_size
+  num_hidden_layers: 6
+  num_attention_heads: 12
+  intermediate_size: 3072
+  layer_norm_eps: 1e-12
+text_embeddings_args:
+  hidden_size: *hidden_size
+  vocab_size: *vocab_size
+  pad_token_id: *pad_token_id
+  max_position_embeddings: *max_position_embeddings
+  type_vocab_size: *type_vocab_size
+  layer_norm_eps: 1e-12
+prediction_head_args:
+  hidden_size: *hidden_size
+  vocab_size: *vocab_size
+  layer_norm_eps: 1e-12
+training_args:
+  log_every_n_steps: 100
+  alpha: 0.4
+  weight_decay: 0.02
+  lr: 2e-5
+  min_lr: 1e-6
+  max_epochs: 8
+  step_size: 100
+  warmup_steps: 4
+  checkpoint_root: "./examples/albef/checkpoints"
+eval_args:
+  log_every_n_steps: 100
+  k_test: 128

data/__init__.py ADDED Viewed

	@@ -0,0 +1,5 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.

data/retrieval_datamodule.py ADDED Viewed

	@@ -0,0 +1,188 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+from typing import List, Optional, Tuple
+import torch
+from data.retrieval_dataset import (
+    ImageToTextRetrievalDataset,
+    RetrievalTrainingDataset,
+    TextToImageRetrievalDataset,
+)
+from data.transforms import (
+    ALBEFTextTransform,
+    testing_image_transform,
+    training_image_transform,
+)
+from pytorch_lightning import LightningDataModule
+from torch import Tensor
+from torch.nn.utils.rnn import pad_sequence
+from torch.utils.data import DataLoader, Dataset, DistributedSampler
+class RetrievalDataModule(LightningDataModule):
+    """
+    The Data Module for Retrieval task.
+    Args:
+        train_files (List[str]): The paths to training json files.
+        test_files (List[str]): The paths to testing json files.
+        image_root (str): The path to image data directory.
+        batch_size (int): The sampling batch size.
+        num_workers (int): The number of workers for the distributed mode.
+    """
+    def __init__(
+        self,
+        train_files: List[str],
+        test_files: List[str],
+        image_root: str,
+        batch_size: int,
+        num_workers: int,
+    ) -> None:
+        super().__init__()
+        self.train_dataset = RetrievalTrainingDataset(
+            train_files,
+            image_root,
+            training_image_transform(),
+            ALBEFTextTransform(truncate=True, max_seq_len=30, add_end_token=False),
+        )
+        self.image_dataset = ImageToTextRetrievalDataset(
+            test_files,
+            image_root,
+            testing_image_transform(),
+        )
+        self.text_dataset = TextToImageRetrievalDataset(
+            test_files,
+            ALBEFTextTransform(
+                truncate=True,
+                pad_to_max_seq_len=True,
+                max_seq_len=30,
+                add_end_token=False,
+            ),
+        )
+        self.batch_size = batch_size
+        self.num_workers = num_workers
+    def _get_sampler(
+        self,
+        dataset: Dataset,
+        shuffle: bool,
+        is_distributed: bool,
+        num_tasks: int,
+        global_rank: int,
+    ) -> Optional[DistributedSampler]:
+        # do not return a sampler if is not in distributed mode
+        # a default RandomSampler is used in this case
+        if not is_distributed:
+            return None
+        return DistributedSampler(
+            dataset, num_replicas=num_tasks, rank=global_rank, shuffle=shuffle
+        )
+    def train_dataloader(
+        self,
+        is_distributed: bool = False,
+        num_tasks: int = 0,
+        global_rank: int = 0,
+        drop_last: bool = True,
+    ) -> DataLoader:
+        """
+        DataLoader Outputs:
+            images (Tensor): Tensor of shape (B, C, W, H) of image inputs.
+            text (Tensor): Tensor of shape (B, L) of text inputs.
+            text_atts (Tensor): Tensor of shape (B, L) of text attention mask.
+            idx (Tensor): Tensor of shape (B) of image identifiers.
+        """
+        sampler = self._get_sampler(
+            dataset=self.train_dataset,
+            shuffle=True,
+            is_distributed=is_distributed,
+            num_tasks=num_tasks,
+            global_rank=global_rank,
+        )
+        shuffle = sampler is None
+        return DataLoader(
+            self.train_dataset,
+            batch_size=self.batch_size,
+            num_workers=self.num_workers,
+            pin_memory=True,
+            sampler=sampler,
+            shuffle=shuffle,
+            collate_fn=retrieval_train_collate_fn,
+            drop_last=drop_last,
+        )
+    def image_dataloader(
+        self,
+        drop_last: bool = False,
+    ) -> DataLoader:
+        """
+        DataLoader Outputs:
+            images (Tensor): Tensor of shape (B, C, W, H) of image inputs.
+        """
+        return DataLoader(
+            self.image_dataset,
+            batch_size=self.batch_size,
+            num_workers=self.num_workers,
+            pin_memory=True,
+            sampler=None,
+            shuffle=False,
+            collate_fn=None,
+            drop_last=drop_last,
+        )
+    def text_dataloader(
+        self,
+        drop_last: bool = False,
+    ) -> DataLoader:
+        """
+        DataLoader Outputs:
+            text (Tensor): Tensor of shape (B, L) of text inputs.
+            text_atts (Tensor): Tensor of shape (B, L) of text attention mask.
+        """
+        return DataLoader(
+            self.text_dataset,
+            batch_size=self.batch_size,
+            num_workers=self.num_workers,
+            pin_memory=True,
+            sampler=None,
+            shuffle=False,
+            collate_fn=text_collate_fn,
+            drop_last=drop_last,
+        )
+def retrieval_train_collate_fn(
+    batch: List[Tuple[Tensor, Tensor, int]]
+) -> Tuple[Tensor, Tensor, Tensor, Tensor]:
+    image_list = []
+    text_list = []
+    idx_list = []
+    for image, text, idx in batch:
+        image_list.append(image)
+        text_list.append(text)
+        idx_list.append(idx)
+    images = torch.stack(image_list, dim=0)
+    text = pad_sequence(text_list, batch_first=True)
+    text_atts = (text != 0).type(torch.long)
+    idx = Tensor(idx_list).type(torch.long)
+    return (
+        images,
+        text,
+        text_atts,
+        idx,
+    )
+def text_collate_fn(batch: List[Tensor]) -> Tuple[Tensor, Tensor]:
+    text = pad_sequence(batch, batch_first=True)
+    text_atts = (text != 0).type(torch.long)
+    return text, text_atts

data/retrieval_dataset.py ADDED Viewed

	@@ -0,0 +1,149 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+import json
+import os
+from typing import Callable, List, Tuple, Union
+from PIL import Image
+from torch import Tensor
+from torch.utils.data import Dataset
+class RetrievalTrainingDataset(Dataset):
+    """
+    Create the training dataset for Retrieval task.
+    Args:
+        ann_file (List[str]): The paths to training annotation json files.
+        image_root (str): The path to image data directory.
+        image_transform (Callable[[Image.Image], Tensor]): Image data transform.
+        text_transform (Callable[[Union[List[str], str]], Tensor]): Text data transform.
+    Dataset Outputs:
+        image (Tensor): Transformed image input tensor of shape (C, H, W).
+        caption (Tensor): Transformed text token input ids.
+        idx (int): The unique identifier for the image.
+    """
+    def __init__(
+        self,
+        ann_file: List[str],
+        image_root: str,
+        image_transform: Callable[[Image.Image], Tensor],
+        text_transform: Callable[[Union[List[str], str]], Tensor],
+    ) -> None:
+        self.ann = []
+        for f in ann_file:
+            self.ann += json.load(open(f, "r"))
+        self.image_root = image_root
+        self.image_transform = image_transform
+        self.text_transform = text_transform
+        self.idx = {}  # map str image_id from dataset to int ids
+        i = 0
+        for ann in self.ann:
+            image_id = ann["image_id"]
+            if image_id not in self.idx.keys():
+                self.idx[image_id] = i
+                i += 1
+    def __len__(self) -> int:
+        return len(self.ann)
+    def __getitem__(self, index: int) -> Tuple[Tensor, Tensor, int]:
+        ann = self.ann[index]
+        image_path = os.path.join(self.image_root, ann["image"])
+        image = Image.open(image_path).convert("RGB")
+        image = self.image_transform(image)
+        caption = self.text_transform(ann["caption"])
+        return image, caption, self.idx[ann["image_id"]]
+class ImageToTextRetrievalDataset(Dataset):
+    """
+    Create the dataset for Image-to-Text Retrieval task.
+    Args:
+        ann_file (List[str]): The paths to annotation json files.
+        image_root (str): The path to image data directory.
+        image_transform (Callable[[Image.Image], Tensor]): Image data transform.
+    Dataset Outputs:
+        image (Tensor): Transformed image input tensor of shape (C, H, W).
+    """
+    def __init__(
+        self,
+        ann_file: List[str],
+        image_root: str,
+        image_transform: Callable[[Image.Image], Tensor],
+    ) -> None:
+        self.image_root = image_root
+        self.image_transform = image_transform
+        self.ann = []
+        self.images = []  # paths to all images in the dataset
+        self.image_to_text = {}  # map image ids to text ids for evaluation
+        for f in ann_file:
+            self.ann += json.load(open(f, "r"))
+        text_id = 0
+        for image_id, ann in enumerate(self.ann):
+            self.images.append(ann["image"])
+            num_text = len(ann["caption"])
+            self.image_to_text[image_id] = list(range(text_id, text_id + num_text))
+            text_id += num_text
+    def __len__(self) -> int:
+        return len(self.images)
+    def __getitem__(self, index: int) -> Tensor:
+        image_path = os.path.join(self.image_root, self.images[index])
+        image = Image.open(image_path).convert("RGB")
+        image = self.image_transform(image)
+        return image
+class TextToImageRetrievalDataset(Dataset):
+    """
+    Create the dataset for Text-to-Image Retrieval task.
+    Args:
+        ann_file (List[str]): The paths to annotation json files.
+        text_transform (Callable[[Union[List[str], str]], Tensor]): Text data transform.
+    Dataset Outputs:
+        text (Tensor): Transformed text token input ids.
+    """
+    def __init__(
+        self,
+        ann_file: List[str],
+        text_transform: Callable[[Union[List[str], str]], Tensor],
+    ) -> None:
+        self.text_transform = text_transform
+        self.ann = []
+        self.text = []  # all text strings in the dataset
+        self.text_to_image = {}  # map text ids to image ids for evaluation
+        for f in ann_file:
+            self.ann += json.load(open(f, "r"))
+        text_id = 0
+        for image_id, ann in enumerate(self.ann):
+            for caption in ann["caption"]:
+                self.text.append(caption)
+                self.text_to_image[text_id] = image_id
+                text_id += 1
+    def __len__(self) -> int:
+        return len(self.text)
+    def __getitem__(self, index: int) -> Tensor:
+        text = self.text_transform(self.text[index])
+        return text

data/transforms.py ADDED Viewed

	@@ -0,0 +1,139 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+import re
+from typing import List, Tuple, Union
+import torch
+from torchtext.transforms import PadTransform, Sequential, ToTensor, Truncate
+from torchvision import transforms
+from transformers.models.bert.tokenization_bert import BertTokenizer
+# mean and standard deviation from the ALBEF repo:
+# https://github.com/salesforce/ALBEF/blob/main/dataset/__init__.py#L16
+MEAN = (0.48145466, 0.4578275, 0.40821073)
+STD_DEV = (0.26862954, 0.26130258, 0.27577711)
+class ALBEFTextTransform:
+    """
+    Remove punctuations and trailing spaces in input text and transform it into
+    a Tensor of token ids using BERTTokenizer.
+    Args:
+        pretrained_tokenizer (str): Pretrained tokenizer to use.
+            Default: "bert-base-uncased"
+        do_pre_process (bool): Whether to pre-process input text.
+            Defaults to True.
+        truncate (bool): Whether to truncate input text to max_seq_length.
+            Defaults to False.
+        pad_to_max_seq_len (bool): Whether to pad the sequence to max_seq_length.
+        add_end_token (bool): Whether to add the end-of-sentence token.
+            Defaults to True.
+        max_seq_len (int): The max sequence length after truncating or padding.
+            Defaults to 25.
+        cls_token_id (int): Value to represent the start of each text.
+            Defaults to 101, Hugging Face's BERT cls token id.
+        sep_token_id (int): Value to represent the end of each text.
+            Defaults to 102, Hugging Face's BERT sep token id.
+        pad_token_id (int): Value with which to pad each text so that all texts are the same length.
+            Defaults to 0, Hugging Face's BERT pad token id.
+    Inputs:
+        text (Union[List[str], str]): Input text to transform.
+    """
+    def __init__(
+        self,
+        pretrained_tokenizer: str = "bert-base-uncased",
+        do_pre_process: bool = True,
+        truncate: bool = False,
+        pad_to_max_seq_len: bool = False,
+        add_end_token: bool = True,
+        max_seq_len: int = 25,
+        cls_token_id: int = 101,
+        sep_token_id: int = 102,
+        pad_token_id: int = 0,
+    ):
+        self.do_pre_process = do_pre_process
+        self.cls_token_id = cls_token_id
+        self.sep_token_id = sep_token_id
+        self.pad_token_id = pad_token_id
+        self.add_end_token = add_end_token
+        self.tokenizer = BertTokenizer.from_pretrained(pretrained_tokenizer)
+        self.transform = Sequential(
+            Truncate(max_seq_len=max_seq_len) if truncate else torch.nn.Identity(),
+            ToTensor(padding_value=self.pad_token_id),
+            PadTransform(max_length=max_seq_len, pad_value=self.pad_token_id)
+            if pad_to_max_seq_len
+            else torch.nn.Identity(),
+        )
+    def pre_process(self, text: str) -> str:
+        text = (
+            re.sub(
+                r"([,.'!?\"()*#:;~])",
+                "",
+                text,
+            )
+            .replace("-", " ")
+            .replace("/", " ")
+        )
+        text = text.rstrip(" ")
+        return text
+    def __call__(self, text: Union[List[str], str]) -> torch.Tensor:
+        if self.do_pre_process:
+            if isinstance(text, str):
+                text = self.pre_process(text)
+            else:
+                text = [self.pre_process(t) for t in text]
+        tokens = self.tokenizer(text)["input_ids"]
+        if not self.add_end_token and tokens[-1] == self.sep_token_id:
+            tokens = tokens[:-1]
+        input_ids = self.transform(tokens)
+        return input_ids
+def training_image_transform(
+    image_size: int = 384,
+    scale: Tuple[float, float] = (0.5, 1.0),
+    image_interpolation=transforms.InterpolationMode.BICUBIC,
+    mean: Tuple[float, float, float] = MEAN,
+    std_dev: Tuple[float, float, float] = STD_DEV,
+) -> transforms.Compose:
+    return transforms.Compose(
+        [
+            transforms.RandomResizedCrop(
+                image_size, scale=scale, interpolation=image_interpolation
+            ),
+            transforms.RandomHorizontalFlip(),
+            transforms.RandAugment(2, 7),
+            transforms.ToTensor(),
+            transforms.Normalize(mean, std_dev),
+        ]
+    )
+def testing_image_transform(
+    image_size: int = 384,
+    image_interpolation=transforms.InterpolationMode.BICUBIC,
+    mean: Tuple[float, float, float] = MEAN,
+    std_dev: Tuple[float, float, float] = STD_DEV,
+) -> transforms.Compose:
+    return transforms.Compose(
+        [
+            transforms.Resize(
+                (image_size, image_size), interpolation=image_interpolation
+            ),
+            transforms.ToTensor(),
+            transforms.Normalize(mean, std_dev),
+        ]
+    )

data/vqa_datamodules.py ADDED Viewed

	@@ -0,0 +1,206 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+from typing import List, Optional, Tuple
+import torch
+from data.transforms import (
+    ALBEFTextTransform,
+    testing_image_transform,
+    training_image_transform,
+)
+from data.vqa_dataset import VQADataset
+from pytorch_lightning import LightningDataModule
+from torch import Tensor
+from torch.nn.utils.rnn import pad_sequence
+from torch.utils.data import DataLoader, DistributedSampler
+class VQADataModule(LightningDataModule):
+    """
+    The Data Module for Visual Question Answering task.
+    Args:
+        train_files (List[str]): The paths to training json files.
+        test_files (List[str]): The paths to testing json files.
+        answer_list (str): The path to the answers list.
+        vqa_root (str): The path to vqa data directory.
+        vg_root (str): The path to vg data directory.
+        batch_size (int): The sampling batch size.
+        num_workers (int): The number of workers for the distributed mode.
+    """
+    def __init__(
+        self,
+        train_files: List[str],
+        test_files: List[str],
+        answer_list: str,
+        vqa_root: str,
+        vg_root: str,
+        batch_size: int,
+        num_workers: int,
+    ) -> None:
+        super().__init__()
+        self.train_dataset = VQADataset(
+            train_files,
+            vqa_root,
+            vg_root,
+            image_transform=training_image_transform(),
+            question_transform=ALBEFTextTransform(
+                truncate=True, max_seq_len=25, add_end_token=False
+            ),
+            answer_transform=ALBEFTextTransform(do_pre_process=False),
+            split="train",
+        )
+        self.test_dataset = VQADataset(
+            test_files,
+            vqa_root,
+            vg_root,
+            image_transform=testing_image_transform(),
+            question_transform=ALBEFTextTransform(add_end_token=False),
+            answer_transform=ALBEFTextTransform(do_pre_process=False),
+            split="test",
+            answer_list=answer_list,
+        )
+        self.batch_size = batch_size
+        self.num_workers = num_workers
+    def _get_sampler(
+        self,
+        dataset: VQADataset,
+        shuffle: bool,
+        is_distributed: bool,
+        num_tasks: int,
+        global_rank: int,
+    ) -> Optional[DistributedSampler]:
+        if not is_distributed:
+            return None
+        return DistributedSampler(
+            dataset, num_replicas=num_tasks, rank=global_rank, shuffle=shuffle
+        )
+    def train_dataloader(
+        self,
+        is_distributed: bool = False,
+        num_tasks: int = 0,
+        global_rank: int = 0,
+        drop_last: bool = True,
+    ) -> DataLoader:
+        """
+        DataLoader Outputs:
+            images (Tensor): Tensor of shape (B, C, W, H) of image inputs.
+            questions (Tensor): Tensor of shape (B, L) of question inputs.
+            question_atts (Tensor): Tensor of shape (B, L) of question attention mask.
+            answers (Tensor): Tensor of shape (N, M) of answer inputs.
+                N >= B because a vqa sample can have multiple answers.
+            answer_atts (Tensor): Tensor of shape (N, M) of answer attention mask.
+            weights (Tensor): Tensor of shape (N) of answer weights.
+            ans_lengths (List[int]): List of length B and sum N where
+                ans_lengths[i] = number of answers for images[i] and questions[i].
+        """
+        sampler = self._get_sampler(
+            dataset=self.train_dataset,
+            shuffle=True,
+            is_distributed=is_distributed,
+            num_tasks=num_tasks,
+            global_rank=global_rank,
+        )
+        shuffle = sampler is None
+        return DataLoader(
+            self.train_dataset,
+            batch_size=self.batch_size,
+            num_workers=self.num_workers,
+            pin_memory=True,
+            sampler=sampler,
+            shuffle=shuffle,
+            collate_fn=vqa_train_collate_fn,
+            drop_last=drop_last,
+        )
+    def test_dataloader(
+        self,
+        is_distributed: bool = False,
+        num_tasks: int = 0,
+        global_rank: int = 0,
+        drop_last=False,
+    ) -> DataLoader:
+        """
+        DataLoader Outputs:
+            images (Tensor): Tensor of shape (B, C, W, H) of image inputs.
+            questions (Tensor): Tensor of shape (B, L) of question inputs.
+            question_atts (Tensor): Tensor of shape (B, L) of question attention mask.
+            question_ids (List): List of length B of question ids.
+        """
+        sampler = self._get_sampler(
+            dataset=self.test_dataset,
+            shuffle=False,
+            is_distributed=is_distributed,
+            num_tasks=num_tasks,
+            global_rank=global_rank,
+        )
+        return DataLoader(
+            self.test_dataset,
+            batch_size=self.batch_size,
+            num_workers=self.num_workers,
+            pin_memory=True,
+            sampler=sampler,
+            shuffle=False,
+            collate_fn=vqa_test_collate_fn,
+            drop_last=drop_last,
+        )
+def vqa_train_collate_fn(
+    batch: List[Tuple[Tensor, Tensor, List[Tensor], List[float]]]
+) -> Tuple[Tensor, Tensor, Tensor, Tensor, Tensor, Tensor, List[int]]:
+    image_list = []
+    question_list = []
+    answer_list = []
+    weight_list = []
+    ans_lengths = []
+    for image, question, answer, weights in batch:
+        image_list.append(image)
+        question_list.append(question)
+        answer_list += answer
+        weight_list += weights
+        ans_lengths.append(len(answer))
+    images = torch.stack(image_list, dim=0)
+    questions = pad_sequence(question_list, batch_first=True)
+    question_atts = (questions != 0).type(torch.long)
+    answers = pad_sequence(answer_list, batch_first=True)
+    answer_atts = (answers != 0).type(torch.long)
+    weights = torch.Tensor(weight_list)
+    return (
+        images,
+        questions,
+        question_atts,
+        answers,
+        answer_atts,
+        weights,
+        ans_lengths,
+    )
+def vqa_test_collate_fn(
+    batch: List[Tuple[Tensor, Tensor, int]]
+) -> Tuple[Tensor, Tensor, Tensor, List[int]]:
+    image_list, question_list, question_ids = [], [], []
+    for image, question, question_id in batch:
+        image_list.append(image)
+        question_list.append(question)
+        question_ids.append(question_id)
+    images = torch.stack(image_list, dim=0)
+    questions = pad_sequence(question_list, batch_first=True)
+    question_atts = (questions != 0).type(torch.long)
+    return (
+        images,
+        questions,
+        question_atts,
+        question_ids,
+    )

data/vqa_dataset.py ADDED Viewed

	@@ -0,0 +1,115 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+import json
+import os
+from typing import Callable, List, Tuple, Union
+import torch
+from PIL import Image
+from torch import Tensor
+from torch.utils.data import Dataset
+class VQADataset(Dataset):
+    """
+    Create the dataset for VQA task.
+    Args:
+        ann_file (List[str]): The paths to annotation json files.
+        vqa_root (str): The path to vqa data directory.
+        vg_root (str): The path to vg data directory.
+        image_transform (Callable[[Image.Image], Tensor]): image data transform.
+        question_transform (Callable[[Union[List[str], str]], Tensor]): text data transform for questions.
+        answer_transform (Callable[[Union[List[str], str]], Tensor]): text data transform for answers.
+        split (str): Indicates train or test. Default is train.
+        answer_list (str): The path to the answers list. Required for test split.
+    Dataset Outputs:
+        if split is train:
+            image (Tensor): Transformed image input tensor of shape (C, W, H).
+            question (Tensor): Transformed question token input ids.
+            answers (List[Tensor]): List of transformed answers token input ids.
+            answer_weights (List[float]): List of answer weights.
+                answer_weights[i] is proportional to the number of occurences of answers[i]
+        if split is test:
+            image (Tensor): Transformed image input tensor of shape (C, W, H).
+            question (Tensor): Transformed text token input ids.
+            question_id (int): The question sample id.
+    """
+    def __init__(
+        self,
+        ann_file: List[str],
+        vqa_root: str,
+        vg_root: str,
+        image_transform: Callable[[Image.Image], Tensor],
+        question_transform: Callable[[Union[List[str], str]], Tensor],
+        answer_transform: Callable[[Union[List[str], str]], Tensor],
+        split: str = "train",
+        answer_list: str = None,
+    ) -> None:
+        self.ann = []
+        for f in ann_file:
+            self.ann += json.load(open(f, "r"))
+        self.vqa_root = vqa_root
+        self.vg_root = vg_root
+        self.image_transform = image_transform
+        self.question_transform = question_transform
+        self.answer_transform = answer_transform
+        self.split = split
+        if split == "test":
+            self.answer_list = json.load(open(answer_list, "r"))
+            self.answer_input_ids = self.answer_transform(self.answer_list)
+            self.answer_attention_mask = (self.answer_input_ids != 0).type(torch.long)
+    def __len__(self) -> int:
+        return len(self.ann)
+    def __getitem__(
+        self, index: int
+    ) -> Union[
+        Tuple[Tensor, Tensor, int], Tuple[Tensor, Tensor, List[Tensor], List[float]]
+    ]:
+        ann = self.ann[index]
+        image_root = self.vqa_root if ann["dataset"] == "vqa" else self.vg_root
+        image_path = os.path.join(image_root, ann["image"])
+        image = Image.open(image_path).convert("RGB")
+        image = self.image_transform(image)
+        question = self.question_transform(ann["question"])
+        if self.split == "test":
+            return image, question, ann["question_id"]
+        elif self.split == "train":
+            if ann["dataset"] == "vqa":
+                # Each VQA sample question has a list of answers (with potential repeats)
+                # answer_weight[answer] = count(answer) / len(answers for the question)
+                answer_weights = {}
+                for answer in ann["answer"]:
+                    if answer in answer_weights.keys():
+                        answer_weights[answer] += 1 / len(ann["answer"])
+                    else:
+                        answer_weights[answer] = 1 / len(ann["answer"])
+                answers = list(answer_weights.keys())
+                answer_weights = list(answer_weights.values())
+            elif ann["dataset"] == "vg":
+                # A VG sample question has one answer so assign it a constant weight (0.5)
+                answers = [ann["answer"]]
+                answer_weights = [0.5]
+            answers = list(self.answer_transform(answers))
+            return image, question, answers, answer_weights
+        else:
+            raise ValueError("dataset split should be train or test")

finetune_retrieval.py ADDED Viewed

	@@ -0,0 +1,400 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+import argparse
+import datetime
+import os
+import random
+import time
+import ruamel.yaml as yaml
+import torch
+import torch.backends.cudnn as cudnn
+import torch.distributed as dist
+from data.retrieval_datamodule import RetrievalDataModule
+from model import albef_model_for_retrieval
+from torch.optim import AdamW
+from torch.optim.lr_scheduler import CosineAnnealingWarmRestarts
+from utils import (
+    add_weight_decay,
+    get_rank,
+    get_world_size,
+    init_distributed_mode,
+    is_dist_avail_and_initialized,
+    is_main_process,
+)
+def train(model, datamodule, args, device):
+    model.train()
+    model_without_ddp = model.module if is_dist_avail_and_initialized() else model
+    optimizer_params = add_weight_decay(model, args["weight_decay"])
+    optimizer = AdamW(optimizer_params, lr=args["lr"])
+    scheduler = CosineAnnealingWarmRestarts(
+        optimizer, T_0=args["max_epochs"], eta_min=args["min_lr"]
+    )
+    step_size = args["step_size"]
+    warmup_steps = args["warmup_steps"]
+    warmup_iterations = warmup_steps * step_size
+    data_loader = datamodule.train_dataloader(
+        is_distributed=is_dist_avail_and_initialized(),
+        num_tasks=get_world_size(),
+        global_rank=get_rank(),
+    )
+    start_time = time.time()
+    for epoch in range(args["max_epochs"]):
+        if epoch > 0:
+            scheduler.step(epoch + warmup_steps)
+        for batch, (image, text, text_atts, idx) in enumerate(data_loader):
+            if epoch > 0:
+                alpha = args["alpha"]
+            else:
+                alpha = args["alpha"] * min(1, batch / len(data_loader))
+            image = image.to(device, non_blocking=True)
+            text = text.to(device)
+            text_atts = text_atts.to(device)
+            idx = idx.to(device, non_blocking=True)
+            loss = model(image, text, text_atts, idx, alpha, is_train=True)
+            optimizer.zero_grad()
+            loss.backward()
+            optimizer.step()
+            if epoch == 0 and batch % step_size == 0 and batch <= warmup_iterations:
+                scheduler.step(batch // step_size)
+            if batch % args["log_every_n_steps"] == 0:
+                total_time = time.time() - start_time
+                time_str = "time {},".format(
+                    datetime.timedelta(seconds=int(total_time))
+                )
+                epoch_str = "epoch {}/{},".format(epoch, args["max_epochs"])
+                batch_str = "batch {}/{},".format(batch, len(data_loader))
+                loss_str = "loss {}".format(loss.item())
+                print(time_str, epoch_str, batch_str, loss_str)
+        if is_main_process():
+            save_obj = {
+                "model": model_without_ddp.state_dict(),
+                "optimizer": optimizer.state_dict(),
+                "lr_scheduler": scheduler.state_dict(),
+                "epoch": epoch,
+            }
+            torch.save(
+                save_obj,
+                os.path.join(
+                    args["checkpoint_root"], "retrieval_checkpoint_%02d.pt" % epoch
+                ),
+            )
+        if is_dist_avail_and_initialized():
+            dist.barrier()
+            torch.cuda.empty_cache()
+@torch.no_grad()
+def encode_text(model, text_dataloader, device):
+    text_embeds = []
+    text_feats = []
+    text_atts = []
+    for text, text_att in text_dataloader:
+        text = text.to(device)
+        text_att = text_att.to(device)
+        text_embed, text_feat = model(
+            text=text, text_atts=text_att, input_type="text", is_train=False
+        )
+        text_embeds.append(text_embed)
+        text_feats.append(text_feat)
+        text_atts.append(text_att)
+    text_embeds = torch.cat(text_embeds, dim=0)
+    text_feats = torch.cat(text_feats, dim=0)
+    text_atts = torch.cat(text_atts, dim=0)
+    return text_embeds, text_feats, text_atts
+@torch.no_grad()
+def encode_image(model, image_dataloader, device):
+    image_embeds = []
+    image_feats = []
+    for image in image_dataloader:
+        image = image.to(device)
+        image_embed, image_feat = model(image=image, input_type="image", is_train=False)
+        image_embeds.append(image_embed)
+        image_feats.append(image_feat)
+    image_embeds = torch.cat(image_embeds, dim=0)
+    image_feats = torch.cat(image_feats, dim=0)
+    return image_embeds, image_feats
+@torch.no_grad()
+def image_to_text(
+    model,
+    image_embeds,
+    text_embeds,
+    text_atts,
+    sims_matrix,
+    num_images,
+    num_text,
+    device,
+    args,
+):
+    start_time = time.time()
+    world_size = get_world_size()
+    rank = get_rank()
+    step = sims_matrix.size(0) // world_size + 1
+    start = rank * step
+    end = min(sims_matrix.size(0), start + step)
+    k = args["k_test"]
+    image_to_text_scores = torch.full((num_images, num_text), -100.0).to(device)
+    for i, sims in enumerate(sims_matrix[start:end]):
+        _, topk_idx = sims.topk(k, dim=0)
+        score = model(
+            image=image_embeds[start + i].repeat(k, 1, 1),
+            text=text_embeds[topk_idx],
+            text_atts=text_atts[topk_idx],
+            input_type="multimodal",
+            is_train=False,
+        )
+        image_to_text_scores[start + i, topk_idx] = score
+        if i % args["log_every_n_steps"] == 0:
+            total_time = time.time() - start_time
+            time_str = "time {},".format(datetime.timedelta(seconds=int(total_time)))
+            batch_str = "batch {}/{},".format(i, len(sims_matrix[start:end]))
+            print("image to text retrieval", time_str, batch_str)
+    return image_to_text_scores
+@torch.no_grad()
+def text_to_image(
+    model,
+    image_embeds,
+    text_embeds,
+    text_atts,
+    sims_matrix,
+    num_images,
+    num_text,
+    device,
+    args,
+):
+    start_time = time.time()
+    world_size = get_world_size()
+    rank = get_rank()
+    step = sims_matrix.size(0) // world_size + 1
+    start = rank * step
+    end = min(sims_matrix.size(0), start + step)
+    k = args["k_test"]
+    text_to_image_scores = torch.full((num_text, num_images), -100.0).to(device)
+    for i, sims in enumerate(sims_matrix[start:end]):
+        _, topk_idx = sims.topk(k, dim=0)
+        score = model(
+            image=image_embeds[topk_idx],
+            text=text_embeds[start + i].repeat(k, 1, 1),
+            text_atts=text_atts[start + i].repeat(k, 1, 1),
+            input_type="multimodal",
+            is_train=False,
+        )
+        text_to_image_scores[start + i, topk_idx] = score
+        if i % args["log_every_n_steps"] == 0:
+            total_time = time.time() - start_time
+            time_str = "time {},".format(datetime.timedelta(seconds=int(total_time)))
+            batch_str = "batch {}/{},".format(i, len(sims_matrix[start:end]))
+            print("text to image retrieval", time_str, batch_str)
+    return text_to_image_scores
+@torch.no_grad()
+def evaluation(model, datamodule, args, device):
+    model.eval()
+    text_loader = datamodule.text_dataloader()
+    image_loader = datamodule.image_dataloader()
+    num_images = len(datamodule.image_dataset)
+    num_text = len(datamodule.text_dataset)
+    text_embeds, text_feats, text_atts = encode_text(model, text_loader, device)
+    image_embeds, image_feats = encode_image(model, image_loader, device)
+    sims_matrix = image_feats @ text_feats.t()
+    image_to_text_scores = image_to_text(
+        model,
+        image_embeds,
+        text_embeds,
+        text_atts,
+        sims_matrix,
+        num_images,
+        num_text,
+        device,
+        args,
+    )
+    sims_matrix = sims_matrix.t()
+    text_to_image_scores = text_to_image(
+        model,
+        image_embeds,
+        text_embeds,
+        text_atts,
+        sims_matrix,
+        num_images,
+        num_text,
+        device,
+        args,
+    )
+    if is_dist_avail_and_initialized():
+        dist.barrier()
+        torch.distributed.all_reduce(
+            image_to_text_scores, op=torch.distributed.ReduceOp.SUM
+        )
+        torch.distributed.all_reduce(
+            text_to_image_scores, op=torch.distributed.ReduceOp.SUM
+        )
+    return image_to_text_scores.cpu(), text_to_image_scores.cpu()
+@torch.no_grad()
+def itm_eval(
+    image_to_text_scores,
+    text_to_image_scores,
+    image_to_text_mapping,
+    text_to_image_mapping,
+):
+    # Images to Text
+    ranks = torch.zeros(image_to_text_scores.size(0))
+    for index, score in enumerate(image_to_text_scores):
+        inds = torch.flip(torch.argsort(score), dims=[0])
+        rank = 1e10
+        # each image has multiple text mappings
+        # check retrieved inds with each ground truth mappping i
+        for i in image_to_text_mapping[index]:
+            tmp = torch.where(inds == i)[0][0]
+            if tmp < rank:
+                rank = tmp
+        ranks[index] = rank
+    # Compute metrics
+    tr1 = 100.0 * len(torch.where(ranks < 1)[0]) / len(ranks)
+    tr5 = 100.0 * len(torch.where(ranks < 5)[0]) / len(ranks)
+    tr10 = 100.0 * len(torch.where(ranks < 10)[0]) / len(ranks)
+    # Text to Images
+    ranks = torch.zeros(text_to_image_scores.size(0))
+    for index, score in enumerate(text_to_image_scores):
+        inds = torch.flip(torch.argsort(score), dims=[0])
+        ranks[index] = torch.where(inds == text_to_image_mapping[index])[0][0]
+    # Compute metrics
+    ir1 = 100.0 * len(torch.where(ranks < 1)[0]) / len(ranks)
+    ir5 = 100.0 * len(torch.where(ranks < 5)[0]) / len(ranks)
+    ir10 = 100.0 * len(torch.where(ranks < 10)[0]) / len(ranks)
+    tr_mean = (tr1 + tr5 + tr10) / 3
+    ir_mean = (ir1 + ir5 + ir10) / 3
+    r_mean = (tr_mean + ir_mean) / 2
+    eval_result = {
+        "txt_r1": tr1,
+        "txt_r5": tr5,
+        "txt_r10": tr10,
+        "txt_r_mean": tr_mean,
+        "img_r1": ir1,
+        "img_r5": ir5,
+        "img_r10": ir10,
+        "img_r_mean": ir_mean,
+        "r_mean": r_mean,
+    }
+    return eval_result
+@torch.no_grad()
+def format_output(
+    image_to_text_scores,
+    text_to_image_scores,
+    image_dataset,
+    text_dataset,
+):
+    image_to_text_output = {}
+    for index, score in enumerate(image_to_text_scores):
+        image = image_dataset.images[index]
+        top10_ids = torch.flip(torch.argsort(score), dims=[0])[:10]
+        top10_text = [text_dataset.text[i] for i in top10_ids]
+        image_to_text_output[index] = {
+            "image": image,
+            "output": top10_text,
+        }
+    text_to_image_output = {}
+    for index, score in enumerate(text_to_image_scores):
+        text = text_dataset.text[index]
+        top10_ids = torch.flip(torch.argsort(score), dims=[0])[:10]
+        top10_images = [image_dataset.images[i] for i in top10_ids]
+        text_to_image_output[index] = {
+            "text": text,
+            "output": top10_images,
+        }
+    return image_to_text_output, text_to_image_output
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--config", default="./examples/albef/configs/retrieval.yaml")
+    args = parser.parse_args()
+    config = yaml.load(open(args.config, "r"), Loader=yaml.Loader)
+    init_distributed_mode(config)
+    device = torch.device(config["device"])
+    seed = config["seed"] + get_rank()
+    torch.manual_seed(seed)
+    random.seed(seed)
+    cudnn.benchmark = True
+    datamodule = RetrievalDataModule(**config["datamodule_args"])
+    model = albef_model_for_retrieval(config, pretrained=True)
+    model = model.to(device)
+    if is_dist_avail_and_initialized():
+        model = torch.nn.parallel.DistributedDataParallel(
+            model, device_ids=[config["gpu"]]
+        )
+    train(model, datamodule, config["training_args"], device)
+    image_to_text_scores, text_to_image_scores = evaluation(
+        model, datamodule, config["eval_args"], device
+    )
+    val_result = itm_eval(
+        image_to_text_scores,
+        text_to_image_scores,
+        datamodule.image_dataset.image_to_text,
+        datamodule.text_dataset.text_to_image,
+    )
+    image_to_text_output, text_to_image_output = format_output(
+        image_to_text_scores,
+        text_to_image_scores,
+        datamodule.image_dataset,
+        datamodule.text_dataset,
+    )
+    result = {
+        "image_to_text_output": image_to_text_output,
+        "text_to_image_output": text_to_image_output,
+        **val_result,
+    }
+    torch.save(result, config["output_path"])
+if __name__ == "__main__":
+    main()

finetune_vqa.py ADDED Viewed

	@@ -0,0 +1,204 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+import argparse
+import datetime
+import os
+import random
+import time
+import ruamel.yaml as yaml
+import torch
+import torch.backends.cudnn as cudnn
+import torch.distributed as dist
+from data.vqa_datamodules import VQADataModule
+from model import albef_model_for_vqa
+from torch.optim import AdamW
+from torch.optim.lr_scheduler import CosineAnnealingWarmRestarts
+from utils import (
+    add_weight_decay,
+    get_rank,
+    get_world_size,
+    init_distributed_mode,
+    is_dist_avail_and_initialized,
+    is_main_process,
+    save_result,
+)
+def train(model, datamodule, args, device):
+    model_without_ddp = model.module if is_dist_avail_and_initialized() else model
+    model.train()
+    optimizer_params = add_weight_decay(model, args["weight_decay"])
+    optimizer = AdamW(optimizer_params, lr=args["lr"])
+    scheduler = CosineAnnealingWarmRestarts(
+        optimizer, T_0=args["max_epochs"], eta_min=args["min_lr"]
+    )
+    step_size = args["step_size"]
+    warmup_steps = args["warmup_steps"]
+    warmup_iterations = warmup_steps * step_size
+    data_loader = datamodule.train_dataloader(
+        is_distributed=is_dist_avail_and_initialized(),
+        num_tasks=get_world_size(),
+        global_rank=get_rank(),
+    )
+    start_time = time.time()
+    for epoch in range(args["max_epochs"]):
+        if is_dist_avail_and_initialized():
+            data_loader.sampler.set_epoch(epoch)
+        if epoch > 0:
+            scheduler.step(epoch + warmup_steps)
+        for batch, (
+            images,
+            questions,
+            questions_atts,
+            answers,
+            answers_atts,
+            ans_weights,
+            ans_lengths,
+        ) in enumerate(data_loader):
+            if epoch > 0:
+                alpha = args["alpha"]
+            else:
+                alpha = args["alpha"] * min(1, batch / len(data_loader))
+            images = images.to(device, non_blocking=True)
+            questions = questions.to(device)
+            questions_atts = questions_atts.to(device)
+            answers = answers.to(device)
+            answers_atts = answers_atts.to(device)
+            ans_weights = ans_weights.to(device)
+            loss = model(
+                images,
+                questions,
+                questions_atts,
+                answers,
+                answers_atts,
+                ans_weights=ans_weights,
+                ans_lengths=ans_lengths,
+                alpha=alpha,
+                is_train=True,
+            )
+            optimizer.zero_grad()
+            loss.backward()
+            optimizer.step()
+            if epoch == 0 and batch % step_size == 0 and batch <= warmup_iterations:
+                scheduler.step(batch // step_size)
+            if batch % args["log_every_n_steps"] == 0:
+                total_time = time.time() - start_time
+                time_str = "time {},".format(
+                    datetime.timedelta(seconds=int(total_time))
+                )
+                epoch_str = "epoch {}/{},".format(epoch, args["max_epochs"])
+                batch_str = "batch {}/{},".format(batch, len(data_loader))
+                loss_str = "loss {}".format(loss.item())
+                print(time_str, epoch_str, batch_str, loss_str)
+        if is_main_process():
+            save_obj = {
+                "model": model_without_ddp.state_dict(),
+                "optimizer": optimizer.state_dict(),
+                "scheduler": scheduler.state_dict(),
+                "epoch": epoch,
+            }
+            torch.save(
+                save_obj,
+                os.path.join(args["checkpoint_root"], "vqa_checkpoint_%02d.pt" % epoch),
+            )
+        if is_dist_avail_and_initialized():
+            dist.barrier()
+@torch.no_grad()
+def evaluation(model, datamodule, args, device):
+    model.eval()
+    result = []
+    answer_list = datamodule.test_dataset.answer_list
+    answer_input_ids = datamodule.test_dataset.answer_input_ids.to(device)
+    answer_atts = datamodule.test_dataset.answer_attention_mask.to(device)
+    data_loader = datamodule.test_dataloader(
+        is_distributed=is_dist_avail_and_initialized(),
+        num_tasks=get_world_size(),
+        global_rank=get_rank(),
+    )
+    start_time = time.time()
+    for batch, (img, ques, ques_atts, ques_ids) in enumerate(data_loader):
+        img = img.to(device, non_blocking=True)
+        ques = ques.to(device)
+        ques_atts = ques_atts.to(device)
+        topk_ids, topk_probs = model(
+            img,
+            ques,
+            ques_atts,
+            answer_input_ids,
+            answer_atts,
+            k=args["k_test"],
+            is_train=False,
+        )
+        for ques_id, topk_id, topk_prob in zip(ques_ids, topk_ids, topk_probs):
+            _, pred = topk_prob.max(dim=0)
+            result.append(
+                {"question_id": ques_id, "answer": answer_list[topk_id[pred]]}
+            )
+        if batch % args["log_every_n_steps"] == 0:
+            total_time = time.time() - start_time
+            total_time_str = str(datetime.timedelta(seconds=int(total_time)))
+            print(
+                "time {}, batch {}/{}".format(total_time_str, batch, len(data_loader))
+            )
+    return result
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--config", default="./examples/albef/configs/vqa.yaml")
+    args = parser.parse_args()
+    config = yaml.load(open(args.config, "r"), Loader=yaml.Loader)
+    init_distributed_mode(config)
+    device = torch.device(config["device"])
+    seed = config["seed"] + get_rank()
+    torch.manual_seed(seed)
+    random.seed(seed)
+    cudnn.benchmark = True
+    datamodule = VQADataModule(**config["datamodule_args"])
+    model = albef_model_for_vqa(config, pretrained=True)
+    model = model.to(device)
+    if is_dist_avail_and_initialized():
+        model = torch.nn.parallel.DistributedDataParallel(
+            model, device_ids=[config["gpu"]]
+        )
+    train(model, datamodule, config["training_args"], device)
+    result = evaluation(model, datamodule, config["eval_args"], device)
+    save_result(result, config["output_root"], "vqa_output")
+if __name__ == "__main__":
+    main()

images/COCO_val2014_000000026348.jpg ADDED Viewed

images/COCO_val2014_000000057222.jpg ADDED Viewed

images/COCO_val2014_000000111207.jpg ADDED Viewed

images/COCO_val2014_000000159269.jpg ADDED Viewed

images/COCO_val2014_000000184359.jpg ADDED Viewed

images/COCO_val2014_000000407072.jpg ADDED Viewed

images/COCO_val2014_000000473994.jpg ADDED Viewed

images/COCO_val2014_000000552075.jpg ADDED Viewed

model.py ADDED Viewed

	@@ -0,0 +1,666 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+import copy
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+import torch
+import torch.nn.functional as F
+from torch import nn, Tensor
+from torchmultimodal.models.albef.image_encoder import ALBEFVisionEncoder
+from torchmultimodal.models.albef.model import ALBEFModel, ALBEFModelWithSimilarity
+from torchmultimodal.models.albef.multimodal_encoder import ALBEFMultimodalEncoder
+from torchmultimodal.modules.encoders.bert_text_encoder import bert_text_encoder
+from torchmultimodal.modules.layers.text_embedding import BERTTextEmbeddings
+from torchmultimodal.modules.losses.albef import (
+    CausalLanguageModelingLoss,
+    ImageTextContrastiveLoss,
+)
+from torchmultimodal.utils.attention import get_causal_attention_mask
+from torchmultimodal.utils.common import momentum_update, remove_grad
+_ALBEF_PRETRAINED_URLS = {
+    "vqa": "https://download.pytorch.org/models/multimodal/albef/pretrained_vqa_checkpoint.pt",
+    "retrieval": "https://download.pytorch.org/models/multimodal/albef/pretrained_retrieval_checkpoint.pt",
+}
+class PredictionHead(nn.Module):
+    """
+    Predict the following token autoregressively.
+    Args:
+        vocab_size (int): The number of different tokens the prediction_head can predict.
+        hidden_size (int): The hidden size of the prediction_head.
+        layer_norm_eps (float): The epsilon used by the prediction_head normalization layer.
+        transform_act_fn (Callable[[Tensor], Tensor]): The activation function in the prediction_head.
+    Inputs:
+        hidden_states (Tensor): The hidden states of preceding tokens.
+    Returns:
+        Tensor: Prediction scores for the following token.
+    """
+    def __init__(
+        self,
+        vocab_size: int = 30522,
+        hidden_size: int = 768,
+        layer_norm_eps: float = 1e-12,
+        transform_act_fn: Callable[[Tensor], Tensor] = nn.functional.gelu,
+    ) -> None:
+        super().__init__()
+        self.dense = nn.Linear(hidden_size, hidden_size)
+        self.transform_act_fn = transform_act_fn
+        self.layer_norm = nn.LayerNorm(hidden_size, eps=layer_norm_eps)
+        self.decoder = nn.Linear(hidden_size, vocab_size)
+    def forward(self, hidden_states: Tensor) -> Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.transform_act_fn(hidden_states)
+        hidden_states = self.layer_norm(hidden_states)
+        hidden_states = self.decoder(hidden_states)
+        return hidden_states
+class ALBEFDecoder(nn.Module):
+    """
+    Generate the prediction scores for answers from image and question hidden states.
+    Args:
+        text_embeddings (ALBEFTextEmbeddings): Instantiated ALBEFTextEmbeddings.
+        multimodal_encoder (ALBEFMultimodalEncoder): Instantiated ALBEFMultimodalEncoder.
+        prediction_head (PredictionHead): Instantiated PredictionHead.
+    Inputs:
+        input_ids (Tensor of shape (batch_size, seq_len)):
+            Input ids for input text tokens.
+        attention_mask (Tensor of shape (batch_size, seq_len)):
+            Input attention mask to avoid performing attention on padding token indices.
+        encoder_hidden_states (Tensor of shape (batch_size, encoder_seq_len, hidden_size)):
+            The encoder hidden states.
+        encoder_attention_mask (Tensor of shape (batch_size, encoder_seq_len)):
+            The attention mask for encoder hidden states.
+    Returns:
+        Tensor: Prediction scores for answers.
+    """
+    def __init__(
+        self,
+        text_embeddings: BERTTextEmbeddings,
+        multimodal_encoder: ALBEFMultimodalEncoder,
+        prediction_head: PredictionHead,
+    ) -> None:
+        super().__init__()
+        self.text_embeddings = text_embeddings
+        self.multimodal_encoder = multimodal_encoder
+        self.prediction_head = prediction_head
+    def get_extended_attention_mask_for_decoder(self, attention_mask: Tensor) -> Tensor:
+        """
+        Apply a causal mask in addition to the padding mask and make the mask broadcastable,
+        such that future and masked tokens are ignored.
+        Args:
+            attention_mask (Tensor):
+                Padding mask with ones indicating tokens to attend to, zeros for tokens to ignore.
+        Returns:
+            extended_attention_mask (Tensor):
+                The broadcastable attention mask, with the same dtype as ``attention_mask.dtype``.
+        """
+        device = attention_mask.device
+        batch_size, seq_length = attention_mask.shape
+        causal_mask = get_causal_attention_mask(seq_length).to(device)
+        causal_mask = causal_mask.repeat(batch_size, 1).view(
+            batch_size, seq_length, seq_length
+        )
+        extended_attention_mask = (
+            causal_mask[:, None, :, :] * attention_mask[:, None, None, :]
+        )
+        extended_attention_mask = extended_attention_mask.to(dtype=attention_mask.dtype)
+        return extended_attention_mask
+    def forward(
+        self,
+        input_ids: Tensor,
+        attention_mask: Tensor,
+        encoder_hidden_states: Tensor,
+        encoder_attention_mask: Tensor,
+    ) -> Tensor:
+        hidden_states = self.text_embeddings(input_ids)
+        attention_mask = self.get_extended_attention_mask_for_decoder(attention_mask)
+        decoder_output = self.multimodal_encoder(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+        )
+        prediction_scores = self.prediction_head(decoder_output)
+        return prediction_scores
+class ALBEFModelForVQA(nn.Module):
+    """
+    ALBEF Model for VQA finetuning and inference.
+    Args:
+        model (ALBEFModel): Instantiated ALBEFModel.
+        answer_decoder (ALBEFDecoder): Instantiated ALBEFDecoder.
+        loss (CausalLanguageModelingLoss): Instantiated CausalLanguageModelingLoss.
+    Inputs:
+        image (Tensor of shape (B, C, H, W)): Image features.
+        question (Tensor of shape (B, L)): Question text features.
+        question_atts (Tensor of shape (B, L)): Question attention mask.
+        answers (Tensor of shape (N, M)): Answer text features.
+        answers_atts (Tensor of shape (N, M)): Answer attention mask.
+        ans_weights (Optional[Tensor] of shape (N)): Weights for each answer.
+            Required if is_train is True.
+        ans_lengths (Optional[List[int]] of length B): Number of answers for each question.
+            ans_lengths should sum to N.
+            Required if is_train is True.
+        alpha (Optional[float]): The interpolation value between clm_loss and loss_distill.
+            Required if is_train is True.
+        k (Optional[int]): The number of answers to return for inference.
+            Required if is_train is False.
+        is_train (Optional[bool]): Whether the model is in training.
+    Returns:
+        is_train is True:
+            Tensor: The masked language modeling loss for input.
+        is_train is False:
+            Tuple[Tensor, Tensor]: The ids and probabilities for the top k predicted answers.
+    """
+    def __init__(
+        self,
+        model: ALBEFModel,
+        answer_decoder: ALBEFDecoder,
+        loss: CausalLanguageModelingLoss,
+    ) -> None:
+        super().__init__()
+        self.model = model
+        self.answer_decoder = answer_decoder
+        self.loss = loss
+        self.answer_decoder_m = copy.deepcopy(self.answer_decoder)
+        remove_grad(
+            self.answer_decoder_m
+        )  # remove gradient for the momentum decoder model
+    def _train_forward(
+        self,
+        image: Tensor,
+        question: Tensor,
+        question_atts: Tensor,
+        answers: Tensor,
+        answers_atts: Tensor,
+        ans_weights: Tensor,
+        ans_lengths: List[int],
+        alpha: float,
+    ) -> Tensor:
+        """
+        Forward step for training. Encode the inputs with the ALBEFModel.
+        Generate pseudo-targets using answer_decoder_m (momentum decoder model).
+        Generate answer predictions using answer_decoder.
+        Compute masked language modeling loss of the predictions using answers as labels,
+            pseudo-targets as soft-labels, and alpha as their interpolation value.
+        Inputs:
+            image (Tensor of shape (B, C, H, W)): Image features.
+            question (Tensor of shape (B, L)): Question text features.
+            question_atts (Tensor of shape (B, L)): Question attention mask.
+            answers (Tensor of shape (N, M)): Answer text features.
+            answers_atts (Tensor of shape (N, M)): Answer attention mask.
+            ans_weights (Tensor of shape (N)): Weights for each answer.
+            ans_lengths (List[int] of length B): Number of answers for each question.
+                ans_lengths should sum to N.
+            alpha (float): The interpolation value between clm_loss and loss_distill.
+        Returns:
+            Tensor: The masked language modeling loss for input.
+        """
+        # get image-question embeddings from the ALBEFModel and format it to match the ans_lengths
+        encoder_outputs = self.model(image, question, question_atts)
+        (
+            encoder_hidden_states,
+            encoder_hidden_states_m,
+            encoder_attention_mask,
+        ) = self._encoder_hidden_states(
+            encoder_outputs.multimodal_embeddings,
+            encoder_outputs.multimodal_embeddings_m,
+            question_atts,
+            ans_lengths,
+        )
+        # use the momentum model to generate pseudo-targets
+        with torch.no_grad():
+            momentum_update(
+                self.answer_decoder, self.answer_decoder_m, self.model.momentum
+            )
+            prediction_scores_m = self.answer_decoder_m(
+                input_ids=answers,
+                attention_mask=answers_atts,
+                encoder_hidden_states=encoder_hidden_states_m,
+                encoder_attention_mask=encoder_attention_mask,
+            )
+        # generate answer predictions
+        prediction_scores = self.answer_decoder(
+            input_ids=answers,
+            attention_mask=answers_atts,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+        )
+        # compute masked language modeling loss from the prediction scores
+        labels = answers.masked_fill(answers == 0, self.loss.mask_token_id)
+        loss = self.loss(labels, prediction_scores, prediction_scores_m, alpha)
+        loss = ans_weights * loss
+        loss = loss.sum() / image.size(0)
+        return loss
+    def _eval_forward(
+        self,
+        image: Tensor,
+        question: Tensor,
+        question_atts: Tensor,
+        answers: Tensor,
+        answer_atts: Tensor,
+        k: int = 128,
+    ) -> Tuple[Tensor, Tensor]:
+        """
+        Forward step for evaluation. Encode the inputs with the ALBEFModel.
+        Generate answer autoregressively using the decoder, starting with the [CLS] token.
+        Compute the answer ids and their perspective probabilities of the top k predictions.
+        Inputs:
+            image (Tensor of shape (B, C, H, W)): Image features.
+            question (Tensor of shape (B, L)): Question text features.
+            question_atts (Tensor of shape (B, L)): Question attention mask.
+            answers (Tensor of shape (N, M)): Answer text features.
+            answer_atts (Tensor of shape (N, M)): Answer attention mask.
+            k (int): The number of answers to return for inference.
+        Returns:
+            Tuple[Tensor, Tensor]: The ids and probabilities for the top k predicted answers.
+        """
+        # get multimodal embeddings from the ALBEFModel and
+        # feed it to the decoder as cross attention
+        encoder_outputs = self.model(image, question, question_atts)
+        # use cls token as the decoder's initial input token
+        num_ques = question.size(0)
+        start_ids = answers[0, 0].repeat(num_ques, 1)
+        atts = torch.ones(start_ids.shape).to(image.device)
+        # auto-regressively generates the answer
+        prediction_scores = self.answer_decoder(
+            input_ids=start_ids,
+            attention_mask=atts,
+            encoder_hidden_states=encoder_outputs.multimodal_embeddings,
+            encoder_attention_mask=question_atts,
+        )
+        logits = prediction_scores[:, 0, :]
+        answer_first_token = answers[:, 1]
+        prob_first_token = F.softmax(logits, dim=1).index_select(
+            dim=1, index=answer_first_token
+        )
+        topk_probs, topk_ids = prob_first_token.topk(k, dim=1)
+        input_ids = []
+        input_atts = []
+        for topk_id in topk_ids:
+            input_ids.append(answers.index_select(dim=0, index=topk_id))
+            input_atts.append(answer_atts.index_select(dim=0, index=topk_id))
+        input_ids = torch.cat(input_ids)
+        input_atts = torch.cat(input_atts)
+        targets_ids = input_ids.masked_fill(input_ids == 0, self.loss.mask_token_id)
+        question_states = encoder_outputs.multimodal_embeddings.repeat_interleave(
+            k, dim=0
+        )
+        question_atts = question_atts.repeat_interleave(k, dim=0)
+        prediction_scores = self.answer_decoder(
+            input_ids=input_ids,
+            attention_mask=input_atts,
+            encoder_hidden_states=question_states,
+            encoder_attention_mask=question_atts,
+        )
+        answer_loss = self.loss(targets_ids, prediction_scores)
+        answer_loss = answer_loss.view(input_ids.size(0), -1)
+        # topk_prob: first token probability
+        topk_probs = topk_probs.view(-1, 1)
+        log_probs = torch.cat([topk_probs.log(), -answer_loss], dim=1)
+        # re-calculate log probabilities for the answer sequences using chain rule
+        log_probs_sum = log_probs.sum(1)
+        log_probs_sum = log_probs_sum.view(num_ques, k)
+        topk_probs = F.softmax(log_probs_sum, dim=-1)
+        # get top-k after re-ranking
+        topk_probs, rerank_id = topk_probs.topk(k, dim=1)
+        topk_ids = torch.gather(topk_ids, 1, rerank_id)
+        return topk_ids, topk_probs
+    def _encoder_hidden_states(
+        self,
+        multimodal_embeds: Tensor,
+        multimodal_embeds_m: Tensor,
+        question_atts: Tensor,
+        ans_lengths: List[int],
+    ) -> Tuple[Tensor, Tensor, Tensor]:
+        """
+        Repeat each image-question input, repeat its embedding and mask to match the number of answers it has.
+        Args:
+            multimodal_embeds (Tensor): Image-question embeddings.
+            multimodal_embeds_m (Tensor): Image-question embeddings from the momentum model.
+            question_atts (Tensor): Question attention mask.
+            ans_lengths (List[int]): The number of answers each image-question input has.
+        Returns:
+            encoder_hidden_states (Tensor): Image-question embeddings after the repetition.
+            encoder_hidden_states_m (Tensor): Image-question embeddings from the momentum model after the repetition.
+            encoder_attention_mask (Tensor): Question attention mask after the repetition.
+        """
+        encoder_hidden_states = []
+        encoder_attention_mask = []
+        for b, n in enumerate(ans_lengths):
+            encoder_hidden_states += [multimodal_embeds[b]] * n
+            encoder_attention_mask += [question_atts[b]] * n
+        encoder_hidden_states = torch.stack(encoder_hidden_states)
+        encoder_attention_mask = torch.stack(encoder_attention_mask)
+        with torch.no_grad():
+            encoder_hidden_states_m = []
+            for b, n in enumerate(ans_lengths):
+                encoder_hidden_states_m += [multimodal_embeds_m[b]] * n
+            encoder_hidden_states_m = torch.stack(encoder_hidden_states_m)
+        return encoder_hidden_states, encoder_hidden_states_m, encoder_attention_mask
+    def forward(
+        self,
+        image: Tensor,
+        question: Tensor,
+        question_atts: Tensor,
+        answers: Tensor,
+        answers_atts: Tensor,
+        ans_weights: Optional[Tensor] = None,
+        ans_lengths: Optional[List[int]] = None,
+        alpha: Optional[float] = 0.0,
+        k: Optional[int] = 128,
+        is_train: Optional[bool] = True,
+    ) -> Union[Tensor, Tuple[Tensor, Tensor]]:
+        if is_train:
+            return self._train_forward(
+                image,
+                question,
+                question_atts,
+                answers,
+                answers_atts,
+                ans_weights,
+                ans_lengths,
+                alpha,
+            )
+        else:
+            return self._eval_forward(
+                image,
+                question,
+                question_atts,
+                answers,
+                answers_atts,
+                k,
+            )
+class ALBEFModelForRetrieval(nn.Module):
+    """
+    ALBEF Model for Retrieval finetuning and inference.
+    In training mode, the forward step computes image-text contrastive loss and
+    image-text matching loss.
+    In evaluation mode, the forward step takes 3 types of input:
+        image: encode image input, project and normalize the embeddings.
+        text: encode text input, project and normalize the embeddings.
+        multimodal: create multimodal embeddings from image and text
+            embeddings, and compute image-text matching scores.
+    Args:
+        model_with_similarity (ALBEFModelWithSimilarity): Instantiated ALBEFModelWithSimilarity.
+        itc_loss (ImageTextContrastiveLoss): Instantiated ImageTextContrastiveLoss.
+        hidden_size (int): Dimensionality of encoder outputs.
+    Inputs:
+        image (Optional[Tensor] of shape (B, C, H, W)): Image features.
+            Required if is_train is True.
+            Required if input_type is "image" or "multimodal".
+        text (Optional[Tensor] of shape (B, L)): Text features.
+            Required if is_train is True.
+            Required if input_type is "text" or "multimodal".
+        text_atts (Tensor of shape (B, L)): Text attention mask.
+            Required if is_train is True.
+            Required if input_type is "text" or "multimodal".
+        idx (Tensor of shape (B)): Identifier for each image sample.
+            Required if is_train is True.
+        alpha (Optional[float]): The interpolation value between clm_loss and loss_distill.
+            Default is 0.
+        input_type (Optional[str]): "image", "text", or "multimodal" indicating the encoding type.
+            Required if is_train is False.
+        is_train (Optional[bool]): Whether the model is in training.
+            Default is True.
+    Returns:
+        is_train is True:
+            Tensor: The sum of itc loss and itm loss.
+        is_train is False:
+            input_type is "image":
+                Tuple[Tensor, Tensor]: Image embeddings and projected image features.
+            input_type is "text":
+                Tuple[Tensor, Tensor]: Text embeddings and projected text features.
+            input_type is "multimodal"
+                Tensor: Scores for the retrieval task.
+    """
+    def __init__(
+        self,
+        model_with_similarity: ALBEFModelWithSimilarity,
+        itc_loss: ImageTextContrastiveLoss,
+        hidden_size: int,
+    ) -> None:
+        super().__init__()
+        self.model_with_similarity = model_with_similarity
+        self.itc_loss = itc_loss
+        self.itm_head = nn.Linear(hidden_size, 2)
+    def _train_forward(
+        self,
+        image: Tensor,
+        text: Tensor,
+        text_atts: Tensor,
+        idx: Tensor,
+        alpha: float,
+    ) -> Tensor:
+        encoder_output = self.model_with_similarity(image, text, text_atts, idx)
+        # compute image-text contrastive loss
+        similarity_outputs = encoder_output.similarity
+        similarity_targets = encoder_output.sim_targets
+        itc_loss = self.itc_loss(
+            similarity_outputs.sim_i2t,
+            similarity_outputs.sim_t2i,
+            similarity_outputs.sim_i2t_m,
+            similarity_outputs.sim_t2i_m,
+            similarity_targets,
+            alpha,
+        )
+        # compute image-text matching loss
+        pos_embeddings = encoder_output.multimodal_embeddings[:, 0, :]
+        neg_embeddings = encoder_output.multimodal_embeddings_neg[:, 0, :]
+        vl_embeddings = torch.cat([pos_embeddings, neg_embeddings], dim=0)
+        vl_output = self.itm_head(vl_embeddings)
+        itm_labels = torch.cat(
+            [
+                torch.ones(pos_embeddings.size(0), dtype=torch.long),
+                torch.zeros(neg_embeddings.size(0), dtype=torch.long),
+            ],
+            dim=0,
+        ).to(vl_embeddings.device)
+        itm_loss = F.cross_entropy(vl_output, itm_labels)
+        loss = itc_loss + itm_loss
+        return loss
+    def _encode_image(
+        self,
+        image: Tensor,
+    ) -> Tuple[Tensor, Tensor]:
+        image_embed = self.model_with_similarity.albef_model.vision_encoder(image)
+        image_feat = F.normalize(
+            self.model_with_similarity.vision_proj(image_embed[:, 0, :]), dim=-1
+        )
+        return image_embed, image_feat
+    def _encode_text(
+        self,
+        text: Tensor,
+        text_atts: Tensor,
+    ) -> Tuple[Tensor, Tensor]:
+        text_embed = self.model_with_similarity.albef_model.text_encoder(
+            text, text_atts
+        ).last_hidden_state
+        text_feat = F.normalize(
+            self.model_with_similarity.text_proj(text_embed[:, 0, :]), dim=-1
+        )
+        return text_embed, text_feat
+    def _image_text_matching_score(
+        self,
+        image: Tensor,
+        text: Tensor,
+        text_atts: Tensor,
+    ) -> Tensor:
+        multimodal_embeds = self.model_with_similarity.albef_model.multimodal_encoder(
+            text,
+            text_atts,
+            image,
+        )
+        score = self.itm_head(multimodal_embeds[:, 0, :])[:, 1]
+        return score
+    def _eval_forward(
+        self,
+        input_type: str,
+        image: Optional[Tensor],
+        text: Optional[Tensor],
+        text_atts: Optional[Tensor],
+    ) -> Union[Tensor, Tuple[Tensor, Tensor]]:
+        if input_type == "image":
+            assert image is not None, "image input tensor cannot be None"
+            return self._encode_image(image)
+        elif input_type == "text":
+            assert (
+                text is not None and text_atts is not None
+            ), "text and text attention mask cannot be None"
+            return self._encode_text(text, text_atts)
+        elif input_type == "multimodal":
+            assert (
+                image is not None and text is not None and text_atts is not None
+            ), "image embeddings, text embeddings, and text attention mask cannot be None"
+            return self._image_text_matching_score(image, text, text_atts)
+        else:
+            raise ValueError("input_type must be image, text, or multimodal")
+    def forward(
+        self,
+        image: Optional[Tensor] = None,
+        text: Optional[Tensor] = None,
+        text_atts: Optional[Tensor] = None,
+        idx: Optional[Tensor] = None,
+        alpha: Optional[Tensor] = 0.0,
+        input_type: Optional[str] = None,
+        is_train: Optional[bool] = True,
+    ) -> Union[Tensor, Tuple[Tensor, Tensor]]:
+        if is_train:
+            return self._train_forward(
+                image,
+                text,
+                text_atts,
+                idx,
+                alpha,
+            )
+        else:
+            return self._eval_forward(
+                input_type,
+                image,
+                text,
+                text_atts,
+            )
+def albef_model_for_vqa(
+    config: Dict[str, Any], pretrained: bool = False
+) -> ALBEFModelForVQA:
+    vision_encoder = ALBEFVisionEncoder(**config["vision_encoder_args"])
+    text_encoder = bert_text_encoder(**config["text_encoder_args"])
+    question_multimodal_encoder = ALBEFMultimodalEncoder(
+        **config["multimodal_encoder_args"]
+    )
+    text_embeddings = BERTTextEmbeddings(**config["text_embeddings_args"])
+    answer_multimodal_encoder = ALBEFMultimodalEncoder(
+        **config["multimodal_encoder_args"]
+    )
+    prediction_head = PredictionHead(**config["prediction_head_args"])
+    albef_model = ALBEFModel(vision_encoder, text_encoder, question_multimodal_encoder)
+    decoder = ALBEFDecoder(text_embeddings, answer_multimodal_encoder, prediction_head)
+    loss = CausalLanguageModelingLoss()
+    model = ALBEFModelForVQA(albef_model, decoder, loss)
+    if pretrained:
+        checkpoint = torch.hub.load_state_dict_from_url(
+            _ALBEF_PRETRAINED_URLS["vqa"], map_location="cpu"
+        )
+        model.load_state_dict(checkpoint)
+    return model
+def albef_model_for_retrieval(
+    config: Dict[str, Any], pretrained: bool = False
+) -> ALBEFModelForRetrieval:
+    vision_encoder = ALBEFVisionEncoder(**config["vision_encoder_args"])
+    text_encoder = bert_text_encoder(**config["text_encoder_args"])
+    multimodal_encoder = ALBEFMultimodalEncoder(**config["multimodal_encoder_args"])
+    vision_proj = nn.Linear(**config["projection_args"])
+    text_proj = nn.Linear(**config["projection_args"])
+    albef_model = ALBEFModel(vision_encoder, text_encoder, multimodal_encoder)
+    albef_model_with_sim = ALBEFModelWithSimilarity(
+        albef_model, vision_proj, text_proj, **config["similarity_args"]
+    )
+    itc_loss = ImageTextContrastiveLoss()
+    model = ALBEFModelForRetrieval(
+        albef_model_with_sim, itc_loss, config["hidden_size"]
+    )
+    if pretrained:
+        checkpoint = torch.hub.load_state_dict_from_url(
+            _ALBEF_PRETRAINED_URLS["retrieval"], map_location="cpu"
+        )
+        model.load_state_dict(checkpoint)
+    return model

requirements.txt ADDED Viewed

	@@ -0,0 +1,5 @@

+opencv-python==4.6.0.66
+pytorch-lightning==1.6.0
+Pillow==9.0.1
+ruamel_yaml==0.17.21
+transformers==4.24.0

utils.py ADDED Viewed

	@@ -0,0 +1,127 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+import json
+import os
+import torch
+import torch.distributed as dist
+from torch import nn
+def setup_for_distributed(is_master):
+    """
+    This function disables printing when not in master process
+    """
+    import builtins as __builtin__
+    builtin_print = __builtin__.print
+    def print(*args, **kwargs):
+        force = kwargs.pop("force", False)
+        if is_master or force:
+            builtin_print(*args, **kwargs)
+    __builtin__.print = print
+def is_dist_avail_and_initialized():
+    if not dist.is_available():
+        return False
+    if not dist.is_initialized():
+        return False
+    return True
+def get_world_size():
+    if not is_dist_avail_and_initialized():
+        return 1
+    return dist.get_world_size()
+def get_rank():
+    if not is_dist_avail_and_initialized():
+        return 0
+    return dist.get_rank()
+def is_main_process():
+    return get_rank() == 0
+def init_distributed_mode(args):
+    if "RANK" in os.environ and "WORLD_SIZE" in os.environ:
+        args["rank"] = int(os.environ["RANK"])
+        args["world_size"] = int(os.environ["WORLD_SIZE"])
+        args["gpu"] = int(os.environ["LOCAL_RANK"])
+    elif "SLURM_PROCID" in os.environ:
+        args["rank"] = int(os.environ["SLURM_PROCID"])
+        args["gpu"] = args["rank"] % torch.cuda.device_count()
+    else:
+        print("Not using distributed mode")
+        args["distributed"] = False
+        return
+    args["distributed"] = True
+    torch.cuda.set_device(args["gpu"])
+    args["dist_backend"] = "nccl"
+    print(
+        "| distributed init (rank {}): {}".format(args["rank"], args["dist_url"]),
+        flush=True,
+    )
+    torch.distributed.init_process_group(
+        backend=args["dist_backend"],
+        init_method=args["dist_url"],
+        world_size=args["world_size"],
+        rank=args["rank"],
+    )
+    torch.distributed.barrier()
+    setup_for_distributed(args["rank"] == 0)
+def save_result(result, directory, file_name):
+    rank_path = os.path.join(directory, "{}_rank_{}.json".format(file_name, get_rank()))
+    main_path = os.path.join(directory, "{}.json".format(file_name))
+    json.dump(result, open(rank_path, "w"))
+    if is_dist_avail_and_initialized():
+        dist.barrier()
+    if is_main_process():
+        result = []
+        for rank in range(get_world_size()):
+            rank_path = os.path.join(
+                directory, "{}_rank_{}.json".format(file_name, rank)
+            )
+            rank_res = json.load(open(rank_path, "r"))
+            result += rank_res
+        json.dump(result, open(main_path, "w"))
+    if is_dist_avail_and_initialized():
+        dist.barrier()
+def add_weight_decay(model: nn.Module, weight_decay: float) -> None:
+    decay = []
+    no_decay = []
+    for name, param in model.named_parameters():
+        if not param.requires_grad:
+            continue  # skip weight_decay for momentum models
+        if len(param.shape) == 1 or name.endswith(".bias"):
+            no_decay.append(param)
+        else:
+            decay.append(param)
+    return [
+        {"params": no_decay, "weight_decay": 0.0},
+        {"params": decay, "weight_decay": weight_decay},
+    ]

vqa_data.json ADDED Viewed

	@@ -0,0 +1 @@

+ [{"image": "images/COCO_val2014_000000184359.jpg", "question": "Is this a train station?", "answers": ["no", "no", "no", "no", "no", "no", "no", "no", "no", "no"]}, {"image": "images/COCO_val2014_000000407072.jpg", "question": "Was this photo taken at night?", "answers": ["yes", "yes", "yes", "yes", "yes", "yes", "yes", "yes", "yes", "yes"]}, {"image": "images/COCO_val2014_000000111207.jpg", "question": "How many photos in one?", "answers": ["2", "2", "2", "2", "2", "2", "2", "2", "2", "2"]}, {"image": "images/COCO_val2014_000000057222.jpg", "question": "How many bears are there?", "answers": ["2", "3", "3", "4", "2", "2", "3", "3", "2", "3"]}, {"image": "images/COCO_val2014_000000159269.jpg", "question": "What time of the day it is?", "answers": ["evening", "evening", "dusk", "sunset", "sunset", "dusk", "morning", "dusk", "evening", "4 pm"]}, {"image": "images/COCO_val2014_000000026348.jpg", "question": "What color is the refrigerator handle?", "answers": ["white", "white", "white", "white", "white", "white", "white", "white", "white", "white"]}, {"image": "images/COCO_val2014_000000473994.jpg", "question": "What does this animal eat?", "answers": ["meat", "dog food", "dog food", "dog food", "dog food", "dog food", "frisbee", "dog food", "frisbee", "dog food"]}, {"image": "images/COCO_val2014_000000552075.jpg", "question": "Who is wearing a hat?", "answers": ["no one", "woman", "no one", "nobody", "no one", "nobody", "no", "nobody", "nobody", "man"]}]