yfan07 commited on 2 days ago

Commit

a47e733

verified ·

1 Parent(s): f1106d1

Add files using upload-large-folder tool

Browse files

Files changed (35) hide show

ChatUniVi/eval/questions/video_qa/msvd_qa.json +0 -0
ChatUniVi/eval/questions/video_qa/temporal_qa.json +0 -0
ChatUniVi/eval/questions/video_qa/tgif_a_list.json +1309 -0
ChatUniVi/eval/questions/video_qa/tgif_qa.json +0 -0
ChatUniVi/eval/table/caps_boxes_coco2014_val_80.jsonl +80 -0
ChatUniVi/eval/table/model.jsonl +5 -0
ChatUniVi/eval/table/question.jsonl +80 -0
ChatUniVi/eval/table/reviewer.jsonl +4 -0
ChatUniVi/eval/table/rule.json +11 -0
ChatUniVi/model/__init__.py +1 -0
ChatUniVi/model/apply_delta.py +44 -0
ChatUniVi/model/arch.py +652 -0
ChatUniVi/model/builder.py +118 -0
ChatUniVi/model/cluster.py +287 -0
ChatUniVi/model/consolidate.py +29 -0
ChatUniVi/model/dataloader.py +67 -0
ChatUniVi/model/language_model/language_model/configuration_phi.py +62 -0
ChatUniVi/model/language_model/language_model/modeling_phi.py +984 -0
ChatUniVi/model/language_model/llama.py +136 -0
ChatUniVi/model/language_model/phi.py +142 -0
ChatUniVi/model/make_delta.py +52 -0
ChatUniVi/model/multimodal_encoder/builder.py +14 -0
ChatUniVi/model/multimodal_encoder/clip_encoder.py +83 -0
ChatUniVi/model/multimodal_encoder/eva_encoder.py +81 -0
ChatUniVi/model/multimodal_encoder/eva_vit.py +448 -0
ChatUniVi/model/multimodal_encoder/processor.py +68 -0
ChatUniVi/model/multimodal_encoder/utils.py +137 -0
ChatUniVi/model/multimodal_projector/builder.py +52 -0
ChatUniVi/train/llama_flash_attn_monkey_patch.py +124 -0
ChatUniVi/train/train.py +1232 -0
ChatUniVi/train/train_mem.py +13 -0
ChatUniVi/train/trainer.py +53 -0
configs/__init__.py +1 -0
configs/config.py +84 -0
data/metadata.csv +0 -0

ChatUniVi/eval/questions/video_qa/msvd_qa.json ADDED Viewed

The diff for this file is too large to render. See raw diff

ChatUniVi/eval/questions/video_qa/temporal_qa.json ADDED Viewed

The diff for this file is too large to render. See raw diff

ChatUniVi/eval/questions/video_qa/tgif_a_list.json ADDED Viewed

	@@ -0,0 +1,1309 @@

+[
+  "cookie",
+  "? machine",
+  "two",
+  "glasses",
+  "black",
+  "tail",
+  "red",
+  "flowers",
+  "laptop",
+  "three",
+  "white",
+  "green",
+  "? boat",
+  "blue",
+  "? room",
+  "brown",
+  "cat",
+  "picture",
+  "drink",
+  "cigarette",
+  "clock",
+  "car",
+  "monkey",
+  "guitar",
+  "purple",
+  "? kitchen",
+  "? mirror",
+  "meal",
+  "four",
+  "? tank",
+  "? classroom",
+  "dog",
+  "pipe",
+  "leaf",
+  "shirt",
+  "champagne",
+  "string",
+  "sweater",
+  "? studio",
+  "tortoise",
+  "and one of them is holding ? dog",
+  "rings",
+  "vehicles",
+  "lollipop",
+  "candy",
+  "bottle",
+  "then a man is shown sitting . ? locker",
+  "parakeets",
+  "hole",
+  "tie",
+  "boat",
+  "ball",
+  "cash",
+  "chicken",
+  "? street",
+  "bird",
+  "six",
+  "? pool",
+  "window",
+  "round",
+  "instrument",
+  "puppy",
+  "doorway",
+  "juice",
+  "flamethrower",
+  "gray",
+  "dress",
+  "hat",
+  "kitten",
+  "gun",
+  "cars",
+  "paws",
+  "elephant",
+  "beam",
+  "? chair",
+  "chimp",
+  "one",
+  "butt",
+  "mascara",
+  "dogs",
+  "puppet",
+  "hamster",
+  "? bedroom",
+  "who pretends to slap him in return ? crack",
+  "machine",
+  "drops",
+  "then he removes and throws it to the ground ? hat",
+  "when two of the cyclist crash ? bicycles",
+  "cannabis",
+  "? trap",
+  "helmet",
+  "motorcycle",
+  "purses",
+  "bank",
+  "orange",
+  "guitars",
+  "? crib",
+  "hedgehog",
+  "? hallway",
+  "? car",
+  "steps",
+  "horse",
+  "? bath",
+  "drawer",
+  "cats",
+  "duck",
+  "wearing , reads a piece of paper on a desk and then raises his head ? glasses",
+  "phone",
+  "pillow",
+  "cup",
+  "he has food in front of him . ? chair",
+  "surfboard",
+  "before one of them climbs from the ring ? two",
+  "dancing , and clapping ? four",
+  "pool",
+  "motorcycles",
+  "pictures",
+  "? star",
+  "clipboard",
+  "paw",
+  "kiss ? two",
+  "turtle",
+  "when one touches the other on the shoulder ? two",
+  "? house",
+  "five",
+  "locker",
+  "tree",
+  "bat",
+  "popcorn",
+  "broom",
+  "guns",
+  "paint",
+  "seat",
+  "and then they run away ? heels",
+  "flags",
+  "dice",
+  "? library",
+  "yellow",
+  "chair",
+  "door",
+  "? warehouse",
+  "kick it and fall over ? tire",
+  "jacket",
+  "wire",
+  "crow",
+  "motions",
+  "bubbles",
+  "vehicle",
+  "wearing and speaking ? necklace",
+  "one is dressed funny , look at each other ? two",
+  "mice",
+  "clothing",
+  "bread",
+  "fireworks",
+  "microphone",
+  "mascot",
+  "? booth",
+  "wolf",
+  "? foyer",
+  "driver",
+  "cylinder",
+  "on top of his food bowl ? dog",
+  "rabbit",
+  "? office",
+  "treadmill",
+  "cap",
+  "tire",
+  "stick",
+  "is laying and opening her eyes . ? bed",
+  "stairs",
+  "drums",
+  "bar",
+  "? bed",
+  "spoons",
+  "? lab",
+  "headphones",
+  "one is . ? basket",
+  "makeup",
+  "frogs",
+  "wine",
+  "two men sit on a sofa and a man dances along a red carpet ? rectangle",
+  "sauce",
+  "airplane",
+  "and he is playing ? guitar",
+  "fox",
+  "costume",
+  "slide",
+  "stamp",
+  "butts",
+  "? window",
+  "rope",
+  "receiver",
+  "then the dog turns around crazy ? butt",
+  "and one talks to someone . ? room",
+  "? aisle",
+  "headset",
+  "horses",
+  "handgun",
+  "bear",
+  "napkin",
+  "? bottle",
+  "frog",
+  "wearing , animal print pants and pink shoes is dancing on a sidewalk ? shirt",
+  "bicycle",
+  "button",
+  "panda",
+  "turtles",
+  "but keeps flying ? airplane",
+  "? headset",
+  "lobby",
+  "pelican",
+  "dive",
+  "? cage",
+  "dishes",
+  "wagon",
+  "seven",
+  "? bag",
+  "butterfly",
+  "flask",
+  "banana",
+  "flasks",
+  "bus",
+  "device",
+  "is riding through the house ? bicycle",
+  "bright lightning ? sky",
+  "umbrellas",
+  "yawns then puts out its paw and pushes a jar off onto the floor ? cat",
+  "skateboard",
+  "cupcakes",
+  "shoe",
+  "cloak",
+  "apple",
+  "wall",
+  "horns",
+  "trick",
+  "date",
+  "he is talking to a woman ? beer",
+  "hill",
+  "? bar",
+  "pieces",
+  "stars",
+  "and the bowl disappears ? dog",
+  "bridge",
+  "box",
+  "with one of them embracing the other from behind ? two",
+  "piano",
+  "? hall",
+  "coffee",
+  "peel",
+  "cutter",
+  "circle",
+  "sunglasses",
+  "star",
+  "? pen",
+  "they move slowly ? stairs",
+  "kitty",
+  "pen",
+  "owl",
+  "puppies",
+  "fish",
+  "keyboard",
+  "underwear",
+  "? gym",
+  "pigeon",
+  "retriever",
+  "masks",
+  "kangaroo",
+  "close",
+  "shorts",
+  "band",
+  "swimming",
+  "? plate",
+  "then another man reaches for it ? gun",
+  "face",
+  "ferret",
+  "drug",
+  "clothes",
+  "spoon",
+  "hurdle",
+  "grass",
+  "? paint",
+  "airplanes",
+  "talks",
+  "whose lights flash on ? flower",
+  "with one drumming ? instruments",
+  "? bowl",
+  "burger",
+  "llama",
+  "it licks its lips ? horse",
+  "? holder",
+  "camel",
+  "dancing",
+  "umbrella",
+  "pants",
+  "ducklings",
+  "mug",
+  "necklace",
+  "track",
+  "smoking and turning her head ? cigarette",
+  "ladder",
+  "cliff",
+  "shirts",
+  "shark",
+  "is playing ? ukulele",
+  "turns",
+  "? ball",
+  "scooter",
+  "? box",
+  "? road",
+  "cover",
+  ". ? cage",
+  "backhoe",
+  "bed",
+  "and she is holding up ? puppet",
+  "? two",
+  "goblet",
+  "is using and smoking a cigarette ? phone",
+  "wearing coats , is hugging . ? hallway",
+  "but he misses ? ball",
+  "diver",
+  "? nightclub",
+  "they both smile ? round",
+  "medic",
+  "? stick",
+  "train",
+  "? microphone",
+  "cigar",
+  "wearing , comes through a door held open by another man ? suit",
+  "wheel",
+  "lions",
+  "tights",
+  "racetrack",
+  "one picks up the other and carries him ? two",
+  "sun",
+  "? floor",
+  "beer",
+  "berries",
+  "mask",
+  "heels",
+  "decorator",
+  "cub",
+  "breakfast",
+  ". ? chair",
+  "then looks away ? monkey",
+  "? bucket",
+  "snack",
+  "girl",
+  "suspenders",
+  "toy",
+  "elephants",
+  "boar",
+  "bubble",
+  "falls off and he grabs it ? hat",
+  "trunk",
+  "and one of them climbs from one to the other ? frogs",
+  "floor",
+  "belt",
+  "octopus",
+  "? dish",
+  "truck",
+  "snowmobile",
+  "standing in the dark , wears ? dress",
+  "? bathtub",
+  "trees",
+  "? mall",
+  "bow",
+  "beat to the rhythm ? sticks",
+  "? store",
+  "but stops him ? rope",
+  "pug",
+  "headgear",
+  "tubes",
+  "dance",
+  "pandas",
+  "iguana",
+  "concert",
+  "dandelion",
+  "? garden",
+  "queen",
+  "instruments",
+  "tricycle",
+  "racing",
+  "? garage",
+  "horn",
+  "entrance",
+  "can",
+  "chimpanzee",
+  "but the bear cub does ? bear",
+  "glass",
+  "birds",
+  "screaming and pointing ? two",
+  "robot",
+  "sky",
+  "egg",
+  "moth",
+  "backpack",
+  "beverages",
+  "bouquet",
+  "trumpet",
+  "carpet",
+  "? apartment",
+  "pony",
+  "goat",
+  "headdress",
+  "and he is removing ? hat",
+  "house",
+  "suit",
+  "gum",
+  "curb",
+  "and then leaves it ? car",
+  "snake",
+  "he looks at his passenger who is sleeping ? car",
+  "? bow-tie",
+  "wig",
+  "raising a cloud of dust ? car",
+  "freezer",
+  "delivering , and signing ? flowers",
+  "skis",
+  "road",
+  "deal",
+  "ship",
+  "? bathroom",
+  "bills",
+  "piece",
+  "items fall out and she makes a face ? door",
+  "drinks",
+  "dives , . ? cafeteria",
+  "goggles",
+  "? wagon",
+  "man",
+  "cups",
+  "dolphin",
+  "card",
+  "building",
+  "trunks",
+  "liquor",
+  "scarf",
+  "squash",
+  "cheese",
+  "then the snake kisses her ? snake",
+  "dances seductively ? dress",
+  "sword",
+  "kiss",
+  "possum",
+  "stockings",
+  "? tray",
+  "the one man yells ? two",
+  "and she is playing ? guitar",
+  "? alley",
+  "also wearing ? helmet",
+  "beverage",
+  "weapon",
+  "rodent",
+  "beach",
+  "? cereals",
+  "bench",
+  "with two holding glass bottles with colored liquid ? five",
+  "holding , jumps in the air and then moves to the back of stage ? guitar",
+  "transportation",
+  "shampoo",
+  "caps",
+  "hook",
+  "squirrel",
+  "scenery",
+  "playing",
+  "? wheelchair",
+  "performer",
+  "cake",
+  "dancing and playing ? instruments",
+  "boxes",
+  "leash",
+  "? bouquet",
+  "but only one arm is . ? sleeve",
+  "rifles",
+  "lenses",
+  "the girl watches him . ? building",
+  "almonds",
+  "tank",
+  "pot",
+  "bracelet",
+  "knife",
+  "mouse",
+  "who then catches it ? bottle",
+  "exercise",
+  "and he is turning around ? wand",
+  "purse",
+  "stones",
+  "show",
+  "bag",
+  "stocking",
+  "balloon",
+  "stops , and its tongue remains stuck out ? cat",
+  "scythe",
+  "creature",
+  "cello",
+  "and ends up on its back ? bird",
+  "pup",
+  "? container",
+  "and one blows a kiss ? two",
+  "animal",
+  "trampoline",
+  "before they turn and walk away ? two",
+  "cloaks",
+  "blackjack",
+  "as they hit fist to fist ? two",
+  "bicycles",
+  "watch",
+  "corgi",
+  "spider",
+  "earring",
+  "bull",
+  "? wheel",
+  "? stadium",
+  "looking at each other ? two",
+  "foxes",
+  "mammal",
+  "sheep",
+  "chases",
+  "? armchair",
+  ". ? room",
+  "dancing , and playing ? instruments",
+  "which then falls backwards ? cat",
+  "dancer",
+  "boots",
+  "rotors",
+  "? ranch",
+  "? shower",
+  "paper , scissors as they stand by the door ? two",
+  "laying and crying on her pillow . ? bed",
+  "pencil",
+  "when one side scores a goal ? two",
+  "food",
+  "one with an arm on the other ? two",
+  "sheets",
+  "rabbits",
+  "pizza",
+  "? glove",
+  "table",
+  "scratched",
+  "syrup",
+  "cone",
+  "while the larger man breaks up the fight ? two",
+  "drives",
+  "luggage",
+  "? vehicle",
+  "lift",
+  "frame",
+  "shoes",
+  "opens the door , and the cat and four dogs enter through the door ? building",
+  "blinks",
+  "crotch",
+  "dishwasher",
+  "skills",
+  "sleeves",
+  "model",
+  "ties",
+  "modeling",
+  "bath",
+  "jet",
+  "tortillas",
+  "teapot",
+  "barbel",
+  "cartwheel",
+  "musician",
+  "rhino",
+  "exits",
+  "pole",
+  "ski",
+  "pajama",
+  "woodchucks",
+  "lanes",
+  "candle",
+  "tag",
+  "gloves",
+  "dinosaur",
+  "surface",
+  "? tub",
+  "snowboard",
+  "wearing , hops around her couch while pointing at her face ? glasses",
+  "donut",
+  "mustard",
+  "? tunnel",
+  "? theater",
+  "wheels",
+  "rat",
+  "and one talks to someone ? two",
+  "bungee",
+  "but then suddenly takes off again ? jet",
+  "? rink",
+  "face shown . ? mirror",
+  "shell",
+  "costumes",
+  "? shield",
+  "confetti",
+  "flower",
+  "gesture",
+  "portfolio",
+  "and moves from under him ? ball",
+  "violin",
+  "photographs",
+  "uniforms",
+  "money",
+  "bomb",
+  "? rv",
+  "claws",
+  "lands",
+  "turnstile",
+  "bot",
+  "hose",
+  "suitcase",
+  "sitting on a table , reaches out and pushes a glass off the table ? paw",
+  "mountain",
+  "tools",
+  "headsets",
+  "the streets crumble below it ? airplane",
+  "t-shirt",
+  "doors",
+  "wearing , hugs another person and smiles ? glasses",
+  "one of them is shaking his head . ? car",
+  "octopuses",
+  "performs",
+  "cases",
+  "deer",
+  "? wall",
+  "and holding a lighter underneath , it explodes in flames ? balloon",
+  "blanket",
+  "coat",
+  "knives",
+  "? frame",
+  "trolley",
+  "noodles",
+  "one cries and holds a handkerchief to his nose , the other tries to comfort him ? two",
+  "wrap",
+  "? cart",
+  "inside of the car get scared ? two",
+  "animals",
+  "tails",
+  "? drawer",
+  "? cigarette",
+  "? barbel",
+  "room",
+  "? building",
+  "using as a weapon , hits a zombie in the head ? bat",
+  "trucks",
+  "boxers",
+  "drum",
+  "challenge",
+  "? toilet",
+  "llamas",
+  "then watches the smoke rise ? cat",
+  "mouths from across a room ? two",
+  "and it is pushed by a cat ? box",
+  "but the bear cub does ? bird",
+  "? skateboard",
+  "lifts up to her mouth , ? microphone",
+  "wearing , talks and bends his head forward ? cap",
+  "? doorway",
+  "which causes that cat to attack another cat ? cat",
+  "giraffe",
+  "cam",
+  "microphones",
+  "losing balance as it tries to walk forward ? cat",
+  "groove",
+  "tricks",
+  "spins , and lands on another ramp ? car",
+  "dumbbell",
+  "with their arms out , while laughing ? three",
+  "sea",
+  "carrot",
+  "chips",
+  "gift",
+  "ropes",
+  "singer",
+  "rocket",
+  "? net",
+  "blows",
+  "? zipper",
+  "sticks",
+  "tambourine",
+  "and he is laughing at a puppet talking ? cookie",
+  "? train",
+  "boats",
+  "across a road , and into the path of a car before being hit ? bicycle",
+  "penguins",
+  "song",
+  "antlers",
+  "feather",
+  "handcuffs",
+  "insect",
+  "gratings",
+  "milk",
+  "blackbird",
+  "scaffolding",
+  "sheet",
+  "seal",
+  "which bursts as the car approaches it ? car",
+  "? locker",
+  "towels",
+  "? highway",
+  "? lane",
+  "? rope",
+  "wearing , is singing with a microphone ? dress",
+  "vegetables",
+  "rag",
+  "? hoop",
+  "? hospital",
+  "keys",
+  "and he is raising his arm ? crotch",
+  "otter",
+  "? corridor",
+  "tires",
+  "they see it from looking up ? window",
+  "trainer",
+  "groundhog",
+  "gorilla",
+  "is sitting on the steps and eating ? shirt",
+  "oar",
+  "nugget",
+  "? cellphone",
+  "hamsters",
+  "walls",
+  "? cup",
+  "and then starts wracking it FRAMEQAeatedly ? wand",
+  "concoction",
+  "computer",
+  "hall",
+  "one is licking the other ones ear ? cats",
+  "earphone",
+  "hallway",
+  "trailer",
+  "magazine",
+  "and pointing at it ? laptop",
+  "elevator",
+  "river",
+  "pig",
+  "is also using ? earring",
+  "case",
+  "cape",
+  "? tablet",
+  "beanie",
+  "penguin",
+  "race",
+  "? excitedly",
+  "groomed each other ? cats",
+  "carriage",
+  "with long hair , open her mouth . ? room",
+  "parakeet",
+  "call",
+  "? tire",
+  "windshield",
+  "nose",
+  "? capsule",
+  "woman",
+  "snowball",
+  "look at one another , and fall to the ground laughing ? three",
+  "wing",
+  "bowl",
+  "lipstick",
+  "who is looking upset ? one",
+  "balls",
+  "cage",
+  "sunroof",
+  "? shop",
+  "shining and wearing a yellow outfit ? microphone",
+  "then two of them wave goodbye ? three",
+  "? sunglasses",
+  "kittens",
+  "? lingerie",
+  "colors",
+  "crying and eating a sandwich . ? bed",
+  "? lapel",
+  "corn",
+  "twirl",
+  "dough",
+  "dock",
+  "taxi",
+  "singing",
+  "stares",
+  "skate",
+  "chick",
+  "is visiting another guy . ? hospital",
+  "comb",
+  "roll",
+  "runway",
+  "statue",
+  "rides a skateboard up and launches himself through the air ? ramp",
+  "bleachers",
+  "? pot",
+  "butter",
+  "and it bounces off of a wall onto a table ? cat",
+  "? basement",
+  "eyeliner",
+  "wearing , is waving his hand ? shirt",
+  "opens the door , and the cat and four dogs enter the building through the door ? cat",
+  "right",
+  "flashlights",
+  "pet",
+  "pastry",
+  "but then the trailing car is shown a weapon and the car falls back ? car",
+  "tuxedo",
+  "begins to flip over and over ? car",
+  "curtain",
+  "fork",
+  "he looks away ? guitar",
+  "roof",
+  "? restroom",
+  "who jumps away . ? box",
+  "? rag",
+  "wearing , talks and raises on eyebrow ? headband",
+  "? cloak",
+  "then the rider lands on top ? motorcycle",
+  "toys",
+  "are talking to each other ? two",
+  "rats",
+  "telephone",
+  "bananas",
+  "user",
+  "stops and gets in ? taxi",
+  "cane",
+  "bucket",
+  "popsicle",
+  "? tent",
+  "? oven",
+  "and the fired a shot ? flower",
+  "? broom",
+  "? pan",
+  "design",
+  "hippopotamus",
+  "they move to the left ? sky",
+  "trying not to laugh ? two",
+  "torch",
+  "they look at one another , and the woman exits the car . ? car",
+  "his head nods to the left . ? chair",
+  "and he had a bandage on his head . ? car",
+  "vegetable",
+  "and everyone celebrates ? star",
+  "balloons",
+  "men",
+  "circles",
+  "graffiti",
+  "racer",
+  "jump",
+  "kissing , and spinning around ? two",
+  "works",
+  "castle",
+  "while they are sitting down ? two",
+  "sandwich",
+  "earpiece",
+  "then lift ? shirt",
+  "motors",
+  "burrito",
+  "? singlet",
+  "180",
+  "? dryer",
+  "torches",
+  "? pullover",
+  "wearing , slides open a door and dances through while carrying a walking tick and radio ? glasses",
+  "straw",
+  "wearing , pushes a melting ice cream into his mouth as some drops from his hand ? cap",
+  "clown",
+  "smiles , and turns away . ? classroom",
+  "figure",
+  "white doll ? two",
+  "signs",
+  "? airplane",
+  "cannon",
+  "cloth",
+  "serviette",
+  "toast",
+  "? kit",
+  "bats",
+  "bobcat",
+  "griddle",
+  "leaves",
+  "pass",
+  "? door",
+  "ramp",
+  "porpoise",
+  "scissors",
+  "fighter",
+  "bandannas",
+  "bases",
+  "hug each other ? two",
+  "duckling",
+  "but grabs on and takes a drink ? monkey",
+  "winks",
+  "? jeep",
+  "twirls",
+  "harp",
+  "one points and talks and the other laughs ? two",
+  "then a redhead grabs ? hat",
+  "? zoo",
+  "tender",
+  "disc",
+  "fly",
+  "wash",
+  "harness",
+  "opening",
+  "brick",
+  "watermelon",
+  "plate",
+  "they bring it closer to their body ? stick",
+  "lake",
+  "sledgehammer",
+  "leaning backward , and waving their arms back and forth ? two",
+  "ocean",
+  "while spectators watch ? two",
+  "shuttle",
+  "loop",
+  "balcony",
+  "? closet",
+  "but falls off a table ? cat",
+  "anchor",
+  "? plaid",
+  "terrapins",
+  "pop",
+  "tool",
+  "hay",
+  "panther",
+  "smiling and laughing ? three",
+  "and it lands on his head ? hat",
+  "? fountain",
+  "photograph",
+  "it has a double yolk ? egg",
+  "one is in a basket ? dogs",
+  "but does ? cub",
+  "strips",
+  "jeep",
+  "when the toaster pops out toast the cat gets scared and jumps off ? cat",
+  "then turns around crazy ? dog",
+  "goldfish",
+  "? elevator",
+  "sedan",
+  "? pocket",
+  "planet",
+  "drill",
+  "two of them spinning around ? cars",
+  "baboon",
+  "mirror",
+  "? flowers",
+  "chairs",
+  "make in the air with a wand ? float",
+  "jewelry",
+  "fabric",
+  "coins",
+  "handset",
+  "jets",
+  "bulldog",
+  "black hair wearing and raising their hand up to their mouth ? shirt",
+  "sweatshirt",
+  "workout",
+  "rounds",
+  "? bench",
+  "? piece",
+  "sparklers",
+  "waterfall",
+  "lettuce",
+  "crashes",
+  "tomato",
+  "cheeseburger",
+  "strawberry",
+  "and another one appears to be . ? garden",
+  "flag",
+  "eight",
+  "toothpick",
+  "and disappears ? bowl",
+  "? lipstick",
+  "and she is smiling ? cat",
+  "? alleyway",
+  "shield",
+  "tuxedos",
+  "talking , smiling and waving his hand . ? chair",
+  "cheetah",
+  "and one player kicks into the goal ? ball",
+  "letters",
+  "? basket",
+  "pill",
+  "which trips another man who does a flip and lands on a recycle bin ? peel",
+  "human",
+  "fence",
+  "? sink",
+  "black leather trench coat ? star",
+  "divers",
+  "couch",
+  "buttons",
+  "shot",
+  "rodents",
+  "swords",
+  "gown",
+  "both speeding down the road ? car",
+  "people watch them . ? house",
+  "belts",
+  "catapult",
+  "ammunition",
+  "potatoes",
+  "lemur",
+  "while a third moves forward and dances ? two",
+  "then their hand and a slogan appears ? towel",
+  "firecrackers",
+  "ribs",
+  "briefcase",
+  "the man spills milk over his face . ? car",
+  "? workshop",
+  "is sitting down and smoking ? cigarette",
+  "dressed in a suit and carrying ? cane",
+  "and she is dancing in a field . ? mirror",
+  "? ashtray",
+  "looking sad . ? hallway",
+  "noodle",
+  "missiles",
+  "? helicopter",
+  "catfish",
+  "toothbrush",
+  "have taken ? pictures",
+  "pane",
+  "he dances on the stage ? headset",
+  "scooters",
+  "then he does the splits . ? hallway",
+  "and it is pushed by a cat ? mouse",
+  "desks",
+  "hills",
+  "stairway",
+  "whisk",
+  "with",
+  "while one of them sings into a microphone ? two",
+  "bottles",
+  "but grabs her leg ? panda",
+  "sled",
+  "nut",
+  "feathers",
+  "dresses",
+  "sink",
+  "wristband",
+  "then jumps up to celebrate ? pool",
+  "drumsticks",
+  "opens her mouth and smiles ? one",
+  "suits",
+  "sculpture",
+  "are fighting for control of the soccer ball ? two",
+  "and he is throwing ? napkin",
+  "pets",
+  "bin",
+  "jockey",
+  "backwards",
+  "spiky , walk across the pavement ? heels",
+  "chainsaw",
+  "? guitar",
+  "with just head and tail exposed ? cat",
+  "when one pins the other one down for a three count ? two",
+  "shore",
+  "chicks",
+  "dancing and laughing ? two",
+  "looking sideways and singing ? guitar",
+  "? turns",
+  "lamp",
+  "paper , scissors ? two",
+  "chocolate",
+  "bra",
+  "blonde woman wearing a back top and matching ? piece",
+  "holding hands ? two",
+  "while the man next to him talks and moves his hands around ? one",
+  "cubs",
+  "having cake . ? restaurant",
+  "figurine",
+  "hood",
+  "lens",
+  "groomed each other ? two",
+  "sabers",
+  "before jumping in the pool ? dog",
+  "mattress",
+  "sidewalk",
+  "landing",
+  "rocks",
+  "avocado",
+  "? bear",
+  "and a man spills , crouches , and cowers ? coffee",
+  "disks",
+  "mountainside",
+  "lips",
+  "chest",
+  "wan",
+  "glove",
+  "? beer",
+  "tortilla",
+  "? stable",
+  "meteor",
+  "expression",
+  "? kayak",
+  "biscuit",
+  "ukulele",
+  "at something ? two",
+  "convertible",
+  "climber",
+  "is using the pay phone and smoking ? cigarette",
+  "wearing , looks mad ? jacket",
+  "mike",
+  "sleeping and stretching on the person 's stomach ? cat",
+  "denim",
+  "lantern",
+  "breaks the branch its sitting on in the tree , and falls to the ground ? panda",
+  "so that she 's almost laying down . ? car",
+  "smears",
+  "hair",
+  "bones",
+  "blade",
+  "unicycle",
+  "? cone",
+  "wallet",
+  "blouse",
+  "trousers",
+  "buds",
+  "spill",
+  "rib",
+  "porcupine",
+  "tray",
+  "map",
+  "sad ? dog",
+  "socks",
+  "automobile",
+  "parallel",
+  "skyscraper",
+  "classroom",
+  "catwalk",
+  "the bike crashes ? bicycle",
+  "stare , and look shocked ? four",
+  "towel",
+  "whilst another one is sitting down ? guitar",
+  "lion",
+  "cargo",
+  "grabs",
+  "and then starts wracking it FRAMEQAeatedly ? cat",
+  "vest",
+  "spits",
+  "wearing is walking and waving ? dress",
+  "poker",
+  "robe",
+  "bandanna",
+  "little fingers ? two",
+  "person",
+  "doves",
+  "container",
+  "wearing , uses gymnastic rings to lift herself to a seated position then into a handstand ? clothes",
+  "forklift",
+  "buildings",
+  "wearing ? blouse",
+  "making a crack big enough for the rest to get in ? cat",
+  "carrots",
+  "lizard",
+  "beakers",
+  "blower",
+  "and another woman is running in black shorts ? pants",
+  "marks",
+  "spaceship",
+  "when one man lays the other man down ? two",
+  "are dancing on a stage while the crowd cheers ? two",
+  "they start to head bang . ? car",
+  "then one blows confetti into the air ? two",
+  "sitting down , when someone else steps up and spins the chair around . ? chair",
+  "puppets",
+  "garage",
+  "lemon",
+  "wearing , is sitting and doing something with her foot ? clothes",
+  "and two men with lighting swords want to fight with him ? door",
+  "treat",
+  "lamb",
+  "ways",
+  "and one man throws ? hat",
+  "pick",
+  "product",
+  "is throwing around the room ? clothes",
+  "the clothes of the people catch on fire ? horses",
+  "all , have the same type of hair style ? three",
+  "whip",
+  "mop",
+  "pointing his fingers and nodding ? bow",
+  "bags",
+  "machines",
+  "seeds",
+  "symbol",
+  "layer",
+  "opens ? door",
+  "dark sunglasses , and cigar ? two",
+  "the man smashes the head of a zombie ? bat",
+  "extinguisher",
+  "candles",
+  ", looking out ? window",
+  "group",
+  "drop",
+  "is riding , into the swimming pool ? bicycle",
+  "stake",
+  "block",
+  "and he is singing into a microphone ? guitar",
+  "ornament",
+  "spins as he bends over . ? chair",
+  "? shirts",
+  "? colors",
+  "hookah",
+  "? courtyard",
+  "cactus",
+  "are having taken while on stage ? picture",
+  "an orange ? shell",
+  "and he is talking ? sunglasses",
+  "veil",
+  "then rolling around in the mud ? horse",
+  "? pillow",
+  "drugs",
+  "? couch",
+  "bun",
+  "koala",
+  "one wearing brown shoes and the other has no footwear ? two",
+  "and he is falling in the water ? dog",
+  "is smoking ? cigarette",
+  "rooster",
+  "submarine",
+  "wand",
+  "helicopter",
+  "wearing , smiles as her hair blows in the wind ? hat",
+  "and fails , to jump into the window ? cat",
+  "tram",
+  "and then is knocked down when it hits him in the head ? bag",
+  "curve",
+  "handrail",
+  "bulldozer",
+  "stops a taxi . ? street",
+  "speedometer",
+  "? necklace",
+  "curbs",
+  "over multiple vehicles and lands on another ramp ? bicycle",
+  "wolves",
+  "laundry",
+  "holding , laughs into a microphone and then puts her fingers up to her lips ? guitar",
+  "peeking . ? room",
+  "cigarettes",
+  "bells",
+  "sill",
+  "raspberry",
+  "suited",
+  "shawl",
+  "wakes",
+  "applying the brake , and applying the gas as needed . ? car",
+  "poodle",
+  "and he 's ? candles",
+  "then skids on the ground ? motorcycle",
+  "office",
+  "outdoors",
+  "it stops at the edge ? car",
+  "as she puts it all on top of her head ? two",
+  "but his reflection is doing something different . ? mirror",
+  "holding , are walking together ? bear",
+  "hats",
+  "mat",
+  "then the team mate scores a goal ? ball",
+  "one with a guitar are behind him ? one",
+  "? looks",
+  "grenade",
+  "coin",
+  "toasting each other with their liquor bottles ? two",
+  "saxophone",
+  "capes",
+  "lounges",
+  "? scissors",
+  "hoop",
+  "rack",
+  "frisbee",
+  "then jumps in the air and runs away ? cat",
+  "wearing , is hugging in the hallway ? coats",
+  "? lobby",
+  "corridor",
+  "who they push to the ground ? two",
+  "worms",
+  "tablet",
+  "who turns and causes the kitten to raise its paw ? kitten",
+  "chariot",
+  "lock",
+  "tongs",
+  "game",
+  "s head while he is trying to eat ? cat",
+  "pie",
+  "feline",
+  "and then are shown ? pictures",
+  "parasol",
+  "pumpkins",
+  "notebook",
+  "the horse leans its head around her ? horse",
+  "spaghetti",
+  "outside",
+  "? bib",
+  "gold",
+  "cart",
+  "the trees are being passed by , and the clouds are above ? sun",
+  "the other elephant pulls it closer ? elephant",
+  "most of them wearing ? sunglasses",
+  "and are falling down on top of him ? balloons",
+  "nods his head and blinks ? one",
+  "with long brown hair , wink and raises to her face ? two",
+  "uncontrollably",
+  "wearing , raises two fingers to her face ? cap",
+  "swinging its hips from side to side ? turtle",
+  "skates",
+  "they look at one another , and the woman exits ? car",
+  "his friends join in the background . ? chair",
+  "store",
+  "donuts",
+  "then sticks its tongue out ? dog",
+  "and then a massive explosion occurs ? container",
+  "then kisses her ? snake",
+  "brakes"
+]

ChatUniVi/eval/questions/video_qa/tgif_qa.json ADDED Viewed

The diff for this file is too large to render. See raw diff

ChatUniVi/eval/table/caps_boxes_coco2014_val_80.jsonl ADDED Viewed

	@@ -0,0 +1,80 @@

+{"id": "000000296284", "image": "COCO_val2014_000000296284.jpg", "captions": ["A donut shop is full of different flavors of donuts.", "Fruit flavored donuts lined up in a glass fronted cabinet", "A rack with some doughnuts in a glass case.", "A display case in a bakery filled with donuts.", "An assortment of doughnuts are arranged in a display case."], "instances": [{"category": "donut", "bbox": [0.37, 0.584, 0.504, 0.709]}, {"category": "donut", "bbox": [0.369, 0.22, 0.492, 0.317]}, {"category": "donut", "bbox": [0.471, 0.587, 0.639, 0.706]}, {"category": "donut", "bbox": [0.544, 0.213, 0.679, 0.316]}, {"category": "donut", "bbox": [0.035, 0.22, 0.196, 0.328]}, {"category": "donut", "bbox": [0.054, 0.608, 0.221, 0.711]}, {"category": "donut", "bbox": [0.283, 0.586, 0.429, 0.708]}, {"category": "donut", "bbox": [0.466, 0.226, 0.585, 0.32]}, {"category": "donut", "bbox": [0.28, 0.232, 0.393, 0.322]}, {"category": "donut", "bbox": [0.0, 0.609, 0.097, 0.722]}]}
+{"id": "000000151358", "image": "COCO_val2014_000000151358.jpg", "captions": ["A newspaper that has sunglasses on top of it sitting in front of books.", "an apple sunglasses books and a teddy bear", "A folded newspaper and sunglasses are on a table with an apple, books, and teddy bear behind.", "An apple sitting on a table next to sunglasses and a news paper.", "There are sunglasses laying on the folded newspaper."], "instances": [{"category": "tie", "bbox": [0.258, 0.074, 0.527, 0.589]}, {"category": "apple", "bbox": [0.621, 0.482, 0.853, 0.645]}, {"category": "book", "bbox": [0.154, 0.107, 0.275, 0.59]}, {"category": "book", "bbox": [0.535, 0.09, 0.735, 0.583]}, {"category": "book", "bbox": [0.051, 0.112, 0.159, 0.6]}, {"category": "teddy bear", "bbox": [0.753, 0.084, 1.0, 0.517]}, {"category": "book", "bbox": [0.681, 0.097, 0.796, 0.483]}, {"category": "book", "bbox": [0.443, 0.099, 0.574, 0.588]}, {"category": "book", "bbox": [0.267, 0.337, 0.386, 0.579]}]}
+{"id": "000000052312", "image": "COCO_val2014_000000052312.jpg", "captions": ["The old man literally has a toothbrush mustache.", "An old man with a tooth brush head under his nose, mimicking Hitler", "A man wearing a toothbrush for a moustache.", "A man with the head of a toothbrush under his nose like a mustache", "An elderly man wearing the head of a toothbrush as a moustache."], "instances": [{"category": "toothbrush", "bbox": [0.345, 0.59, 0.594, 0.679]}, {"category": "person", "bbox": [0.0, 0.03, 1.0, 0.99]}]}
+{"id": "000000473210", "image": "COCO_val2014_000000473210.jpg", "captions": ["two people taking apart their wii controllers to replace batteries", "People taking apart video game remote controls on a table", "People handling a couple of remotes taking them apart.", "two sets of hands a wooden table and two controllers", "Two people who are taking apart a video game controller."], "instances": [{"category": "person", "bbox": [0.002, 0.334, 0.453, 0.986]}, {"category": "remote", "bbox": [0.407, 0.207, 0.727, 0.604]}, {"category": "remote", "bbox": [0.088, 0.344, 0.313, 0.547]}, {"category": "laptop", "bbox": [0.001, 0.049, 0.1, 0.197]}, {"category": "person", "bbox": [0.484, 0.254, 0.998, 0.985]}, {"category": "dining table", "bbox": [0.0, 0.003, 1.0, 0.956]}]}
+{"id": "000000097131", "image": "COCO_val2014_000000097131.jpg", "captions": ["A car parked by a parking meter in front of a building.", "A car is sitting parked at a curb in front of a parking meter.", "A black car on the street next to a parking meter.", "A gray car parked in front of two parking meters.", "A black car parked on the side of the road."], "instances": [{"category": "car", "bbox": [0.227, 0.362, 0.946, 0.761]}, {"category": "car", "bbox": [0.793, 0.322, 0.88, 0.4]}, {"category": "car", "bbox": [0.0, 0.447, 0.028, 0.726]}, {"category": "parking meter", "bbox": [0.156, 0.35, 0.186, 0.453]}, {"category": "truck", "bbox": [0.907, 0.331, 1.0, 0.408]}, {"category": "parking meter", "bbox": [0.188, 0.349, 0.218, 0.448]}]}
+{"id": "000000543364", "image": "COCO_val2014_000000543364.jpg", "captions": ["There is a table in the middle of the room.", "A room with a couch, table, lamp and a chaise.", "A living room with couch, chaise, track lighting, and a large window.", "A room with large windows, a couch and a table.", "A living room with lots of furniture and a large window."], "instances": [{"category": "dining table", "bbox": [0.388, 0.644, 0.636, 0.879]}, {"category": "couch", "bbox": [0.194, 0.531, 0.552, 0.777]}, {"category": "couch", "bbox": [0.568, 0.488, 0.907, 0.783]}, {"category": "remote", "bbox": [0.524, 0.651, 0.556, 0.675]}, {"category": "chair", "bbox": [0.661, 0.478, 0.802, 0.604]}]}
+{"id": "000000217181", "image": "COCO_val2014_000000217181.jpg", "captions": ["They are standing next to some stylish motorcycles.", "Three men are standing around looking at sports motorcycles.", "A small group of men are standing around a motorcycle.", "Two men surrounding a blue motorcycle and others", "A few blue motorcycles are parked in a lot."], "instances": [{"category": "car", "bbox": [0.011, 0.177, 0.2, 0.336]}, {"category": "motorcycle", "bbox": [0.032, 0.139, 0.907, 0.982]}, {"category": "motorcycle", "bbox": [0.0, 0.239, 0.148, 0.613]}, {"category": "motorcycle", "bbox": [0.0, 0.301, 0.106, 0.45]}, {"category": "person", "bbox": [0.775, 0.043, 0.93, 0.463]}, {"category": "person", "bbox": [0.717, 0.116, 0.81, 0.509]}, {"category": "person", "bbox": [0.296, 0.008, 0.472, 0.325]}, {"category": "person", "bbox": [0.115, 0.19, 0.164, 0.269]}, {"category": "truck", "bbox": [0.63, 0.227, 0.731, 0.335]}]}
+{"id": "000000140289", "image": "COCO_val2014_000000140289.jpg", "captions": ["Two born bears walking though a forest surrounded by trees.", "Two full grown brown bears in a habitat.", "Two bears are roaming around in the woods.", "Two bears around logs in front of a large rock.", "Two big bears wandering through the woods together"], "instances": [{"category": "bear", "bbox": [0.131, 0.269, 0.375, 0.65]}, {"category": "bear", "bbox": [0.568, 0.193, 0.809, 0.827]}]}
+{"id": "000000460149", "image": "COCO_val2014_000000460149.jpg", "captions": ["A clock hosted on a pole on a pavement next to a building", "Street clock on quiet street with trees and bicycles.", "A tall clock stands on an empty sidewalk.", "A pole that has a clock on the top of it.", "a clock on a short tower and potted plants along the sidewalk"], "instances": [{"category": "potted plant", "bbox": [0.14, 0.71, 0.338, 0.856]}, {"category": "bicycle", "bbox": [0.65, 0.671, 0.766, 0.733]}, {"category": "car", "bbox": [0.38, 0.608, 0.488, 0.656]}, {"category": "clock", "bbox": [0.468, 0.048, 0.699, 0.216]}, {"category": "bicycle", "bbox": [0.669, 0.662, 0.719, 0.67]}, {"category": "car", "bbox": [0.786, 0.625, 0.86, 0.668]}, {"category": "potted plant", "bbox": [0.756, 0.637, 0.819, 0.682]}, {"category": "person", "bbox": [0.942, 0.615, 0.954, 0.641]}, {"category": "bicycle", "bbox": [0.648, 0.68, 0.714, 0.747]}, {"category": "car", "bbox": [0.837, 0.619, 0.88, 0.659]}, {"category": "potted plant", "bbox": [0.017, 0.197, 0.443, 0.686]}]}
+{"id": "000000225738", "image": "COCO_val2014_000000225738.jpg", "captions": ["A group of giraffes standing up in their natural habitat.", "A group of giraffe standing in a grass field.", "A group of four giraffes near the same tree.", "there are four giraffes standing among some dry brush", "A herd of giraffe standing on top of a grass field."], "instances": [{"category": "giraffe", "bbox": [0.648, 0.231, 0.855, 0.915]}, {"category": "giraffe", "bbox": [0.33, 0.136, 0.521, 0.93]}, {"category": "giraffe", "bbox": [0.406, 0.261, 0.515, 1.0]}, {"category": "giraffe", "bbox": [0.347, 0.194, 0.583, 0.922]}]}
+{"id": "000000109532", "image": "COCO_val2014_000000109532.jpg", "captions": ["An adorable husky dog sleeping in a dog bed next to a fan.", "A dark room with a dog sleeping on a dog bed.", "A dog is sleeping in a dark room.", "a large dog laying in a dog bed in a living room", "A dog sleeping on a dog bed in a room."], "instances": [{"category": "dog", "bbox": [0.426, 0.661, 0.582, 0.925]}, {"category": "potted plant", "bbox": [0.603, 0.261, 0.781, 0.613]}, {"category": "chair", "bbox": [0.67, 0.515, 0.899, 0.801]}, {"category": "potted plant", "bbox": [0.671, 0.439, 0.763, 0.612]}, {"category": "chair", "bbox": [0.852, 0.653, 0.948, 0.818]}]}
+{"id": "000000118606", "image": "COCO_val2014_000000118606.jpg", "captions": ["A man riding skis on top of a rail.", "a person riding a pair of skis on a rail", "Someone on a pair of skis on a ramp at the ski slope", "Person with skis in the air above the snow.", "A man performing a trick on a rail while skiing."], "instances": [{"category": "person", "bbox": [0.444, 0.361, 0.537, 0.633]}, {"category": "skis", "bbox": [0.413, 0.554, 0.539, 0.664]}, {"category": "person", "bbox": [0.342, 0.585, 0.352, 0.62]}, {"category": "person", "bbox": [0.439, 0.565, 0.446, 0.58]}]}
+{"id": "000000385873", "image": "COCO_val2014_000000385873.jpg", "captions": ["Three pizzas sitting next to each other in boxes.", "Two smaller pizzas sit beside a large pizza topped with tortilla chips.", "Three pizzas inside their delivery boxes, one with two side orders of sauce.", "One pizza is larger than two other pizzas.", "Three pizza boxes with pizza in them are open."], "instances": [{"category": "bowl", "bbox": [0.634, 0.624, 0.736, 0.752]}, {"category": "pizza", "bbox": [0.3, 0.382, 0.615, 0.733]}, {"category": "pizza", "bbox": [0.0, 0.4, 0.287, 0.745]}, {"category": "pizza", "bbox": [0.624, 0.279, 0.999, 0.753]}, {"category": "bowl", "bbox": [0.94, 0.247, 1.0, 0.352]}]}
+{"id": "000000092109", "image": "COCO_val2014_000000092109.jpg", "captions": ["A giraffe's head is pictured in this clear, colorful photo.", "A giraffe is standing tall in the middle of several bright green trees", "The face of a giraffe looking to the side.", "the close up head shot of a giraffe", "this is a giraffe chewing on some leaves"], "instances": [{"category": "giraffe", "bbox": [0.236, 0.122, 1.0, 0.987]}]}
+{"id": "000000163076", "image": "COCO_val2014_000000163076.jpg", "captions": ["There's an outdoor dining area featuring a fountain.", "A table sitting next to a water fountain covered by an umbrella.", "An empty restaurant patio with tables and umbrellas.", "An outdoor restaurant with a fountain at night", "A fountain bubbles in the plaza of an outdoor cafe."], "instances": [{"category": "umbrella", "bbox": [0.064, 0.069, 0.95, 0.844]}, {"category": "chair", "bbox": [0.198, 0.574, 0.355, 0.704]}, {"category": "chair", "bbox": [0.42, 0.571, 0.55, 0.738]}, {"category": "dining table", "bbox": [0.066, 0.741, 0.766, 0.925]}, {"category": "dining table", "bbox": [0.059, 0.584, 0.27, 0.659]}, {"category": "chair", "bbox": [0.432, 0.567, 0.52, 0.624]}, {"category": "chair", "bbox": [0.433, 0.555, 0.504, 0.6]}, {"category": "chair", "bbox": [0.109, 0.673, 0.374, 0.796]}]}
+{"id": "000000560371", "image": "COCO_val2014_000000560371.jpg", "captions": ["Street signs from the corner of 8th ave. and 22 3/4 st.", "A two way street sign with one sign that changes from one name to another.", "A street sign is pointing towards 8th avenue and the other is pointing towards 22 3/4 street in the middle of the forest.", "A street sign standing in front of some trees.", "Peculiar street sign showing intersection of 23 3/4 St and 8th Ave/CTH D."], "instances": []}
+{"id": "000000367571", "image": "COCO_val2014_000000367571.jpg", "captions": ["A couple of different doughnuts in a box", "There are four donuts in a box, and some are cake donuts and a doughnut with nuts and coconut on top.", "A box of glazed doughnuts on a table.", "Three donuts with toppings on them sitting inside a box.", "A box that is filled with different kinds of doughnuts."], "instances": [{"category": "donut", "bbox": [0.412, 0.335, 0.711, 0.681]}, {"category": "donut", "bbox": [0.093, 0.493, 0.486, 0.922]}, {"category": "donut", "bbox": [0.713, 0.423, 0.957, 0.874]}, {"category": "donut", "bbox": [0.13, 0.331, 0.397, 0.55]}]}
+{"id": "000000580197", "image": "COCO_val2014_000000580197.jpg", "captions": ["Two men in bow ties standing next to steel rafter.", "Several men in suits talking together in a room.", "An older man in a tuxedo standing next to a younger man in a tuxedo wearing glasses.", "Two men wearing tuxedos glance at each other.", "Older man in tuxedo sitting next to another younger man in tuxedo."], "instances": [{"category": "tie", "bbox": [0.914, 0.46, 0.984, 0.512]}, {"category": "person", "bbox": [0.297, 0.638, 0.71, 0.989]}, {"category": "person", "bbox": [0.77, 0.177, 1.0, 0.971]}, {"category": "tie", "bbox": [0.281, 0.481, 0.368, 0.519]}, {"category": "person", "bbox": [0.103, 0.204, 0.497, 1.0]}]}
+{"id": "000000506095", "image": "COCO_val2014_000000506095.jpg", "captions": ["A cat is staring at a laptop computer.", "a cat on a desk with a laptop and a mouse", "A cat that is sitting at a desk next to a laptop.", "A kitten sitting on a laptop computer sitting on top of a wooden desk.", "A kitten sits facing an open black laptop."], "instances": [{"category": "cat", "bbox": [0.658, 0.207, 1.0, 0.754]}, {"category": "laptop", "bbox": [0.108, 0.135, 0.766, 0.69]}, {"category": "book", "bbox": [0.836, 0.239, 0.954, 0.273]}, {"category": "book", "bbox": [0.0, 0.556, 0.128, 0.685]}, {"category": "book", "bbox": [0.039, 0.574, 0.257, 0.691]}, {"category": "book", "bbox": [0.825, 0.214, 0.962, 0.254]}, {"category": "book", "bbox": [0.892, 0.275, 0.958, 0.308]}, {"category": "book", "bbox": [0.922, 0.318, 0.986, 0.353]}, {"category": "book", "bbox": [0.87, 0.267, 0.951, 0.291]}, {"category": "book", "bbox": [0.949, 0.102, 0.976, 0.114]}, {"category": "book", "bbox": [0.936, 0.161, 0.958, 0.168]}]}
+{"id": "000000024996", "image": "COCO_val2014_000000024996.jpg", "captions": ["A bathroom with a glass door and a sink.", "A blue lined bathroom with an open glass door.", "A nice bathroom with a sink, toilet, and tiled shower.", "A bathroom that is clean and shiny in the day.", "a bathroom with a sink and a mirror and a window"], "instances": [{"category": "toilet", "bbox": [0.842, 0.934, 0.95, 1.0]}, {"category": "sink", "bbox": [0.506, 0.724, 0.683, 0.834]}]}
+{"id": "000000457882", "image": "COCO_val2014_000000457882.jpg", "captions": ["a girl in a bikini and a brown and white dog and a few other people", "A woman with a swimsuit on sitting with a dog.", "A woman is sitting with a dog on her lap.", "A dog sitting next to a woman in her swimsuit.", "WOMAN SITTING WITH HER DOG, AND OTHER WOMEN ARE AROUND"], "instances": [{"category": "dog", "bbox": [0.202, 0.409, 0.54, 0.81]}, {"category": "dog", "bbox": [0.61, 0.428, 0.729, 0.723]}, {"category": "boat", "bbox": [0.003, 0.705, 0.939, 0.974]}, {"category": "person", "bbox": [0.236, 0.001, 0.558, 0.784]}, {"category": "person", "bbox": [0.681, 0.001, 0.957, 0.798]}, {"category": "person", "bbox": [0.849, 0.478, 1.0, 0.946]}, {"category": "person", "bbox": [0.345, 0.187, 0.634, 0.828]}, {"category": "person", "bbox": [0.033, 0.345, 0.109, 0.434]}]}
+{"id": "000000081552", "image": "COCO_val2014_000000081552.jpg", "captions": ["A cat sitting and curled up on a red couch", "A cat laying on a red couch sleeping.", "a tan and black cat curled up asleep on a red velvet seat", "A cat is curled up on a red sofa.", "Cat curled up, sleeping on a red plush couch."], "instances": [{"category": "cat", "bbox": [0.412, 0.237, 0.634, 0.482]}, {"category": "couch", "bbox": [0.003, 0.005, 1.0, 0.99]}]}
+{"id": "000000273450", "image": "COCO_val2014_000000273450.jpg", "captions": ["A person flipping of a parking meter on the side of a road.", "A man holds up his middle finger to a parking meter.", "Person giving the middle finger to a parking meter.", "a black silver white blue red an orange parking meter and a hand flipping it off", "A person is flipping off a parking meter."], "instances": [{"category": "person", "bbox": [0.0, 0.475, 0.565, 0.987]}, {"category": "car", "bbox": [0.0, 0.0, 0.531, 0.734]}, {"category": "parking meter", "bbox": [0.0, 0.0, 1.0, 0.987]}]}
+{"id": "000000203879", "image": "COCO_val2014_000000203879.jpg", "captions": ["There is a small cellphone displayed between a set of ear buds and two paper weights.", "a cell phone lays next to some diamonds", "a close up of a cell phone on a table near earbuds", "A cell phone sits on a table next to some jewels.", "A cell phone, ear buds, and two jewels laying near each other."], "instances": [{"category": "cell phone", "bbox": [0.322, 0.233, 0.62, 0.79]}]}
+{"id": "000000346875", "image": "COCO_val2014_000000346875.jpg", "captions": ["two zebras in a field near one another", "A couple of zebra walking across a green field.", "Two zebra are walking near a gravel road.", "two zebras in a green field of grass and some trees", "A zebra follows another zebra through a park."], "instances": [{"category": "zebra", "bbox": [0.591, 0.263, 0.82, 0.466]}, {"category": "zebra", "bbox": [0.293, 0.243, 0.561, 0.45]}]}
+{"id": "000000525439", "image": "COCO_val2014_000000525439.jpg", "captions": ["a man stands in front of a flipped skate boarder", "A man standing next to a skateboard that is laying on the ground wheels pointed up.", "Skateboard laying upside down on cement with someone standing next to it.", "A boy in camo shorts stands before an overturned skateboard.", "a person with an upside down skate board"], "instances": [{"category": "person", "bbox": [0.307, 0.001, 0.63, 0.739]}, {"category": "skateboard", "bbox": [0.0, 0.592, 0.626, 0.969]}]}
+{"id": "000000304749", "image": "COCO_val2014_000000304749.jpg", "captions": ["The woman is taking a picture in the bathroom mirror.", "A picture of a woman in a mirror.", "A woman's midsection reflected in a round mirror.", "A circular mirror reflecting a woman's stomach in turquoise shirt.", "A selfie taken of a person from the neck down."], "instances": [{"category": "person", "bbox": [0.092, 0.001, 0.646, 0.496]}]}
+{"id": "000000323760", "image": "COCO_val2014_000000323760.jpg", "captions": ["A toilet is shown in a bare room.", "A ugly bathroom with a section of the wall missing.", "A toilet in a stripped bathroom with studs, bricks and plaster showing", "A bathroom with no walls and a toilet bowl", "A white toilet next to some torn out walls."], "instances": [{"category": "toilet", "bbox": [0.167, 0.585, 0.714, 1.0]}]}
+{"id": "000000066144", "image": "COCO_val2014_000000066144.jpg", "captions": ["A woman standing in front of window next to a bug and a stop sign.", "A car parked on the street next to a tree and stop sign.", "A lone Volkswagen is parked by a stop sign.", "A window view of a small car near a street stop sign.", "An old VW Bug standing at a stop sign."], "instances": [{"category": "stop sign", "bbox": [0.501, 0.328, 0.569, 0.428]}, {"category": "car", "bbox": [0.242, 0.488, 0.56, 0.726]}, {"category": "car", "bbox": [0.279, 0.325, 0.33, 0.363]}, {"category": "car", "bbox": [0.153, 0.333, 0.29, 0.405]}, {"category": "car", "bbox": [0.11, 0.339, 0.177, 0.373]}, {"category": "car", "bbox": [0.0, 0.654, 0.082, 0.826]}, {"category": "car", "bbox": [0.0, 0.322, 0.064, 0.364]}, {"category": "car", "bbox": [0.451, 0.333, 0.51, 0.392]}]}
+{"id": "000000455772", "image": "COCO_val2014_000000455772.jpg", "captions": ["A person in a field jumping to catch a Frisbee.", "A guy jumping to catch a frisbee in mid-air.", "A person that is trying to get a frisbee.", "Nice reach, but the Frisbee flies on, victorious.", "A man playing frisbee in a grassy yard."], "instances": [{"category": "car", "bbox": [0.148, 0.339, 0.201, 0.476]}, {"category": "car", "bbox": [0.376, 0.396, 0.424, 0.476]}, {"category": "person", "bbox": [0.547, 0.122, 0.698, 0.904]}, {"category": "frisbee", "bbox": [0.479, 0.154, 0.555, 0.231]}, {"category": "car", "bbox": [0.001, 0.299, 0.085, 0.394]}]}
+{"id": "000000511117", "image": "COCO_val2014_000000511117.jpg", "captions": ["A couple of kids standing on top of a grass covered field.", "A little boy wearing a baseball uniform stands by a little girl.", "A young boy in a baseball uniform and a young girl are standing in front of a chain link fence.", "A little boy and girl standing on a baseball field. The boy has a uniform on.", "A young baseball player is standing next to a young girl."], "instances": [{"category": "person", "bbox": [0.514, 0.178, 0.776, 0.774]}, {"category": "baseball glove", "bbox": [0.468, 0.462, 0.593, 0.609]}, {"category": "person", "bbox": [0.174, 0.051, 0.598, 0.839]}, {"category": "bench", "bbox": [0.558, 0.125, 1.0, 0.315]}]}
+{"id": "000000207151", "image": "COCO_val2014_000000207151.jpg", "captions": ["A vegetarian pizza is half eaten on a pizza holder.", "A couple of pieces of pizza with vegetable slices on them.", "A wooden pan serving tray with a pizza on it.", "A pizza on a cutting board is half gone.", "A Pizza is nearly finished with only three pieces left."], "instances": [{"category": "bottle", "bbox": [0.001, 0.001, 0.121, 0.231]}, {"category": "cup", "bbox": [0.0, 0.002, 0.121, 0.238]}, {"category": "pizza", "bbox": [0.17, 0.472, 0.526, 0.82]}, {"category": "pizza", "bbox": [0.398, 0.106, 0.962, 0.679]}, {"category": "dining table", "bbox": [0.0, 0.001, 1.0, 0.988]}]}
+{"id": "000000431165", "image": "COCO_val2014_000000431165.jpg", "captions": ["A baby elephant standing in front of a brick building.", "An elephant is standing near a dirt mount in an exhibit.", "Grey elephant standing next to a large sand dune in a pen.", "An elephant standing alone inside of an enclosure.", "The baby elephant is alone in the pen."], "instances": [{"category": "elephant", "bbox": [0.303, 0.399, 0.638, 0.78]}]}
+{"id": "000000378545", "image": "COCO_val2014_000000378545.jpg", "captions": ["A pole that has a clock on top of it.", "A clock mounted on an outdoor post with Roman numerals.", "a clock on a pole saying it is 12:45", "An ornamental standing clock is at the foreground of a row of houses.", "A black and gold clock on a pole in front of a building."], "instances": [{"category": "clock", "bbox": [0.216, 0.249, 0.749, 0.658]}]}
+{"id": "000000555904", "image": "COCO_val2014_000000555904.jpg", "captions": ["A man sitting at a bar filled with liquor.", "People sitting a a take near several bottles of wine on shelves.", "Several people are sitting at a table drinking.", "Several people in a bar sitting at a long table.", "People eating in a restaurant near wine bottles."], "instances": [{"category": "dining table", "bbox": [0.123, 0.663, 0.317, 0.811]}, {"category": "person", "bbox": [0.715, 0.239, 1.0, 0.998]}, {"category": "person", "bbox": [0.142, 0.528, 0.281, 0.742]}, {"category": "person", "bbox": [0.529, 0.53, 0.606, 0.69]}, {"category": "person", "bbox": [0.705, 0.518, 0.796, 0.673]}, {"category": "wine glass", "bbox": [0.247, 0.669, 0.27, 0.718]}, {"category": "person", "bbox": [0.281, 0.524, 0.534, 1.0]}, {"category": "bottle", "bbox": [0.168, 0.346, 0.189, 0.425]}, {"category": "bottle", "bbox": [0.379, 0.264, 0.431, 0.433]}, {"category": "bottle", "bbox": [0.252, 0.313, 0.277, 0.429]}, {"category": "bottle", "bbox": [0.294, 0.295, 0.326, 0.43]}, {"category": "bottle", "bbox": [0.589, 0.35, 0.613, 0.444]}, {"category": "bottle", "bbox": [0.433, 0.281, 0.473, 0.437]}, {"category": "bottle", "bbox": [0.478, 0.289, 0.513, 0.44]}, {"category": "wine glass", "bbox": [0.688, 0.615, 0.709, 0.69]}, {"category": "cup", "bbox": [0.589, 0.647, 0.612, 0.693]}, {"category": "person", "bbox": [0.732, 0.356, 0.953, 0.806]}, {"category": "bottle", "bbox": [0.555, 0.337, 0.585, 0.438]}, {"category": "bottle", "bbox": [0.337, 0.29, 0.378, 0.432]}, {"category": "bottle", "bbox": [0.21, 0.333, 0.232, 0.426]}, {"category": "bottle", "bbox": [0.134, 0.36, 0.148, 0.422]}, {"category": "bottle", "bbox": [0.516, 0.312, 0.557, 0.439]}, {"category": "cup", "bbox": [0.231, 0.718, 0.26, 0.763]}, {"category": "chair", "bbox": [0.517, 0.828, 0.65, 0.999]}, {"category": "chair", "bbox": [0.643, 0.804, 0.738, 0.841]}, {"category": "chair", "bbox": [0.347, 0.908, 0.519, 1.0]}, {"category": "chair", "bbox": [0.64, 0.806, 0.74, 0.998]}, {"category": "cup", "bbox": [0.205, 0.692, 0.232, 0.767]}, {"category": "dining table", "bbox": [0.536, 0.676, 0.743, 0.838]}, {"category": "person", "bbox": [0.002, 0.501, 0.263, 0.987]}, {"category": "bottle", "bbox": [0.531, 0.461, 0.542, 0.526]}, {"category": "bottle", "bbox": [0.237, 0.354, 0.702, 0.629]}]}
+{"id": "000000415393", "image": "COCO_val2014_000000415393.jpg", "captions": ["a man on a skate board looks like he is falling", "A man does a skateboard trick on a skateboard ramp", "Guy falling off a skateboard in a room.", "A man riding a skateboard on top of a table.", "a man skating on part of a ramp with his skateboard"], "instances": [{"category": "person", "bbox": [0.361, 0.016, 0.809, 0.888]}, {"category": "skateboard", "bbox": [0.606, 0.809, 0.889, 0.901]}, {"category": "person", "bbox": [0.479, 0.091, 0.576, 0.386]}, {"category": "person", "bbox": [0.047, 0.441, 0.197, 0.759]}, {"category": "person", "bbox": [0.038, 0.453, 0.076, 0.545]}, {"category": "person", "bbox": [0.249, 0.307, 0.311, 0.591]}]}
+{"id": "000000161011", "image": "COCO_val2014_000000161011.jpg", "captions": ["Three skiers posing for a picture on the slope.", "Three skiers pause for a photo at the top of a mountain.", "Three people standing on a mountain taking a picture as they ski.", "A woman and two men on skis on a snowy hillside surrounded by trees", "Three skiers have stopped to pose for a picture."], "instances": [{"category": "person", "bbox": [0.36, 0.321, 0.509, 0.82]}, {"category": "person", "bbox": [0.179, 0.281, 0.349, 0.795]}, {"category": "person", "bbox": [0.611, 0.292, 0.751, 0.809]}, {"category": "skis", "bbox": [0.595, 0.743, 0.732, 0.961]}, {"category": "skis", "bbox": [0.341, 0.724, 0.621, 0.907]}, {"category": "skis", "bbox": [0.212, 0.705, 0.398, 0.905]}]}
+{"id": "000000284296", "image": "COCO_val2014_000000284296.jpg", "captions": ["Three giraffe's leaning over to get a sip of water.", "an image of a herd of giraffes in the water", "three giraffes banding down to drink water with trees in the background", "Three giraffe drinking from a pond with brush in back.", "Giraffes leaning down to drink at a watering hole"], "instances": [{"category": "giraffe", "bbox": [0.624, 0.387, 0.822, 0.635]}, {"category": "giraffe", "bbox": [0.4, 0.326, 0.561, 0.58]}, {"category": "giraffe", "bbox": [0.152, 0.291, 0.343, 0.551]}]}
+{"id": "000000056013", "image": "COCO_val2014_000000056013.jpg", "captions": ["a number of luggage bags on a cart in a lobby", "Wheeled cart with luggage at lobby of commercial business.", "Trolley used for transporting personal luggage to guests rooms.", "A luggage cart topped with lots of luggage.", "a cart filled with suitcases and bags"], "instances": [{"category": "backpack", "bbox": [0.276, 0.52, 0.456, 0.678]}, {"category": "suitcase", "bbox": [0.41, 0.58, 0.597, 0.827]}, {"category": "suitcase", "bbox": [0.173, 0.645, 0.363, 0.836]}, {"category": "person", "bbox": [0.959, 0.297, 1.0, 0.478]}, {"category": "suitcase", "bbox": [0.526, 0.519, 0.712, 0.706]}, {"category": "person", "bbox": [0.762, 0.253, 0.871, 0.46]}, {"category": "backpack", "bbox": [0.517, 0.514, 0.694, 0.698]}, {"category": "handbag", "bbox": [0.316, 0.181, 0.431, 0.426]}, {"category": "suitcase", "bbox": [0.747, 0.453, 0.858, 0.557]}]}
+{"id": "000000293505", "image": "COCO_val2014_000000293505.jpg", "captions": ["A person on a motor bike next to a cow.", "A woman riding a motorcycle down a dirt road.", "there is a woman riding a scooter down a dirt road", "A woman on a moped, two men and animals walking down the road.", "A woman on a motorcycle is next to a man walking a dog along with other people going down a dirt road."], "instances": [{"category": "cow", "bbox": [0.602, 0.472, 0.721, 0.816]}, {"category": "motorcycle", "bbox": [0.402, 0.512, 0.516, 0.788]}, {"category": "person", "bbox": [0.408, 0.4, 0.514, 0.639]}, {"category": "person", "bbox": [0.754, 0.301, 1.0, 1.0]}, {"category": "person", "bbox": [0.705, 0.415, 0.789, 0.714]}, {"category": "cow", "bbox": [0.347, 0.44, 0.373, 0.509]}, {"category": "cow", "bbox": [0.361, 0.436, 0.381, 0.501]}]}
+{"id": "000000305873", "image": "COCO_val2014_000000305873.jpg", "captions": ["A little girl holding a red black dotted umbrella.", "A little girl with rain boots and a rain jacket on and an open umbrella to match her jacket.", "a little girl holding onto a lady bug pattern umbrella", "The child wears a labybug rain coat with a matching umbrella.", "A little girl wearing a ladybug raincoat and green rubber boots holding a ladybug umbrella"], "instances": [{"category": "umbrella", "bbox": [0.246, 0.002, 0.992, 0.415]}, {"category": "person", "bbox": [0.35, 0.132, 0.699, 0.791]}, {"category": "car", "bbox": [0.614, 0.0, 1.0, 0.465]}]}
+{"id": "000000034096", "image": "COCO_val2014_000000034096.jpg", "captions": ["A house being built with lots of wood.", "A big pile of building material is placed on the floor in the wooden structure.", "A partially-built house with wooden studs and staircase in view.", "A house full of wood getting built at the moment.", "The beginning stages of a home still being made."], "instances": [{"category": "bed", "bbox": [0.505, 0.42, 0.721, 0.59]}, {"category": "tv", "bbox": [0.192, 0.441, 0.335, 0.606]}]}
+{"id": "000000165257", "image": "COCO_val2014_000000165257.jpg", "captions": ["A large black counter top sitting next to a sink.", "a clean kitchen counter with a clean sink", "A kitchen with a sink, dishwasher and some boxes on the counter.", "A kitchen with a sink, dishwasher and boxes on the counter.", "a black counter on a wood cabinet in a kitchen", "a new kitchen cabinet with a sink being installed"], "instances": [{"category": "sink", "bbox": [0.513, 0.243, 0.718, 0.314]}]}
+{"id": "000000431026", "image": "COCO_val2014_000000431026.jpg", "captions": ["a street sign on a city street near some tall bushes", "street signs on a metal pole lining a sidewalk lined with shrubbery.", "a large hedge of bushes on a corner near a street sign.", "Two street signs on sidewalk next to bushes and trees.", "Street signs along a well manicured street with large houses."], "instances": []}
+{"id": "000000524575", "image": "COCO_val2014_000000524575.jpg", "captions": ["Three giraffe and a wildebeest in a field.", "A moose and several giraffes are grazing in the field.", "Zebras in the wild with a wildebeest behind them", "Two giraffe and a ox standing in a field eating grass.", "Giraffes and other safari animals graze in a sunlit field."], "instances": [{"category": "cow", "bbox": [0.46, 0.716, 0.643, 0.999]}, {"category": "giraffe", "bbox": [0.285, 0.5, 0.401, 0.826]}, {"category": "giraffe", "bbox": [0.083, 0.554, 0.179, 0.821]}, {"category": "giraffe", "bbox": [0.887, 0.481, 0.968, 0.715]}]}
+{"id": "000000326550", "image": "COCO_val2014_000000326550.jpg", "captions": ["Black and white photograph of a person holding a surfboard by water.", "A person with a surfboard standing next to the water.", "A surfer stands on the rocks watching a wave crash.", "A man standing on a beach holding a surfboard.", "a person looking at the waves ready to surf"], "instances": [{"category": "person", "bbox": [0.327, 0.461, 0.492, 0.897]}, {"category": "surfboard", "bbox": [0.282, 0.56, 0.606, 0.741]}, {"category": "person", "bbox": [0.924, 0.352, 0.933, 0.362]}, {"category": "person", "bbox": [0.912, 0.348, 0.919, 0.36]}]}
+{"id": "000000018476", "image": "COCO_val2014_000000018476.jpg", "captions": ["A tie that is sitting on top of a shirt.", "This photograph appears to be looking truly wonderful.", "a uniform complete with shoes laying on a bed", "Suit laid out with a red tie, white shirt and black shoes.", "a white shirt a red tie and some black shoes"], "instances": [{"category": "tie", "bbox": [0.457, 0.09, 0.853, 0.984]}, {"category": "bed", "bbox": [0.005, 0.005, 1.0, 0.379]}]}
+{"id": "000000480652", "image": "COCO_val2014_000000480652.jpg", "captions": ["These suitcases are sitting next to a chair.", "An assortment of luggage bags stacked by a kitchen chair.", "A stack of luggage by a chair and table.", "a table and chair with several pieces of luggage nearby", "A pile of luggage sitting on the floor."], "instances": [{"category": "chair", "bbox": [0.483, 0.192, 1.0, 0.769]}, {"category": "backpack", "bbox": [0.433, 0.429, 0.742, 0.856]}, {"category": "suitcase", "bbox": [0.059, 0.414, 0.453, 0.841]}, {"category": "handbag", "bbox": [0.19, 0.184, 0.779, 0.475]}, {"category": "suitcase", "bbox": [0.175, 0.204, 0.583, 0.462]}]}
+{"id": "000000012748", "image": "COCO_val2014_000000012748.jpg", "captions": ["A man and child next to a horse.", "a little boy touching the nose of a brown horse", "A man holding a baby whose petting a horse.", "a man letting his baby pet a horse", "man holding a baby and petting a horse"], "instances": [{"category": "horse", "bbox": [0.003, 0.079, 0.504, 0.868]}, {"category": "person", "bbox": [0.452, 0.294, 1.0, 0.989]}, {"category": "person", "bbox": [0.46, 0.217, 1.0, 0.988]}]}
+{"id": "000000247840", "image": "COCO_val2014_000000247840.jpg", "captions": ["Large group of people standing outside a restaurant together.", "A dairy queen has people standing outside waiting", "an image of people standing outside and ice cream store", "Several people are  lined up outside of a store.", "The front of a Dairy Queen restaurant with people entering the side."], "instances": [{"category": "fire hydrant", "bbox": [0.774, 0.674, 0.83, 0.807]}, {"category": "person", "bbox": [0.741, 0.465, 0.824, 0.755]}, {"category": "person", "bbox": [0.806, 0.471, 0.839, 0.722]}, {"category": "person", "bbox": [0.831, 0.499, 0.866, 0.726]}, {"category": "bench", "bbox": [0.061, 0.69, 0.219, 0.768]}, {"category": "handbag", "bbox": [0.859, 0.558, 0.877, 0.603]}, {"category": "person", "bbox": [0.719, 0.504, 0.75, 0.626]}, {"category": "potted plant", "bbox": [0.7, 0.648, 0.764, 0.743]}, {"category": "handbag", "bbox": [0.827, 0.548, 0.837, 0.577]}, {"category": "sandwich", "bbox": [0.359, 0.618, 0.417, 0.694]}]}
+{"id": "000000399452", "image": "COCO_val2014_000000399452.jpg", "captions": ["a sandwhich sitting on a plate next to a glass of tea, bowl of soup", "a sandwich on a white plate a drink on a brown table", "A sandwich and chips sit on a white plate.", "a large plate of food with a glass of soda by it", "A sandwich sitting on top of a white plate next to a cup of coffee."], "instances": [{"category": "sandwich", "bbox": [0.175, 0.326, 0.605, 0.71]}, {"category": "cup", "bbox": [0.504, 0.024, 0.687, 0.419]}, {"category": "knife", "bbox": [0.742, 0.283, 0.857, 0.376]}, {"category": "spoon", "bbox": [0.618, 0.46, 0.797, 0.809]}, {"category": "fork", "bbox": [0.684, 0.254, 0.805, 0.395]}, {"category": "bowl", "bbox": [0.782, 0.366, 1.0, 0.62]}, {"category": "chair", "bbox": [0.202, 0.0, 0.671, 0.148]}, {"category": "dining table", "bbox": [0.002, 0.126, 0.996, 0.987]}]}
+{"id": "000000515716", "image": "COCO_val2014_000000515716.jpg", "captions": ["A couple of women standing on either side of a man wearing glasses.", "Two women and a man are holding glasses up at a wine tasting.", "Three young adults holding wine glasses while standing at a bar.", "A group of people sit holding glasses and smiling at a table with several bottles.", "A group of people at a celebration having a taste of wine."], "instances": [{"category": "bottle", "bbox": [0.529, 0.604, 0.637, 0.908]}, {"category": "bottle", "bbox": [0.379, 0.398, 0.481, 0.892]}, {"category": "bottle", "bbox": [0.942, 0.464, 0.988, 0.653]}, {"category": "person", "bbox": [0.0, 0.126, 0.136, 0.811]}, {"category": "person", "bbox": [0.05, 0.093, 0.211, 0.471]}, {"category": "person", "bbox": [0.401, 0.031, 0.678, 0.683]}, {"category": "person", "bbox": [0.617, 0.191, 0.94, 0.858]}, {"category": "person", "bbox": [0.723, 0.098, 0.947, 0.564]}, {"category": "wine glass", "bbox": [0.634, 0.434, 0.697, 0.628]}, {"category": "wine glass", "bbox": [0.285, 0.346, 0.372, 0.558]}, {"category": "wine glass", "bbox": [0.522, 0.422, 0.583, 0.544]}, {"category": "handbag", "bbox": [0.704, 0.601, 1.0, 0.916]}, {"category": "person", "bbox": [0.944, 0.319, 0.999, 0.604]}, {"category": "bottle", "bbox": [0.921, 0.46, 0.953, 0.636]}, {"category": "person", "bbox": [0.116, 0.171, 0.41, 0.829]}]}
+{"id": "000000116173", "image": "COCO_val2014_000000116173.jpg", "captions": ["The boy is on his surfboard in the water riding it.", "a young boy riding a boogie board in the water", "A boy riding surf board in the ocean.", "A young boy is riding a surfboard on a small wave.", "A young boy is surfing in the ocean."], "instances": [{"category": "person", "bbox": [0.485, 0.238, 0.702, 0.821]}, {"category": "person", "bbox": [0.866, 0.223, 0.921, 0.29]}, {"category": "person", "bbox": [0.752, 0.146, 0.775, 0.188]}, {"category": "surfboard", "bbox": [0.239, 0.758, 0.782, 0.846]}, {"category": "surfboard", "bbox": [0.853, 0.277, 0.981, 0.29]}, {"category": "surfboard", "bbox": [0.727, 0.169, 0.801, 0.198]}, {"category": "person", "bbox": [0.637, 0.194, 0.677, 0.261]}]}
+{"id": "000000186013", "image": "COCO_val2014_000000186013.jpg", "captions": ["A beach scene includes many different kites flying in a cloudy sky.", "Kites being flown at the beach at twilight.", "A beach with flags in the ground and kites overhead in the sky.", "A beach with rows of flags in the sand and kites flying overhead.", "A beach filled with kites and wind sails next to the ocean."], "instances": [{"category": "kite", "bbox": [0.174, 0.4, 0.351, 0.483]}, {"category": "kite", "bbox": [0.144, 0.13, 0.273, 0.17]}, {"category": "kite", "bbox": [0.236, 0.269, 0.268, 0.294]}, {"category": "kite", "bbox": [0.464, 0.204, 0.598, 0.271]}, {"category": "kite", "bbox": [0.61, 0.304, 0.659, 0.342]}, {"category": "kite", "bbox": [0.545, 0.435, 0.565, 0.452]}, {"category": "kite", "bbox": [0.027, 0.558, 0.151, 0.59]}, {"category": "kite", "bbox": [0.93, 0.429, 0.973, 0.536]}, {"category": "kite", "bbox": [0.684, 0.36, 0.697, 0.374]}, {"category": "surfboard", "bbox": [0.393, 0.627, 0.446, 0.934]}, {"category": "person", "bbox": [0.959, 0.685, 0.984, 0.713]}, {"category": "person", "bbox": [0.919, 0.681, 0.94, 0.725]}, {"category": "person", "bbox": [0.8, 0.597, 0.805, 0.61]}, {"category": "person", "bbox": [0.079, 0.928, 0.116, 0.975]}, {"category": "kite", "bbox": [0.743, 0.307, 0.755, 0.319]}, {"category": "kite", "bbox": [0.78, 0.322, 0.795, 0.335]}, {"category": "kite", "bbox": [0.536, 0.526, 0.597, 0.617]}, {"category": "person", "bbox": [0.941, 0.694, 0.961, 0.726]}, {"category": "kite", "bbox": [0.575, 0.446, 0.594, 0.471]}]}
+{"id": "000000015029", "image": "COCO_val2014_000000015029.jpg", "captions": ["A man holding a white frisbee standing on top of a field.", "A man is playing frisbee next to a tent.", "Guy at the park holding a frisbee with people in the back under a tent", "A man is holding a Frisbee standing in the grass.", "Young adult male holding a frisbee at an event."], "instances": [{"category": "frisbee", "bbox": [0.138, 0.359, 0.215, 0.587]}, {"category": "person", "bbox": [0.16, 0.002, 0.726, 0.995]}, {"category": "person", "bbox": [0.81, 0.73, 0.852, 0.825]}, {"category": "person", "bbox": [0.786, 0.749, 0.833, 0.814]}, {"category": "person", "bbox": [0.847, 0.743, 0.89, 0.804]}, {"category": "person", "bbox": [0.614, 0.749, 0.706, 0.936]}]}
+{"id": "000000500565", "image": "COCO_val2014_000000500565.jpg", "captions": ["A woman holding a child wrapped in a towel brushing her teeth.", "A woman is holding a baby who is wrapped in a towel and holding a toothbrush", "A woman holding a little boy who is brushing his teeth.", "A baby with a toothbrush in his mouth while being held by a woman", "a close up of an adult holding a child brushing their teeth"], "instances": [{"category": "toothbrush", "bbox": [0.586, 0.66, 0.754, 0.821]}, {"category": "person", "bbox": [0.002, 0.007, 0.637, 0.991]}, {"category": "person", "bbox": [0.357, 0.196, 0.998, 0.984]}]}
+{"id": "000000297323", "image": "COCO_val2014_000000297323.jpg", "captions": ["Two buses are parked against a curb in front of a building.", "Two automobiles parked on the side of a building.", "two tourist buses parked on street in front of old industrial building", "Two unique city buses stopped at a stop sign.", "Buses parked outside by a building and stop sign."], "instances": [{"category": "bus", "bbox": [0.7, 0.711, 0.92, 0.881]}, {"category": "person", "bbox": [0.936, 0.771, 0.972, 0.833]}, {"category": "stop sign", "bbox": [0.237, 0.666, 0.285, 0.728]}, {"category": "bus", "bbox": [0.334, 0.71, 0.678, 0.935]}, {"category": "truck", "bbox": [0.335, 0.72, 0.683, 0.934]}, {"category": "person", "bbox": [0.34, 0.791, 0.367, 0.834]}]}
+{"id": "000000441147", "image": "COCO_val2014_000000441147.jpg", "captions": ["Two antique suitcases sit stacked one on top of the other.", "Two suitcases are stacked on each other and one is black while the other is brown and yellow.", "a close up of two luggage suit cases stacked on each other", "A stack of antique luggage is displayed with price tags.", "two suitcases made of leather and stacked on top of each other"], "instances": [{"category": "suitcase", "bbox": [0.167, 0.025, 0.989, 0.445]}, {"category": "suitcase", "bbox": [0.002, 0.31, 0.994, 0.996]}]}
+{"id": "000000353536", "image": "COCO_val2014_000000353536.jpg", "captions": ["A table topped with plates and glasses with eating utensils..", "a fork is laying on a small white plate", "dirty dishes on a table, and a bottle of something.", "a table top with some dishes on top of it", "A table full of dirty dishes is pictured in this image."], "instances": [{"category": "dining table", "bbox": [0.0, 0.007, 0.998, 0.988]}, {"category": "bottle", "bbox": [0.554, 0.002, 0.768, 0.411]}, {"category": "cup", "bbox": [0.372, 0.011, 0.544, 0.427]}, {"category": "fork", "bbox": [0.442, 0.464, 0.818, 0.572]}, {"category": "fork", "bbox": [0.089, 0.233, 0.272, 0.456]}, {"category": "spoon", "bbox": [0.144, 0.218, 0.326, 0.413]}, {"category": "cup", "bbox": [0.688, 0.056, 0.812, 0.361]}]}
+{"id": "000000416256", "image": "COCO_val2014_000000416256.jpg", "captions": ["A cat laying on the floor next to a keyboard.", "an orange and white cat is laying next to a keyboard and some wires", "A cat is laying next to a computer keyboard.", "a cat laying on a floor next to a keyboard", "A CAT LAYING ON THE FLOOR AMIDST A COMPUTER,SPEAKERS,CORDS"], "instances": [{"category": "cat", "bbox": [0.235, 0.23, 0.737, 0.639]}, {"category": "keyboard", "bbox": [0.243, 0.562, 0.631, 0.836]}, {"category": "keyboard", "bbox": [0.058, 0.33, 0.277, 0.608]}]}
+{"id": "000000214367", "image": "COCO_val2014_000000214367.jpg", "captions": ["Wood shading on the side of a window with brick siding.", "A tree filled with lots of red fruit near a building.", "By the window outside is a apple tree, where the apples are ready to be picked.", "Some very nice looking red fruity by a window,", "A shuttered window has a fruit tree outside it."], "instances": [{"category": "apple", "bbox": [0.214, 0.112, 0.408, 0.266]}, {"category": "apple", "bbox": [0.472, 0.166, 0.618, 0.293]}, {"category": "apple", "bbox": [0.055, 0.592, 0.172, 0.686]}, {"category": "apple", "bbox": [0.126, 0.661, 0.236, 0.739]}, {"category": "apple", "bbox": [0.52, 0.09, 0.609, 0.143]}, {"category": "apple", "bbox": [0.226, 0.354, 0.285, 0.409]}, {"category": "apple", "bbox": [0.0, 0.698, 0.096, 0.771]}, {"category": "apple", "bbox": [0.001, 0.646, 0.042, 0.713]}, {"category": "apple", "bbox": [0.258, 0.719, 0.329, 0.778]}]}
+{"id": "000000210299", "image": "COCO_val2014_000000210299.jpg", "captions": ["A little boy riding his bike and wearing a helmet", "A little boy raveling down a road on a bike, with a yellow helmet on.", "The boy wears a helmet while riding his bicycle.", "a small child wearing a helmet and riding a bike", "A little boy wearing a helmet and riding a bike."], "instances": [{"category": "person", "bbox": [0.198, 0.259, 0.399, 0.679]}, {"category": "bicycle", "bbox": [0.213, 0.383, 0.408, 0.835]}]}
+{"id": "000000088218", "image": "COCO_val2014_000000088218.jpg", "captions": ["Signs proclaim the famous Haight Ashbury intersection and district.", "a pole with street lights, signs and wires attached to it", "A traffic light at the intersection of Haight and Ashbury", "A traffic sign is shown with traffic signs above it.", "The street signs and traffic signal are below wires attached to the pole."], "instances": [{"category": "traffic light", "bbox": [0.443, 0.435, 0.658, 0.721]}]}
+{"id": "000000020650", "image": "COCO_val2014_000000020650.jpg", "captions": ["Burger with broccoli, pickle, and fork on orange plate", "On a plate is kept a burger and a bowl of broccoli and a fork.", "There is half a sandwich on an orange plate with a pickle and a bowl of broccoli", "A A bowl and a sandwich on an orange plate on a table.", "A plate has a sandwich, broccoli, and a pickle."], "instances": [{"category": "sandwich", "bbox": [0.436, 0.155, 0.805, 0.859]}, {"category": "sandwich", "bbox": [0.311, 0.006, 0.748, 0.293]}, {"category": "fork", "bbox": [0.0, 0.665, 0.578, 0.876]}, {"category": "bowl", "bbox": [0.002, 0.263, 0.487, 0.744]}, {"category": "bowl", "bbox": [0.708, 0.003, 0.828, 0.03]}, {"category": "broccoli", "bbox": [0.185, 0.288, 0.366, 0.546]}, {"category": "broccoli", "bbox": [0.017, 0.344, 0.384, 0.654]}, {"category": "broccoli", "bbox": [0.31, 0.191, 0.466, 0.463]}, {"category": "broccoli", "bbox": [0.104, 0.107, 0.285, 0.342]}, {"category": "broccoli", "bbox": [0.092, 0.276, 0.242, 0.442]}, {"category": "dining table", "bbox": [0.002, 0.0, 0.999, 0.987]}]}
+{"id": "000000514915", "image": "COCO_val2014_000000514915.jpg", "captions": ["A large black dog laying on a kitchen floor.", "A dog is laying down on the floor in the home.", "Black dog laying down on the kitchen floor next to it's bowls and toy", "A black dog with a red collar laying on a tiled floor.", "A black dog that is laying on the floor."], "instances": [{"category": "dog", "bbox": [0.087, 0.276, 0.812, 0.792]}, {"category": "bowl", "bbox": [0.437, 0.09, 0.533, 0.213]}, {"category": "bowl", "bbox": [0.537, 0.035, 0.665, 0.141]}]}
+{"id": "000000205183", "image": "COCO_val2014_000000205183.jpg", "captions": ["A duck walking along a paved road next to a patch of grass.", "A close up of a duck walking on a path.", "a duck walks along a cement patch while looking down", "A white duck out of water, walking on the ground.", "A goose standing in the road, looking at the ground."], "instances": [{"category": "bird", "bbox": [0.291, 0.235, 0.859, 0.889]}]}
+{"id": "000000534270", "image": "COCO_val2014_000000534270.jpg", "captions": ["Man and woman with umbrella hats sitting on top of a bridge.", "A couple equipped with umbrella hats taking a break from walking their dog on a bridge on a rainy day.", "Two people in ridiculous looking umbrella hats.", "two people with umbrella hats near one another", "A couple of people wearing umbrella hats next to the ocean."], "instances": [{"category": "dog", "bbox": [0.456, 0.832, 0.6, 0.983]}, {"category": "person", "bbox": [0.433, 0.464, 0.636, 0.975]}, {"category": "person", "bbox": [0.263, 0.321, 0.459, 0.978]}, {"category": "boat", "bbox": [0.912, 0.4, 0.978, 0.433]}, {"category": "boat", "bbox": [0.211, 0.236, 0.478, 0.304]}, {"category": "boat", "bbox": [0.144, 0.328, 0.189, 0.361]}, {"category": "umbrella", "bbox": [0.443, 0.402, 0.607, 0.473]}, {"category": "umbrella", "bbox": [0.325, 0.311, 0.483, 0.432]}, {"category": "umbrella", "bbox": [0.207, 0.738, 0.284, 0.778]}, {"category": "umbrella", "bbox": [0.489, 0.713, 0.649, 0.83]}]}
+{"id": "000000408439", "image": "COCO_val2014_000000408439.jpg", "captions": ["Cliffs rise on the edge of a placid lake.", "A scenic view of a river with a train on the edge of it in the distance.", "A large lake surrounded by beautiful tree covered mountains.", "a landscape scene with water, mountains and trees", "A train on a waterfront track surrounded by mountains."], "instances": [{"category": "train", "bbox": [0.008, 0.591, 0.562, 0.644]}]}
+{"id": "000000474253", "image": "COCO_val2014_000000474253.jpg", "captions": ["A man riding on the back of a horse through a river.", "A person is riding a horse through water.", "Horse and rider crossing waterway during competitive event.", "A woman riding a horse splashes through a large puddle.", "A young man riding a horse through some water."], "instances": [{"category": "horse", "bbox": [0.385, 0.235, 0.651, 0.814]}, {"category": "person", "bbox": [0.396, 0.06, 0.576, 0.675]}, {"category": "person", "bbox": [0.29, 0.148, 0.355, 0.333]}, {"category": "person", "bbox": [0.129, 0.163, 0.212, 0.349]}, {"category": "person", "bbox": [0.005, 0.014, 0.038, 0.165]}, {"category": "person", "bbox": [0.144, 0.011, 0.193, 0.155]}, {"category": "person", "bbox": [0.089, 0.007, 0.133, 0.162]}]}
+{"id": "000000098029", "image": "COCO_val2014_000000098029.jpg", "captions": ["a table with many plates on it with a bread basket", "A table set for four has many foods and fruits on it.", "Several objects displayed on a kitchen table including bread, oranges and plating.", "Several dishes and food items sit on a table.", "An assortment of foods sitting on a round brown table."], "instances": [{"category": "refrigerator", "bbox": [0.013, 0.004, 0.37, 0.317]}, {"category": "bottle", "bbox": [0.467, 0.517, 0.555, 0.638]}, {"category": "bottle", "bbox": [0.602, 0.536, 0.658, 0.609]}, {"category": "chair", "bbox": [0.747, 0.367, 1.0, 0.592]}, {"category": "chair", "bbox": [0.044, 0.368, 0.358, 0.544]}, {"category": "cup", "bbox": [0.296, 0.465, 0.359, 0.54]}, {"category": "cup", "bbox": [0.709, 0.67, 0.782, 0.736]}, {"category": "cup", "bbox": [0.213, 0.684, 0.294, 0.753]}, {"category": "knife", "bbox": [0.787, 0.699, 0.922, 0.797]}, {"category": "knife", "bbox": [0.161, 0.539, 0.265, 0.584]}, {"category": "spoon", "bbox": [0.813, 0.674, 0.922, 0.759]}, {"category": "spoon", "bbox": [0.156, 0.555, 0.233, 0.587]}, {"category": "spoon", "bbox": [0.596, 0.467, 0.613, 0.509]}, {"category": "bowl", "bbox": [0.241, 0.753, 0.505, 0.935]}, {"category": "banana", "bbox": [0.632, 0.138, 0.718, 0.161]}, {"category": "apple", "bbox": [0.701, 0.152, 0.758, 0.191]}, {"category": "orange", "bbox": [0.607, 0.66, 0.692, 0.716]}, {"category": "orange", "bbox": [0.565, 0.636, 0.611, 0.667]}, {"category": "orange", "bbox": [0.526, 0.624, 0.572, 0.652]}, {"category": "orange", "bbox": [0.61, 0.628, 0.656, 0.657]}, {"category": "orange", "bbox": [0.599, 0.649, 0.643, 0.677]}, {"category": "dining table", "bbox": [0.013, 0.439, 0.964, 0.986]}, {"category": "cup", "bbox": [0.612, 0.489, 0.669, 0.548]}, {"category": "knife", "bbox": [0.605, 0.457, 0.638, 0.53]}, {"category": "apple", "bbox": [0.502, 0.137, 0.537, 0.159]}, {"category": "orange", "bbox": [0.54, 0.135, 0.563, 0.151]}, {"category": "orange", "bbox": [0.527, 0.129, 0.554, 0.142]}, {"category": "orange", "bbox": [0.611, 0.155, 0.641, 0.171]}, {"category": "chair", "bbox": [0.0, 0.843, 0.29, 0.989]}, {"category": "cup", "bbox": [0.353, 0.469, 0.411, 0.511]}, {"category": "cup", "bbox": [0.609, 0.716, 0.682, 0.786]}, {"category": "orange", "bbox": [0.638, 0.158, 0.679, 0.177]}, {"category": "cake", "bbox": [0.38, 0.821, 0.481, 0.895]}, {"category": "chair", "bbox": [0.79, 0.747, 1.0, 1.0]}, {"category": "bottle", "bbox": [0.719, 0.55, 0.769, 0.616]}, {"category": "bottle", "bbox": [0.795, 0.546, 0.873, 0.613]}, {"category": "knife", "bbox": [0.17, 0.799, 0.264, 0.88]}, {"category": "cup", "bbox": [0.317, 0.695, 0.391, 0.752]}]}
+{"id": "000000294073", "image": "COCO_val2014_000000294073.jpg", "captions": ["A woman and a man standing between two brown horses.", "A COUPLE WEARING YELLOW DRESS STANDING NEAR TWO HORSES.", "An older couple stands between two horses.", "A man and a woman standing with two horses", "A man and a woman stand in between two horses."], "instances": [{"category": "horse", "bbox": [0.0, 0.052, 0.49, 0.989]}, {"category": "horse", "bbox": [0.632, 0.23, 1.0, 0.989]}, {"category": "person", "bbox": [0.425, 0.326, 0.696, 0.987]}, {"category": "person", "bbox": [0.627, 0.203, 0.828, 0.986]}, {"category": "book", "bbox": [0.525, 0.597, 0.644, 0.833]}]}
+{"id": "000000203629", "image": "COCO_val2014_000000203629.jpg", "captions": ["A man on a cell phone in a public area holding his thumb up.", "A group of people gathered inside of a room.", "A man on his cellphone posing for a  picture.", "A man giving a thumbs up while on a cell phone.", "The man is giving a thumbs up while on his phone."], "instances": [{"category": "cell phone", "bbox": [0.43, 0.459, 0.449, 0.503]}, {"category": "cup", "bbox": [0.756, 0.838, 0.865, 0.98]}, {"category": "person", "bbox": [0.232, 0.317, 0.603, 0.98]}, {"category": "person", "bbox": [0.602, 0.405, 1.0, 0.999]}, {"category": "person", "bbox": [0.003, 0.339, 0.313, 0.987]}, {"category": "person", "bbox": [0.164, 0.379, 0.258, 0.733]}, {"category": "person", "bbox": [0.564, 0.36, 0.673, 0.645]}, {"category": "person", "bbox": [0.241, 0.379, 0.336, 0.512]}, {"category": "person", "bbox": [0.682, 0.372, 0.736, 0.502]}, {"category": "person", "bbox": [0.654, 0.428, 0.734, 0.536]}, {"category": "person", "bbox": [0.718, 0.368, 0.787, 0.508]}, {"category": "person", "bbox": [0.148, 0.362, 0.205, 0.529]}, {"category": "person", "bbox": [0.001, 0.431, 0.044, 0.564]}, {"category": "cup", "bbox": [0.901, 0.808, 0.995, 0.982]}]}
+{"id": "000000119876", "image": "COCO_val2014_000000119876.jpg", "captions": ["A man dressed loudly is using his cell phone.", "A man talking on the phone while he walks down the street.", "A man with pink hair talking on a cell phone.", "A man in a purple shirt and tie and purple hair.", "a man colored his hair in purple walking on the road"], "instances": [{"category": "bicycle", "bbox": [0.525, 0.222, 0.924, 0.608]}, {"category": "bicycle", "bbox": [0.895, 0.249, 1.0, 0.642]}, {"category": "person", "bbox": [0.0, 0.0, 0.738, 1.0]}, {"category": "tie", "bbox": [0.319, 0.255, 0.423, 0.638]}, {"category": "cell phone", "bbox": [0.411, 0.13, 0.426, 0.161]}, {"category": "handbag", "bbox": [0.369, 0.205, 0.575, 0.839]}]}
+{"id": "000000164255", "image": "COCO_val2014_000000164255.jpg", "captions": ["An umbrella that is standing in the sand.", "An umbrella is stuck in the sand on the beach.", "a colorful striped umbrella on the beach near the ocean", "A colorful umbrella is set up at the beach.", "The colorful umbrella is sitting by the beach,"], "instances": [{"category": "umbrella", "bbox": [0.0, 0.101, 0.567, 0.575]}]}
+{"id": "000000192817", "image": "COCO_val2014_000000192817.jpg", "captions": ["A view from a window high up in the sky.", "A bunch of mountains seen from a plane window.", "The window from a plane overlooking the ground.", "The view of a mountain area from an airplane window.", "An aerial view of mountains and lakes from an airplane window."], "instances": []}
+{"id": "000000258285", "image": "COCO_val2014_000000258285.jpg", "captions": ["Two large passenger jets flying over a beach filled with birds.", "A plane is flying over a bird filed lake", "Two airplanes are in the sky over blue water.", "An airplane landing over an airplane on the ground.", "A photo of two plans with water and birds surrounding it , one plane in the air one one the ground."], "instances": [{"category": "bird", "bbox": [0.507, 0.941, 0.536, 0.973]}, {"category": "bird", "bbox": [0.304, 0.933, 0.315, 0.95]}, {"category": "bird", "bbox": [0.129, 0.885, 0.143, 0.912]}, {"category": "bird", "bbox": [0.158, 0.851, 0.165, 0.87]}, {"category": "bird", "bbox": [0.404, 0.839, 0.429, 0.864]}, {"category": "bird", "bbox": [0.498, 0.833, 0.513, 0.861]}, {"category": "airplane", "bbox": [0.276, 0.085, 0.825, 0.316]}, {"category": "airplane", "bbox": [0.478, 0.252, 0.983, 0.495]}, {"category": "bird", "bbox": [0.552, 0.828, 0.564, 0.844]}, {"category": "bird", "bbox": [0.789, 0.812, 0.798, 0.836]}, {"category": "bird", "bbox": [0.927, 0.82, 0.936, 0.838]}, {"category": "bird", "bbox": [0.65, 0.828, 0.664, 0.849]}, {"category": "bird", "bbox": [0.752, 0.81, 0.763, 0.83]}, {"category": "bird", "bbox": [0.841, 0.817, 0.852, 0.828]}, {"category": "bird", "bbox": [0.292, 0.849, 0.311, 0.868]}, {"category": "bird", "bbox": [0.005, 0.727, 0.981, 0.998]}]}
+{"id": "000000506483", "image": "COCO_val2014_000000506483.jpg", "captions": ["An art installation is placed by a street.", "People sit near a display of large artworks including an oversize bench and painted feline heads.", "Looking down on a giant rocking bench and large animal heads.", "An over sized wooden bench next to two massive animal art sculptures.", "artistic sculptures and images on a city street"], "instances": [{"category": "car", "bbox": [0.656, 0.939, 0.933, 1.0]}, {"category": "person", "bbox": [0.08, 0.664, 0.147, 0.805]}, {"category": "person", "bbox": [0.154, 0.646, 0.217, 0.821]}, {"category": "bench", "bbox": [0.316, 0.124, 0.951, 0.635]}, {"category": "backpack", "bbox": [0.062, 0.701, 0.097, 0.769]}, {"category": "person", "bbox": [0.0, 0.132, 0.031, 0.197]}]}
+{"id": "000000502168", "image": "COCO_val2014_000000502168.jpg", "captions": ["a fleet of naval ships in the ocean", "A group of men on aircraft carrier with other boats in the distance.", "A large ship floating in the ocean next to other ships.", "Several men on a boat looking over the side.", "The men wear hardhats as they work on the aircraft carrier."], "instances": [{"category": "boat", "bbox": [0.634, 0.292, 1.0, 0.982]}, {"category": "person", "bbox": [0.675, 0.507, 0.736, 0.731]}, {"category": "person", "bbox": [0.684, 0.737, 0.817, 1.0]}, {"category": "person", "bbox": [0.803, 0.691, 0.883, 0.932]}, {"category": "person", "bbox": [0.741, 0.56, 0.798, 0.767]}, {"category": "person", "bbox": [0.924, 0.269, 0.951, 0.367]}, {"category": "boat", "bbox": [0.079, 0.171, 0.172, 0.231]}, {"category": "boat", "bbox": [0.863, 0.131, 0.961, 0.239]}, {"category": "boat", "bbox": [0.435, 0.288, 0.46, 0.313]}, {"category": "boat", "bbox": [0.591, 0.186, 0.605, 0.222]}, {"category": "person", "bbox": [0.451, 0.289, 0.455, 0.296]}, {"category": "person", "bbox": [0.446, 0.29, 0.451, 0.296]}, {"category": "person", "bbox": [0.872, 0.627, 0.957, 0.966]}, {"category": "person", "bbox": [0.44, 0.288, 0.446, 0.3]}]}
+{"id": "000000319432", "image": "COCO_val2014_000000319432.jpg", "captions": ["Man holding two shirts with luggage and window", "A man holding clothes on a hanger with a suitcase in front of him.", "A man show a red and a white clothing hangers.", "A man holding his garment bags in both hands", "A man holding up some clothes in some hanger bags."], "instances": [{"category": "person", "bbox": [0.0, 0.092, 0.776, 0.852]}, {"category": "suitcase", "bbox": [0.153, 0.798, 0.587, 1.0]}]}
+{"id": "000000131019", "image": "COCO_val2014_000000131019.jpg", "captions": ["Two zebras and two monkeys walking on the grass.", "Two giraffes and another animal are on green grass.", "A baboon and two zebras grazing on the savannah.", "A baboon and its baby eat by two zebras in the grass", "Monkey standing behind two zebras as they graze."], "instances": [{"category": "zebra", "bbox": [0.367, 0.258, 0.834, 0.646]}, {"category": "zebra", "bbox": [0.161, 0.13, 0.396, 0.375]}, {"category": "bird", "bbox": [0.309, 0.138, 0.34, 0.163]}]}

ChatUniVi/eval/table/model.jsonl ADDED Viewed

	@@ -0,0 +1,5 @@

+{"model_id": "vicuna-13b:20230322-clean-lang", "model_name": "vicuna-13b", "model_version": "20230322-clean-lang", "model_metadata": "vicuna-13b-20230322-clean-lang"}
+{"model_id": "alpaca-13b:v1", "model_name": "alpaca-13b", "model_version": "v1", "model_metadata": "alpaca-13b"}
+{"model_id": "llama-13b:v1", "model_name": "llama-13b", "model_version": "v1", "model_metadata": "hf-llama-13b"}
+{"model_id": "bard:20230327", "model_name": "bard", "model_version": "20230327", "model_metadata": "Google Bard 20230327"}
+{"model_id": "gpt-3.5-turbo:20230327", "model_name": "gpt-3.5-turbo", "model_version": "20230327", "model_metadata": "OpenAI ChatGPT gpt-3.5-turbo Chat Completion"}

ChatUniVi/eval/table/question.jsonl ADDED Viewed

	@@ -0,0 +1,80 @@

+{"question_id": 1, "text": "How can I improve my time management skills?", "category": "generic"}
+{"question_id": 2, "text": "What are the most effective ways to deal with stress?", "category": "generic"}
+{"question_id": 3, "text": "What are the main differences between Python and JavaScript programming languages?", "category": "generic"}
+{"question_id": 4, "text": "How can I increase my productivity while working from home?", "category": "generic"}
+{"question_id": 5, "text": "Can you explain the basics of quantum computing?", "category": "generic"}
+{"question_id": 6, "text": "What are the differences between plant-based and animal-based protein sources?", "category": "generic"}
+{"question_id": 7, "text": "How can I develop my critical thinking skills?", "category": "generic"}
+{"question_id": 8, "text": "What are the major challenges faced by the education sector today?", "category": "generic"}
+{"question_id": 9, "text": "What are the primary factors that influence consumer behavior?", "category": "generic"}
+{"question_id": 10, "text": "What are the most effective strategies for conflict resolution in the workplace?", "category": "generic"}
+{"question_id": 11, "text": "What are some potential implications of using a single-use plastic bottle versus a reusable bottle on both the environment and human health?", "category": "knowledge"}
+{"question_id": 12, "text": "What factors would you consider when designing an inclusive and accessible public transportation system?", "category": "knowledge"}
+{"question_id": 13, "text": "How can governments utilize fiscal and monetary policies to combat economic recessions?", "category": "knowledge"}
+{"question_id": 14, "text": "How do language and cultural barriers affect the way people communicate and form relationships in multicultural societies?", "category": "knowledge"}
+{"question_id": 15, "text": "Describe a scenario where artificial intelligence could be used to improve the quality and efficiency of healthcare delivery.", "category": "knowledge"}
+{"question_id": 16, "text": "Explain the process of gene editing using CRISPR-Cas9 technology, and discuss its potential applications and ethical implications.", "category": "knowledge"}
+{"question_id": 17, "text": "How do vaccinations work to protect individuals and communities from infectious diseases, and what is herd immunity?", "category": "knowledge"}
+{"question_id": 18, "text": "How do social media platforms influence the way people consume and share news, and what are the potential implications for the spread of misinformation?", "category": "knowledge"}
+{"question_id": 19, "text": "How do cultural, social, and economic factors influence people's food choices, and how can this knowledge be used to promote healthier diets?", "category": "knowledge"}
+{"question_id": 20, "text": "Explain the process of natural selection and how it contributes to the evolution and adaptation of species.", "category": "knowledge"}
+{"question_id": 21, "text": "How would you introduce yourself as a medieval knight at a royal banquet?", "category": "roleplay"}
+{"question_id": 22, "text": "As a pirate captain, what would you say to your crew to motivate them to search for hidden treasure?", "category": "roleplay"}
+{"question_id": 23, "text": "If you were a Shakespearean character, how would you declare your love for someone in a soliloquy?", "category": "roleplay"}
+{"question_id": 24, "text": "As a superhero, how would you explain your origin story to a curious child?", "category": "roleplay"}
+{"question_id": 25, "text": "Imagine you are a time traveler from the year 3000. What technological advancements would you tell people about?", "category": "roleplay"}
+{"question_id": 26, "text": "As a sports commentator, describe the winning play in the final seconds of a championship game.", "category": "roleplay"}
+{"question_id": 27, "text": "Pretend to be a world-famous chef. How would you describe your signature dish to a panel of judges?", "category": "roleplay"}
+{"question_id": 28, "text": "You are a mountain climber reaching the summit of Mount Everest. Describe your emotions and the view from the top.", "category": "roleplay"}
+{"question_id": 29, "text": "As a space colonist on Mars, describe your daily life and the challenges you face living on another planet.", "category": "roleplay"}
+{"question_id": 30, "text": "Pretend to be a character in a post-apocalyptic world. Describe how you survive and the allies you encounter.", "category": "roleplay"}
+{"question_id": 31, "text": "How can you determine if a restaurant is popular among locals or mainly attracts tourists, and why might this information be useful?", "category": "common-sense"}
+{"question_id": 32, "text": "What are some subtle clues that suggest someone is pretending to understand a topic or conversation when they are actually confused or uninformed?", "category": "common-sense"}
+{"question_id": 33, "text": "Why might someone choose to use a paper map or ask for directions instead of relying on a GPS device or smartphone app?", "category": "common-sense"}
+{"question_id": 34, "text": "How can you determine if a person is genuinely interested in a conversation or simply being polite?", "category": "common-sense"}
+{"question_id": 35, "text": "Why might someone prefer to shop at a small, locally-owned business instead of a large chain store, even if the prices are higher?", "category": "common-sense"}
+{"question_id": 36, "text": "How can you assess the credibility of a source of information, such as a news article or blog post, without relying solely on the reputation of the author or publisher?", "category": "common-sense"}
+{"question_id": 37, "text": "Why do some people enjoy the sensation of being scared, such as by watching horror movies or going on roller coasters, while others avoid these experiences?", "category": "common-sense"}
+{"question_id": 38, "text": "How can observing the behavior of other people in a social situation provide clues about cultural norms and expectations?", "category": "common-sense"}
+{"question_id": 39, "text": "Do we have a moral obligation to explore space, or should we focus on solving Earth's problems first?", "category": "common-sense"}
+{"question_id": 40, "text": "In a world where automation is becoming increasingly prevalent, is it more important to prioritize job creation or technological progress?", "category": "common-sense"}
+{"question_id": 41, "text": "How many times does the average human blink in a lifetime? Try to explain your answer. Your explanation should take the reader through your reasoning step-by-step.", "category": "fermi"}
+{"question_id": 42, "text": "How many atoms are in a grain of salt? Try to explain your answer. Your explanation should take the reader through your reasoning step-by-step.", "category": "fermi"}
+{"question_id": 43, "text": "How many lightning strikes occur on Earth each day? Try to explain your answer. Your explanation should take the reader through your reasoning step-by-step.", "category": "fermi"}
+{"question_id": 44, "text": "How many balloons would it take to lift a house like in the movie \"Up\"? Try to explain your answer. Your explanation should take the reader through your reasoning step-by-step.", "category": "fermi"}
+{"question_id": 45, "text": "How many text messages are sent globally in a minute? Try to explain your answer. Your explanation should take the reader through your reasoning step-by-step.", "category": "fermi"}
+{"question_id": 46, "text": "How many words are spoken daily on Earth? Try to explain your answer. Your explanation should take the reader through your reasoning step-by-step.", "category": "fermi"}
+{"question_id": 47, "text": "How many snowflakes fall during a typical winter? Try to explain your answer. Your explanation should take the reader through your reasoning step-by-step.", "category": "fermi"}
+{"question_id": 48, "text": "How many pages are in all the books ever written? Try to explain your answer. Your explanation should take the reader through your reasoning step-by-step.", "category": "fermi"}
+{"question_id": 49, "text": "How many times has the Earth orbited the Sun since the beginning of life? Try to explain your answer. Your explanation should take the reader through your reasoning step-by-step.", "category": "fermi"}
+{"question_id": 50, "text": "How many songs have been recorded throughout history? Try to explain your answer. Your explanation should take the reader through your reasoning step-by-step.", "category": "fermi"}
+{"question_id": 51, "text": "What if the Internet had been invented during the Renaissance period?", "category": "counterfactual"}
+{"question_id": 52, "text": "What if the Aztecs had successfully repelled the Spanish conquistadors?", "category": "counterfactual"}
+{"question_id": 53, "text": "What if the Black Death had not occurred in the 14th century?", "category": "counterfactual"}
+{"question_id": 54, "text": "What if Isaac Newton had focused on biology instead of physics?", "category": "counterfactual"}
+{"question_id": 55, "text": "What if the Beatles had never formed as a band?", "category": "counterfactual"}
+{"question_id": 56, "text": "What if Alan Turing had not cracked the Enigma code during World War II?", "category": "counterfactual"}
+{"question_id": 57, "text": "What if the Suez Canal had never been constructed?", "category": "counterfactual"}
+{"question_id": 58, "text": "What if the Maya civilization had never mysteriously collapsed?", "category": "counterfactual"}
+{"question_id": 59, "text": "What if Christopher Columbus had not discovered the Americas?", "category": "counterfactual"}
+{"question_id": 60, "text": "What if Vincent van Gogh had been a successful artist during his lifetime?", "category": "counterfactual"}
+{"question_id": 61, "text": "Develop a C++ program that reads a text file line by line and counts the number of occurrences of a specific word in the file.", "category": "coding"}
+{"question_id": 62, "text": "Implement a Python function to find the longest common subsequence of two input strings using dynamic programming.", "category": "coding"}
+{"question_id": 63, "text": "Implement a regular expression in Python to validate an email address.", "category": "coding"}
+{"question_id": 64, "text": "Write a program to find the nth Fibonacci number using dynamic programming.", "category": "coding"}
+{"question_id": 65, "text": "Implement a binary search algorithm to find a specific element in a sorted array.", "category": "coding"}
+{"question_id": 66, "text": "Implement a queue data structure using two stacks in Python.", "category": "coding"}
+{"question_id": 67, "text": "Implement a program to find the common elements in two arrays without using any extra data structures.", "category": "coding"}
+{"question_id": 68, "text": "Given that f(x) = 5x^3 - 2x + 3, find the value of f(2).", "category": "math"}
+{"question_id": 69, "text": "Solve for x in the equation 3x + 10 = 5(x - 2).", "category": "math"}
+{"question_id": 70, "text": "If the endpoints of a line segment are (2, -2) and (10, 4), what is the length of the segment?", "category": "math"}
+{"question_id": 71, "text": "Can you help me write a formal email to a potential business partner proposing a joint venture?", "category": "writing"}
+{"question_id": 72, "text": "Can you help me write a resignation letter to my current employer, while leaving on good terms and expressing gratitude for the opportunities provided?", "category": "writing"}
+{"question_id": 73, "text": "Use an appropriate format to structure a formal letter of recommendation for a student applying to a prestigious graduate program in computer science.", "category": "writing"}
+{"question_id": 74, "text": "Write a compelling product launch announcement email to inform our customers of our new software solution.", "category": "writing"}
+{"question_id": 75, "text": "Draft an apology email to a customer who experienced a delay in their order, and provide reassurance that the issue has been resolved.", "category": "writing"}
+{"question_id": 76, "text": "Write a script for a YouTube video exploring the history and cultural significance of jazz.", "category": "writing"}
+{"question_id": 77, "text": "Compose an engaging travel blog post about a recent trip to Hawaii, highlighting cultural experiences and must-see attractions.", "category": "writing"}
+{"question_id": 78, "text": "Write a captivating movie review for a recently released science fiction film, discussing its plot, characters, and special effects.", "category": "writing"}
+{"question_id": 79, "text": "Structure a podcast script for an episode discussing the influence of streaming platforms on the music industry.", "category": "writing"}
+{"question_id": 80, "text": "Write a symphony concert review, discussing the orchestra's performance and overall audience experience.", "category": "writing"}

ChatUniVi/eval/table/reviewer.jsonl ADDED Viewed

	@@ -0,0 +1,4 @@

+{"reviewer_id": "gpt-4-0328-default", "prompt_id": 1, "metadata": {"temperature": 0.2, "max_tokens": 1024}, "description": "GPT-4 for general questions"}
+{"reviewer_id": "gpt-4-0328-coding", "prompt_id": 2, "metadata": {"temperature": 0.2, "max_tokens": 1024}, "description": "GPT-4 for coding questions"}
+{"reviewer_id": "gpt-4-0328-math", "prompt_id": 3, "metadata": {"temperature": 0.2, "max_tokens": 1024}, "description": "GPT-4 for math questions"}
+{"reviewer_id": "gpt-4-0417-visual", "prompt_id": 4, "metadata": {"temperature": 0.2, "max_tokens": 1024}, "description": "GPT-4 for math questions"}

ChatUniVi/eval/table/rule.json ADDED Viewed

	@@ -0,0 +1,11 @@

+{
+    "coding": {"role": "Assistant", "prompt": "Your task is to evaluate the coding abilities of the above two assistants. They have been asked to implement a program to solve a given problem. Please review their code submissions, paying close attention to their problem-solving approach, code structure, readability, and the inclusion of helpful comments.\n\nPlease ensure that the assistants' submissions:\n\n1. Correctly implement the given problem statement.\n2. Contain accurate and efficient code.\n3. Include clear and concise comments that explain the code's logic and functionality.\n4. Adhere to proper coding standards and best practices.\n\nOnce you have carefully reviewed both submissions, provide detailed feedback on their strengths and weaknesses, along with any suggestions for improvement. You should first output a single line containing two scores on the scale of 1-10 (1: no code/no sense; 10: perfect) for Assistant 1 and 2, respectively. Then give extra comments starting from the next line."},
+    "math":  {"role": "Assistant", "prompt": "We would like to request your feedback on the mathematical proficiency of two AI assistants regarding the given user question.\nFirstly, please solve the problem independently, without referring to the answers provided by Assistant 1 and Assistant 2.\nAfterward, please examine the problem-solving process of Assistant 1 and Assistant 2 step-by-step to ensure their correctness, identifying any incorrect steps if present. Your evaluation should take into account not only the answer but also the problem-solving steps.\nFinally, please output a Python tuple containing two numerical scores for Assistant 1 and Assistant 2, ranging from 1 to 10, respectively. If applicable, explain the reasons for any variations in their scores and determine which assistant performed better."},
+    "default":  {"role": "Assistant", "prompt": "We would like to request your feedback on the performance of two AI assistants in response to the user question displayed above.\nPlease rate the helpfulness, relevance, accuracy, level of details of their responses. Each assistant receives an overall score on a scale of 1 to 10, where a higher score indicates better overall performance.\nPlease first output a single line containing only two values indicating the scores for Assistant 1 and 2, respectively. The two scores are separated by a space.\nIn the subsequent line, please provide a comprehensive explanation of your evaluation, avoiding any potential bias and ensuring that the order in which the responses were presented does not affect your judgment."},
+    "conv":  {"role": "Assistant", "prompt": "We would like to request your feedback on the performance of two AI assistants in response to the user question displayed above. The user asks the question on observing an image. For your reference, the visual content in the image is represented with five descriptive sentences describing the same image and the bounding box coordinates of each object in the scene. These coordinates are in the form of bounding boxes, represented as (x1, y1, x2, y2) with floating numbers ranging from 0 to 1. These values correspond to the top left x, top left y, bottom right x, and bottom right y. \nPlease rate the helpfulness, relevance, accuracy, level of details of their responses. Each assistant receives an overall score on a scale of 1 to 10, where a higher score indicates better overall performance.\nPlease first output a single line containing only two values indicating the scores for Assistant 1 and 2, respectively. The two scores are separated by a space.\nIn the subsequent line, please provide a comprehensive explanation of your evaluation, avoiding any potential bias and ensuring that the order in which the responses were presented does not affect your judgment."},
+    "detail":  {"role": "Assistant", "prompt": "We would like to request your feedback on the performance of two AI assistants in response to the user question displayed above. The user asks the question on observing an image. For your reference, the visual content in the image is represented with five descriptive sentences describing the same image and the bounding box coordinates of each object in the scene. These coordinates are in the form of bounding boxes, represented as (x1, y1, x2, y2) with floating numbers ranging from 0 to 1. These values correspond to the top left x, top left y, bottom right x, and bottom right y. \nPlease rate the helpfulness, relevance, accuracy, level of details of their responses. Each assistant receives an overall score on a scale of 1 to 10, where a higher score indicates better overall performance.\nPlease first output a single line containing only two values indicating the scores for Assistant 1 and 2, respectively. The two scores are separated by a space.\nIn the subsequent line, please provide a comprehensive explanation of your evaluation, avoiding any potential bias and ensuring that the order in which the responses were presented does not affect your judgment."},
+    "complex":  {"role": "Assistant", "prompt": "We would like to request your feedback on the performance of two AI assistants in response to the user question displayed above. The user asks the question on observing an image. For your reference, the visual content in the image is represented with five descriptive sentences describing the same image and the bounding box coordinates of each object in the scene. These coordinates are in the form of bounding boxes, represented as (x1, y1, x2, y2) with floating numbers ranging from 0 to 1. These values correspond to the top left x, top left y, bottom right x, and bottom right y. \nPlease rate the helpfulness, relevance, accuracy, level of details of their responses. Each assistant receives an overall score on a scale of 1 to 10, where a higher score indicates better overall performance.\nPlease first output a single line containing only two values indicating the scores for Assistant 1 and 2, respectively. The two scores are separated by a space.\nIn the subsequent line, please provide a comprehensive explanation of your evaluation, avoiding any potential bias and ensuring that the order in which the responses were presented does not affect your judgment."},
+    "llava_bench_conv":  {"role": "Assistant", "prompt": "We would like to request your feedback on the performance of two AI assistants in response to the user question displayed above. The user asks the question on observing an image. For your reference, the visual content in the image is represented with a few sentences describing the image. \nPlease rate the helpfulness, relevance, accuracy, level of details of their responses. Each assistant receives an overall score on a scale of 1 to 10, where a higher score indicates better overall performance.\nPlease first output a single line containing only two values indicating the scores for Assistant 1 and 2, respectively. The two scores are separated by a space.\nIn the subsequent line, please provide a comprehensive explanation of your evaluation, avoiding any potential bias and ensuring that the order in which the responses were presented does not affect your judgment."},
+    "llava_bench_detail":  {"role": "Assistant", "prompt": "We would like to request your feedback on the performance of two AI assistants in response to the user question displayed above. The user asks the question on observing an image. For your reference, the visual content in the image is represented with a few sentences describing the image. \nPlease rate the helpfulness, relevance, accuracy, level of details of their responses. Each assistant receives an overall score on a scale of 1 to 10, where a higher score indicates better overall performance.\nPlease first output a single line containing only two values indicating the scores for Assistant 1 and 2, respectively. The two scores are separated by a space.\nIn the subsequent line, please provide a comprehensive explanation of your evaluation, avoiding any potential bias and ensuring that the order in which the responses were presented does not affect your judgment."},
+    "llava_bench_complex":  {"role": "Assistant", "prompt": "We would like to request your feedback on the performance of two AI assistants in response to the user question displayed above. The user asks the question on observing an image. For your reference, the visual content in the image is represented with a few sentences describing the image. \nPlease rate the helpfulness, relevance, accuracy, level of details of their responses. Each assistant receives an overall score on a scale of 1 to 10, where a higher score indicates better overall performance.\nPlease first output a single line containing only two values indicating the scores for Assistant 1 and 2, respectively. The two scores are separated by a space.\nIn the subsequent line, please provide a comprehensive explanation of your evaluation, avoiding any potential bias and ensuring that the order in which the responses were presented does not affect your judgment."}
+}

ChatUniVi/model/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from .language_model.llama import ChatUniViLlamaForCausalLM, ChatUniViConfig

ChatUniVi/model/apply_delta.py ADDED Viewed

	@@ -0,0 +1,44 @@

+import argparse
+import torch
+from tqdm import tqdm
+from transformers import AutoTokenizer, AutoModelForCausalLM
+from ChatUniVi import ChatUniViLlamaForCausalLM
+def apply_delta(base_model_path, target_model_path, delta_path):
+    print("Loading base model")
+    base = AutoModelForCausalLM.from_pretrained(
+        base_model_path, torch_dtype=torch.float16, low_cpu_mem_usage=True)
+    print("Loading delta")
+    delta = ChatUniViLlamaForCausalLM.from_pretrained(delta_path, torch_dtype=torch.float16, low_cpu_mem_usage=True)
+    delta_tokenizer = AutoTokenizer.from_pretrained(delta_path)
+    print("Applying delta")
+    for name, param in tqdm(delta.state_dict().items(), desc="Applying delta"):
+        if name not in base.state_dict():
+            assert name in ['model.mm_projector.weight', 'model.mm_projector.bias'], f'{name} not in base model'
+            continue
+        if param.data.shape == base.state_dict()[name].shape:
+            param.data += base.state_dict()[name]
+        else:
+            assert name in ['model.embed_tokens.weight', 'lm_head.weight'], \
+                f'{name} dimension mismatch: {param.data.shape} vs {base.state_dict()[name].shape}'
+            bparam = base.state_dict()[name]
+            param.data[:bparam.shape[0], :bparam.shape[1]] += bparam
+    print("Saving target model")
+    delta.save_pretrained(target_model_path)
+    delta_tokenizer.save_pretrained(target_model_path)
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--base-model-path", type=str, required=True)
+    parser.add_argument("--target-model-path", type=str, required=True)
+    parser.add_argument("--delta-path", type=str, required=True)
+    args = parser.parse_args()
+    apply_delta(args.base_model_path, args.target_model_path, args.delta_path)

ChatUniVi/model/arch.py ADDED Viewed

	@@ -0,0 +1,652 @@

+from abc import ABC, abstractmethod
+import torch
+import torch.nn as nn
+from PIL.ImImagePlugin import split
+from .multimodal_encoder.builder import build_vision_tower
+from ChatUniVi.constants import *
+from .cluster import CTM, TCBlock
+from collections import OrderedDict
+from .multimodal_projector.builder import build_vision_projector
+class MetaModel:
+    def __init__(self, config):
+        super(MetaModel, self).__init__(config)
+        if hasattr(config, "mm_vision_tower"):
+            self.vision_tower = build_vision_tower(config, delay_load=True)
+            self.mm_projector = nn.Linear(config.mm_hidden_size, config.hidden_size)
+        if hasattr(config, "config"):
+            self.use_cluster = config.config["use_cluster"]
+            if self.use_cluster:
+                self.ctm0 = CTM(sample_ratio=config.config["spatial_cluster_rate0"], embed_dim=self.config.mm_hidden_size, dim_out=self.config.mm_hidden_size, k=5)
+                self.block0 = TCBlock(dim=self.config.mm_hidden_size, num_heads=8)
+                self.ctm1 = CTM(sample_ratio=config.config["spatial_cluster_rate1"], embed_dim=self.config.mm_hidden_size, dim_out=self.config.mm_hidden_size, k=3)
+                self.block1 = TCBlock(dim=self.config.mm_hidden_size, num_heads=8)
+                self.ctm2 = CTM(sample_ratio=config.config["spatial_cluster_rate2"], embed_dim=self.config.mm_hidden_size, dim_out=self.config.mm_hidden_size, k=3)
+                self.block2 = TCBlock(dim=self.config.mm_hidden_size, num_heads=8)
+                self.ctm3 = CTM(sample_ratio=config.config["temporal_cluster_rate"], embed_dim=self.config.mm_hidden_size, dim_out=self.config.mm_hidden_size, k=5)
+                self.block3 = TCBlock(dim=self.config.mm_hidden_size, num_heads=8)
+        else:
+            self.use_cluster = False
+    def get_vision_tower(self):
+        vision_tower = getattr(self, 'vision_tower', None)
+        if type(vision_tower) is list:
+            vision_tower = vision_tower[0]
+        return vision_tower
+    def initialize_vision_modules(self, model_args, fsdp=None):
+        vision_tower = model_args.vision_tower
+        mm_vision_select_layer = model_args.mm_vision_select_layer
+        mm_vision_select_feature = model_args.mm_vision_select_feature
+        pretrain_mm_mlp_adapter = model_args.pretrain_mm_mlp_adapter
+        self.config.mm_vision_tower = vision_tower
+        vision_tower = build_vision_tower(model_args)
+        self.config.use_mm_proj = True
+        self.config.mm_hidden_size = vision_tower.hidden_size
+        self.config.mm_vision_select_layer = mm_vision_select_layer
+        self.config.mm_vision_select_feature = mm_vision_select_feature
+        if fsdp is not None and len(fsdp) > 0:
+            self.vision_tower = [vision_tower]
+        else:
+            self.vision_tower = vision_tower
+        if not hasattr(self, 'mm_projector'):
+            self.mm_projector = build_vision_projector(self.config)
+        if pretrain_mm_mlp_adapter is not None:
+            mm_projector_weights = torch.load(pretrain_mm_mlp_adapter, map_location='cpu')
+            def get_w(weights, keyword):
+                return {k.split(keyword + '.')[1]: v for k, v in weights.items() if keyword in k}
+            self.mm_projector.load_state_dict(get_w(mm_projector_weights, 'mm_projector'))
+    def initialize_cluster_modules(self, model_args):
+        self.use_cluster = model_args.use_cluster
+        if self.use_cluster and not hasattr(self, 'ctm0'):
+            self.ctm0 = CTM(sample_ratio=model_args.spatial_cluster_rate0, embed_dim=self.config.mm_hidden_size, dim_out=self.config.mm_hidden_size, k=5)
+            self.block0 = TCBlock(dim=self.config.mm_hidden_size, num_heads=8)
+            self.ctm1 = CTM(sample_ratio=model_args.spatial_cluster_rate1, embed_dim=self.config.mm_hidden_size, dim_out=self.config.mm_hidden_size, k=3)
+            self.block1 = TCBlock(dim=self.config.mm_hidden_size, num_heads=8)
+            self.ctm2 = CTM(sample_ratio=model_args.spatial_cluster_rate2, embed_dim=self.config.mm_hidden_size, dim_out=self.config.mm_hidden_size, k=3)
+            self.block2 = TCBlock(dim=self.config.mm_hidden_size, num_heads=8)
+            self.ctm3 = CTM(sample_ratio=model_args.temporal_cluster_rate, embed_dim=self.config.mm_hidden_size, dim_out=self.config.mm_hidden_size, k=5)
+            self.block3 = TCBlock(dim=self.config.mm_hidden_size, num_heads=8)
+class ChatUniViMetaForCausalLM(ABC):
+    @abstractmethod
+    def get_model(self):
+        pass
+    def get_vision_tower(self):
+        return self.get_model().get_vision_tower()
+    def encode_images(self, images):
+        image_features = self.get_model().get_vision_tower()(images, select_feature="patch")
+        return image_features
+    def positional_encoding(self, x, num_features=1024, max_len=64):
+        p = torch.zeros((1, max_len, num_features))
+        _x = torch.arange(max_len, dtype=torch.float32).reshape(-1, 1) / torch.pow(10000,
+                                                                            torch.arange(0, num_features, 2, dtype=torch.float32) / num_features)
+        p[:, :, 0::2] = torch.sin(_x)
+        p[:, :, 1::2] = torch.cos(_x)
+        x = x + p[:, :x.shape[1], :].to(x.device).to(x.dtype)
+        return x
+    def project(self, image_features, input_type="image"):
+        if self.get_model().use_cluster:
+            if input_type == "image":
+                cluster_image_features = []
+                token_dict = {'x': image_features,
+                              'token_num': image_features.size(1),
+                              'idx_token': torch.arange(image_features.size(1))[None, :].repeat(
+                                  image_features.size(0), 1),
+                              'agg_weight': image_features.new_ones(image_features.size(0), image_features.size(1),
+                                                                    1),
+                              'mask': None}
+                token_dict = self.get_model().block0(self.get_model().ctm0(token_dict))
+                cluster_image_features.append(token_dict["x"])
+                token_dict = self.get_model().block1(self.get_model().ctm1(token_dict))
+                cluster_image_features.append(token_dict["x"])
+                token_dict = self.get_model().block2(self.get_model().ctm2(token_dict))
+                cluster_image_features.append(token_dict["x"])
+                image_features = torch.cat(cluster_image_features, dim=1)
+                image_features = image_features.to(self.get_model().mm_projector.weight.dtype)
+            else:
+                cls_features = torch.mean(image_features, dim=1, keepdim=False).unsqueeze(0).clone()
+                token_dict = {'x': cls_features,
+                              'token_num': cls_features.size(1),
+                              'idx_token': torch.arange(cls_features.size(1))[None, :].repeat(
+                                  cls_features.size(0), 1),
+                              'agg_weight': cls_features.new_ones(cls_features.size(0), cls_features.size(1),
+                                                                  1),
+                              'mask': None}
+                down_dict, token_dict = self.get_model().ctm3(token_dict)
+                events = OrderedDict()
+                max_len = 0
+                for id, i in enumerate(down_dict["idx_token"][0].tolist()):
+                    if i not in events:
+                        events[i] = [id]
+                    else:
+                        events[i].append(id)
+                    max_len = len(events[i]) if max_len < len(events[i]) else max_len
+                cluster_image_features = []
+                token_dict = {'x': image_features,
+                              'token_num': image_features.size(1),
+                              'idx_token': torch.arange(image_features.size(1))[None, :].repeat(
+                                  image_features.size(0), 1),
+                              'agg_weight': image_features.new_ones(image_features.size(0), image_features.size(1),
+                                                                    1),
+                              'mask': None}
+                token_dict0 = self.get_model().block0(self.get_model().ctm0(token_dict))
+                token_dict1 = self.get_model().block1(self.get_model().ctm1(token_dict0))
+                token_dict2 = self.get_model().block2(self.get_model().ctm2(token_dict1))
+                for id, key in enumerate(events):
+                    cur_image_features0 = torch.cat([token_dict0["x"][i] for i in events[key]], dim=0).unsqueeze(0)
+                    token_dict = {'x': cur_image_features0,
+                                  'token_num': cur_image_features0.size(1),
+                                  'idx_token': torch.arange(cur_image_features0.size(1))[None, :].repeat(
+                                      cur_image_features0.size(0), 1),
+                                  'agg_weight': cur_image_features0.new_ones(cur_image_features0.size(0),
+                                                                             cur_image_features0.size(1),
+                                                                      1),
+                                  'mask': None}
+                    cur_token_dict0 = self.get_model().block0(self.get_model().ctm0(token_dict))
+                    cluster_image_features.append(cur_token_dict0["x"])
+                    cur_image_features1 = torch.cat([token_dict1["x"][i] for i in events[key]], dim=0).unsqueeze(0)
+                    token_dict = {'x': cur_image_features1,
+                                  'token_num': cur_image_features1.size(1),
+                                  'idx_token': torch.arange(cur_image_features1.size(1))[None, :].repeat(
+                                      cur_image_features1.size(0), 1),
+                                  'agg_weight': cur_image_features1.new_ones(cur_image_features1.size(0),
+                                                                             cur_image_features1.size(1),
+                                                                             1),
+                                  'mask': None}
+                    cur_token_dict1 = self.get_model().block1(self.get_model().ctm1(token_dict))
+                    cluster_image_features.append(cur_token_dict1["x"])
+                    cur_image_features2 = torch.cat([token_dict2["x"][i] for i in events[key]], dim=0).unsqueeze(0)
+                    token_dict = {'x': cur_image_features2,
+                                  'token_num': cur_image_features2.size(1),
+                                  'idx_token': torch.arange(cur_image_features2.size(1))[None, :].repeat(
+                                      cur_image_features2.size(0), 1),
+                                  'agg_weight': cur_image_features2.new_ones(cur_image_features2.size(0),
+                                                                             cur_image_features2.size(1),
+                                                                             1),
+                                  'mask': None}
+                    cur_token_dict2 = self.get_model().block2(self.get_model().ctm2(token_dict))
+                    cluster_image_features.append(cur_token_dict2["x"])
+                image_features = torch.cat(cluster_image_features, dim=1)
+                image_features = image_features.to(self.get_model().mm_projector.weight.dtype)
+        else:
+            if input_type == "video":
+                image_features, cls_features = torch.mean(image_features, dim=0, keepdim=False).unsqueeze(
+                    0), torch.mean(image_features, dim=1, keepdim=False).unsqueeze(0)
+                image_features = torch.cat([image_features, cls_features], dim=1)
+        image_features = self.get_model().mm_projector(image_features)
+        return image_features # 不同的type形状相同
+    def prepare_inputs_labels_for_multimodal(
+        self, input_ids, attention_mask, past_key_values, labels, images, audio_features=None, target_frame=0, ref_ids=None
+    ):
+        IMAGE_TOKEN_INDEX = -200
+        AUDIO_TOKEN_INDEX = -300
+        # print("\n调用prepare_inputs_labels_for_multimodal")
+        vision_tower = self.get_vision_tower()
+        # print("获取vision_tower")
+        num_frames = images[0].shape[0]  # T
+        if vision_tower is None or images is None or input_ids.shape[1] == 1:
+            if past_key_values is not None and vision_tower is not None and images is not None and input_ids.shape[1] == 1:
+                attention_mask = torch.ones((attention_mask.shape[0], past_key_values[-1][-1].shape[-2] + 1), dtype=attention_mask.dtype, device=attention_mask.device)
+            return input_ids, attention_mask, past_key_values, None, labels
+        if ref_ids is not None:
+            ref_embeds = []
+            for ref_id in ref_ids:
+                ref_embed = self.get_model().embed_tokens(ref_id) #[L, 4096]
+                ref_embeds.append(ref_embed)
+            # list[B]: [len_ref, 4096]
+        if type(images) is list or images.ndim == 5:
+            # print("先concat列表中的图像")
+            concat_images = torch.cat([image for image in images], dim=0)  # [BT, 3, H, W]
+            org_image_features = self.encode_images(concat_images)  # [BT, 256, 1024]
+            # if audio_features is not None and hasattr(self, "audio_adapter"):
+            if True:
+                # image_features = self.audio_adapter(org_image_features, audio_features, ref_embeds_T)
+                # image_features = self.token_compressor(org_image_features, ref_embeds)
+                # print("image_features after compress:", image_features.shape)
+                image_features = org_image_features
+            else:
+                image_features = org_image_features
+            # split_sizes = [image.shape[0] for image in images]
+            split_sizes = 1
+            image_features = torch.split(image_features, split_sizes, dim=0)  # list[BT]: [1, 256,1024]
+            image_features = [x.flatten(0, 1) for x in image_features] # list[BT]: [256,1024]
+            org_image_features = torch.split(org_image_features, split_sizes, dim=0)
+            org_image_features = [x.flatten(0, 1) for x in org_image_features]
+        else:
+            # print("直接获取image_feature")
+            image_features = self.encode_images(images)
+            org_image_features = image_features
+        new_input_embeds = []
+        new_labels = [] if labels is not None else None
+        cur_image_idx = 0
+        for batch_idx, cur_input_ids in enumerate(input_ids):
+            # cur_image_idx += 1
+            # 判断当前input_id中有没有图像token
+            # print("cur_input_ids shape:", cur_input_ids.shape)
+            # print("cur_input_ids:", cur_input_ids)
+            if (cur_input_ids == IMAGE_TOKEN_INDEX).sum() == 0:
+                # print("input_ids中没有 IMAGE token")
+                # multimodal LLM, but the current sample is not multimodal
+                # 直接把input_ids进行text embed
+                cur_input_embeds = self.get_model().embed_tokens(cur_input_ids)
+                cur_input_embeds = cur_input_embeds + (
+                            0. * self.get_model().mm_projector(vision_tower.dummy_feature)).sum()
+                new_input_embeds.append(cur_input_embeds)
+                if labels is not None:
+                    new_labels.append(labels[batch_idx])
+                cur_image_idx += 1
+                continue
+            image_token_indices = torch.where((cur_input_ids == IMAGE_TOKEN_INDEX)|(cur_input_ids == AUDIO_TOKEN_INDEX))[0]
+            audio_token_indices = torch.where(cur_input_ids == AUDIO_TOKEN_INDEX)[0]
+            # print("audio indices:", audio_token_indices)
+            # print("image and audio indices:", image_token_indices)
+            cur_new_input_embeds = []
+            if labels is not None:
+                cur_labels = labels[batch_idx]
+                cur_new_labels = []
+                assert cur_labels.shape == cur_input_ids.shape
+            # 有多个image token---------------------------------------------
+            if len(image_token_indices) > 1:
+                # print("有多个image token")
+                # return 0
+                temp = []
+                cur, pre = image_token_indices[0], image_token_indices[0]
+                # 这里是把连续的<image>的位置放到一个list中存储 分割开的<image>
+                for i in image_token_indices:
+                    cur = i
+                    # 如果下一个<image>就在上一个<image>之后
+                    if cur - pre == 1:
+                        temp[-1] = temp[-1] + [cur]
+                    else:
+                        temp.append([cur])
+                    pre = cur
+                pre_image_token_end = 0
+                cur_frames = 0
+                for i in temp:
+                    # 第一个以及最后一个<image>的位置
+                    image_token_start = i[0]
+                    image_token_end = i[-1]
+                    cur_image_features = []
+                    if len(i) >= 2:  # 处理T个image组成的视频特征
+                        for frame_idx in range(num_frames):
+                            cur_image_features.append(org_image_features[batch_idx*num_frames+frame_idx])
+                            # print(batch_idx*num_frames+frame_idx)
+                    elif i[0] not in audio_token_indices:
+                        cur_image_features.append(org_image_features[batch_idx * num_frames + target_frame])
+                        # print(batch_idx * num_frames + target_frame)
+                    else:
+                        cur_image_features.append(audio_features[batch_idx])
+                        # print(f"audio{batch_idx}")
+                    # ------------------------------------------------------------------
+                    # # i是每组<image>的indices 根据其数量从image_features中拿特征
+                    # for _ in i:
+                    #     # 表示处理的是<image>
+                    #     if _ not in audio_token_indices:
+                    #         # 单个image
+                    #         if cur_frames == num_frames:
+                    #             # cur_image_features.append(org_image_features[cur_image_idx-num_frames+target_frame])
+                    #             cur_image_features.append(org_image_features[batch_idx*num_frames+target_frame])
+                    #             # print(cur_image_idx-num_frames+target_frame)
+                    #         # 多个image
+                    #         else:
+                    #             cur_image_features.append(image_features[cur_image_idx])
+                    #             # print(cur_image_idx)
+                    #             cur_image_idx += 1
+                    #         cur_frames += 1
+                    #     # 处理<audio>
+                    #     else:
+                    #         # cur_image_features.append(self.audio_feature_layer(audio_features[batch_idx]))
+                    #         cur_image_features.append(audio_features[batch_idx])
+                    #         # print("audio:", batch_idx)
+                    # # cur_image_features list[len(i)] : [256,1024]
+                    # 如果当前分组是多个image 代表video
+                    if len(i) >= 2:
+                        if not self.compress:
+                            # 对拿到的多个image_features进行压缩 并投影
+                            cur_image_features = torch.stack(cur_image_features, dim=0)  # [len(i), 256, 1024]
+                            cur_image_features = self.project(cur_image_features, input_type="video")
+                            t, l, n = cur_image_features.size()
+                            cur_image_features = cur_image_features.contiguous().view(t * l, n) #[112, 4096]
+                            # print(f"no compression, cur_image_features{cur_image_features.shape}")
+                        else:
+                            compressed_frames = []
+                            for cur_image_feature in cur_image_features:
+                                cur_image_feature = self.project(cur_image_feature.unsqueeze(0), input_type="image")  # [1, 256, 1024]
+                                t, l, n = cur_image_feature.size()
+                                cur_image_feature = cur_image_feature.contiguous().view(t * l, n)  # [112, 4096]
+                                compressed_frames.append(cur_image_feature.mean(dim=0).unsqueeze(0))  # [1, 4096]
+                            compressed_frames = torch.cat(compressed_frames, dim=0)  # [T, 4096]
+                            cur_image_features = torch.stack(cur_image_features, dim=0)  # [len(i), 256, 1024]
+                            cur_image_features = self.project(cur_image_features, input_type="video")
+                            t, l, n = cur_image_features.size()
+                            cur_image_features = cur_image_features.contiguous().view(t * l, n)  # [112, 4096]
+                            # cur_image_features = torch.cat([cur_image_features, compressed_frames], dim=0)  # [122, 4096]
+                            cur_image_features = torch.cat([compressed_frames, cur_image_features], dim=0)  # [122, 4096]
+                    # 对于单个的特殊 token 如果是<image>
+                    elif i[0] not in audio_token_indices:
+                        cur_image_features = torch.stack(cur_image_features, dim=0)
+                        cur_image_features = self.project(cur_image_features, input_type="image")
+                        t, l, n = cur_image_features.size()
+                        cur_image_features = cur_image_features.contiguous().view(t * l, n)  # [112, 4093]
+                    else:
+                        cur_image_features = cur_image_features[0]  #[10, 4096]
+                    if getattr(self.config, 'tune_mm_mlp_adapter', False) and getattr(self.config, 'mm_use_im_start_end', False):
+                        # 把im_start前的文字进行embeds
+                        cur_new_input_embeds.append(self.get_model().embed_tokens(cur_input_ids[pre_image_token_end:image_token_start - 1]).detach())
+                        # 把im_start进行embeds
+                        cur_new_input_embeds.append(self.get_model().embed_tokens(cur_input_ids[image_token_start - 1:image_token_start]))
+                        # 图像特征
+                        cur_new_input_embeds.append(cur_image_features)
+                        # im_end
+                        cur_new_input_embeds.append(self.get_model().embed_tokens(cur_input_ids[image_token_end + 1:image_token_end + 2]))
+                        if labels is not None:
+                            cur_new_labels.append(cur_labels[pre_image_token_end:image_token_start])
+                            # cur_new_labels填充
+                            cur_new_labels.append(torch.full((cur_image_features.shape[0],), IGNORE_INDEX, device=labels.device, dtype=labels.dtype))
+                            cur_new_labels.append(cur_labels[image_token_end:image_token_end + 1])
+                            # cur_labels设置为剩余的cur_labels
+                            # cur_labels = cur_labels[image_token_end + 2:]
+                    else:
+                        cur_new_input_embeds.append(self.get_model().embed_tokens(cur_input_ids[pre_image_token_end:image_token_start]))
+                        cur_new_input_embeds.append(cur_image_features)
+                        if labels is not None:
+                            cur_new_labels.append(cur_labels[pre_image_token_end:image_token_start])
+                            cur_new_labels.append(torch.full((cur_image_features.shape[0],), IGNORE_INDEX, device=labels.device, dtype=labels.dtype))
+                            # cur_labels = cur_labels[image_token_end + 1:]
+                    pre_image_token_end = image_token_end + 1
+                # cur_input_ids设置为剩余的cur_input_ids
+                if getattr(self.config, 'tune_mm_mlp_adapter', False) and getattr(self.config, 'mm_use_im_start_end',
+                                                                                  False):
+                    cur_input_ids = cur_input_ids[image_token_end + 2:]
+                    cur_labels = cur_labels[image_token_end + 2:]
+                else:
+                    cur_input_ids = cur_input_ids[image_token_end + 1:]
+                    cur_labels = cur_labels[image_token_end + 1:]
+            # 结合上面大于1 此处就是只有一个image token
+            elif image_token_indices.numel() > 0:
+                # print("只有一个image token")
+                cur_image_features = []
+                image_token_start = image_token_indices[0]
+                image_token_end = image_token_indices[-1]
+                # print("image_token_start:", image_token_start, " image_token_end:", image_token_end)
+                # 根据image token数量 把image feature加入到cur_image_features
+                for _ in image_token_indices:
+                    cur_image_features.append(image_features[cur_image_idx])
+                    cur_image_idx += 1
+                # print("cur_image_features length:", len(cur_image_features))
+                # 对image features进行维度上拼接
+                cur_image_features = torch.stack(cur_image_features, dim=0)
+                # print("cur_image_features_stacked shape:", cur_image_features.shape)
+                cur_image_features = self.project(cur_image_features, input_type="image")
+                # print("cur_image_features_projected shape:", cur_image_features.shape)
+                # 获取 图像特征的维度 nums, len, dim
+                t, l, n = cur_image_features.size()
+                cur_image_features = cur_image_features.contiguous().view(t * l, n)
+                # print("cur_image_features_viewed shape:", cur_image_features.shape)
+                if getattr(self.config, 'tune_mm_mlp_adapter', False) and getattr(self.config, 'mm_use_im_start_end', False):
+                    print("no tune_mm_mlp_adapter and no mm_use_im_start_end")
+                    # 把imagetoken之前的text进行embedding 这两行
+                    cur_new_input_embeds.append(self.get_model().embed_tokens(cur_input_ids[:image_token_start-1]).detach())
+                    # 这里加入的 image——strat——token
+                    cur_new_input_embeds.append(self.get_model().embed_tokens(cur_input_ids[image_token_start-1:image_token_start]))
+                    print("cur_new_input_embeds length:", len(cur_new_input_embeds))
+                    print("cur_new_input_embeds shape:", cur_new_input_embeds[0].shape)
+                    print("cur_new_input_embeds shape:", cur_new_input_embeds[1].shape)
+                    # 在图像token位置上加入image feature
+                    cur_new_input_embeds.append(cur_image_features)
+                    print("cur_new_input_embeds length:", len(cur_new_input_embeds))
+                    # print("cur_new_input_embeds shape:", cur_new_input_embeds[2].shape)
+                    # 把图像token之后的img-end-token加入
+                    cur_new_input_embeds.append(self.get_model().embed_tokens(cur_input_ids[image_token_end+1:image_token_end+2]))
+                    print("cur_new_input_embeds length:", len(cur_new_input_embeds))
+                    if labels is not None:
+                        # 把image token前面的label加入
+                        cur_new_labels.append(cur_labels[:image_token_start])
+                        # 根据图像特征形状加入 多个ignore index
+                        cur_new_labels.append(torch.full((cur_image_features.shape[0],), IGNORE_INDEX, device=labels.device, dtype=labels.dtype))
+                        # 把img-end-token加入
+                        cur_new_labels.append(cur_labels[image_token_end:image_token_end+1])
+                        # 把剩下的text label加入
+                        cur_labels = cur_labels[image_token_end+2:]
+                else:
+                    # print("tune_mm_mlp_adapter / mm_use_im_start_end")
+                    # 对图像token之前的text token 进行embedding
+                    cur_new_input_embeds.append(self.get_model().embed_tokens(cur_input_ids[:image_token_start]))
+                    cur_new_input_embeds.append(cur_image_features)
+                    # print("cur_new_input_embeds length:", len(cur_new_input_embeds))
+                    if labels is not None:
+                        # 把图像前面的labels进行复制
+                        cur_new_labels.append(cur_labels[:image_token_start])
+                        # 根据图像特征形状 加入shape[0]个 IGNORE_INDEX
+                        cur_new_labels.append(torch.full((cur_image_features.shape[0],), IGNORE_INDEX, device=labels.device, dtype=labels.dtype))
+                        # 加入剩下的labels
+                        # print("cur_new_labels length:", len(cur_new_labels))
+                        # print("cur_new_labels:", cur_new_labels)
+                        # print(cur_new_labels[0].shape, '   ',cur_new_labels[1].shape)
+                        # 将cur_labels保留为剩下的未处理过的lables
+                        cur_labels = cur_labels[image_token_end+1:]
+                        # print("labels after image:", cur_labels)
+                        # print(len(cur_labels))
+                # 将 cur_input_ids替换为剩下的 没有处理的 (img之后的) ids
+                if getattr(self.config, 'tune_mm_mlp_adapter', False) and getattr(self.config, 'mm_use_im_start_end', False):
+                    cur_input_ids = cur_input_ids[image_token_end+2:]
+                else:
+                    cur_input_ids = cur_input_ids[image_token_end+1:]
+                # print("input_ids after image :", cur_input_ids)
+            # 如果图像token之后还有text token
+            if cur_input_ids.numel() > 0:
+                # print("image token 之后还有 text token")
+                if getattr(self.config, 'tune_mm_mlp_adapter', False) and getattr(self.config, 'mm_use_im_start_end', False):
+                    cur_new_input_embeds.append(self.get_model().embed_tokens(cur_input_ids).detach())
+                else:
+                    # 把剩下的input_id进行embedding
+                    cur_new_input_embeds.append(self.get_model().embed_tokens(cur_input_ids))
+                    # print("cur_new_input_embeds length:", len(cur_new_input_embeds))
+                    # print("cur_new_input_embeds shape:", cur_new_input_embeds[0].shape, cur_new_input_embeds[1].shape, cur_new_input_embeds[2].shape)
+                if labels is not None:
+                    # 把剩下的labels加入
+                    cur_new_labels.append(cur_labels)
+            cur_new_input_embeds = [x.to(device='cuda') for x in cur_new_input_embeds]
+            cur_new_input_embeds = torch.cat(cur_new_input_embeds, dim=0)
+            new_input_embeds.append(cur_new_input_embeds)
+            if labels is not None:
+                cur_new_labels = torch.cat(cur_new_labels, dim=0)
+                new_labels.append(cur_new_labels)
+        # 如果一个batch内部embedd inputs长度不一致
+        if any(x.shape != new_input_embeds[0].shape for x in new_input_embeds):
+            print("batch 内部长度不一致")
+            max_len = max(x.shape[0] for x in new_input_embeds)
+            new_input_embeds_align = []
+            for cur_new_embed in new_input_embeds:
+                cur_new_embed = torch.cat((cur_new_embed, torch.zeros((max_len - cur_new_embed.shape[0], cur_new_embed.shape[1]), dtype=cur_new_embed.dtype, device=cur_new_embed.device)), dim=0)
+                new_input_embeds_align.append(cur_new_embed)
+            new_input_embeds = torch.stack(new_input_embeds_align, dim=0)
+            if labels is not None:
+                new_labels_align = []
+                _new_labels = new_labels
+                for cur_new_label in new_labels:
+                    cur_new_label = torch.cat((cur_new_label, torch.full((max_len - cur_new_label.shape[0],), IGNORE_INDEX, dtype=cur_new_label.dtype, device=cur_new_label.device)), dim=0)
+                    new_labels_align.append(cur_new_label)
+                new_labels = torch.stack(new_labels_align, dim=0)
+            if attention_mask is not None:
+                new_attention_mask = []
+                for cur_attention_mask, cur_new_labels, cur_new_labels_align in zip(attention_mask, _new_labels, new_labels):
+                    new_attn_mask_pad_left = torch.full((cur_new_labels.shape[0] - labels.shape[1],), True, dtype=attention_mask.dtype, device=attention_mask.device)
+                    new_attn_mask_pad_right = torch.full((cur_new_labels_align.shape[0] - cur_new_labels.shape[0],), False, dtype=attention_mask.dtype, device=attention_mask.device)
+                    cur_new_attention_mask = torch.cat((new_attn_mask_pad_left, cur_attention_mask, new_attn_mask_pad_right), dim=0)
+                    new_attention_mask.append(cur_new_attention_mask)
+                attention_mask = torch.stack(new_attention_mask, dim=0)
+                assert attention_mask.shape == new_labels.shape
+        # 内部长度一致
+        else:
+            # 将一个batch的数据 拼接成 [B, token_len, dim]
+            new_input_embeds = torch.stack(new_input_embeds, dim=0)
+            if labels is not None:
+                new_labels = torch.stack(new_labels, dim=0)
+            if attention_mask is not None:
+                new_attn_mask_pad_left = torch.full((attention_mask.shape[0], new_input_embeds.shape[1] - input_ids.shape[1]), True, dtype=attention_mask.dtype, device=attention_mask.device)
+                attention_mask = torch.cat((new_attn_mask_pad_left, attention_mask), dim=1)
+                assert attention_mask.shape == new_input_embeds.shape[:2]
+        return None, attention_mask, past_key_values, new_input_embeds, new_labels
+    def initialize_vision_tokenizer(self, model_args, tokenizer):
+        if model_args.mm_use_im_patch_token:
+            tokenizer.add_tokens([DEFAULT_IMAGE_PATCH_TOKEN], special_tokens=True)
+            tokenizer.add_tokens([DEFAULT_VIDEO_PATCH_TOKEN], special_tokens=True)
+            self.resize_token_embeddings(len(tokenizer))
+        if model_args.mm_use_im_start_end:
+            num_new_tokens = tokenizer.add_tokens([DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN, DEFAULT_VID_START_TOKEN, DEFAULT_VID_END_TOKEN], special_tokens=True)
+            self.resize_token_embeddings(len(tokenizer))
+            if num_new_tokens > 0:
+                input_embeddings = self.get_input_embeddings().weight.data
+                output_embeddings = self.get_output_embeddings().weight.data
+                input_embeddings_avg = input_embeddings[:-num_new_tokens].mean(
+                    dim=0, keepdim=True)
+                output_embeddings_avg = output_embeddings[:-num_new_tokens].mean(
+                    dim=0, keepdim=True)
+                input_embeddings[-num_new_tokens:] = input_embeddings_avg
+                output_embeddings[-num_new_tokens:] = output_embeddings_avg
+            if model_args.tune_mm_mlp_adapter:
+                for p in self.get_input_embeddings().parameters():
+                    p.requires_grad = True
+                for p in self.get_output_embeddings().parameters():
+                    p.requires_grad = False
+            if model_args.pretrain_mm_mlp_adapter:
+                mm_projector_weights = torch.load(model_args.pretrain_mm_mlp_adapter, map_location='cpu')
+                embed_tokens_weight = mm_projector_weights['model.embed_tokens.weight']
+                assert num_new_tokens == 2
+                if input_embeddings.shape == embed_tokens_weight.shape:
+                    input_embeddings[-num_new_tokens:] = embed_tokens_weight[-num_new_tokens:]
+                elif embed_tokens_weight.shape[0] == num_new_tokens:
+                    input_embeddings[-num_new_tokens:] = embed_tokens_weight
+                else:
+                    raise ValueError(f"Unexpected embed_tokens_weight shape. Pretrained: {embed_tokens_weight.shape}. Current: {input_embeddings.shape}. Numer of new tokens: {num_new_tokens}.")
+        elif model_args.mm_use_im_patch_token:
+            if model_args.tune_mm_mlp_adapter:
+                for p in self.get_input_embeddings().parameters():
+                    p.requires_grad = False
+                for p in self.get_output_embeddings().parameters():
+                    p.requires_grad = False

ChatUniVi/model/builder.py ADDED Viewed

	@@ -0,0 +1,118 @@

+import os
+import shutil
+from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig, BitsAndBytesConfig
+import torch
+from ChatUniVi.model import *
+from ChatUniVi.constants import DEFAULT_IMAGE_PATCH_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN
+from accelerate import init_empty_weights, load_checkpoint_and_dispatch
+from transformers import AutoConfig, AutoModelForCausalLM
+def load_pretrained_model(model_path, model_base, model_name, load_8bit=False, load_4bit=False, device_map="auto"):
+    kwargs = {"device_map": device_map}
+    if load_8bit:
+        kwargs['load_in_8bit'] = True
+    elif load_4bit:
+        kwargs['load_in_4bit'] = True
+        kwargs['quantization_config'] = BitsAndBytesConfig(
+            load_in_4bit=True,
+            bnb_4bit_compute_dtype=torch.float16,
+            bnb_4bit_use_double_quant=True,
+            bnb_4bit_quant_type='nf4'
+        )
+    else:
+        kwargs['torch_dtype'] = torch.float16
+    if 'chatunivi' in model_name.lower():
+        # Load ChatUniVi model
+        if 'lora' in model_name.lower() and model_base is not None:
+            lora_cfg_pretrained = AutoConfig.from_pretrained(model_path)
+            tokenizer = AutoTokenizer.from_pretrained(model_base, use_fast=False)
+            print('Loading ChatUniVi from base model...')
+            model = ChatUniViLlamaForCausalLM.from_pretrained(model_base, low_cpu_mem_usage=True, config=lora_cfg_pretrained, **kwargs)
+            token_num, tokem_dim = model.lm_head.out_features, model.lm_head.in_features
+            if model.lm_head.weight.shape[0] != token_num:
+                model.lm_head.weight = torch.nn.Parameter(torch.empty(token_num, tokem_dim, device=model.device, dtype=model.dtype))
+                model.model.embed_tokens.weight = torch.nn.Parameter(torch.empty(token_num, tokem_dim, device=model.device, dtype=model.dtype))
+            print('Loading additional ChatUniVi weights...')
+            if os.path.exists(os.path.join(model_path, 'non_lora_trainables.bin')):
+                non_lora_trainables = torch.load(os.path.join(model_path, 'non_lora_trainables.bin'), map_location='cpu')
+            else:
+                # this is probably from HF Hub
+                from huggingface_hub import hf_hub_download
+                def load_from_hf(repo_id, filename, subfolder=None):
+                    cache_file = hf_hub_download(
+                        repo_id=repo_id,
+                        filename=filename,
+                        subfolder=subfolder)
+                    return torch.load(cache_file, map_location='cpu')
+                non_lora_trainables = load_from_hf(model_path, 'non_lora_trainables.bin')
+            non_lora_trainables = {(k[11:] if k.startswith('base_model.') else k): v for k, v in non_lora_trainables.items()}
+            if any(k.startswith('model.model.') for k in non_lora_trainables):
+                non_lora_trainables = {(k[6:] if k.startswith('model.') else k): v for k, v in non_lora_trainables.items()}
+            model.load_state_dict(non_lora_trainables, strict=False)
+            from peft import PeftModel
+            print('Loading LoRA weights...')
+            model = PeftModel.from_pretrained(model, model_path)
+            print('Merging LoRA weights...')
+            model = model.merge_and_unload()
+            print('Model is loaded...')
+        elif model_base is not None:
+            # this may be mm projector only
+            print('Loading ChatUniVi from base model...')
+            tokenizer = AutoTokenizer.from_pretrained(model_base, use_fast=False)
+            cfg_pretrained = AutoConfig.from_pretrained(model_path)
+            model = ChatUniViLlamaForCausalLM.from_pretrained(model_base, low_cpu_mem_usage=True, config=cfg_pretrained, **kwargs)
+            mm_projector_weights = torch.load(os.path.join(model_path, 'mm_projector.bin'), map_location='cpu')
+            mm_projector_weights = {k: v.to(torch.float16) for k, v in mm_projector_weights.items()}
+            model.load_state_dict(mm_projector_weights, strict=False)
+        else:
+            #
+            tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=False)
+            model = AutoModelForCausalLM.from_pretrained(model_path, low_cpu_mem_usage=True, **kwargs)
+    else:
+        # Load language model
+        if model_base is not None:
+            # PEFT model
+            from peft import PeftModel
+            tokenizer = AutoTokenizer.from_pretrained(model_base, use_fast=False)
+            model = AutoModelForCausalLM.from_pretrained(model_base, torch_dtype=torch.float16, low_cpu_mem_usage=True, device_map="auto")
+            print(f"Loading LoRA weights from {model_path}")
+            model = PeftModel.from_pretrained(model, model_path)
+            print(f"Merging weights")
+            model = model.merge_and_unload()
+            print('Convert to FP16...')
+            model.to(torch.float16)
+        else:
+            use_fast = False
+            tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=False)
+            model = AutoModelForCausalLM.from_pretrained(model_path, low_cpu_mem_usage=True, **kwargs)
+    image_processor = None
+    if 'chatunivi' in model_name.lower():
+        mm_use_im_start_end = getattr(model.config, "mm_use_im_start_end", False)
+        mm_use_im_patch_token = getattr(model.config, "mm_use_im_patch_token", True)
+        if mm_use_im_patch_token:
+            tokenizer.add_tokens([DEFAULT_IMAGE_PATCH_TOKEN], special_tokens=True)
+        if mm_use_im_start_end:
+            tokenizer.add_tokens([DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN], special_tokens=True)
+        model.resize_token_embeddings(len(tokenizer))
+        vision_tower = model.get_vision_tower()
+        if not vision_tower.is_loaded:
+            vision_tower.load_model()
+            vision_tower.to(device='cuda', dtype=torch.float16)
+        image_processor = vision_tower.image_eval_processor
+    if hasattr(model.config, "max_sequence_length"):
+        context_len = model.config.max_sequence_length
+    else:
+        context_len = 2048
+    return tokenizer, model, image_processor, context_len

ChatUniVi/model/cluster.py ADDED Viewed

	@@ -0,0 +1,287 @@

+import torch
+import math
+import torch.nn as nn
+import warnings
+def _no_grad_trunc_normal_(tensor, mean, std, a, b):
+    # Cut & paste from PyTorch official master until it's in a few official releases - RW
+    # Method based on https://people.sc.fsu.edu/~jburkardt/presentations/truncated_normal.pdf
+    def norm_cdf(x):
+        # Computes standard normal cumulative distribution function
+        return (1. + math.erf(x / math.sqrt(2.))) / 2.
+    if (mean < a - 2 * std) or (mean > b + 2 * std):
+        warnings.warn("mean is more than 2 std from [a, b] in nn.init.trunc_normal_. "
+                      "The distribution of values may be incorrect.",
+                      stacklevel=2)
+    with torch.no_grad():
+        # Values are generated by using a truncated uniform distribution and
+        # then using the inverse CDF for the normal distribution.
+        # Get upper and lower cdf values
+        l = norm_cdf((a - mean) / std)
+        u = norm_cdf((b - mean) / std)
+        # Uniformly fill tensor with values from [l, u], then translate to
+        # [2l-1, 2u-1].
+        tensor.uniform_(2 * l - 1, 2 * u - 1)
+        # Use inverse cdf transform for normal distribution to get truncated
+        # standard normal
+        tensor.erfinv_()
+        # Transform to proper mean, std
+        tensor.mul_(std * math.sqrt(2.))
+        tensor.add_(mean)
+        # Clamp to ensure it's in the proper range
+        tensor.clamp_(min=a, max=b)
+        return tensor
+def trunc_normal_(tensor, mean=0., std=1., a=-2., b=2.):
+    # type: (Tensor, float, float, float, float) -> Tensor
+    r"""Fills the input Tensor with values drawn from a truncated
+    normal distribution. The values are effectively drawn from the
+    normal distribution :math:`\mathcal{N}(\text{mean}, \text{std}^2)`
+    with values outside :math:`[a, b]` redrawn until they are within
+    the bounds. The method used for generating the random values works
+    best when :math:`a \leq \text{mean} \leq b`.
+    Args:
+        tensor: an n-dimensional `torch.Tensor`
+        mean: the mean of the normal distribution
+        std: the standard deviation of the normal distribution
+        a: the minimum cutoff value
+        b: the maximum cutoff value
+    Examples:
+        >>> w = torch.empty(3, 5)
+        >>> nn.init.trunc_normal_(w)
+    """
+    try:
+        return _no_grad_trunc_normal_(tensor, mean, std, a, b)
+    except:
+        return tensor
+def drop_path(x, drop_prob: float = 0., training: bool = False):
+    """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
+    """
+    if drop_prob == 0. or not training:
+        return x
+    keep_prob = 1 - drop_prob
+    shape = (x.shape[0],) + (1,) * (x.ndim - 1)  # work with diff dim tensors, not just 2D ConvNets
+    random_tensor = keep_prob + torch.rand(shape, dtype=x.dtype, device=x.device)
+    random_tensor.floor_()  # binarize
+    output = x.div(keep_prob) * random_tensor
+    return output
+class DropPath(nn.Module):
+    """Drop paths (Stochastic Depth) per sample  (when applied in main path of residual blocks).
+    """
+    def __init__(self, drop_prob=None):
+        super(DropPath, self).__init__()
+        self.drop_prob = drop_prob
+    def forward(self, x):
+        return drop_path(x, self.drop_prob, self.training)
+def index_points(points, idx):
+    """Sample features following the index.
+    Returns:
+        new_points:, indexed points data, [B, S, C]
+    Args:
+        points: input points data, [B, N, C]
+        idx: sample index data, [B, S]
+    """
+    device = points.device
+    B = points.shape[0]
+    view_shape = list(idx.shape)
+    view_shape[1:] = [1] * (len(view_shape) - 1)
+    repeat_shape = list(idx.shape)
+    repeat_shape[0] = 1
+    batch_indices = torch.arange(B, dtype=torch.long).to(device).view(view_shape).repeat(repeat_shape)
+    new_points = points[batch_indices, idx, :]
+    return new_points
+def cluster_dpc_knn(token_dict, cluster_num, k=5, token_mask=None):
+    """Cluster tokens with DPC-KNN algorithm.
+    Return:
+        idx_cluster (Tensor[B, N]): cluster index of each token.
+        cluster_num (int): actual cluster number. The same with
+            input cluster number
+    Args:
+        token_dict (dict): dict for token information
+        cluster_num (int): cluster number
+        k (int): number of the nearest neighbor used for local density.
+        token_mask (Tensor[B, N]): mask indicate the whether the token is
+            padded empty token. Non-zero value means the token is meaningful,
+            zero value means the token is an empty token. If set to None, all
+            tokens are regarded as meaningful.
+    """
+    with torch.no_grad():
+        x = token_dict["x"]
+        B, N, C = x.shape
+        dist_matrix = torch.cdist(x.float(), x.float()) / (C ** 0.5)
+        if token_mask is not None:
+            token_mask = token_mask > 0
+            # in order to not affect the local density, the distance between empty tokens
+            # and any other tokens should be the maximal distance.
+            dist_matrix = dist_matrix * token_mask[:, None, :] + \
+                          (dist_matrix.max() + 1) * (~token_mask[:, None, :])
+        # get local density
+        dist_nearest, index_nearest = torch.topk(dist_matrix, k=k, dim=-1, largest=False)
+        density = (-(dist_nearest ** 2).mean(dim=-1)).exp()
+        # add a little noise to ensure no tokens have the same density.
+        density = density + torch.rand(
+            density.shape, device=density.device, dtype=density.dtype) * 1e-6
+        if token_mask is not None:
+            # the density of empty token should be 0
+            density = density * token_mask
+        # get distance indicator
+        mask = density[:, None, :] > density[:, :, None]
+        mask = mask.type(x.dtype)
+        dist_max = dist_matrix.flatten(1).max(dim=-1)[0][:, None, None]
+        dist, index_parent = (dist_matrix * mask + dist_max * (1 - mask)).min(dim=-1)
+        # select clustering center according to score
+        score = dist * density
+        _, index_down = torch.topk(score, k=cluster_num, dim=-1)
+        # assign tokens to the nearest center
+        dist_matrix = index_points(dist_matrix, index_down)
+        idx_cluster = dist_matrix.argmin(dim=1)
+        # make sure cluster center merge to itself
+        idx_batch = torch.arange(B, device=x.device)[:, None].expand(B, cluster_num)
+        idx_tmp = torch.arange(cluster_num, device=x.device)[None, :].expand(B, cluster_num)
+        idx_cluster[idx_batch.reshape(-1), index_down.reshape(-1)] = idx_tmp.reshape(-1)
+    return idx_cluster, cluster_num
+def merge_tokens(token_dict, idx_cluster, cluster_num, token_weight=None):
+    """Merge tokens in the same cluster to a single cluster.
+    Implemented by torch.index_add(). Flops: B*N*(C+2)
+    Return:
+        out_dict (dict): dict for output token information
+    Args:
+        token_dict (dict): dict for input token information
+        idx_cluster (Tensor[B, N]): cluster index of each token.
+        cluster_num (int): cluster number
+        token_weight (Tensor[B, N, 1]): weight for each token.
+    """
+    x = token_dict['x']
+    idx_token = token_dict['idx_token']
+    agg_weight = token_dict['agg_weight']
+    B, N, C = x.shape
+    if token_weight is None:
+        token_weight = x.new_ones(B, N, 1)
+    idx_batch = torch.arange(B, device=x.device)[:, None]
+    idx = idx_cluster + idx_batch * cluster_num
+    all_weight = token_weight.new_zeros(B * cluster_num, 1)
+    all_weight.index_add_(dim=0, index=idx.reshape(B * N),
+                          source=token_weight.reshape(B * N, 1))
+    all_weight = all_weight + 1e-6
+    norm_weight = token_weight / all_weight[idx]
+    # average token features
+    x_merged = x.new_zeros(B * cluster_num, C)
+    source = x * norm_weight
+    x_merged.index_add_(dim=0, index=idx.reshape(B * N),
+                        source=source.reshape(B * N, C).type(x.dtype))
+    x_merged = x_merged.reshape(B, cluster_num, C)
+    idx_token_new = index_points(idx_cluster[..., None], idx_token).squeeze(-1)
+    weight_t = index_points(norm_weight, idx_token)
+    agg_weight_new = agg_weight * weight_t
+    agg_weight_new / agg_weight_new.max(dim=1, keepdim=True)[0]
+    out_dict = {}
+    out_dict['x'] = x_merged
+    out_dict['token_num'] = cluster_num
+    out_dict['idx_token'] = idx_token_new
+    out_dict['agg_weight'] = agg_weight_new
+    out_dict['mask'] = None
+    return out_dict
+class CTM(nn.Module):
+    def __init__(self, sample_ratio, embed_dim, dim_out, k=5):
+        super().__init__()
+        self.sample_ratio = sample_ratio
+        self.dim_out = dim_out
+        self.k = k
+    def forward(self, token_dict, sample_ratio=None):
+        x = token_dict["x"]
+        B, N, C = x.shape
+        token_weight = x.new_ones(B, N)
+        if token_dict["mask"] is not None:
+            token_weight.masked_fill_((1 - token_dict["mask"]).to(torch.bool), float("-inf"))
+        token_weight = token_weight.unsqueeze(2)
+        token_dict['x'] = x
+        if sample_ratio is not None:
+            cluster_num = max(math.ceil(N * sample_ratio), 1)
+        elif self.sample_ratio > 1:
+            cluster_num = max(math.ceil(self.sample_ratio), 1)
+        else:
+            cluster_num = max(math.ceil(N * self.sample_ratio), 1)
+        k = min(3, max(cluster_num//2, 1)) if self.k > cluster_num else self.k
+        idx_cluster, cluster_num = cluster_dpc_knn(
+            token_dict, cluster_num, k, token_mask=token_dict["mask"])
+        down_dict = merge_tokens(token_dict, idx_cluster, cluster_num, token_weight)
+        return down_dict, token_dict
+class TCBlock(nn.Module):
+    def __init__(self, dim, num_heads, mlp_ratio=4., qkv_bias=True, qk_scale=None, drop=0., attn_drop=0.,
+                 drop_path=0., act_layer=nn.GELU, norm_layer=nn.LayerNorm, sr_ratio=1, use_sr_layer=False):
+        super().__init__()
+        self.apply(self._init_weights)
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight, std=.02)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.LayerNorm):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+        elif isinstance(m, nn.Conv2d):
+            fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
+            fan_out //= m.groups
+            m.weight.data.normal_(0, math.sqrt(2.0 / fan_out))
+            if m.bias is not None:
+                m.bias.data.zero_()
+    def forward(self, inputs):
+        if isinstance(inputs, tuple) or isinstance(inputs, list):
+            q_dict, kv_dict = inputs
+        else:
+            q_dict, kv_dict = inputs, None
+        x = q_dict['x']
+        return q_dict

ChatUniVi/model/consolidate.py ADDED Viewed

	@@ -0,0 +1,29 @@

+"""
+Usage:
+python3 -m llava.model.consolidate --src ~/model_weights/llava-7b --dst ~/model_weights/llava-7b_consolidate
+"""
+import argparse
+import torch
+from transformers import AutoTokenizer, AutoModelForCausalLM
+from llava.model import *
+from llava.model.utils import auto_upgrade
+def consolidate_ckpt(src_path, dst_path):
+    print("Loading model")
+    auto_upgrade(src_path)
+    src_model = AutoModelForCausalLM.from_pretrained(src_path, torch_dtype=torch.float16, low_cpu_mem_usage=True)
+    src_tokenizer = AutoTokenizer.from_pretrained(src_path, use_fast=False)
+    src_model.save_pretrained(dst_path)
+    src_tokenizer.save_pretrained(dst_path)
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--src", type=str, required=True)
+    parser.add_argument("--dst", type=str, required=True)
+    args = parser.parse_args()
+    consolidate_ckpt(args.src, args.dst)

ChatUniVi/model/dataloader.py ADDED Viewed

	@@ -0,0 +1,67 @@

+from PIL import Image
+import math
+from decord import VideoReader, cpu
+import numpy as np
+import os
+import torch
+def _get_rawvideo_dec(video_path, image_processor, max_frames=64, image_resolution=224, video_framerate=1, s=None, e=None):
+    # speed up video decode via decord.
+    video_mask = np.zeros(max_frames, dtype=np.int64)
+    max_video_length = 0
+    # T x 3 x H x W
+    video = np.zeros((max_frames, 3, image_resolution, image_resolution), dtype=np.float64)
+    if s is None:
+        start_time, end_time = None, None
+    else:
+        start_time = int(s)
+        end_time = int(e)
+        start_time = start_time if start_time >= 0. else 0.
+        end_time = end_time if end_time >= 0. else 0.
+        if start_time > end_time:
+            start_time, end_time = end_time, start_time
+        elif start_time == end_time:
+            end_time = start_time + 1
+    if os.path.exists(video_path):
+        vreader = VideoReader(video_path, ctx=cpu(0))
+    else:
+        print(video_path)
+        raise FileNotFoundError
+    fps = vreader.get_avg_fps()
+    f_start = 0 if start_time is None else int(start_time * fps)
+    f_end = int(min(1000000000 if end_time is None else end_time * fps, len(vreader) - 1))
+    num_frames = f_end - f_start + 1
+    if num_frames > 0:
+        # T x 3 x H x W
+        sample_fps = int(video_framerate)
+        t_stride = int(round(float(fps) / sample_fps))
+        all_pos = list(range(f_start, f_end + 1, t_stride))
+        if len(all_pos) > max_frames:
+            sample_pos = [all_pos[_] for _ in np.linspace(0, len(all_pos) - 1, num=max_frames, dtype=int)]
+        else:
+            sample_pos = all_pos
+        patch_images = [Image.fromarray(f) for f in vreader.get_batch(sample_pos).asnumpy()]
+        patch_images = [image_processor.preprocess(img, return_tensors='pt')['pixel_values'][0] for img in patch_images]
+        slice_len = len(patch_images)
+        return  patch_images, slice_len
+        max_video_length = max_video_length if max_video_length > slice_len else slice_len
+        if slice_len < 1:
+            pass
+        else:
+            while len(patch_images) < max_frames:
+                patch_images.append(torch.zeros((3, image_resolution, image_resolution)))
+            # video[:slice_len, ...] = patch_images
+    else:
+        print("video path: {} error.".format(video_path))
+    video_mask[:max_video_length] = [1] * max_video_length
+    return patch_images, video_mask

ChatUniVi/model/language_model/language_model/configuration_phi.py ADDED Viewed

	@@ -0,0 +1,62 @@

+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+import math
+from typing import Optional
+from transformers import PretrainedConfig
+class PhiConfig(PretrainedConfig):
+    """Phi configuration."""
+    model_type = "phi-msft"
+    attribute_map = {
+        "max_position_embeddings": "n_positions",
+        "hidden_size": "n_embd",
+        "num_attention_heads": "n_head",
+        "num_hidden_layers": "n_layer",
+    }
+    def __init__(
+        self,
+        vocab_size: int = 50304,
+        n_positions: int = 2048,
+        n_embd: int = 1024,
+        n_layer: int = 20,
+        n_inner: Optional[int] = None,
+        n_head: int = 16,
+        n_head_kv: Optional[int] = None,
+        rotary_dim: Optional[int] = 32,
+        activation_function: Optional[str] = "gelu_new",
+        flash_attn: bool = False,
+        flash_rotary: bool = False,
+        fused_dense: bool = False,
+        attn_pdrop: float = 0.0,
+        embd_pdrop: float = 0.0,
+        resid_pdrop: float = 0.0,
+        layer_norm_epsilon: float = 1e-5,
+        initializer_range: float = 0.02,
+        tie_word_embeddings: bool = False,
+        pad_vocab_size_multiple: int = 64,
+        **kwargs
+    ) -> None:
+        self.vocab_size = int(math.ceil(vocab_size / pad_vocab_size_multiple) * pad_vocab_size_multiple)
+        self.n_positions = n_positions
+        self.n_embd = n_embd
+        self.n_layer = n_layer
+        self.n_inner = n_inner
+        self.n_head = n_head
+        self.n_head_kv = n_head_kv
+        self.rotary_dim = min(rotary_dim, n_embd // n_head)
+        self.activation_function = activation_function
+        self.flash_attn = flash_attn
+        self.flash_rotary = flash_rotary
+        self.fused_dense = fused_dense
+        self.attn_pdrop = attn_pdrop
+        self.embd_pdrop = embd_pdrop
+        self.resid_pdrop = resid_pdrop
+        self.layer_norm_epsilon = layer_norm_epsilon
+        self.initializer_range = initializer_range
+        super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs)

ChatUniVi/model/language_model/language_model/modeling_phi.py ADDED Viewed

	@@ -0,0 +1,984 @@

+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+#
+# Copyright (c) 2022, Tri Dao, trid@cs.stanford.edu.
+# Licensed under the BSD 3-Clause License.
+from __future__ import annotations
+import math
+from dataclasses import dataclass, field
+from typing import Any, Dict, Optional, Tuple, Union
+import torch
+import torch.nn as nn
+from einops import rearrange, repeat
+from transformers import PretrainedConfig, PreTrainedModel
+from transformers.activations import ACT2FN
+from transformers.modeling_outputs import CausalLMOutputWithPast
+from .configuration_phi import PhiConfig
+try:
+    from flash_attn.bert_padding import pad_input, unpad_input
+    from flash_attn.layers.rotary import RotaryEmbedding as FlashRotaryEmbedding
+    from flash_attn.modules.mha import FlashCrossAttention, FlashSelfAttention
+    # from flash_attn.ops.fused_dense import FusedDense
+except:
+    pad_input, unpad_input = None, None
+    FlashRotaryEmbedding = None
+    FlashSelfAttention, FlashCrossAttention = None, None
+    FusedDense = None
+from flash_attn.bert_padding import pad_input, unpad_input
+from flash_attn.layers.rotary import RotaryEmbedding as FlashRotaryEmbedding
+from flash_attn.modules.mha import FlashCrossAttention, FlashSelfAttention
+@dataclass
+class InferenceParams:
+    """Inference parameters passed to model to efficiently calculate
+    and store context during inference.
+    Reference:
+        https://github.com/Dao-AILab/flash-attention/blob/main/flash_attn/utils/generation.py.
+    Args:
+        max_seqlen: Maximum sequence length.
+        max_batch_size: Maximum batch size.
+        seqlen_offset: Sequence length offset.
+        batch_size_offset: Batch size offset.
+        key_value_memory_dict: Key value memory dictionary.
+        lengths_per_sample: Lengths per sample.
+    """
+    max_seqlen: int = field(metadata={"help": "Maximum sequence length."})
+    max_batch_size: int = field(metadata={"help": "Maximum batch size."})
+    seqlen_offset: int = field(default=0, metadata={"help": "Sequence length offset."})
+    batch_size_offset: int = field(default=0, metadata={"help": "Batch size offset."})
+    key_value_memory_dict: Dict[str, Any] = field(
+        default_factory=dict, metadata={"help": "Key value memory dictionary."}
+    )
+    lengths_per_sample: torch.Tensor = field(default=None, metadata={"help": "Lengths per sample."})
+class Embedding(nn.Module):
+    """Token embedding with dropout."""
+    def __init__(self, config: PretrainedConfig) -> None:
+        super().__init__()
+        self.wte = nn.Embedding(config.vocab_size, config.n_embd)
+        self.drop = nn.Dropout(config.embd_pdrop)
+    def forward(self, input_ids: torch.LongTensor) -> torch.FloatTensor:
+        input_shape = input_ids.size()
+        input_ids = input_ids.view(-1, input_shape[-1])
+        hidden_states = self.wte(input_ids)
+        hidden_states = self.drop(hidden_states)
+        return hidden_states
+def _apply_rotary_emb(
+    x: torch.FloatTensor,
+    cos: torch.FloatTensor,
+    sin: torch.FloatTensor,
+) -> torch.FloatTensor:
+    _, seqlen, _, _ = x.shape
+    _, rotary_dim = cos.shape
+    rotary_dim *= 2
+    x_rot = x[:, :, :, :rotary_dim]
+    x_pass = x[:, :, :, rotary_dim:]
+    x1, x2 = x_rot.chunk(2, dim=-1)
+    c, s = rearrange(cos[:seqlen], "s d -> s 1 d"), rearrange(sin[:seqlen], "s d -> s 1 d")
+    x1, x2, c, s = [t.to(dtype=torch.float32) for t in [x1, x2, c, s]]
+    x_rot = torch.cat([x1 * c - x2 * s, x1 * s + x2 * c], axis=-1).to(x.dtype)
+    return torch.cat([x_rot, x_pass], axis=-1)
+def _apply_rotary_emb_kv(
+    kv: torch.FloatTensor,
+    cos: torch.FloatTensor,
+    sin: torch.FloatTensor,
+    cos_k: Optional[torch.FloatTensor] = None,
+    sin_k: Optional[torch.FloatTensor] = None,
+) -> torch.FloatTensor:
+    _, seqlen, _, _, _ = kv.shape
+    _, rotary_dim = cos.shape
+    rotary_dim *= 2
+    k_rot = kv[:, :, 0, :, :rotary_dim]
+    k_pass = kv[:, :, 0, :, rotary_dim:]
+    k1, k2 = k_rot.chunk(2, dim=-1)
+    c, s = rearrange(cos[:seqlen], "s d -> s 1 d"), rearrange(sin[:seqlen], "s d -> s 1 d")
+    k1, k2, c, s = [t.to(dtype=torch.float32) for t in [k1, k2, c, s]]
+    k_rot = torch.cat([k1 * c - k2 * s, k1 * s + k2 * c], axis=-1).to(kv.dtype)
+    return torch.cat(
+        [
+            torch.cat([k_rot, k_pass], axis=-1).unsqueeze(2),
+            kv[:, :, 1:2, :, :],
+        ],
+        axis=2,
+    )
+def _apply_rotary_emb_qkv(
+    qkv: torch.FloatTensor,
+    cos: torch.FloatTensor,
+    sin: torch.FloatTensor,
+    cos_k: Optional[torch.FloatTensor] = None,
+    sin_k: Optional[torch.FloatTensor] = None,
+) -> torch.FloatTensor:
+    _, seqlen, _, _, _ = qkv.shape
+    _, rotary_dim = cos.shape
+    rotary_dim *= 2
+    q_rot = qkv[:, :, 0, :, :rotary_dim]
+    q_pass = qkv[:, :, 0, :, rotary_dim:]
+    k_rot = qkv[:, :, 1, :, :rotary_dim]
+    k_pass = qkv[:, :, 1, :, rotary_dim:]
+    q1, q2 = q_rot.chunk(2, dim=-1)
+    k1, k2 = k_rot.chunk(2, dim=-1)
+    c, s = rearrange(cos[:seqlen], "s d -> s 1 d"), rearrange(sin[:seqlen], "s d -> s 1 d")
+    q1, q2, k1, k2, c, s = [t.to(dtype=torch.float32) for t in [q1, q2, k1, k2, c, s]]
+    q_rot = torch.cat([q1 * c - q2 * s, q1 * s + q2 * c], axis=-1).to(qkv.dtype)
+    k_rot = torch.cat([k1 * c - k2 * s, k1 * s + k2 * c], axis=-1).to(qkv.dtype)
+    return torch.cat(
+        [
+            torch.cat([q_rot, q_pass], axis=-1).unsqueeze(2),
+            torch.cat([k_rot, k_pass], axis=-1).unsqueeze(2),
+            qkv[:, :, 2:3, :, :],
+        ],
+        axis=2,
+    )
+class RotaryEmbedding(nn.Module):
+    """Rotary positional embedding (RoPE).
+    Reference:
+        RoFormer: Enhanced Transformer with Rotary Position Embedding.
+        https://arxiv.org/pdf/2104.09864.pdf.
+    """
+    def __init__(
+        self,
+        dim: int,
+        base: int = 10000,
+        scale_base: Optional[float] = None,
+        pos_idx_in_fp32: bool = True,
+        max_position_embeddings: int = 2048,
+        device: Optional[str] = None,
+        **kwargs,
+    ) -> None:
+        super().__init__()
+        if scale_base is not None:
+            raise NotImplementedError
+        self.dim = dim
+        self.base = float(base)
+        self.scale_base = scale_base
+        self.pos_idx_in_fp32 = pos_idx_in_fp32
+        self.max_position_embeddings = max_position_embeddings
+        self.device = device
+        # Generate and save the inverse frequency buffer (non-trainable)
+        inv_freq = self._compute_inv_freq(device)
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+        # Generate and save the scale buffer (non-trainable)
+        scale = (
+            (torch.arange(0, dim, 2, device=device, dtype=torch.float32) + 0.4 * dim) / (1.4 * dim)
+            if scale_base is not None
+            else None
+        )
+        self.register_buffer("scale", scale, persistent=False)
+        # Initialize cached attributes since ONNX can't rely on dynamic initialization
+        self._update_cos_sin_cache(max_position_embeddings, device=device, dtype=torch.float32)
+    def _compute_inv_freq(self, device: Optional[str] = None) -> torch.FloatTensor:
+        return 1.0 / (self.base ** (torch.arange(0, self.dim, 2, device=device, dtype=torch.float32) / self.dim))
+    def _update_cos_sin_cache(
+        self,
+        seqlen: int,
+        device: Optional[str] = None,
+        dtype: Optional[torch.dtype] = None,
+    ) -> None:
+        self._seq_len_cached = seqlen
+        # fp32 is preferred since the output of `torch.arange` can be quite large
+        # and bf16 would lose a lot of precision
+        if self.pos_idx_in_fp32:
+            t = torch.arange(seqlen, device=device, dtype=torch.float32)
+            if self.inv_freq.dtype != torch.float32:
+                inv_freq = self._compute_inv_freq(device=device)
+            else:
+                inv_freq = self.inv_freq
+        else:
+            t = torch.arange(seqlen, device=device, dtype=self.inv_freq.dtype)
+            inv_freq = self.inv_freq
+        # `torch.outer` is preferred since `torch.einsum` converts from fp32 to fp16 if used with AMP
+        freqs = torch.outer(t, inv_freq)
+        if self.scale is None:
+            self._cos_cached = torch.cos(freqs).to(dtype)
+            self._sin_cached = torch.sin(freqs).to(dtype)
+        else:
+            power = (
+                torch.arange(seqlen, dtype=self.scale.dtype, device=self.scale.device) - seqlen // 2
+            ) / self.scale_base
+            scale = self.scale.to(device=power.device) ** rearrange(power, "s -> s 1")
+            # Force the scale multiplication to happen in fp32
+            self._cos_cached = (torch.cos(freqs) * scale).to(dtype)
+            self._sin_cached = (torch.sin(freqs) * scale).to(dtype)
+            self._cos_k_cached = (torch.cos(freqs) / scale).to(dtype)
+            self._sin_k_cached = (torch.sin(freqs) / scale).to(dtype)
+    def forward(
+        self,
+        qkv: torch.Tensor,
+        kv: Optional[torch.Tensor] = None,
+        seqlen_offset: int = 0,
+        **kwargs,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        if (
+            self._seq_len_cached < qkv.shape[1] + seqlen_offset
+            or self._cos_cached.device != qkv.device
+            or self._cos_cached.dtype != qkv.dtype
+            or (self.training and self._cos_cached.is_inference())
+        ):
+            self._update_cos_sin_cache(qkv.shape[1] + seqlen_offset, device=qkv.device, dtype=qkv.dtype)
+        if kv is None:
+            return _apply_rotary_emb_qkv(
+                qkv,
+                self._cos_cached[seqlen_offset:],
+                self._sin_cached[seqlen_offset:],
+            )
+        else:
+            q = _apply_rotary_emb(
+                qkv,
+                self._cos_cached[seqlen_offset:],
+                self._sin_cached[seqlen_offset:],
+            )
+            kv = _apply_rotary_emb_kv(
+                kv,
+                self._cos_cached[seqlen_offset:],
+                self._sin_cached[seqlen_offset:],
+            )
+            return q, kv
+class MLP(nn.Module):
+    """Multi-Layer Perceptron.
+    Reference:
+        Attention Is All You Need.
+        https://arxiv.org/pdf/1706.03762.pdf.
+    """
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        n_inner: Optional[int] = None,
+        act_fn: Optional[str] = None,
+    ) -> None:
+        super().__init__()
+        act_fn = config.activation_function if act_fn is None else act_fn
+        n_inner = getattr(config, "n_inner", None) if n_inner is None else n_inner
+        n_inner = n_inner if n_inner is not None else 4 * config.n_embd
+        self.fc1 = nn.Linear(config.n_embd, n_inner)
+        self.fc2 = nn.Linear(n_inner, config.n_embd)
+        self.act = ACT2FN[act_fn]
+    def forward(self, hidden_states: torch.FloatTensor) -> torch.FloatTensor:
+        hidden_states = self.fc1(hidden_states)
+        hidden_states = self.act(hidden_states)
+        hidden_states = self.fc2(hidden_states)
+        return hidden_states
+class SelfAttention(nn.Module):
+    """Self-attention layer (compatible with PyTorch).
+    Reference:
+        https://github.com/Dao-AILab/flash-attention/blob/main/flash_attn/modules/mha.py.
+    """
+    def __init__(
+        self,
+        causal: bool = True,
+        softmax_scale: Optional[float] = None,
+        attention_dropout: float = 0.0,
+    ) -> None:
+        super().__init__()
+        self.causal = causal
+        self.softmax_scale = softmax_scale
+        self.drop = nn.Dropout(attention_dropout)
+    @torch.autocast("cpu", enabled=False)
+    @torch.autocast("cuda", enabled=False)
+    def forward(
+        self,
+        qkv: torch.FloatTensor,
+        causal: bool = None,
+        key_padding_mask: Optional[torch.BoolTensor] = None,
+        **kwargs,
+    ) -> torch.FloatTensor:
+        batch_size, seqlen = qkv.shape[0], qkv.shape[1]
+        q, k, v = qkv.unbind(dim=2)
+        q = q.to(torch.float32)
+        k = k.to(torch.float32)
+        causal = self.causal if causal is None else causal
+        softmax_scale = self.softmax_scale or 1.0 / math.sqrt(q.shape[-1])
+        # Autocast is manually disabled to avoid `torch.einsum` performing the operation
+        # using float16, which might lead to overflow
+        scores = torch.einsum("bthd,bshd->bhts", q, k * softmax_scale)
+        if key_padding_mask is not None:
+            padding_mask = torch.full((batch_size, seqlen), -10000.0, dtype=scores.dtype, device=scores.device)
+            padding_mask.masked_fill_(key_padding_mask, 0.0)
+            scores = scores + rearrange(padding_mask, "b s -> b 1 1 s")
+        if causal:
+            causal_mask = torch.triu(torch.full((seqlen, seqlen), -10000.0, device=scores.device), 1)
+            scores = scores + causal_mask.to(dtype=scores.dtype)
+        attention = torch.softmax(scores, dim=-1).to(v.dtype)
+        attention = self.drop(attention)
+        output = torch.einsum("bhts,bshd->bthd", attention, v)
+        return output
+class CrossAttention(nn.Module):
+    """Cross-attention layer (compatible with PyTorch).
+    Reference:
+        https://github.com/Dao-AILab/flash-attention/blob/main/flash_attn/modules/mha.py.
+    """
+    def __init__(
+        self,
+        causal: bool = True,
+        softmax_scale: Optional[float] = None,
+        attention_dropout: float = 0.0,
+    ) -> None:
+        super().__init__()
+        self.causal = causal
+        self.softmax_scale = softmax_scale
+        self.drop = nn.Dropout(attention_dropout)
+    @torch.autocast("cpu", enabled=False)
+    @torch.autocast("cuda", enabled=False)
+    def forward(
+        self,
+        q: torch.FloatTensor,
+        kv: torch.FloatTensor,
+        causal: bool = None,
+        key_padding_mask: Optional[torch.BoolTensor] = None,
+        **kwargs,
+    ) -> torch.FloatTensor:
+        batch_size, seqlen_q = q.shape[0], q.shape[1]
+        seqlen_k = kv.shape[1]
+        if kv.shape[3] != q.shape[2]:
+            kv = repeat(kv, "... hkv d -> ... (hkv g) d", g=q.shape[2] // kv.shape[3])
+        k, v = kv.unbind(dim=2)
+        q = q.to(torch.float32)
+        k = k.to(torch.float32)
+        causal = self.causal if causal is None else causal
+        softmax_scale = self.softmax_scale or 1.0 / math.sqrt(q.shape[-1])
+        # Autocast is manually disabled to avoid `torch.einsum` performing the operation
+        # using float16, which might lead to overflow
+        scores = torch.einsum("bthd,bshd->bhts", q, k * softmax_scale)
+        if key_padding_mask is not None:
+            padding_mask = torch.full(
+                (batch_size, seqlen_k),
+                -10000.0,
+                dtype=scores.dtype,
+                device=scores.device,
+            )
+            padding_mask.masked_fill_(key_padding_mask, 0.0)
+            scores = scores + rearrange(padding_mask, "b s -> b 1 1 s")
+        if causal:
+            rows = rearrange(torch.arange(seqlen_q, device=q.device, dtype=torch.long), "s -> s 1")
+            cols = torch.arange(seqlen_k, device=k.device, dtype=torch.long)
+            causal_mask = cols > rows + seqlen_k - seqlen_q
+            scores = scores.masked_fill(causal_mask, -10000.0)
+        attention = torch.softmax(scores, dim=-1).to(v.dtype)
+        attention = self.drop(attention)
+        output = torch.einsum("bhts,bshd->bthd", attention, v)
+        return output
+def _find_mha_dims(
+    config: PretrainedConfig,
+    n_head: Optional[int] = None,
+    n_head_kv: Optional[int] = None,
+    head_dim: Optional[int] = None,
+) -> Tuple[int, int]:
+    if n_head is None and head_dim is None:
+        head_dim = config.n_embd // config.n_head
+        n_head = config.n_head
+    elif n_head is None or head_dim is None:
+        raise ValueError("`n_head` and `head_dim` must be both specified or `None`.")
+    if n_head_kv is None:
+        n_head_kv = getattr(config, "n_head_kv", None) or n_head
+    return n_head, n_head_kv, head_dim
+def _update_kv_cache(kv: torch.FloatTensor, inference_params: InferenceParams, layer_idx: int) -> torch.FloatTensor:
+    num_heads, head_dim = kv.shape[-2:]
+    if layer_idx not in inference_params.key_value_memory_dict:
+        inference_params.key_value_memory_dict[layer_idx] = torch.empty(
+            inference_params.max_batch_size,
+            inference_params.max_seqlen,
+            2,
+            num_heads,
+            head_dim,
+            dtype=kv.dtype,
+            device=kv.device,
+        )
+    batch_start = inference_params.batch_size_offset
+    batch_end = batch_start + kv.shape[0]
+    sequence_start = inference_params.seqlen_offset
+    sequence_end = sequence_start + kv.shape[1]
+    # When the current sequence length is equal to or larger than the maximum sequence length,
+    # we need to concatenate the current `kv` with the cached `kv` to expand its length
+    if sequence_end >= inference_params.max_seqlen:
+        inference_params.key_value_memory_dict[layer_idx] = torch.concatenate((inference_params.key_value_memory_dict[layer_idx], kv), dim=1)
+    inference_params.key_value_memory_dict[layer_idx][batch_start:batch_end, sequence_start:sequence_end, ...] = kv
+    kv = inference_params.key_value_memory_dict[layer_idx][batch_start:batch_end, :sequence_end, ...]
+    return kv
+class MHA(nn.Module):
+    """Multi-head attention layer."""
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        dtype: Optional[torch.dtype] = None,
+        device: Optional[str] = None,
+        rotary_dim: Optional[int] = None,
+        rotary_base: float = 10000.0,
+        rotary_scale_base: Optional[float] = None,
+        n_head: Optional[int] = None,
+        n_head_kv: Optional[int] = None,
+        head_dim: Optional[int] = None,
+        bias: bool = True,
+        causal: bool = True,
+        softmax_scale: Optional[float] = None,
+        layer_idx: Optional[int] = None,
+        return_residual: bool = False,
+        checkpointing: bool = False,
+    ) -> None:
+        super().__init__()
+        # Rotary embedding
+        self.rotary_dim = rotary_dim if rotary_dim is not None else getattr(config, "rotary_dim", 0)
+        if self.rotary_dim > 0:
+            rotary_cls = FlashRotaryEmbedding if config.flash_rotary else RotaryEmbedding
+            if rotary_cls is None:
+                rotary_cls = RotaryEmbedding
+            rotary_kwargs = {}
+            if rotary_cls is RotaryEmbedding:
+                rotary_kwargs["max_position_embeddings"] = config.n_positions
+            self.rotary_emb = rotary_cls(
+                self.rotary_dim,
+                base=rotary_base,
+                scale_base=rotary_scale_base,
+                device=device,
+                **rotary_kwargs,
+            )
+        # MLP
+        self.n_head, self.n_head_kv, self.head_dim = _find_mha_dims(
+            config, n_head=n_head, n_head_kv=n_head_kv, head_dim=head_dim
+        )
+        op_size = self.head_dim * (self.n_head + 2 * self.n_head_kv)
+        hidden_size = config.n_embd
+        linear_cls = FusedDense if config.fused_dense else nn.Linear
+        if linear_cls is None:
+            linear_cls = nn.Linear
+        self.Wqkv = linear_cls(hidden_size, op_size, bias=bias, device=device, dtype=dtype)
+        self.out_proj = linear_cls(hidden_size, hidden_size, bias=bias, device=device, dtype=dtype)
+        # Attention
+        # attn_cls = FlashSelfAttention if config.flash_attn else SelfAttention
+        attn_cls = FlashSelfAttention
+        if attn_cls is None:
+            attn_cls = SelfAttention
+        # cross_attn_cls = FlashCrossAttention if config.flash_attn else CrossAttention
+        cross_attn_cls = FlashCrossAttention
+        if cross_attn_cls is None:
+            cross_attn_cls = CrossAttention
+        self.inner_attn = attn_cls(
+            causal=causal,
+            softmax_scale=softmax_scale,
+            attention_dropout=config.attn_pdrop,
+        )
+        self.inner_cross_attn = cross_attn_cls(
+            causal=causal,
+            softmax_scale=softmax_scale,
+            attention_dropout=config.attn_pdrop,
+        )
+        # self.flash_attn = config.flash_attn and attn_cls is FlashSelfAttention
+        self.flash_attn = True
+        self.layer_idx = layer_idx
+        self.return_residual = return_residual
+        self.checkpointing = checkpointing
+    def _forward_self_attn(
+        self, x: torch.FloatTensor, key_padding_mask: Optional[torch.BoolTensor]
+    ) -> torch.FloatTensor:
+        qkv = self.Wqkv(x)
+        qkv = rearrange(qkv, "... (three h d) -> ... three h d", three=3, d=self.head_dim)
+        if self.rotary_dim > 0:
+            qkv = self.rotary_emb(qkv)
+        if self.flash_attn:
+            batch_size, seqlen = qkv.shape[0], qkv.shape[1]
+            cu_seqlens, max_seqlen = None, None
+            if key_padding_mask is not None:
+                # If `key_padding_mask` is supplied, we need to unpad the input and retrieve
+                # the `cu_seqlens` and `max_seqlen` to be used by `flash-attn`
+                qkv, indices, cu_seqlens, max_seqlen = unpad_input(qkv, key_padding_mask)
+            if self.checkpointing:
+                attn_output = torch.utils.checkpoint.checkpoint(
+                    self.inner_attn, qkv, cu_seqlens=cu_seqlens, max_seqlen=max_seqlen
+                )
+            else:
+                attn_output = self.inner_attn(qkv, cu_seqlens=cu_seqlens, max_seqlen=max_seqlen).to(qkv.device)
+            # If `key_padding_mask` is supplied, we need to pad the output back to the original shape
+            return pad_input(attn_output, indices, batch_size, seqlen) if key_padding_mask is not None else attn_output
+        if self.checkpointing:
+            return torch.utils.checkpoint.checkpoint(self.inner_attn, qkv, key_padding_mask=key_padding_mask)
+        return self.inner_attn(qkv, key_padding_mask=key_padding_mask)
+    def _forward_cross_attn(
+        self,
+        x: torch.FloatTensor,
+        past_key_values: Optional[InferenceParams],
+        key_padding_mask: Optional[torch.BoolTensor],
+    ) -> torch.FloatTensor:
+        batch_size = x.shape[0]
+        qkv = self.Wqkv(x)
+        q = qkv[..., : self.n_head * self.head_dim]
+        q = rearrange(q, "... (h d) -> ... h d", d=self.head_dim)
+        kv = qkv[..., self.n_head * self.head_dim :]
+        kv = rearrange(kv, "... (two hkv d) -> ... two hkv d", two=2, d=self.head_dim)
+        seqlen_offset = past_key_values.seqlen_offset if past_key_values is not None else 0
+        causal = None if seqlen_offset == 0 else False
+        if self.rotary_dim > 0:
+            q, kv = self.rotary_emb(q, kv=kv, seqlen_offset=seqlen_offset)
+        if past_key_values is not None:
+            kv = _update_kv_cache(kv, past_key_values, self.layer_idx)
+        if self.flash_attn:
+            batch_size, seqlen_q = q.shape[0], q.shape[1]
+            seqlen_k = kv.shape[1]
+            cu_seqlens_q, cu_seqlens_k, max_seqlen_q, max_seqlen_k = (
+                None,
+                None,
+                None,
+                None,
+            )
+            if key_padding_mask is not None:
+                kv, _, cu_seqlens_k, max_seqlen_k = unpad_input(kv, key_padding_mask)
+                if seqlen_q == 1:
+                    key_padding_mask = torch.ones(batch_size, 1, device=q.device)
+                elif seqlen_q != seqlen_k:
+                    key_padding_mask = key_padding_mask[:, -seqlen_q:]
+                q, indices_q, cu_seqlens_q, max_seqlen_q = unpad_input(q, key_padding_mask)
+            if self.checkpointing:
+                attn_output = torch.utils.checkpoint.checkpoint(
+                    self.inner_cross_attn,
+                    q,
+                    kv,
+                    causal=causal,
+                    cu_seqlens=cu_seqlens_q,
+                    max_seqlen=max_seqlen_q,
+                    cu_seqlens_k=cu_seqlens_k,
+                    max_seqlen_k=max_seqlen_k,
+                )
+            else:
+                attn_output = self.inner_cross_attn(
+                    q,
+                    kv,
+                    causal=causal,
+                    cu_seqlens=cu_seqlens_q,
+                    max_seqlen=max_seqlen_q,
+                    cu_seqlens_k=cu_seqlens_k,
+                    max_seqlen_k=max_seqlen_k,
+                )
+            return (
+                pad_input(attn_output, indices_q, batch_size, max_seqlen_q)
+                if key_padding_mask is not None
+                else attn_output
+            )
+        if self.checkpointing:
+            return torch.utils.checkpoint.checkpoint(
+                self.inner_cross_attn,
+                q,
+                kv,
+                key_padding_mask=key_padding_mask,
+                causal=causal,
+            )
+        return self.inner_cross_attn(q, kv, key_padding_mask=key_padding_mask, causal=causal)
+    def forward(
+        self,
+        x: torch.FloatTensor,
+        past_key_values: Optional[InferenceParams] = None,
+        attention_mask: Optional[Union[torch.LongTensor, torch.BoolTensor]] = None,
+        **kwargs,
+    ) -> Tuple[torch.FloatTensor, torch.FloatTensor]:
+        if attention_mask is not None:
+            attention_mask = attention_mask.bool()
+        else:
+            attention_mask = None
+        # MHA
+        if self.n_head == self.n_head_kv:
+            if past_key_values is None:
+                # If `past_key_values` are not supplied, we run self-attention
+                attn_output = self._forward_self_attn(x, attention_mask)
+            else:
+                # If `past_key_values` are supplied, it means that we might have cached values and
+                # could take advantage of cross-attention
+                attn_output = self._forward_cross_attn(x, past_key_values, attention_mask)
+        # MQA / GQA
+        else:
+            # Regardless of `past_key_values` being supplied or not, it always use cross-attention
+            # because `q` and `kv` lengths might be different
+            attn_output = self._forward_cross_attn(x, past_key_values, attention_mask)
+        output = rearrange(attn_output, "... h d -> ... (h d)")
+        output = self.out_proj(output)
+        return output if not self.return_residual else (output, x)
+class ParallelBlock(nn.Module):
+    """Parallel block.
+    This block applies parallel mixer and MLP layers to the input (used in GPT-J and CodeGen).
+    """
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        block_idx: Optional[int] = None,
+    ) -> None:
+        super().__init__()
+        self.ln = nn.LayerNorm(config.n_embd, eps=config.layer_norm_epsilon)
+        self.resid_dropout = nn.Dropout(config.resid_pdrop)
+        self.block_idx = block_idx
+        self.mixer = MHA(config, layer_idx=block_idx)
+        self.mlp = MLP(config)
+    def forward(
+        self,
+        hidden_states: torch.FloatTensor,
+        past_key_values: Optional[Union[torch.FloatTensor, InferenceParams]] = None,
+        attention_mask: Optional[torch.BoolTensor] = None,
+        **kwargs,
+    ) -> torch.FloatTensor:
+        residual = hidden_states
+        hidden_states = self.ln(hidden_states)
+        attn_outputs = self.mixer(
+            hidden_states,
+            past_key_values=past_key_values,
+            attention_mask=attention_mask,
+        )
+        if isinstance(attn_outputs, tuple):
+            attn_outputs = attn_outputs[0]
+        attn_outputs = self.resid_dropout(attn_outputs)
+        feed_forward_hidden_states = self.resid_dropout(self.mlp(hidden_states))
+        hidden_states = attn_outputs + feed_forward_hidden_states + residual
+        return hidden_states
+class CausalLMHead(nn.Module):
+    """Causal Language Modeling head.
+    Reference:
+        Improving Language Understanding by Generative Pre-Training.
+        https://cdn.openai.com/research-covers/language-unsupervised/language_understanding_paper.pdf.
+    """
+    def __init__(self, config: PretrainedConfig) -> None:
+        super().__init__()
+        self.ln = nn.LayerNorm(config.n_embd, eps=config.layer_norm_epsilon)
+        self.linear = nn.Linear(config.n_embd, config.vocab_size)
+    def forward(self, hidden_states: torch.FloatTensor) -> torch.FloatTensor:
+        hidden_states = self.ln(hidden_states)
+        logits = self.linear(hidden_states).to(torch.float32)
+        return logits
+class CausalLMLoss(nn.Module):
+    """Causal Language Modeling loss.
+    Reference:
+        Improving Language Understanding by Generative Pre-Training.
+        https://cdn.openai.com/research-covers/language-unsupervised/language_understanding_paper.pdf.
+    """
+    def __init__(self, shift_labels: bool = True) -> None:
+        super().__init__()
+        self.shift_labels = shift_labels
+        self.loss_fct = nn.CrossEntropyLoss()
+    def forward(self, logits: torch.FloatTensor, labels: torch.LongTensor) -> torch.FloatTensor:
+        if self.shift_labels:
+            logits = logits[..., :-1, :].contiguous()
+            labels = labels[..., 1:].contiguous()
+        loss = self.loss_fct(logits.view(-1, logits.size(-1)), labels.view(-1))
+        return loss
+class PhiPreTrainedModel(PreTrainedModel):
+    """Phi pre-trained model."""
+    config_class = PhiConfig
+    base_model_prefix = "transformer"
+    supports_gradient_checkpointing = False
+    _no_split_modules = ["ParallelBlock"]
+    def __init__(self, *inputs, **kwargs) -> None:
+        super().__init__(*inputs, **kwargs)
+    def _init_weights(self, module: nn.Module) -> None:
+        if isinstance(module, (nn.Linear,)):
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+        elif isinstance(module, nn.LayerNorm):
+            if module.bias is not None:
+                module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+    def prepare_inputs_for_generation(
+        self,
+        input_ids: torch.LongTensor,
+        past_key_values: Optional[Union[torch.FloatTensor, InferenceParams]] = None,
+        attention_mask: Optional[Union[torch.LongTensor, torch.BoolTensor]] = None,
+        **kwargs,
+    ) -> Dict[str, Any]:
+        if past_key_values is None or not (isinstance(past_key_values, InferenceParams)):
+            past_key_values = InferenceParams(
+                max_seqlen=self.config.n_positions,
+                max_batch_size=input_ids.shape[0],
+                seqlen_offset=0,
+                batch_size_offset=0,
+                key_value_memory_dict={},
+                lengths_per_sample=None,
+            )
+        else:
+            # Assume that `past_key_values` has cached all tokens up to the last token in `input_ids`
+            past_key_values.seqlen_offset = input_ids.shape[1] - 1
+            input_ids = input_ids[:, -1].unsqueeze(-1)
+        return {
+            "input_ids": input_ids,
+            "past_key_values": past_key_values,
+            "attention_mask": attention_mask,
+        }
+class PhiModel(PhiPreTrainedModel):
+    """Phi model."""
+    _keys_to_ignore_on_load_missing = [""]
+    _keys_to_ignore_on_load_unexpected = [r"h\.\d+\.mlp.(fc_in|fc_out)\.(weight|bias)"]
+    def __init__(self, config: PhiConfig) -> None:
+        super().__init__(config)
+        self.embd = Embedding(config)
+        self.embed_tokens = self.embd
+        self.h = nn.ModuleList([ParallelBlock(config, block_idx=i) for i in range(config.n_layer)])
+        self.gradient_checkpointing = False
+        self.post_init()
+    def get_input_embeddings(self) -> nn.Embedding:
+        return self.embd.wte
+    def set_input_embeddings(self, new_embeddings: nn.Embedding) -> None:
+        self.embd.wte = new_embeddings
+    def forward(
+        self,
+        input_ids: torch.LongTensor,
+        past_key_values: Optional[Union[torch.FloatTensor, InferenceParams]] = None,
+        attention_mask: Optional[torch.BoolTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+    ) -> torch.FloatTensor:
+        if inputs_embeds is None:
+            hidden_states = self.embd(input_ids)
+        else:
+            hidden_states = inputs_embeds
+        for layer in self.h:
+            hidden_states = layer(
+                hidden_states,
+                past_key_values=past_key_values,
+                attention_mask=attention_mask,
+            )
+        return hidden_states
+class PhiForCausalLM(PhiPreTrainedModel):
+    """Phi for Causal Language Modeling."""
+    _keys_to_ignore_on_load_missing = [""]
+    _keys_to_ignore_on_load_unexpected = [r"transformer\.h\.\d+\.mlp.(fc_in|fc_out)\.(weight|bias)"]
+    def __init__(self, config: PhiConfig) -> None:
+        super().__init__(config)
+        self.transformer = PhiModel(config)
+        self.lm_head = CausalLMHead(config)
+        self.loss = CausalLMLoss()
+        self.post_init()
+    def set_input_embeddings(self, value):
+        self.transformer.embd = value
+    # Copied from transformers.models.llama.modeling_llama.LlamaForCausalLM.set_decoder
+    def set_decoder(self, decoder):
+        self.transformer = decoder
+    # Copied from transformers.models.llama.modeling_llama.LlamaForCausalLM.get_decoder
+    def get_decoder(self):
+        return self.transformer
+    def get_input_embeddings(self):
+        return self.transformer.embd
+    def get_output_embeddings(self) -> nn.Linear:
+        return self.lm_head.linear
+    def set_output_embeddings(self, new_embeddings: nn.Linear) -> None:
+        self.lm_head.linear = new_embeddings
+    def forward(
+        self,
+        input_ids: torch.LongTensor,
+        past_key_values: Optional[Union[torch.FloatTensor, InferenceParams]] = None,
+        attention_mask: Optional[torch.BoolTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        **kwargs,
+    ) -> CausalLMOutputWithPast:
+        hidden_states = self.transformer(input_ids, past_key_values=past_key_values, attention_mask=attention_mask)
+        lm_logits = self.lm_head(hidden_states)
+        loss = None
+        if labels is not None:
+            loss = self.loss(lm_logits, labels)
+        return CausalLMOutputWithPast(loss=loss, logits=lm_logits, past_key_values=past_key_values)

ChatUniVi/model/language_model/llama.py ADDED Viewed

	@@ -0,0 +1,136 @@

+from typing import List, Optional, Tuple, Union
+import torch
+import torch.nn as nn
+from torch.nn import CrossEntropyLoss
+from transformers import AutoConfig, AutoModelForCausalLM, \
+                         LlamaConfig, LlamaModel, LlamaForCausalLM
+from transformers.modeling_outputs import CausalLMOutputWithPast
+from models.tf.modeling_outputs import CausalLMOutputWithPastAndLabel
+from ChatUniVi.model.arch import MetaModel, ChatUniViMetaForCausalLM
+class ChatUniViConfig(LlamaConfig):
+    model_type = "ChatUniVi"
+class ChatUniViLlamaModel(MetaModel, LlamaModel):
+    config_class = ChatUniViConfig
+    def __init__(self, config: LlamaConfig):
+        super(ChatUniViLlamaModel, self).__init__(config)
+class ChatUniViLlamaForCausalLM(LlamaForCausalLM, ChatUniViMetaForCausalLM):
+    config_class = ChatUniViConfig
+    def __init__(self, config):
+        super(LlamaForCausalLM, self).__init__(config)
+        self.model = ChatUniViLlamaModel(config)
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+        # Initialize weights and apply final processing
+        self.post_init()
+    def get_model(self):
+        return self.model
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        images: Optional[torch.FloatTensor] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, CausalLMOutputWithPast]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        # print(use_cache, output_attentions, return_dict)
+        # return 0
+        if inputs_embeds is None:
+            input_ids, attention_mask, past_key_values, inputs_embeds, labels = self.prepare_inputs_labels_for_multimodal(input_ids, attention_mask, past_key_values, labels, images)
+        # else:
+        #     print("不调用 prepare_inputs_labels_for_multimodal")
+        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+        outputs = self.model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict
+        )
+        hidden_states = outputs[0]
+        logits = self.lm_head(hidden_states)
+        loss = None
+        if labels is not None:
+            # Shift so that tokens < n predict n
+            shift_logits = logits[..., :-1, :].contiguous()
+            shift_labels = labels[..., 1:].contiguous()
+            # Flatten the tokens
+            loss_fct = CrossEntropyLoss()
+            shift_logits = shift_logits.view(-1, self.config.vocab_size)
+            shift_labels = shift_labels.view(-1)
+            # Enable model/pipeline parallelism
+            shift_labels = shift_labels.to(shift_logits.device)
+            loss = loss_fct(shift_logits, shift_labels)
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return (loss,) + output if loss is not None else output
+        # return CausalLMOutputWithPast(
+        #     loss=loss,
+        #     logits=logits,
+        #     past_key_values=outputs.past_key_values,
+        #     hidden_states=outputs.hidden_states,
+        #     attentions=outputs.attentions,
+        # )
+        return CausalLMOutputWithPastAndLabel(
+            loss=loss,
+            labels = labels,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+    def prepare_inputs_for_generation(
+        self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs
+    ):
+        if past_key_values:
+            input_ids = input_ids[:, -1:]
+        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
+        if inputs_embeds is not None and past_key_values is None:
+            model_inputs = {"inputs_embeds": inputs_embeds}
+        else:
+            model_inputs = {"input_ids": input_ids}
+        model_inputs.update(
+            {
+                "past_key_values": past_key_values,
+                "use_cache": kwargs.get("use_cache"),
+                "attention_mask": attention_mask,
+                "images": kwargs.get("images", None),
+            }
+        )
+        return model_inputs
+AutoConfig.register("ChatUniVi", ChatUniViConfig)
+AutoModelForCausalLM.register(ChatUniViConfig, ChatUniViLlamaForCausalLM)

ChatUniVi/model/language_model/phi.py ADDED Viewed

	@@ -0,0 +1,142 @@

+#    Copyright 2023 Haotian Liu
+#
+#    Licensed under the Apache License, Version 2.0 (the "License");
+#    you may not use this file except in compliance with the License.
+#    You may obtain a copy of the License at
+#
+#        http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS,
+#    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#    See the License for the specific language governing permissions and
+#    limitations under the License.
+from typing import List, Optional, Tuple, Union
+import torch
+import torch.nn as nn
+from torch.nn import CrossEntropyLoss
+from transformers import AutoConfig, AutoModelForCausalLM
+from .modeling_phi.modeling_phi import PhiModel, PhiForCausalLM, CausalLMHead, CausalLMLoss
+from .modeling_phi.configuration_phi import PhiConfig
+from transformers.modeling_outputs import CausalLMOutputWithPast
+from ChatUniVi.model.arch import MetaModel, ChatUniViMetaForCausalLM
+class ChatUniViConfig(PhiConfig):
+    model_type = "ChatUniViPhi2"
+class ChatUniViPhiModel(MetaModel, PhiModel):
+    config_class = ChatUniViConfig
+    def __init__(self, config: PhiConfig):
+        super(ChatUniViPhiModel, self).__init__(config)
+class ChatUniViPhiForCausalLM(PhiForCausalLM, ChatUniViMetaForCausalLM):
+    config_class = ChatUniViConfig
+    supports_gradient_checkpointing = True
+    def __init__(self, config):
+        super(PhiForCausalLM, self).__init__(config)
+        self.config = config
+        self.transformer = ChatUniViPhiModel(config)
+        self.lm_head = CausalLMHead(config)
+        self.loss = CausalLMLoss()
+        self.post_init()
+    def get_model(self):
+        return self.transformer
+    def _set_gradient_checkpointing(self, module, value=False):
+        module.gradient_checkpointing = value
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        images: Optional[torch.FloatTensor] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, CausalLMOutputWithPast]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        input_ids, attention_mask, past_key_values, inputs_embeds, labels = self.prepare_inputs_labels_for_multimodal(input_ids, attention_mask, past_key_values, labels, images)
+        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+        outputs = self.transformer(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+        )
+        hidden_states = outputs
+        logits = self.lm_head(hidden_states)
+        loss = None
+        if labels is not None:
+            # Shift so that tokens < n predict n
+            shift_logits = logits[..., :-1, :].contiguous()
+            shift_labels = labels[..., 1:].contiguous()
+            # Flatten the tokens
+            loss_fct = CrossEntropyLoss()
+            shift_logits = shift_logits.view(-1, self.config.vocab_size)
+            shift_labels = shift_labels.view(-1)
+            # Enable model/pipeline parallelism
+            shift_labels = shift_labels.to(shift_logits.device)
+            try:
+                loss = loss_fct(shift_logits, shift_labels)
+            except:
+                loss = torch.nn.Parameter(torch.zeros(1), requires_grad=True)
+        if not return_dict:
+            output = (logits,) + outputs
+            return (loss,) + output if loss is not None else output
+        return CausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs,
+        )
+    def prepare_inputs_for_generation(
+        self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs
+    ):
+        if past_key_values:
+            input_ids = input_ids[:, -1:]
+        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
+        if inputs_embeds is not None and past_key_values is None:
+            model_inputs = {"inputs_embeds": inputs_embeds}
+        else:
+            model_inputs = {"input_ids": input_ids}
+        model_inputs.update(
+            {
+                "past_key_values": past_key_values,
+                "use_cache": kwargs.get("use_cache"),
+                "attention_mask": attention_mask,
+                "images": kwargs.get("images", None),
+            }
+        )
+        return model_inputs
+AutoConfig.register("ChatUniViPhi2", ChatUniViConfig)
+AutoModelForCausalLM.register(ChatUniViConfig, ChatUniViPhiForCausalLM)

ChatUniVi/model/make_delta.py ADDED Viewed

	@@ -0,0 +1,52 @@

+"""
+Usage:
+python3 -m llava.model.make_delta --base ~/model_weights/llama-7b --target ~/model_weights/llava-7b --delta ~/model_weights/llava-7b-delta --hub-repo-id liuhaotian/llava-7b-delta
+"""
+import argparse
+import torch
+from tqdm import tqdm
+from transformers import AutoTokenizer, AutoModelForCausalLM
+from llava.model.utils import auto_upgrade
+def make_delta(base_model_path, target_model_path, delta_path, hub_repo_id):
+    print("Loading base model")
+    base = AutoModelForCausalLM.from_pretrained(
+        base_model_path, torch_dtype=torch.float16, low_cpu_mem_usage=True)
+    print("Loading target model")
+    auto_upgrade(target_model_path)
+    target = AutoModelForCausalLM.from_pretrained(target_model_path, torch_dtype=torch.float16, low_cpu_mem_usage=True)
+    print("Calculating delta")
+    for name, param in tqdm(target.state_dict().items(), desc="Calculating delta"):
+        if name not in base.state_dict():
+            assert name in ['model.mm_projector.weight', 'model.mm_projector.bias'], f'{name} not in base model'
+            continue
+        if param.data.shape == base.state_dict()[name].shape:
+            param.data -= base.state_dict()[name]
+        else:
+            assert name in ['model.embed_tokens.weight', 'lm_head.weight'], f'{name} dimension mismatch: {param.data.shape} vs {base.state_dict()[name].shape}'
+            bparam = base.state_dict()[name]
+            param.data[:bparam.shape[0], :bparam.shape[1]] -= bparam
+    print("Saving delta")
+    if hub_repo_id:
+        kwargs = {"push_to_hub": True, "repo_id": hub_repo_id}
+    else:
+        kwargs = {}
+    target.save_pretrained(delta_path, **kwargs)
+    target_tokenizer = AutoTokenizer.from_pretrained(target_model_path)
+    target_tokenizer.save_pretrained(delta_path, **kwargs)
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--base-model-path", type=str, required=True)
+    parser.add_argument("--target-model-path", type=str, required=True)
+    parser.add_argument("--delta-path", type=str, required=True)
+    parser.add_argument("--hub-repo-id", type=str, default=None)
+    args = parser.parse_args()
+    make_delta(args.base_model_path, args.target_model_path, args.delta_path, args.hub_repo_id)

ChatUniVi/model/multimodal_encoder/builder.py ADDED Viewed

	@@ -0,0 +1,14 @@

+from .clip_encoder import CLIPVisionTower
+from .eva_encoder import EVAVisionTower
+def build_vision_tower(vision_tower_cfg, **kwargs):
+    vision_tower = getattr(vision_tower_cfg, 'mm_vision_tower', getattr(vision_tower_cfg, 'vision_tower', None))
+    # if vision_tower.startswith("openai") or vision_tower.startswith("laion"):
+    #     return CLIPVisionTower(vision_tower, args=vision_tower_cfg, **kwargs)
+    #
+    # elif vision_tower.startswith("eva_vit_g"):
+    #     return EVAVisionTower(vision_tower, args=vision_tower_cfg, **kwargs)
+    #
+    # raise ValueError(f'Unknown vision tower: {vision_tower}')
+    return CLIPVisionTower(vision_tower, args=vision_tower_cfg, **kwargs)

ChatUniVi/model/multimodal_encoder/clip_encoder.py ADDED Viewed

	@@ -0,0 +1,83 @@

+import torch
+import torch.nn as nn
+from transformers import CLIPVisionModel, CLIPImageProcessor, CLIPVisionConfig
+class CLIPVisionTower(nn.Module):
+    def __init__(self, vision_tower, args=None, delay_load=False):
+        super().__init__()
+        self.is_loaded = False
+        self.vision_tower_name = vision_tower
+        if args is None:
+            self.select_layer = -2
+            self.select_feature = 'patch'
+        else:
+            self.select_layer = args.mm_vision_select_layer
+            self.select_feature = getattr(args, 'mm_vision_select_feature', 'patch')
+        if not delay_load:
+            self.load_model()
+        else:
+            self.cfg_only = CLIPVisionConfig.from_pretrained(self.vision_tower_name)
+    def load_model(self):
+        self.image_processor = CLIPImageProcessor.from_pretrained(self.vision_tower_name)
+        self.image_eval_processor = CLIPImageProcessor.from_pretrained(self.vision_tower_name)
+        self.vision_tower = CLIPVisionModel.from_pretrained(self.vision_tower_name)
+        self.vision_tower.requires_grad_(False)
+        self.is_loaded = True
+    def feature_select(self, image_forward_outs, select_feature='patch'):
+        image_features = image_forward_outs.hidden_states[self.select_layer]
+        if select_feature == 'patch':
+            image_features = image_features[:, 1:]
+        elif select_feature == 'cls_patch':
+            image_features = image_features
+        else:
+            raise ValueError(f'Unexpected select feature: {self.select_feature}')
+        return image_features
+    @torch.no_grad()
+    def forward(self, images, select_feature='patch'):
+        if type(images) is list:
+            image_features = []
+            for image in images:
+                image_forward_out = self.vision_tower(image.to(device=self.device, dtype=self.dtype).unsqueeze(0), output_hidden_states=True)
+                image_feature = self.feature_select(image_forward_out, select_feature).to(image.dtype)
+                image_features.append(image_feature)
+        else:
+            image_forward_outs = self.vision_tower(images.to(device=self.device, dtype=self.dtype), output_hidden_states=True)
+            image_features = self.feature_select(image_forward_outs, select_feature).to(images.dtype)
+        return image_features
+    @property
+    def dummy_feature(self):
+        return torch.zeros(1, self.hidden_size, device=self.device, dtype=self.dtype)
+    @property
+    def dtype(self):
+        return self.vision_tower.dtype
+    @property
+    def device(self):
+        return self.vision_tower.device
+    @property
+    def config(self):
+        if self.is_loaded:
+            return self.vision_tower.config
+        else:
+            return self.cfg_only
+    @property
+    def hidden_size(self):
+        return self.config.hidden_size
+    @property
+    def num_patches(self):
+        return (self.config.image_size // self.config.patch_size) ** 2

ChatUniVi/model/multimodal_encoder/eva_encoder.py ADDED Viewed

	@@ -0,0 +1,81 @@

+import torch
+import torch.nn as nn
+from .eva_vit import create_eva_vit_g, _cfg
+from .processor import ImageTrainProcessor, ImageEvalProcessor
+class EVAVisionTower(nn.Module):
+    def __init__(self, vision_tower, args, delay_load=False):
+        super().__init__()
+        self.is_loaded = False
+        self.vision_tower_name = vision_tower
+        self.select_layer = args.mm_vision_select_layer
+        self.select_feature = getattr(args, 'mm_vision_select_feature', 'patch')
+        if not delay_load:
+            self.load_model()
+        else:
+            self.cfg_only = _cfg()
+    def load_model(self):
+        self.image_processor = ImageTrainProcessor()
+        self.image_eval_processor = ImageEvalProcessor()
+        self.vision_tower = create_eva_vit_g(
+            img_size=224, drop_path_rate=0, use_checkpoint=False, precision="fp16"
+            )
+        # self.vision_tower.requires_grad_(False)
+        self.is_loaded = True
+    def feature_select(self, image_forward_outs, select_feature='patch'):
+        image_features = image_forward_outs[self.select_layer]
+        if select_feature == 'patch':
+            image_features = image_features[:, 1:]
+        elif select_feature == 'cls_patch':
+            image_features = image_features
+        else:
+            raise ValueError(f'Unexpected select feature: {self.select_feature}')
+        return image_features
+    @torch.no_grad()
+    def forward(self, images, select_feature='patch'):
+        if type(images) is list:
+            image_features = []
+            for image in images:
+                image_forward_out = self.vision_tower.get_intermediate_layers(image.to(device=self.device, dtype=self.dtype).unsqueeze(0),)
+                image_feature = self.feature_select(image_forward_out, select_feature).to(image.dtype)
+                image_features.append(image_feature)
+        else:
+            image_forward_outs = self.vision_tower.get_intermediate_layers(images.to(device=self.device, dtype=self.dtype))
+            image_features = self.feature_select(image_forward_outs, select_feature).to(images.dtype)
+        return image_features
+    @property
+    def dummy_feature(self):
+        return torch.zeros(1, self.hidden_size, device=self.device, dtype=self.dtype)
+    @property
+    def dtype(self):
+        return self.vision_tower.cls_token.dtype
+    @property
+    def device(self):
+        return self.vision_tower.cls_token.device
+    @property
+    def config(self):
+        if self.is_loaded:
+            return self.vision_tower.config
+        else:
+            return self.cfg_only
+    @property
+    def hidden_size(self):
+        return self.vision_tower.num_features
+    @property
+    def num_patches(self):
+        return (self.config.image_size // self.config.patch_size) ** 2

ChatUniVi/model/multimodal_encoder/eva_vit.py ADDED Viewed

	@@ -0,0 +1,448 @@

+# Based on EVA, BEIT, timm and DeiT code bases
+# https://github.com/baaivision/EVA
+# https://github.com/rwightman/pytorch-image-models/tree/master/timm
+# https://github.com/microsoft/unilm/tree/master/beit
+# https://github.com/facebookresearch/deit/
+# https://github.com/facebookresearch/dino
+# --------------------------------------------------------'
+import math
+from functools import partial
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.utils.checkpoint as checkpoint
+from timm.models.layers import drop_path, to_2tuple, trunc_normal_
+from timm.models.registry import register_model
+from .utils import download_cached_file
+def _cfg(url='', **kwargs):
+    return {
+        'url': url,
+        'num_classes': 1000, 'input_size': (3, 224, 224), 'pool_size': None,
+        'crop_pct': .9, 'interpolation': 'bicubic',
+        'mean': (0.5, 0.5, 0.5), 'std': (0.5, 0.5, 0.5),
+        **kwargs
+    }
+class DropPath(nn.Module):
+    """Drop paths (Stochastic Depth) per sample  (when applied in main path of residual blocks).
+    """
+    def __init__(self, drop_prob=None):
+        super(DropPath, self).__init__()
+        self.drop_prob = drop_prob
+    def forward(self, x):
+        return drop_path(x, self.drop_prob, self.training)
+    def extra_repr(self) -> str:
+        return 'p={}'.format(self.drop_prob)
+class Mlp(nn.Module):
+    def __init__(self, in_features, hidden_features=None, out_features=None, act_layer=nn.GELU, drop=0.):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Linear(in_features, hidden_features)
+        self.act = act_layer()
+        self.fc2 = nn.Linear(hidden_features, out_features)
+        self.drop = nn.Dropout(drop)
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.act(x)
+        # x = self.drop(x)
+        # commit this for the orignal BERT implement
+        x = self.fc2(x)
+        x = self.drop(x)
+        return x
+class Attention(nn.Module):
+    def __init__(
+            self, dim, num_heads=8, qkv_bias=False, qk_scale=None, attn_drop=0.,
+            proj_drop=0., window_size=None, attn_head_dim=None):
+        super().__init__()
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        if attn_head_dim is not None:
+            head_dim = attn_head_dim
+        all_head_dim = head_dim * self.num_heads
+        self.scale = qk_scale or head_dim ** -0.5
+        self.qkv = nn.Linear(dim, all_head_dim * 3, bias=False)
+        if qkv_bias:
+            self.q_bias = nn.Parameter(torch.zeros(all_head_dim))
+            self.v_bias = nn.Parameter(torch.zeros(all_head_dim))
+        else:
+            self.q_bias = None
+            self.v_bias = None
+        if window_size:
+            self.window_size = window_size
+            self.num_relative_distance = (2 * window_size[0] - 1) * (2 * window_size[1] - 1) + 3
+            self.relative_position_bias_table = nn.Parameter(
+                torch.zeros(self.num_relative_distance, num_heads))  # 2*Wh-1 * 2*Ww-1, nH
+            # cls to token & token 2 cls & cls to cls
+            # get pair-wise relative position index for each token inside the window
+            coords_h = torch.arange(window_size[0])
+            coords_w = torch.arange(window_size[1])
+            coords = torch.stack(torch.meshgrid([coords_h, coords_w]))  # 2, Wh, Ww
+            coords_flatten = torch.flatten(coords, 1)  # 2, Wh*Ww
+            relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :]  # 2, Wh*Ww, Wh*Ww
+            relative_coords = relative_coords.permute(1, 2, 0).contiguous()  # Wh*Ww, Wh*Ww, 2
+            relative_coords[:, :, 0] += window_size[0] - 1  # shift to start from 0
+            relative_coords[:, :, 1] += window_size[1] - 1
+            relative_coords[:, :, 0] *= 2 * window_size[1] - 1
+            relative_position_index = \
+                torch.zeros(size=(window_size[0] * window_size[1] + 1,) * 2, dtype=relative_coords.dtype)
+            relative_position_index[1:, 1:] = relative_coords.sum(-1)  # Wh*Ww, Wh*Ww
+            relative_position_index[0, 0:] = self.num_relative_distance - 3
+            relative_position_index[0:, 0] = self.num_relative_distance - 2
+            relative_position_index[0, 0] = self.num_relative_distance - 1
+            self.register_buffer("relative_position_index", relative_position_index)
+        else:
+            self.window_size = None
+            self.relative_position_bias_table = None
+            self.relative_position_index = None
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(all_head_dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+    def forward(self, x, rel_pos_bias=None):
+        B, N, C = x.shape
+        qkv_bias = None
+        if self.q_bias is not None:
+            qkv_bias = torch.cat((self.q_bias, torch.zeros_like(self.v_bias, requires_grad=False), self.v_bias))
+        # qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
+        qkv = F.linear(input=x, weight=self.qkv.weight, bias=qkv_bias)
+        qkv = qkv.reshape(B, N, 3, self.num_heads, -1).permute(2, 0, 3, 1, 4)
+        q, k, v = qkv[0], qkv[1], qkv[2]  # make torchscript happy (cannot use tensor as tuple)
+        q = q * self.scale
+        attn = (q @ k.transpose(-2, -1))
+        if self.relative_position_bias_table is not None:
+            relative_position_bias = \
+                self.relative_position_bias_table[self.relative_position_index.view(-1)].view(
+                    self.window_size[0] * self.window_size[1] + 1,
+                    self.window_size[0] * self.window_size[1] + 1, -1)  # Wh*Ww,Wh*Ww,nH
+            relative_position_bias = relative_position_bias.permute(2, 0, 1).contiguous()  # nH, Wh*Ww, Wh*Ww
+            attn = attn + relative_position_bias.unsqueeze(0)
+        if rel_pos_bias is not None:
+            attn = attn + rel_pos_bias
+        attn = attn.softmax(dim=-1)
+        attn = self.attn_drop(attn)
+        x = (attn @ v).transpose(1, 2).reshape(B, N, -1)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+class Block(nn.Module):
+    def __init__(self, dim, num_heads, mlp_ratio=4., qkv_bias=False, qk_scale=None, drop=0., attn_drop=0.,
+                 drop_path=0., init_values=None, act_layer=nn.GELU, norm_layer=nn.LayerNorm,
+                 window_size=None, attn_head_dim=None):
+        super().__init__()
+        self.norm1 = norm_layer(dim)
+        self.attn = Attention(
+            dim, num_heads=num_heads, qkv_bias=qkv_bias, qk_scale=qk_scale,
+            attn_drop=attn_drop, proj_drop=drop, window_size=window_size, attn_head_dim=attn_head_dim)
+        # NOTE: drop path for stochastic depth, we shall see if this is better than dropout here
+        self.drop_path = DropPath(drop_path) if drop_path > 0. else nn.Identity()
+        self.norm2 = norm_layer(dim)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = Mlp(in_features=dim, hidden_features=mlp_hidden_dim, act_layer=act_layer, drop=drop)
+        if init_values is not None and init_values > 0:
+            self.gamma_1 = nn.Parameter(init_values * torch.ones((dim)), requires_grad=True)
+            self.gamma_2 = nn.Parameter(init_values * torch.ones((dim)), requires_grad=True)
+        else:
+            self.gamma_1, self.gamma_2 = None, None
+    def forward(self, x, rel_pos_bias=None):
+        if self.gamma_1 is None:
+            x = x + self.drop_path(self.attn(self.norm1(x), rel_pos_bias=rel_pos_bias))
+            x = x + self.drop_path(self.mlp(self.norm2(x)))
+        else:
+            x = x + self.drop_path(self.gamma_1 * self.attn(self.norm1(x), rel_pos_bias=rel_pos_bias))
+            x = x + self.drop_path(self.gamma_2 * self.mlp(self.norm2(x)))
+        return x
+class PatchEmbed(nn.Module):
+    """ Image to Patch Embedding
+    """
+    def __init__(self, img_size=224, patch_size=16, in_chans=3, embed_dim=768):
+        super().__init__()
+        img_size = to_2tuple(img_size)
+        patch_size = to_2tuple(patch_size)
+        num_patches = (img_size[1] // patch_size[1]) * (img_size[0] // patch_size[0])
+        self.patch_shape = (img_size[0] // patch_size[0], img_size[1] // patch_size[1])
+        self.img_size = img_size
+        self.patch_size = patch_size
+        self.num_patches = num_patches
+        self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_size, stride=patch_size)
+    def forward(self, x, **kwargs):
+        B, C, H, W = x.shape
+        # FIXME look at relaxing size constraints
+        assert H == self.img_size[0] and W == self.img_size[1], \
+            f"Input image size ({H}*{W}) doesn't match model ({self.img_size[0]}*{self.img_size[1]})."
+        x = self.proj(x).flatten(2).transpose(1, 2)
+        return x
+class RelativePositionBias(nn.Module):
+    def __init__(self, window_size, num_heads):
+        super().__init__()
+        self.window_size = window_size
+        self.num_relative_distance = (2 * window_size[0] - 1) * (2 * window_size[1] - 1) + 3
+        self.relative_position_bias_table = nn.Parameter(
+            torch.zeros(self.num_relative_distance, num_heads))  # 2*Wh-1 * 2*Ww-1, nH
+        # cls to token & token 2 cls & cls to cls
+        # get pair-wise relative position index for each token inside the window
+        coords_h = torch.arange(window_size[0])
+        coords_w = torch.arange(window_size[1])
+        coords = torch.stack(torch.meshgrid([coords_h, coords_w]))  # 2, Wh, Ww
+        coords_flatten = torch.flatten(coords, 1)  # 2, Wh*Ww
+        relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :]  # 2, Wh*Ww, Wh*Ww
+        relative_coords = relative_coords.permute(1, 2, 0).contiguous()  # Wh*Ww, Wh*Ww, 2
+        relative_coords[:, :, 0] += window_size[0] - 1  # shift to start from 0
+        relative_coords[:, :, 1] += window_size[1] - 1
+        relative_coords[:, :, 0] *= 2 * window_size[1] - 1
+        relative_position_index = \
+            torch.zeros(size=(window_size[0] * window_size[1] + 1,) * 2, dtype=relative_coords.dtype)
+        relative_position_index[1:, 1:] = relative_coords.sum(-1)  # Wh*Ww, Wh*Ww
+        relative_position_index[0, 0:] = self.num_relative_distance - 3
+        relative_position_index[0:, 0] = self.num_relative_distance - 2
+        relative_position_index[0, 0] = self.num_relative_distance - 1
+        self.register_buffer("relative_position_index", relative_position_index)
+        # trunc_normal_(self.relative_position_bias_table, std=.02)
+    def forward(self):
+        relative_position_bias = \
+            self.relative_position_bias_table[self.relative_position_index.view(-1)].view(
+                self.window_size[0] * self.window_size[1] + 1,
+                self.window_size[0] * self.window_size[1] + 1, -1)  # Wh*Ww,Wh*Ww,nH
+        return relative_position_bias.permute(2, 0, 1).contiguous()  # nH, Wh*Ww, Wh*Ww
+class VisionTransformer(nn.Module):
+    """ Vision Transformer with support for patch or hybrid CNN input stage
+    """
+    def __init__(self, img_size=224, patch_size=16, in_chans=3, num_classes=1000, embed_dim=768, depth=12,
+                 num_heads=12, mlp_ratio=4., qkv_bias=False, qk_scale=None, drop_rate=0., attn_drop_rate=0.,
+                 drop_path_rate=0., norm_layer=nn.LayerNorm, init_values=None,
+                 use_abs_pos_emb=True, use_rel_pos_bias=False, use_shared_rel_pos_bias=False,
+                 use_mean_pooling=True, init_scale=0.001, use_checkpoint=False):
+        super().__init__()
+        self.image_size = img_size
+        self.num_classes = num_classes
+        self.num_features = self.embed_dim = embed_dim  # num_features for consistency with other models
+        self.patch_embed = PatchEmbed(
+            img_size=img_size, patch_size=patch_size, in_chans=in_chans, embed_dim=embed_dim)
+        num_patches = self.patch_embed.num_patches
+        self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim))
+        if use_abs_pos_emb:
+            self.pos_embed = nn.Parameter(torch.zeros(1, num_patches + 1, embed_dim))
+        else:
+            self.pos_embed = None
+        self.pos_drop = nn.Dropout(p=drop_rate)
+        if use_shared_rel_pos_bias:
+            self.rel_pos_bias = RelativePositionBias(window_size=self.patch_embed.patch_shape, num_heads=num_heads)
+        else:
+            self.rel_pos_bias = None
+        self.use_checkpoint = use_checkpoint
+        dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)]  # stochastic depth decay rule
+        self.use_rel_pos_bias = use_rel_pos_bias
+        self.blocks = nn.ModuleList([
+            Block(
+                dim=embed_dim, num_heads=num_heads, mlp_ratio=mlp_ratio, qkv_bias=qkv_bias, qk_scale=qk_scale,
+                drop=drop_rate, attn_drop=attn_drop_rate, drop_path=dpr[i], norm_layer=norm_layer,
+                init_values=init_values, window_size=self.patch_embed.patch_shape if use_rel_pos_bias else None)
+            for i in range(depth)])
+        #         self.norm = nn.Identity() if use_mean_pooling else norm_layer(embed_dim)
+        #         self.fc_norm = norm_layer(embed_dim) if use_mean_pooling else None
+        #         self.head = nn.Linear(embed_dim, num_classes) if num_classes > 0 else nn.Identity()
+        if self.pos_embed is not None:
+            trunc_normal_(self.pos_embed, std=.02)
+        trunc_normal_(self.cls_token, std=.02)
+        # trunc_normal_(self.mask_token, std=.02)
+        #         if isinstance(self.head, nn.Linear):
+        #             trunc_normal_(self.head.weight, std=.02)
+        self.apply(self._init_weights)
+        self.fix_init_weight()
+    #         if isinstance(self.head, nn.Linear):
+    #             self.head.weight.data.mul_(init_scale)
+    #             self.head.bias.data.mul_(init_scale)
+    def fix_init_weight(self):
+        def rescale(param, layer_id):
+            param.div_(math.sqrt(2.0 * layer_id))
+        for layer_id, layer in enumerate(self.blocks):
+            rescale(layer.attn.proj.weight.data, layer_id + 1)
+            rescale(layer.mlp.fc2.weight.data, layer_id + 1)
+    def _init_weights(self, m):
+        if isinstance(m, nn.Linear):
+            trunc_normal_(m.weight, std=.02)
+            if isinstance(m, nn.Linear) and m.bias is not None:
+                nn.init.constant_(m.bias, 0)
+        elif isinstance(m, nn.LayerNorm):
+            nn.init.constant_(m.bias, 0)
+            nn.init.constant_(m.weight, 1.0)
+    def get_classifier(self):
+        return self.head
+    def reset_classifier(self, num_classes, global_pool=''):
+        self.num_classes = num_classes
+        self.head = nn.Linear(self.embed_dim, num_classes) if num_classes > 0 else nn.Identity()
+    def forward_features(self, x):
+        x = self.patch_embed(x)
+        batch_size, seq_len, _ = x.size()
+        cls_tokens = self.cls_token.expand(batch_size, -1, -1)  # stole cls_tokens impl from Phil Wang, thanks
+        x = torch.cat((cls_tokens, x), dim=1)
+        if self.pos_embed is not None:
+            x = x + self.pos_embed
+        x = self.pos_drop(x)
+        rel_pos_bias = self.rel_pos_bias() if self.rel_pos_bias is not None else None
+        for blk in self.blocks:
+            if self.use_checkpoint:
+                x = checkpoint.checkpoint(blk, x, rel_pos_bias)
+            else:
+                x = blk(x, rel_pos_bias)
+        return x
+    #         x = self.norm(x)
+    #         if self.fc_norm is not None:
+    #             t = x[:, 1:, :]
+    #             return self.fc_norm(t.mean(1))
+    #         else:
+    #             return x[:, 0]
+    def forward(self, x):
+        x = self.forward_features(x)
+        #         x = self.head(x)
+        return x
+    def get_intermediate_layers(self, x):
+        x = self.patch_embed(x)
+        batch_size, seq_len, _ = x.size()
+        cls_tokens = self.cls_token.expand(batch_size, -1, -1)  # stole cls_tokens impl from Phil Wang, thanks
+        x = torch.cat((cls_tokens, x), dim=1)
+        if self.pos_embed is not None:
+            x = x + self.pos_embed
+        x = self.pos_drop(x)
+        features = []
+        rel_pos_bias = self.rel_pos_bias() if self.rel_pos_bias is not None else None
+        for blk in self.blocks:
+            x = blk(x, rel_pos_bias)
+            features.append(x)
+        return features
+def interpolate_pos_embed(model, checkpoint_model):
+    if 'pos_embed' in checkpoint_model:
+        pos_embed_checkpoint = checkpoint_model['pos_embed'].float()
+        embedding_size = pos_embed_checkpoint.shape[-1]
+        num_patches = model.patch_embed.num_patches
+        num_extra_tokens = model.pos_embed.shape[-2] - num_patches
+        # height (== width) for the checkpoint position embedding
+        orig_size = int((pos_embed_checkpoint.shape[-2] - num_extra_tokens) ** 0.5)
+        # height (== width) for the new position embedding
+        new_size = int(num_patches ** 0.5)
+        # class_token and dist_token are kept unchanged
+        if orig_size != new_size:
+            print("Position interpolate from %dx%d to %dx%d" % (orig_size, orig_size, new_size, new_size))
+            extra_tokens = pos_embed_checkpoint[:, :num_extra_tokens]
+            # only the position tokens are interpolated
+            pos_tokens = pos_embed_checkpoint[:, num_extra_tokens:]
+            pos_tokens = pos_tokens.reshape(-1, orig_size, orig_size, embedding_size).permute(0, 3, 1, 2)
+            pos_tokens = torch.nn.functional.interpolate(
+                pos_tokens, size=(new_size, new_size), mode='bicubic', align_corners=False)
+            pos_tokens = pos_tokens.permute(0, 2, 3, 1).flatten(1, 2)
+            new_pos_embed = torch.cat((extra_tokens, pos_tokens), dim=1)
+            checkpoint_model['pos_embed'] = new_pos_embed
+def convert_weights_to_fp16(model: nn.Module):
+    """Convert applicable model parameters to fp16"""
+    def _convert_weights_to_fp16(l):
+        if isinstance(l, (nn.Conv1d, nn.Conv2d, nn.Linear)):
+            l.weight.data = l.weight.data.half()
+            if l.bias is not None:
+                l.bias.data = l.bias.data.half()
+    #         if isinstance(l, (nn.MultiheadAttention, Attention)):
+    #             for attr in [*[f"{s}_proj_weight" for s in ["in", "q", "k", "v"]], "in_proj_bias", "bias_k", "bias_v"]:
+    #                 tensor = getattr(l, attr)
+    #                 if tensor is not None:
+    #                     tensor.data = tensor.data.half()
+    model.apply(_convert_weights_to_fp16)
+def create_eva_vit_g(img_size=224, drop_path_rate=0.4, use_checkpoint=False, precision="fp16"):
+    model = VisionTransformer(
+        img_size=img_size,
+        patch_size=14,
+        use_mean_pooling=False,
+        embed_dim=1408,
+        depth=39,
+        num_heads=1408 // 88,
+        mlp_ratio=4.3637,
+        qkv_bias=True,
+        drop_path_rate=drop_path_rate,
+        norm_layer=partial(nn.LayerNorm, eps=1e-6),
+        use_checkpoint=use_checkpoint,
+    )
+    url = "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/eva_vit_g.pth"
+    cached_file = download_cached_file(
+        url, check_hash=False, progress=True
+    )
+    state_dict = torch.load(cached_file, map_location="cpu")
+    interpolate_pos_embed(model, state_dict)
+    incompatible_keys = model.load_state_dict(state_dict, strict=False)
+    #     print(incompatible_keys)
+    if precision == "fp16":
+        #         model.to("cuda")
+        convert_weights_to_fp16(model)
+    return model

ChatUniVi/model/multimodal_encoder/processor.py ADDED Viewed

	@@ -0,0 +1,68 @@

+import re
+from torchvision import transforms
+from torchvision.transforms.functional import InterpolationMode
+class BaseProcessor:
+    def __init__(self, mean=None, std=None):
+        if mean is None:
+            mean = (0.48145466, 0.4578275, 0.40821073)
+        if std is None:
+            std = (0.26862954, 0.26130258, 0.27577711)
+        self.normalize = transforms.Normalize(mean, std)
+class ImageTrainProcessor(BaseProcessor):
+    def __init__(self, image_size=224, mean=None, std=None, min_scale=0.5, max_scale=1.0):
+        super().__init__(mean=mean, std=std)
+        self.transform = transforms.Compose(
+            [
+                transforms.Resize(
+                    (image_size, image_size), interpolation=InterpolationMode.BICUBIC
+                ),
+                transforms.ToTensor(),
+                self.normalize,
+            ]
+        )
+    def preprocess(self, item, return_tensors):
+        return {'pixel_values': [self.transform(item)]}
+class ImageEvalProcessor(BaseProcessor):
+    def __init__(self, image_size=224, mean=None, std=None):
+        super().__init__(mean=mean, std=std)
+        self.transform = transforms.Compose(
+            [
+                transforms.Resize(
+                    (image_size, image_size), interpolation=InterpolationMode.BICUBIC
+                ),
+                transforms.ToTensor(),
+                self.normalize,
+            ]
+        )
+    def preprocess(self, item, return_tensors):
+        return {'pixel_values': [self.transform(item)]}
+class QWenImageProcessor(BaseProcessor):
+    def __init__(self, image_size=224, mean=None, std=None):
+        super().__init__(mean=mean, std=std)
+        mean = (0.48145466, 0.4578275, 0.40821073)
+        std = (0.26862954, 0.26130258, 0.27577711)
+        self.transform = transforms.Compose([
+            transforms.Resize(
+                (448, 448),
+                interpolation=InterpolationMode.BICUBIC
+            ),
+            transforms.ToTensor(),
+            transforms.Normalize(mean=mean, std=std),
+        ])
+    def preprocess(self, item, return_tensors):
+        return {'pixel_values': [self.transform(item)]}

ChatUniVi/model/multimodal_encoder/utils.py ADDED Viewed

	@@ -0,0 +1,137 @@

+"""
+ Copyright (c) 2022, salesforce.com, inc.
+ All rights reserved.
+ SPDX-License-Identifier: BSD-3-Clause
+ For full license text, see the LICENSE_Lavis file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+"""
+import datetime
+import functools
+import os
+import torch
+import torch.distributed as dist
+import timm.models.hub as timm_hub
+def setup_for_distributed(is_master):
+    """
+    This function disables printing when not in master process
+    """
+    import builtins as __builtin__
+    builtin_print = __builtin__.print
+    def print(*args, **kwargs):
+        force = kwargs.pop("force", False)
+        if is_master or force:
+            builtin_print(*args, **kwargs)
+    __builtin__.print = print
+def is_dist_avail_and_initialized():
+    if not dist.is_available():
+        return False
+    if not dist.is_initialized():
+        return False
+    return True
+def get_world_size():
+    if not is_dist_avail_and_initialized():
+        return 1
+    return dist.get_world_size()
+def get_rank():
+    if not is_dist_avail_and_initialized():
+        return 0
+    return dist.get_rank()
+def is_main_process():
+    return get_rank() == 0
+def init_distributed_mode(args):
+    if "RANK" in os.environ and "WORLD_SIZE" in os.environ:
+        args.rank = int(os.environ["RANK"])
+        args.world_size = int(os.environ["WORLD_SIZE"])
+        args.gpu = int(os.environ["LOCAL_RANK"])
+    elif "SLURM_PROCID" in os.environ:
+        args.rank = int(os.environ["SLURM_PROCID"])
+        args.gpu = args.rank % torch.cuda.device_count()
+    else:
+        print("Not using distributed mode")
+        args.distributed = False
+        return
+    args.distributed = True
+    torch.cuda.set_device(args.gpu)
+    args.dist_backend = "nccl"
+    print(
+        "| distributed init (rank {}, world {}): {}".format(
+            args.rank, args.world_size, args.dist_url
+        ),
+        flush=True,
+    )
+    torch.distributed.init_process_group(
+        backend=args.dist_backend,
+        init_method=args.dist_url,
+        world_size=args.world_size,
+        rank=args.rank,
+        timeout=datetime.timedelta(
+            days=365
+        ),  # allow auto-downloading and de-compressing
+    )
+    torch.distributed.barrier()
+    setup_for_distributed(args.rank == 0)
+def get_dist_info():
+    if torch.__version__ < "1.0":
+        initialized = dist._initialized
+    else:
+        initialized = dist.is_initialized()
+    if initialized:
+        rank = dist.get_rank()
+        world_size = dist.get_world_size()
+    else:  # non-distributed training
+        rank = 0
+        world_size = 1
+    return rank, world_size
+def main_process(func):
+    @functools.wraps(func)
+    def wrapper(*args, **kwargs):
+        rank, _ = get_dist_info()
+        if rank == 0:
+            return func(*args, **kwargs)
+    return wrapper
+def download_cached_file(url, check_hash=True, progress=False):
+    """
+    Download a file from a URL and cache it locally. If the file already exists, it is not downloaded again.
+    If distributed, only the main process downloads the file, and the other processes wait for the file to be downloaded.
+    """
+    def get_cached_file_path():
+        # a hack to sync the file path across processes
+        parts = torch.hub.urlparse(url)
+        filename = os.path.basename(parts.path)
+        cached_file = os.path.join(timm_hub.get_cache_dir(), filename)
+        return cached_file
+    if is_main_process():
+        timm_hub.download_cached_file(url, check_hash, progress)
+    if is_dist_avail_and_initialized():
+        dist.barrier()
+    return get_cached_file_path()

ChatUniVi/model/multimodal_projector/builder.py ADDED Viewed

	@@ -0,0 +1,52 @@

+import torch
+import torch.nn as nn
+import re
+class IdentityMap(nn.Module):
+    def __init__(self):
+        super().__init__()
+    def forward(self, x, *args, **kwargs):
+        return x
+    @property
+    def config(self):
+        return {"mm_projector_type": 'identity'}
+class SimpleResBlock(nn.Module):
+    def __init__(self, channels):
+        super().__init__()
+        self.pre_norm = nn.LayerNorm(channels)
+        self.proj = nn.Sequential(
+            nn.Linear(channels, channels),
+            nn.GELU(),
+            nn.Linear(channels, channels)
+        )
+    def forward(self, x):
+        x = self.pre_norm(x)
+        return x + self.proj(x)
+def build_vision_projector(config, delay_load=False, **kwargs):
+    projector_type = getattr(config, 'mm_projector_type', 'linear')
+    if projector_type == 'linear':
+        return nn.Linear(config.mm_hidden_size, config.hidden_size)
+    print("projector_type:", projector_type)
+    mlp_gelu_match = re.match(r'^mlp(\d+)x_gelu$', projector_type)
+    if mlp_gelu_match:
+        mlp_depth = int(mlp_gelu_match.group(1))
+        modules = [nn.Linear(config.mm_hidden_size, config.hidden_size)]
+        for _ in range(1, mlp_depth):
+            modules.append(nn.GELU())
+            modules.append(nn.Linear(config.hidden_size, config.hidden_size))
+        return nn.Sequential(*modules)
+    if projector_type == 'identity':
+        return IdentityMap()
+    raise ValueError(f'Unknown projector type: {projector_type}')

ChatUniVi/train/llama_flash_attn_monkey_patch.py ADDED Viewed

	@@ -0,0 +1,124 @@

+from typing import List, Optional, Tuple
+import logging
+import torch
+from torch import nn
+import transformers
+from transformers.models.llama.modeling_llama import apply_rotary_pos_emb
+from einops import rearrange
+try:
+    from flash_attn.flash_attn_interface import flash_attn_unpadded_qkvpacked_func
+except ImportError:
+    from flash_attn.flash_attn_interface import flash_attn_varlen_qkvpacked_func as flash_attn_unpadded_qkvpacked_func
+from flash_attn.bert_padding import unpad_input, pad_input
+def forward(
+    self,
+    hidden_states: torch.Tensor,
+    attention_mask: Optional[torch.Tensor] = None,
+    position_ids: Optional[torch.Tensor] = None,
+    past_key_value: Optional[Tuple[torch.Tensor]] = None,
+    output_attentions: bool = False,
+    use_cache: bool = False,
+) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+    """Input shape: Batch x Time x Channel
+    attention_mask: [bsz, q_len]
+    """
+    bsz, q_len, _ = hidden_states.size()
+    query_states = (
+        self.q_proj(hidden_states)
+        .view(bsz, q_len, self.num_heads, self.head_dim)
+        .transpose(1, 2)
+    )
+    key_states = (
+        self.k_proj(hidden_states)
+        .view(bsz, q_len, self.num_heads, self.head_dim)
+        .transpose(1, 2)
+    )
+    value_states = (
+        self.v_proj(hidden_states)
+        .view(bsz, q_len, self.num_heads, self.head_dim)
+        .transpose(1, 2)
+    )
+    # [bsz, q_len, nh, hd]
+    # [bsz, nh, q_len, hd]
+    kv_seq_len = key_states.shape[-2]
+    assert past_key_value is None, "past_key_value is not supported"
+    cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
+    query_states, key_states = apply_rotary_pos_emb(
+        query_states, key_states, cos, sin, position_ids
+    )
+    # [bsz, nh, t, hd]
+    assert not output_attentions, "output_attentions is not supported"
+    assert not use_cache, "use_cache is not supported"
+    # Flash attention codes from
+    # https://github.com/HazyResearch/flash-attention/blob/main/flash_attn/flash_attention.py
+    # transform the data into the format required by flash attention
+    qkv = torch.stack(
+        [query_states, key_states, value_states], dim=2
+    )  # [bsz, nh, 3, q_len, hd]
+    qkv = qkv.transpose(1, 3)  # [bsz, q_len, 3, nh, hd]
+    # We have disabled _prepare_decoder_attention_mask in LlamaModel
+    # the attention_mask should be the same as the key_padding_mask
+    key_padding_mask = attention_mask
+    if key_padding_mask is None:
+        qkv = rearrange(qkv, "b s ... -> (b s) ...")
+        max_s = q_len
+        cu_q_lens = torch.arange(
+            0, (bsz + 1) * q_len, step=q_len, dtype=torch.int32, device=qkv.device
+        )
+        output = flash_attn_unpadded_qkvpacked_func(
+            qkv, cu_q_lens, max_s, 0.0, softmax_scale=None, causal=True
+        )
+        output = rearrange(output, "(b s) ... -> b s ...", b=bsz)
+    else:
+        nheads = qkv.shape[-2]
+        x = rearrange(qkv, "b s three h d -> b s (three h d)")
+        x_unpad, indices, cu_q_lens, max_s = unpad_input(x, key_padding_mask)
+        x_unpad = rearrange(
+            x_unpad, "nnz (three h d) -> nnz three h d", three=3, h=nheads
+        )
+        output_unpad = flash_attn_unpadded_qkvpacked_func(
+            x_unpad, cu_q_lens, max_s, 0.0, softmax_scale=None, causal=True
+        )
+        output = rearrange(
+            pad_input(
+                rearrange(output_unpad, "nnz h d -> nnz (h d)"), indices, bsz, q_len
+            ),
+            "b s (h d) -> b s h d",
+            h=nheads,
+        )
+    return self.o_proj(rearrange(output, "b s h d -> b s (h d)")), None, None
+# Disable the transformation of the attention mask in LlamaModel as the flash attention
+# requires the attention mask to be the same as the key_padding_mask
+def _prepare_decoder_attention_mask(
+    self, attention_mask, input_shape, inputs_embeds, past_key_values_length
+):
+    # [bsz, seq_len]
+    return attention_mask
+def replace_llama_attn_with_flash_attn():
+    cuda_major, cuda_minor = torch.cuda.get_device_capability()
+    if cuda_major < 8:
+        logging.warning(
+            "Flash attention is only supported on A100 or H100 GPU during training due to head dim > 64 backward."
+            "ref: https://github.com/HazyResearch/flash-attention/issues/190#issuecomment-1523359593"
+        )
+    transformers.models.llama.modeling_llama.LlamaModel._prepare_decoder_attention_mask = (
+        _prepare_decoder_attention_mask
+    )
+    transformers.models.llama.modeling_llama.LlamaAttention.forward = forward

ChatUniVi/train/train.py ADDED Viewed

	@@ -0,0 +1,1232 @@

+# Adopted from https://github.com/lm-sys/FastChat. Below is the original copyright:
+# Adopted from tatsu-lab@stanford_alpaca. Below is the original copyright:
+#    Copyright 2023 Rohan Taori, Ishaan Gulrajani, Tianyi Zhang, Yann Dubois, Xuechen Li
+#
+#    Licensed under the Apache License, Version 2.0 (the "License");
+#    you may not use this file except in compliance with the License.
+#    You may obtain a copy of the License at
+#
+#        http://www.apache.org/licenses/LICENSE-2.0
+#
+#    Unless required by applicable law or agreed to in writing, software
+#    distributed under the License is distributed on an "AS IS" BASIS,
+#    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#    See the License for the specific language governing permissions and
+#    limitations under the License.
+import os
+import copy
+from dataclasses import dataclass, field
+import json
+import logging
+import pathlib
+from typing import Dict, Optional, Sequence, List
+import torch
+import transformers
+from ChatUniVi.constants import *
+from torch.utils.data import Dataset
+from ChatUniVi.train.trainer import ChatUniViTrainer
+from ChatUniVi import conversation as conversation_lib
+from ChatUniVi.model import *
+from ChatUniVi.mm_utils import tokenizer_image_token
+from ChatUniVi.config import ModelConfig, DataConfig
+from PIL import Image
+import random
+import numpy as np
+from ChatUniVi.model.dataloader import _get_rawvideo_dec
+local_rank = None
+def rank0_print(*args):
+    if local_rank == 0:
+        print(*args)
+@dataclass
+class ModelArguments:
+    model_name_or_path: Optional[str] = field(default="facebook/opt-125m")
+    version: Optional[str] = field(default="v0")
+    freeze_backbone: bool = field(default=False)
+    tune_mm_mlp_adapter: bool = field(default=False)
+    vision_tower: Optional[str] = field(default=None)
+    mm_vision_select_layer: Optional[int] = field(default=-1)  # default to the last layer
+    pretrain_mm_mlp_adapter: Optional[str] = field(default=None)
+    mm_use_im_start_end: bool = field(default=False)
+    mm_use_im_patch_token: bool = field(default=True)
+    mm_vision_select_feature: Optional[str] = field(default="patch")
+    mm_projector_type: Optional[str] = field(default='linear')
+    model_use: str = field(default="BASE")
+    mm_use_box_start_end: bool = field(default=False)
+@dataclass
+class DataArguments:
+    lazy_preprocess: bool = False
+    is_multimodal: bool = False
+    image_aspect_ratio: str = 'square'
+    image_grid_pinpoints: Optional[str] = field(default=None)
+    dataset_use: str = field(default="Pretrain")
+@dataclass
+class TrainingArguments(transformers.TrainingArguments):
+    cache_dir: Optional[str] = field(default=None)
+    optim: str = field(default="adamw_torch")
+    remove_unused_columns: bool = field(default=False)
+    freeze_mm_mlp_adapter: bool = field(default=False)
+    mpt_attn_impl: Optional[str] = field(default="triton")
+    model_max_length: int = field(
+        default=512,
+        metadata={
+            "help":
+                "Maximum sequence length. Sequences will be right padded (and possibly truncated)."
+        },
+    )
+    double_quant: bool = field(
+        default=True,
+        metadata={"help": "Compress the quantization statistics through double quantization."}
+    )
+    quant_type: str = field(
+        default="nf4",
+        metadata={"help": "Quantization data type to use. Should be one of `fp4` or `nf4`."}
+    )
+    bits: int = field(
+        default=16,
+        metadata={"help": "How many bits to use."}
+    )
+    lora_enable: bool = False
+    lora_r: int = 64
+    lora_alpha: int = 16
+    lora_dropout: float = 0.05
+    lora_weight_path: str = ""
+    lora_bias: str = "none"
+    seed = 42
+def maybe_zero_3(param, ignore_status=False, name=None):
+    from deepspeed import zero
+    from deepspeed.runtime.zero.partition_parameters import ZeroParamStatus
+    if hasattr(param, "ds_id"):
+        if param.ds_status == ZeroParamStatus.NOT_AVAILABLE:
+            if not ignore_status:
+                logging.warning(f"{name}: param.ds_status != ZeroParamStatus.NOT_AVAILABLE: {param.ds_status}")
+        with zero.GatheredParameters([param]):
+            param = param.data.detach().cpu().clone()
+    else:
+        param = param.detach().cpu().clone()
+    return param
+# Borrowed from peft.utils.get_peft_model_state_dict
+def get_peft_state_maybe_zero_3(named_params, bias):
+    if bias == "none":
+        to_return = {k: t for k, t in named_params if "lora_" in k}
+    elif bias == "all":
+        to_return = {k: t for k, t in named_params if "lora_" in k or "bias" in k}
+    elif bias == "lora_only":
+        to_return = {}
+        maybe_lora_bias = {}
+        lora_bias_names = set()
+        for k, t in named_params:
+            if "lora_" in k:
+                to_return[k] = t
+                bias_name = k.split("lora_")[0] + "bias"
+                lora_bias_names.add(bias_name)
+            elif "bias" in k:
+                maybe_lora_bias[k] = t
+        for k, t in maybe_lora_bias:
+            if bias_name in lora_bias_names:
+                to_return[bias_name] = t
+    else:
+        raise NotImplementedError
+    to_return = {k: maybe_zero_3(v, name=k) for k, v in to_return.items()}
+    return to_return
+def get_peft_state_non_lora_maybe_zero_3(named_params, require_grad_only=True):
+    to_return = {k: t for k, t in named_params if "lora_" not in k}
+    if require_grad_only:
+        to_return = {k: t for k, t in to_return.items() if t.requires_grad}
+    to_return = {k: maybe_zero_3(v, ignore_status=True).cpu() for k, v in to_return.items()}
+    return to_return
+def get_mm_adapter_state_maybe_zero_3(named_params, keys_to_match):
+    to_return = {k: t for k, t in named_params if any(key_match in k for key_match in keys_to_match)}
+    to_return = {k: maybe_zero_3(v, ignore_status=True).cpu() for k, v in to_return.items()}
+    return to_return
+def find_all_linear_names(model):
+    cls = torch.nn.Linear
+    lora_module_names = set()
+    for name, module in model.named_modules():
+        if isinstance(module, cls):
+            names = name.split('.')
+            lora_module_names.add(names[0] if len(names) == 1 else names[-1])
+    if 'lm_head' in lora_module_names:  # needed for 16-bit
+        lora_module_names.remove('lm_head')
+    return list(lora_module_names)
+def safe_save_model_for_hf_trainer(trainer: transformers.Trainer,
+                                   output_dir: str):
+    """Collects the state dict and dump to disk."""
+    if getattr(trainer.args, "tune_mm_mlp_adapter", False):
+        # Only save Adapter
+        keys_to_match = ['mm_projector', "ctm", "block"]
+        if getattr(trainer.args, "use_im_start_end", False):
+            keys_to_match.extend(['embed_tokens', 'embed_in'])
+        weight_to_save = get_mm_adapter_state_maybe_zero_3(trainer.model.named_parameters(), keys_to_match)
+        trainer.model.config.save_pretrained(output_dir)
+        current_folder = output_dir.split('/')[-1]
+        parent_folder = os.path.dirname(output_dir)
+        if trainer.args.local_rank == 0 or trainer.args.local_rank == -1:
+            if current_folder.startswith('checkpoint-'):
+                mm_projector_folder = os.path.join(parent_folder, "mm_projector")
+                os.makedirs(mm_projector_folder, exist_ok=True)
+                torch.save(weight_to_save, os.path.join(mm_projector_folder, f'{current_folder}.bin'))
+            else:
+                torch.save(weight_to_save, os.path.join(output_dir, f'mm_projector.bin'))
+    if trainer.deepspeed:
+        torch.cuda.synchronize()
+        trainer.save_model(output_dir)
+        return
+    state_dict = trainer.model.state_dict()
+    if trainer.args.should_save:
+        cpu_state_dict = {
+            key: value.cpu()
+            for key, value in state_dict.items()
+        }
+        del state_dict
+        trainer._save(output_dir, state_dict=cpu_state_dict)  # noqa
+def smart_tokenizer_and_embedding_resize(
+        special_tokens_dict: Dict,
+        tokenizer: transformers.PreTrainedTokenizer,
+        model: transformers.PreTrainedModel,
+):
+    """Resize tokenizer and embedding.
+    Note: This is the unoptimized version that may make your embedding size not be divisible by 64.
+    """
+    num_new_tokens = tokenizer.add_special_tokens(special_tokens_dict)
+    model.resize_token_embeddings(len(tokenizer))
+    if num_new_tokens > 0:
+        input_embeddings = model.get_input_embeddings().weight.data
+        output_embeddings = model.get_output_embeddings().weight.data
+        input_embeddings_avg = input_embeddings[:-num_new_tokens].mean(
+            dim=0, keepdim=True)
+        output_embeddings_avg = output_embeddings[:-num_new_tokens].mean(
+            dim=0, keepdim=True)
+        input_embeddings[-num_new_tokens:] = input_embeddings_avg
+        output_embeddings[-num_new_tokens:] = output_embeddings_avg
+def _tokenize_fn(strings: Sequence[str],
+                 tokenizer: transformers.PreTrainedTokenizer) -> Dict:
+    """Tokenize a list of strings."""
+    tokenized_list = [
+        tokenizer(
+            text,
+            return_tensors="pt",
+            padding="longest",
+            max_length=tokenizer.model_max_length,
+            truncation=True,
+        ) for text in strings
+    ]
+    input_ids = labels = [
+        tokenized.input_ids[0] for tokenized in tokenized_list
+    ]
+    input_ids_lens = labels_lens = [
+        tokenized.input_ids.ne(tokenizer.pad_token_id).sum().item()
+        for tokenized in tokenized_list
+    ]
+    return dict(
+        input_ids=input_ids,
+        labels=labels,
+        input_ids_lens=input_ids_lens,
+        labels_lens=labels_lens,
+    )
+def _mask_targets(target, tokenized_lens, speakers):
+    # cur_idx = 0
+    cur_idx = tokenized_lens[0]
+    tokenized_lens = tokenized_lens[1:]
+    target[:cur_idx] = IGNORE_INDEX
+    for tokenized_len, speaker in zip(tokenized_lens, speakers):
+        if speaker == "human":
+            target[cur_idx + 2:cur_idx + tokenized_len] = IGNORE_INDEX
+        cur_idx += tokenized_len
+def _add_speaker_and_signal(header, source, get_conversation=True):
+    """Add speaker and start/end signal on each round."""
+    BEGIN_SIGNAL = "### "
+    END_SIGNAL = "\n"
+    conversation = header
+    for sentence in source:
+        from_str = sentence["from"]
+        if from_str.lower() == "human":
+            from_str = conversation_lib.default_conversation.roles[0]
+        elif from_str.lower() == "gpt":
+            from_str = conversation_lib.default_conversation.roles[1]
+        else:
+            from_str = 'unknown'
+        sentence["value"] = (BEGIN_SIGNAL + from_str + ": " +
+                             sentence["value"] + END_SIGNAL)
+        if get_conversation:
+            conversation += sentence["value"]
+    conversation += BEGIN_SIGNAL
+    return conversation
+def preprocess_multimodal(
+        sources: Sequence[str],
+        data_args: DataArguments,
+        image_token_num=1
+) -> Dict:
+    is_multimodal = data_args.is_multimodal
+    if not is_multimodal:
+        return sources
+    for source in sources:
+        for sentence in source:
+            if DEFAULT_IMAGE_TOKEN in sentence['value'] or DEFAULT_VIDEO_TOKEN in sentence['value']:
+                sentence['value'] = sentence['value'].replace(DEFAULT_IMAGE_TOKEN + '\n', DEFAULT_IMAGE_TOKEN).strip()
+                sentence['value'] = sentence['value'].replace('\n' + DEFAULT_IMAGE_TOKEN, DEFAULT_IMAGE_TOKEN).strip()
+                if sentence['value'].endswith(DEFAULT_IMAGE_TOKEN):
+                    IMAGE_TOKEN_NUM = sentence['value'].count(DEFAULT_IMAGE_TOKEN)
+                    sentence['value'] = sentence['value'].replace(DEFAULT_IMAGE_TOKEN * IMAGE_TOKEN_NUM, '').strip()
+                    sentence['value'] = DEFAULT_IMAGE_TOKEN * IMAGE_TOKEN_NUM + sentence['value']
+                    sentence['value'] = sentence['value'].strip()
+                if sentence['value'].endswith(DEFAULT_VIDEO_TOKEN):
+                    VIDEO_TOKEN_NUM = sentence['value'].count(DEFAULT_VIDEO_TOKEN)
+                    sentence['value'] = sentence['value'].replace(DEFAULT_VIDEO_TOKEN * VIDEO_TOKEN_NUM, '').strip()
+                    sentence['value'] = DEFAULT_VIDEO_TOKEN * VIDEO_TOKEN_NUM + sentence['value']
+                    sentence['value'] = sentence['value'].strip()
+                if "mmtag" in conversation_lib.default_conversation.version:
+                    sentence['value'] = sentence['value'].replace(DEFAULT_IMAGE_TOKEN,
+                                                                  '<Image>' + DEFAULT_IMAGE_TOKEN + '</Image>')
+                IMAGE_TOKEN_NUM = sentence['value'].count(DEFAULT_IMAGE_TOKEN)
+                if IMAGE_TOKEN_NUM > MAX_IMAGE_LENGTH:
+                    sentence['value'] = sentence['value'].replace(DEFAULT_IMAGE_TOKEN * IMAGE_TOKEN_NUM,
+                                                                  DEFAULT_IMAGE_TOKEN * MAX_IMAGE_LENGTH).strip()
+            replace_token, vid_replace_token = DEFAULT_IMAGE_TOKEN, DEFAULT_IMAGE_TOKEN * image_token_num
+            if data_args.mm_use_im_start_end:
+                replace_token = DEFAULT_IM_START_TOKEN + replace_token + DEFAULT_IM_END_TOKEN
+                vid_replace_token = DEFAULT_VID_START_TOKEN + vid_replace_token + DEFAULT_VID_END_TOKEN
+            sentence["value"] = sentence["value"].replace(DEFAULT_IMAGE_TOKEN, replace_token + '\n')
+            sentence['value'] = sentence['value'].replace(DEFAULT_VIDEO_TOKEN, vid_replace_token + '\n')
+            sentence['value'] = sentence['value'].replace('\n\n', '\n')
+    return sources
+def preprocess_llama_2(
+        sources,
+        tokenizer: transformers.PreTrainedTokenizer,
+        has_image: bool = False
+) -> Dict:
+    conv = conversation_lib.default_conversation.copy()
+    roles = {"human": conv.roles[0], "gpt": conv.roles[1]}
+    # Apply prompt templates
+    conversations = []
+    for i, source in enumerate(sources):
+        if roles[source[0]["from"]] != conv.roles[0]:
+            # Skip the first one if it is not from human
+            source = source[1:]
+        conv.messages = []
+        for j, sentence in enumerate(source):
+            role = roles[sentence["from"]]
+            assert role == conv.roles[j % 2], f"{i}"
+            conv.append_message(role, sentence["value"])
+        conversations.append(conv.get_prompt())
+    # Tokenize conversations
+    if has_image:
+        input_ids = torch.stack(
+            [tokenizer_image_token(prompt, tokenizer, return_tensors='pt') for prompt in conversations], dim=0)
+    else:
+        input_ids = tokenizer(
+            conversations,
+            return_tensors="pt",
+            padding="longest",
+            max_length=tokenizer.model_max_length,
+            truncation=True,
+        ).input_ids
+    targets = input_ids.clone()
+    assert conv.sep_style == conversation_lib.SeparatorStyle.LLAMA_2
+    # Mask targets
+    sep = "[/INST] "
+    for conversation, target in zip(conversations, targets):
+        total_len = int(target.ne(tokenizer.pad_token_id).sum())
+        rounds = conversation.split(conv.sep2)
+        cur_len = 1
+        target[:cur_len] = IGNORE_INDEX
+        for i, rou in enumerate(rounds):
+            if rou == "":
+                break
+            parts = rou.split(sep)
+            if len(parts) != 2:
+                break
+            parts[0] += sep
+            if has_image:
+                round_len = len(tokenizer_image_token(rou, tokenizer))
+                instruction_len = len(tokenizer_image_token(parts[0], tokenizer)) - 2
+            else:
+                round_len = len(tokenizer(rou).input_ids)
+                instruction_len = len(tokenizer(parts[0]).input_ids) - 2
+            target[cur_len: cur_len + instruction_len] = IGNORE_INDEX
+            cur_len += round_len
+        if tokenizer.eos_token == tokenizer.pad_token:
+            cur_len += 1
+        target[cur_len:] = IGNORE_INDEX
+        if cur_len < tokenizer.model_max_length:
+            if cur_len != total_len:
+                target[:] = IGNORE_INDEX
+                print(
+                    f"WARNING: tokenization mismatch: {cur_len} vs. {total_len}."
+                    f" (ignored)"
+                )
+    return dict(
+        input_ids=input_ids,
+        labels=targets,
+    )
+def preprocess_v1(
+        sources,
+        tokenizer: transformers.PreTrainedTokenizer,
+        has_image: bool = False
+) -> Dict:
+    conv = conversation_lib.default_conversation.copy()
+    roles = {"human": conv.roles[0], "gpt": conv.roles[1]}
+    # Apply prompt templates
+    conversations = []
+    for i, source in enumerate(sources):
+        if roles[source[0]["from"]] != conv.roles[0]:
+            # Skip the first one if it is not from human
+            source = source[1:]
+        conv.messages = []
+        for j, sentence in enumerate(source):
+            role = roles[sentence["from"]]
+            assert role == conv.roles[j % 2], f"{i}"
+            conv.append_message(role, sentence["value"])
+        conversations.append(conv.get_prompt())
+    # Tokenize conversations
+    if has_image:
+        input_ids = torch.stack(
+            [tokenizer_image_token(prompt, tokenizer, return_tensors='pt') for prompt in conversations], dim=0)
+    else:
+        input_ids = tokenizer(
+            conversations,
+            return_tensors="pt",
+            padding="longest",
+            max_length=tokenizer.model_max_length,
+            truncation=True,
+        ).input_ids
+    targets = input_ids.clone()
+    assert conv.sep_style == conversation_lib.SeparatorStyle.TWO
+    # Mask targets
+    round_len_list = []
+    sep = conv.sep + conv.roles[1] + ": "
+    for conversation, target in zip(conversations, targets):
+        total_len = int(target.ne(tokenizer.pad_token_id).sum())
+        rounds = conversation.split(conv.sep2)
+        cur_len = 1
+        target[:cur_len] = IGNORE_INDEX
+        for i, rou in enumerate(rounds):
+            if rou == "":
+                break
+            parts = rou.split(sep)
+            if len(parts) != 2:
+                break
+            parts[0] += sep
+            if has_image:
+                round_len = len(tokenizer_image_token(rou, tokenizer))
+                instruction_len = len(tokenizer_image_token(parts[0], tokenizer)) - 2
+            else:
+                round_len = len(tokenizer(rou).input_ids)
+                instruction_len = len(tokenizer(parts[0]).input_ids) - 2
+            target[cur_len: cur_len + instruction_len] = IGNORE_INDEX
+            # print("rou:", rou)
+            # print(round_len, instruction_len)
+            # print(len(tokenizer(rou).input_ids), len(tokenizer_image_token(rou, tokenizer)))
+            cur_len += round_len
+            round_len_list.append(round_len)
+        target[cur_len:] = IGNORE_INDEX
+        if cur_len < tokenizer.model_max_length:
+            if cur_len != total_len:
+                # print(conversations, target, round_len_list)
+                target[:] = IGNORE_INDEX
+                print(
+                    f"WARNING: tokenization mismatch: {cur_len} vs. {total_len}."
+                    f" (ignored)"
+                )
+                # exit()
+    # print("ok", conversations, target, round_len_list)
+    return dict(
+        input_ids=input_ids,
+        labels=targets,
+    )
+def preprocess_mpt(
+        sources,
+        tokenizer: transformers.PreTrainedTokenizer,
+) -> Dict:
+    conv = conversation_lib.default_conversation.copy()
+    roles = {"human": conv.roles[0], "gpt": conv.roles[1]}
+    # Apply prompt templates
+    conversations = []
+    for i, source in enumerate(sources):
+        if roles[source[0]["from"]] != conv.roles[0]:
+            # Skip the first one if it is not from human
+            source = source[1:]
+        conv.messages = []
+        for j, sentence in enumerate(source):
+            role = roles[sentence["from"]]
+            assert role == conv.roles[j % 2], f"{i}"
+            conv.append_message(role, sentence["value"])
+        conversations.append(conv.get_prompt())
+    # Tokenize conversations
+    input_ids = torch.stack([tokenizer_image_token(prompt, tokenizer, return_tensors='pt') for prompt in conversations],
+                            dim=0)
+    targets = input_ids.clone()
+    assert conv.sep_style == conversation_lib.SeparatorStyle.MPT
+    # Mask targets
+    sep = conv.sep + conv.roles[1]
+    for conversation, target in zip(conversations, targets):
+        total_len = int(target.ne(tokenizer.pad_token_id).sum())
+        rounds = conversation.split(conv.sep)
+        re_rounds = [conv.sep.join(rounds[:3])]  # system + user + gpt
+        for conv_idx in range(3, len(rounds), 2):
+            re_rounds.append(conv.sep.join(rounds[conv_idx:conv_idx + 2]))  # user + gpt
+        cur_len = 0
+        target[:cur_len] = IGNORE_INDEX
+        for i, rou in enumerate(re_rounds):
+            if rou == "":
+                break
+            parts = rou.split(sep)
+            if len(parts) != 2:
+                break
+            parts[0] += sep
+            round_len = len(tokenizer_image_token(rou, tokenizer)) + len(tokenizer_image_token(conv.sep, tokenizer))
+            instruction_len = len(tokenizer_image_token(parts[0], tokenizer))
+            target[cur_len: cur_len + instruction_len] = IGNORE_INDEX
+            cur_len += round_len
+        target[cur_len:] = IGNORE_INDEX
+        if cur_len < tokenizer.model_max_length:
+            if cur_len != total_len:
+                target[:] = IGNORE_INDEX
+                print(
+                    f"WARNING: tokenization mismatch: {cur_len} vs. {total_len}."
+                    f" (ignored)"
+                )
+    return dict(
+        input_ids=input_ids,
+        labels=targets,
+    )
+def preprocess_plain(
+        sources: Sequence[str],
+        tokenizer: transformers.PreTrainedTokenizer,
+) -> Dict:
+    # add end signal and concatenate together
+    conversations = []
+    for source in sources:
+        assert len(source) == 2
+        assert DEFAULT_IMAGE_TOKEN in source[0]['value']
+        source[0]['value'] = DEFAULT_IMAGE_TOKEN
+        conversation = source[0]['value'] + source[1]['value'] + conversation_lib.default_conversation.sep
+        conversations.append(conversation)
+    # tokenize conversations
+    input_ids = [tokenizer_image_token(prompt, tokenizer, return_tensors='pt') for prompt in conversations]
+    targets = copy.deepcopy(input_ids)
+    for target, source in zip(targets, sources):
+        tokenized_len = len(tokenizer_image_token(source[0]['value'], tokenizer))
+        target[:tokenized_len] = IGNORE_INDEX
+    return dict(input_ids=input_ids, labels=targets)
+def preprocess_phi(
+        sources,
+        tokenizer: transformers.PreTrainedTokenizer,
+        has_image: bool = False
+) -> Dict:
+    conv = conversation_lib.default_conversation.copy()
+    roles = {"human": conv.roles[0], "gpt": conv.roles[1]}
+    # Apply prompt templates
+    conversations = []
+    for i, source in enumerate(sources):
+        if roles[source[0]["from"]] != conv.roles[0]:
+            # Skip the first one if it is not from human
+            source = source[1:]
+        conv.messages = []
+        for j, sentence in enumerate(source):
+            role = roles[sentence["from"]]
+            assert role == conv.roles[j % 2], f"{i}"
+            conv.append_message(role, sentence["value"])
+        conversations.append(conv.get_prompt())
+    # Tokenize conversations
+    if has_image:
+        input_ids = torch.stack(
+            [tokenizer_image_token(prompt, tokenizer, return_tensors='pt') for prompt in conversations], dim=0)
+    else:
+        input_ids = tokenizer(
+            conversations,
+            return_tensors="pt",
+            padding="longest",
+            max_length=tokenizer.model_max_length,
+            truncation=True,
+        ).input_ids
+    targets = input_ids.clone()
+    assert conv.sep_style == conversation_lib.SeparatorStyle.TWO
+    # Mask targets
+    round_len_list = []
+    sep = conv.sep + conv.roles[1] + ": "
+    for conversation, target in zip(conversations, targets):
+        total_len = int(target.ne(tokenizer.pad_token_id).sum())
+        rounds = conversation.split(conv.sep2)
+        cur_len = 0
+        pre_len = 0
+        for i, rou in enumerate(rounds):
+            if rou == "":
+                break
+            parts = rou.split(sep)
+            if len(parts) != 2:
+                break
+            parts[0] += sep
+            cur_len += 1
+            target[pre_len: cur_len] = IGNORE_INDEX
+            if has_image:
+                round_len = len(tokenizer_image_token(rou, tokenizer))
+                instruction_len = len(tokenizer_image_token(parts[0], tokenizer)) - 2
+            else:
+                round_len = len(tokenizer(rou).input_ids)
+                instruction_len = len(tokenizer(parts[0]).input_ids) - 2
+            target[cur_len: cur_len + instruction_len] = IGNORE_INDEX
+            # print("rou:", rou)
+            # print(round_len, instruction_len)
+            # print(len(tokenizer(rou).input_ids), len(tokenizer_image_token(rou, tokenizer)))
+            cur_len += round_len
+            pre_len = cur_len
+            round_len_list.append(round_len)
+        target[cur_len:] = IGNORE_INDEX
+        if cur_len < tokenizer.model_max_length:
+            if cur_len != total_len + len(rounds) - 1:
+                # print(conversations, target, round_len_list)
+                target[:] = IGNORE_INDEX
+                print(
+                    f"WARNING: tokenization mismatch: {cur_len} vs. {total_len}."
+                    f" (ignored)"
+                )
+                # exit()
+    # print("ok", conversations, target, round_len_list)
+    return dict(
+        input_ids=input_ids,
+        labels=targets,
+    )
+def preprocess(
+        sources: Sequence[str],
+        tokenizer: transformers.PreTrainedTokenizer,
+        has_image: bool = False
+) -> Dict:
+    """
+    Given a list of sources, each is a conversation list. This transform:
+    1. Add signal '### ' at the beginning each sentence, with end signal '\n';
+    2. Concatenate conversations together;
+    3. Tokenize the concatenated conversation;
+    4. Make a deepcopy as the target. Mask human words with IGNORE_INDEX.
+    """
+    if conversation_lib.default_conversation.version.startswith("phi"):
+        return preprocess_phi(sources, tokenizer, has_image=has_image)
+    if conversation_lib.default_conversation.sep_style == conversation_lib.SeparatorStyle.PLAIN:
+        return preprocess_plain(sources, tokenizer)
+    if conversation_lib.default_conversation.sep_style == conversation_lib.SeparatorStyle.LLAMA_2:
+        return preprocess_llama_2(sources, tokenizer, has_image=has_image)
+    if conversation_lib.default_conversation.version.startswith("v1"):
+        return preprocess_v1(sources, tokenizer, has_image=has_image)
+    if conversation_lib.default_conversation.version == "mpt":
+        return preprocess_mpt(sources, tokenizer)
+    # add end signal and concatenate together
+    conversations = []
+    for source in sources:
+        header = f"{conversation_lib.default_conversation.system}\n\n"
+        conversation = _add_speaker_and_signal(header, source)
+        conversations.append(conversation)
+    # tokenize conversations
+    def get_tokenize_len(prompts):
+        return [len(tokenizer_image_token(prompt, tokenizer)) for prompt in prompts]
+    if has_image:
+        input_ids = [tokenizer_image_token(prompt, tokenizer, return_tensors='pt') for prompt in conversations]
+    else:
+        conversations_tokenized = _tokenize_fn(conversations, tokenizer)
+        input_ids = conversations_tokenized["input_ids"]
+    targets = copy.deepcopy(input_ids)
+    for target, source in zip(targets, sources):
+        if has_image:
+            tokenized_lens = get_tokenize_len([header] + [s["value"] for s in source])
+        else:
+            tokenized_lens = _tokenize_fn([header] + [s["value"] for s in source], tokenizer)["input_ids_lens"]
+        speakers = [sentence["from"] for sentence in source]
+        _mask_targets(target, tokenized_lens, speakers)
+    return dict(input_ids=input_ids, labels=targets)
+class LazySupervisedDataset(Dataset):
+    """Dataset for supervised fine-tuning."""
+    def __init__(self, tokenizer: transformers.PreTrainedTokenizer,
+                 data_args: DataArguments):
+        super(LazySupervisedDataset, self).__init__()
+        dataset_list = DataConfig[str(data_args.dataset_use)]
+        print(dataset_list)
+        self.max_length = MAX_IMAGE_LENGTH
+        list_data_dict = []
+        self.folder_dict = {}
+        for i in dataset_list:
+            list_data_dict += json.load(open(i["chat_path"], "r"))
+            image_folder = [folder for folder in i if folder is not "chat_path"]
+            for folder in image_folder:
+                if folder not in self.folder_dict:
+                    self.folder_dict[folder] = i[folder]
+        random.shuffle(list_data_dict)
+        rank0_print("Formatting inputs...Skip in lazy mode")
+        self.tokenizer = tokenizer
+        self.list_data_dict = list_data_dict
+        self.data_args = data_args
+    def __len__(self):
+        return len(self.list_data_dict)
+    def __getitem__(self, i) -> Dict[str, torch.Tensor]:
+        sources = self.list_data_dict[i]
+        if isinstance(i, int):
+            sources = [sources]
+        assert len(sources) == 1, "Don't know why it is wrapped to a list"  # FIXME
+        if 'image' in sources[0]:
+            image_file = self.list_data_dict[i]['image']
+            file = image_file[0] if type(image_file) is list else image_file
+            if "llava_image" in file:
+                image_folder = self.folder_dict['llava']
+            elif "\\" in file:
+                image_folder = self.folder_dict['ScienceQA']
+            elif "CGD" in file:
+                image_folder = self.folder_dict['CDG']
+            elif "DC" in file:
+                image_folder = self.folder_dict['DC']
+            elif "LA" in file:
+                image_folder = self.folder_dict['LA']
+            elif "SD" in file:
+                image_folder = self.folder_dict['SD']
+            elif "SN" in file:
+                image_folder = self.folder_dict['SN']
+            elif "TVC" in file:
+                image_folder = self.folder_dict['TVC']
+            elif "VST" in file:
+                image_folder = self.folder_dict['VST']
+            elif "GCC" in file:
+                image_folder = self.folder_dict['CC3M']
+            elif "COCO_train2014" in file:
+                image_folder = self.folder_dict['COCO2014']
+            else:
+                image_folder = self.folder_dict['COCO2017']
+            processor = self.data_args.image_processor
+            if type(image_file) is list:
+                image = [Image.open(os.path.join(image_folder, file.replace("\\", "/"))).convert('RGB') for file in
+                         image_file]
+                if self.data_args.image_aspect_ratio == 'pad':
+                    def expand2square(pil_img, background_color):
+                        width, height = pil_img.size
+                        if width == height:
+                            return pil_img
+                        elif width > height:
+                            result = Image.new(pil_img.mode, (width, width), background_color)
+                            result.paste(pil_img, (0, (width - height) // 2))
+                            return result
+                        else:
+                            result = Image.new(pil_img.mode, (height, height), background_color)
+                            result.paste(pil_img, ((height - width) // 2, 0))
+                            return result
+                    image = [expand2square(i, tuple(int(x * 255) for x in processor.image_mean)) for i in image]
+                    image = [processor.preprocess(i, return_tensors='pt')['pixel_values'][0] for i in image]
+                else:
+                    image = [processor.preprocess(i, return_tensors='pt')['pixel_values'][0] for i in image]
+            else:
+                image = Image.open(os.path.join(image_folder, image_file.replace("\\", "/"))).convert('RGB')
+                if self.data_args.image_aspect_ratio == 'pad':
+                    def expand2square(pil_img, background_color):
+                        width, height = pil_img.size
+                        if width == height:
+                            return pil_img
+                        elif width > height:
+                            result = Image.new(pil_img.mode, (width, width), background_color)
+                            result.paste(pil_img, (0, (width - height) // 2))
+                            return result
+                        else:
+                            result = Image.new(pil_img.mode, (height, height), background_color)
+                            result.paste(pil_img, ((height - width) // 2, 0))
+                            return result
+                    image = expand2square(image, tuple(int(x * 255) for x in processor.image_mean))
+                    image = processor.preprocess(image, return_tensors='pt')['pixel_values'][0]
+                else:
+                    image = processor.preprocess(image, return_tensors='pt')['pixel_values'][0]
+            sources = preprocess_multimodal(
+                copy.deepcopy([e["conversations"] for e in sources]),
+                self.data_args)
+            data_dict = preprocess(
+                sources,
+                self.tokenizer,
+                has_image=True)
+        elif "video" in sources[0]:
+            video_file = self.list_data_dict[i]['video']
+            if "valley" in video_file:
+                video_folder = self.folder_dict['valley']
+            else:
+                video_folder = self.folder_dict['VIDEO']
+            processor = self.data_args.image_processor
+            if os.path.exists(os.path.join(video_folder, video_file)):
+                image, image_token_num = _get_rawvideo_dec(os.path.join(video_folder, video_file), processor,
+                                                           max_frames=MAX_IMAGE_LENGTH)
+                flag = 0
+            else:
+                crop_size = self.data_args.image_processor.crop_size
+                image, image_token_num = torch.zeros(3, crop_size['height'], crop_size['width']), 1
+                flag = 1
+            sources = preprocess_multimodal(
+                copy.deepcopy([e["conversations"] for e in sources]),
+                self.data_args, image_token_num=image_token_num)
+            data_dict = preprocess(
+                sources,
+                self.tokenizer,
+                has_image=True)
+            if flag:
+                data_dict["labels"][:] = IGNORE_INDEX
+                print(
+                    f"WARNING: video load failed: {os.path.join(video_folder, video_file)}."
+                    f" (ignored)"
+                )
+        else:
+            sources = copy.deepcopy([e["conversations"] for e in sources])
+            data_dict = preprocess(
+                sources,
+                self.tokenizer,
+                has_image=False)
+        if isinstance(i, int):
+            data_dict = dict(input_ids=data_dict["input_ids"][0],
+                             labels=data_dict["labels"][0])
+        # image exist in the data
+        if 'image' in self.list_data_dict[i] or 'video' in self.list_data_dict[i]:
+            data_dict['image'] = image
+        elif self.data_args.is_multimodal:
+            # image does not exist in the data, but the model is multimodal
+            crop_size = self.data_args.image_processor.crop_size
+            data_dict['image'] = torch.zeros(3, crop_size['height'], crop_size['width'])
+        return data_dict
+@dataclass
+class DataCollatorForSupervisedDataset(object):
+    """Collate examples for supervised fine-tuning."""
+    tokenizer: transformers.PreTrainedTokenizer
+    def __call__(self, instances: Sequence[Dict]) -> Dict[str, torch.Tensor]:
+        input_ids, labels = tuple([instance[key] for instance in instances]
+                                  for key in ("input_ids", "labels"))
+        input_ids = torch.nn.utils.rnn.pad_sequence(
+            input_ids,
+            batch_first=True,
+            padding_value=self.tokenizer.pad_token_id)
+        labels = torch.nn.utils.rnn.pad_sequence(labels,
+                                                 batch_first=True,
+                                                 padding_value=IGNORE_INDEX)
+        input_ids = input_ids[:, :self.tokenizer.model_max_length]
+        labels = labels[:, :self.tokenizer.model_max_length]
+        batch = dict(
+            input_ids=input_ids,
+            labels=labels,
+            attention_mask=input_ids.ne(self.tokenizer.pad_token_id),
+        )
+        if 'image' in instances[0]:
+            images = [instance['image'] for instance in instances]
+            new_images = []
+            for image in images:
+                if type(image) is list:
+                    for i in image:
+                        new_images.append(i)
+                else:
+                    new_images.append(image)
+            images = new_images
+            if all(x is not None and x.shape == images[0].shape for x in images):
+                batch['images'] = torch.stack(images)
+            else:
+                batch['images'] = images
+        return batch
+def make_supervised_data_module(tokenizer: transformers.PreTrainedTokenizer,
+                                data_args) -> Dict:
+    """Make dataset and collator for supervised fine-tuning."""
+    train_dataset = LazySupervisedDataset(tokenizer=tokenizer, data_args=data_args)
+    data_collator = DataCollatorForSupervisedDataset(tokenizer=tokenizer)
+    return dict(train_dataset=train_dataset,
+                eval_dataset=None,
+                data_collator=data_collator)
+def train():
+    global local_rank
+    parser = transformers.HfArgumentParser(
+        (ModelArguments, DataArguments, TrainingArguments))
+    model_args, data_args, training_args = parser.parse_args_into_dataclasses()
+    local_rank = training_args.local_rank
+    compute_dtype = (torch.float16 if training_args.fp16 else (torch.bfloat16 if training_args.bf16 else torch.float32))
+    random.seed(training_args.seed)
+    os.environ['PYTHONHASHSEED'] = str(training_args.seed)
+    np.random.seed(training_args.seed)
+    torch.manual_seed(training_args.seed)
+    torch.cuda.manual_seed(training_args.seed)
+    torch.cuda.manual_seed_all(training_args.seed)  # if you are using multi-GPU.
+    torch.backends.cudnn.benchmark = False
+    torch.backends.cudnn.deterministic = True
+    bnb_model_from_pretrained_args = {}
+    if training_args.bits in [4, 8]:
+        from transformers import BitsAndBytesConfig
+        bnb_model_from_pretrained_args.update(dict(
+            device_map={"": training_args.device},
+            load_in_4bit=training_args.bits == 4,
+            load_in_8bit=training_args.bits == 8,
+            quantization_config=BitsAndBytesConfig(
+                load_in_4bit=training_args.bits == 4,
+                load_in_8bit=training_args.bits == 8,
+                llm_int8_threshold=6.0,
+                llm_int8_has_fp16_weight=False,
+                bnb_4bit_compute_dtype=compute_dtype,
+                bnb_4bit_use_double_quant=training_args.double_quant,
+                bnb_4bit_quant_type=training_args.quant_type  # {'fp4', 'nf4'}
+            )
+        ))
+    if model_args.vision_tower is not None:
+        if "phi" in model_args.model_name_or_path.lower():
+            from ChatUniVi.model.language_model.phi import ChatUniViPhiForCausalLM
+            model = ChatUniViPhiForCausalLM.from_pretrained(
+                model_args.model_name_or_path,
+                cache_dir=training_args.cache_dir,
+                **bnb_model_from_pretrained_args
+            )
+        else:
+            model = ChatUniViLlamaForCausalLM.from_pretrained(
+                model_args.model_name_or_path,
+                cache_dir=training_args.cache_dir,
+                **bnb_model_from_pretrained_args
+            )
+    else:
+        model = transformers.LlamaForCausalLM.from_pretrained(
+            model_args.model_name_or_path,
+            cache_dir=training_args.cache_dir,
+            **bnb_model_from_pretrained_args
+        )
+    model.config.use_cache = False
+    if model_args.freeze_backbone:
+        model.model.requires_grad_(False)
+    if training_args.bits in [4, 8]:
+        from peft import prepare_model_for_kbit_training
+        model.config.torch_dtype = (
+            torch.float32 if training_args.fp16 else (torch.bfloat16 if training_args.bf16 else torch.float32))
+        model = prepare_model_for_kbit_training(model, use_gradient_checkpointing=training_args.gradient_checkpointing)
+    if training_args.gradient_checkpointing:
+        if hasattr(model, "enable_input_require_grads"):
+            model.enable_input_require_grads()
+        else:
+            def make_inputs_require_grad(module, input, output):
+                output.requires_grad_(True)
+            model.get_input_embeddings().register_forward_hook(make_inputs_require_grad)
+    if training_args.lora_enable:
+        from peft import LoraConfig, get_peft_model
+        lora_config = LoraConfig(
+            r=training_args.lora_r,
+            lora_alpha=training_args.lora_alpha,
+            target_modules=find_all_linear_names(model),
+            lora_dropout=training_args.lora_dropout,
+            bias=training_args.lora_bias,
+            task_type="CAUSAL_LM",
+        )
+        if training_args.bits == 16:
+            if training_args.bf16:
+                model.to(torch.bfloat16)
+            if training_args.fp16:
+                model.to(torch.float16)
+        rank0_print("Adding LoRA adapters...")
+        model = get_peft_model(model, lora_config)
+    if 'mpt' in model_args.model_name_or_path:
+        tokenizer = transformers.AutoTokenizer.from_pretrained(
+            model_args.model_name_or_path,
+            cache_dir=training_args.cache_dir,
+            model_max_length=training_args.model_max_length,
+            padding_side="right"
+        )
+    else:
+        tokenizer = transformers.AutoTokenizer.from_pretrained(
+            model_args.model_name_or_path,
+            cache_dir=training_args.cache_dir,
+            model_max_length=training_args.model_max_length,
+            padding_side="right",
+            use_fast=True,
+        )
+    if model_args.version == "v0":
+        if tokenizer.pad_token is None:
+            smart_tokenizer_and_embedding_resize(
+                special_tokens_dict=dict(pad_token="[PAD]"),
+                tokenizer=tokenizer,
+                model=model,
+            )
+        if "llama" in model_args.model_name_or_path.lower():
+            tokenizer.add_special_tokens({
+                "eos_token": "</s>",
+                "bos_token": "<s>",
+                "unk_token": "<unk>",
+            })
+    elif model_args.version == "v0.5":
+        tokenizer.pad_token = tokenizer.unk_token
+    elif model_args.version == "phi":
+        tokenizer.pad_token = tokenizer.unk_token
+        conversation_lib.default_conversation = conversation_lib.conv_templates[model_args.version]
+    else:
+        tokenizer.pad_token = tokenizer.unk_token
+        if model_args.version in conversation_lib.conv_templates:
+            conversation_lib.default_conversation = conversation_lib.conv_templates[model_args.version]
+        else:
+            conversation_lib.default_conversation = conversation_lib.conv_templates["vicuna_v1"]
+    if model_args.vision_tower is not None:
+        model.get_model().initialize_vision_modules(
+            model_args=model_args,
+            fsdp=training_args.fsdp
+        )
+        vision_tower = model.get_vision_tower()
+        vision_tower.to(dtype=torch.float16, device=training_args.device)
+        data_args.image_processor = vision_tower.image_processor
+        data_args.is_multimodal = True
+        model.config.image_aspect_ratio = data_args.image_aspect_ratio
+        model.config.image_grid_pinpoints = data_args.image_grid_pinpoints
+        model.config.tune_mm_mlp_adapter = training_args.tune_mm_mlp_adapter = model_args.tune_mm_mlp_adapter
+        if model_args.tune_mm_mlp_adapter:
+            model.requires_grad_(False)
+            for p in model.get_model().mm_projector.parameters():
+                p.requires_grad = True
+        model.config.freeze_mm_mlp_adapter = training_args.freeze_mm_mlp_adapter
+        if training_args.freeze_mm_mlp_adapter:
+            for p in model.get_model().mm_projector.parameters():
+                p.requires_grad = False
+        if training_args.bits in [4, 8]:
+            model.get_model().mm_projector.to(dtype=compute_dtype, device=training_args.device)
+        model.config.mm_use_im_start_end = data_args.mm_use_im_start_end = model_args.mm_use_im_start_end
+        training_args.use_im_start_end = model_args.mm_use_im_start_end
+        model.config.mm_use_box_start_end = data_args.mm_use_box_start_end = model_args.mm_use_box_start_end
+        training_args.use_im_start_end = model_args.mm_use_box_start_end
+        model.config.mm_use_im_patch_token = model_args.mm_use_im_patch_token
+        model.initialize_vision_tokenizer(model_args, tokenizer=tokenizer)
+        model_config = ModelConfig[str(model_args.model_use)]
+        model.config.aarchitectures = "LlavaLlamaForCausalLM"
+        model.config.config = model_config
+        model_args.use_cluster = model_config["use_cluster"]
+        model_args.spatial_cluster_rate0 = model_config["spatial_cluster_rate0"]
+        model_args.spatial_cluster_rate1 = model_config["spatial_cluster_rate1"]
+        model_args.spatial_cluster_rate2 = model_config["spatial_cluster_rate2"]
+        model_args.temporal_cluster_rate = model_config.get("temporal_cluster_rate", 1 / 16)
+        model.get_model().initialize_cluster_modules(model_args)
+        if model_args.use_cluster:
+            for n, p in model.named_parameters():
+                if "block" in n or "ctm" in n:
+                    p.requires_grad = True
+        if model.config.config["freeze"]:
+            for n, p in model.named_parameters():
+                if "block" not in n and "ctm" not in n:
+                    p.requires_grad = False
+            if model.config.config["mm_tune"]:
+                for p in model.get_model().mm_projector.parameters():
+                    p.requires_grad = True
+        model_args.vision_tune = model_config["vision_tune"]
+        for p in model.get_vision_tower().parameters():
+            p.requires_grad = model_args.vision_tune
+    params_need_grad = [n for n, p in model.named_parameters() if p.requires_grad]
+    print("Parameters require gradients: {}".format(params_need_grad))
+    if training_args.bits in [4, 8]:
+        from peft.tuners.lora import LoraLayer
+        for name, module in model.named_modules():
+            if isinstance(module, LoraLayer):
+                if training_args.bf16:
+                    module = module.to(torch.bfloat16)
+            if 'norm' in name:
+                module = module.to(torch.float32)
+            if 'lm_head' in name or 'embed_tokens' in name:
+                if hasattr(module, 'weight'):
+                    if training_args.bf16 and module.weight.dtype == torch.float32:
+                        module = module.to(torch.bfloat16)
+    data_module = make_supervised_data_module(tokenizer=tokenizer,
+                                              data_args=data_args)
+    trainer = ChatUniViTrainer(model=model,
+                               tokenizer=tokenizer,
+                               args=training_args,
+                               **data_module)
+    if list(pathlib.Path(training_args.output_dir).glob("checkpoint-*")):
+        trainer.train(resume_from_checkpoint=True)
+    else:
+        trainer.train()
+    model.config.use_cache = True
+    if training_args.lora_enable:
+        state_dict = get_peft_state_maybe_zero_3(
+            model.named_parameters(), training_args.lora_bias
+        )
+        non_lora_state_dict = get_peft_state_non_lora_maybe_zero_3(
+            model.named_parameters()
+        )
+        if training_args.local_rank == 0 or training_args.local_rank == -1:
+            model.config.save_pretrained(training_args.output_dir)
+            model.save_pretrained(training_args.output_dir, state_dict=state_dict)
+            torch.save(non_lora_state_dict, os.path.join(training_args.output_dir, 'non_lora_trainables.bin'))
+    else:
+        safe_save_model_for_hf_trainer(trainer=trainer,
+                                       output_dir=training_args.output_dir)
+if __name__ == "__main__":
+    train()

ChatUniVi/train/train_mem.py ADDED Viewed

	@@ -0,0 +1,13 @@

+# Adopted from https://github.com/lm-sys/FastChat. Below is the original copyright:
+# Adopted from tatsu-lab@stanford_alpaca. Below is the original copyright:
+# Make it more memory efficient by monkey patching the LLaMA model with FlashAttn.
+# Need to call this before importing transformers.
+from ChatUniVi.train.llama_flash_attn_monkey_patch import replace_llama_attn_with_flash_attn
+replace_llama_attn_with_flash_attn()
+from ChatUniVi.train.train import train
+if __name__ == "__main__":
+    train()

ChatUniVi/train/trainer.py ADDED Viewed

	@@ -0,0 +1,53 @@

+import os
+import torch
+from transformers import Trainer
+from typing import Optional
+def maybe_zero_3(param, ignore_status=False, name=None):
+    from deepspeed import zero
+    from deepspeed.runtime.zero.partition_parameters import ZeroParamStatus
+    if hasattr(param, "ds_id"):
+        if param.ds_status == ZeroParamStatus.NOT_AVAILABLE:
+            if not ignore_status:
+                print(name, 'no ignore status')
+        with zero.GatheredParameters([param]):
+            param = param.data.detach().cpu().clone()
+    else:
+        param = param.detach().cpu().clone()
+    return param
+def get_mm_adapter_state_maybe_zero_3(named_params, keys_to_match):
+    to_return = {k: t for k, t in named_params if any(key_match in k for key_match in keys_to_match)}
+    to_return = {k: maybe_zero_3(v, ignore_status=True, name=k).cpu() for k, v in to_return.items()}
+    return to_return
+class ChatUniViTrainer(Trainer):
+    def _save_checkpoint(self, model, trial, metrics=None):
+        if 0 and getattr(self.args, 'tune_mm_mlp_adapter', False):
+            from transformers.trainer_utils import PREFIX_CHECKPOINT_DIR
+            checkpoint_folder = f"{PREFIX_CHECKPOINT_DIR}-{self.state.global_step}"
+            run_dir = self._get_output_dir(trial=trial)
+            output_dir = os.path.join(run_dir, checkpoint_folder)
+            # Only save Adapter
+            keys_to_match = ['mm_projector', "ctm", "block"]
+            if getattr(self.args, "use_im_start_end", False):
+                keys_to_match.extend(['embed_tokens', 'embed_in'])
+            weight_to_save = get_mm_adapter_state_maybe_zero_3(self.model.named_parameters(), keys_to_match)
+            if self.args.local_rank == 0 or self.args.local_rank == -1:
+                self.model.config.save_pretrained(output_dir)
+                torch.save(weight_to_save, os.path.join(output_dir, f'mm_projector.bin'))
+        else:
+            super(ChatUniViTrainer, self)._save_checkpoint(model, trial, metrics)
+    def _save(self, output_dir: Optional[str] = None, state_dict=None):
+        if 0 and getattr(self.args, 'tune_mm_mlp_adapter', False):
+            pass
+        else:
+            super(ChatUniViTrainer, self)._save(output_dir, state_dict)

configs/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ from .config import args

configs/config.py ADDED Viewed

	@@ -0,0 +1,84 @@

+from email.policy import default
+import os
+import sys
+BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+sys.path.append(BASE_DIR)
+import cv2  # type: ignore
+import argparse
+import json
+import os
+from typing import Any, Dict, List
+# 数据集结构
+file_arch = """
+./REFAVS/data
+    - /media
+    - /gt_mask
+    - /metadata.csv
+    - /audio_embed
+    - /image_embed
+"""
+# print(f">>> File arch: {file_arch}")
+parser = argparse.ArgumentParser(
+    description=(
+        "SimToken"
+    )
+)
+parser.add_argument("--vision_pretrained",type=str,default='/workspace/SimToken/models/segment_anything/sam_vit_h_4b8939.pth')
+parser.add_argument("--vision_tower",type=str,default='openai/clip-vit-large-patch14')
+parser.add_argument("--mllm",type=str,default='Chat-UniVi/Chat-UniVi-7B-v1.5')
+parser.add_argument("--conv_template",type=int,default=1)
+parser.add_argument("--ct_weight",type=float,default=0.1)
+parser.add_argument("--input_type",type=str,default='refer')
+parser.add_argument("--compress",action='store_false',default=True)
+parser.add_argument("--start",type=int,default=0)
+parser.add_argument("--name",type=str,default='testrun')
+# path to ref-avs dataset
+parser.add_argument("--data_dir",type=str,default='/workspace/SimToken/data',help=f"The data paranet dir. File arch should be: {file_arch}")
+# path to pretrained checkpoints
+parser.add_argument("--saved_model",type=str,default='/workspace/SimToken/checkpoints/simtoken_pretrained.pth', help="the pretrained simtoken pth")
+parser.add_argument("--log_root",type=str,default='log', help="where to save log during training")
+parser.add_argument("--checkpoint_root",type=str,default='checkpoints', help="where to save trained checkpoints during training")
+parser.add_argument("--visualization_root",type=str,default='visualization', help="where to save visualization result during test")
+# parser.add_argument("--show_params", action='store_true', help=f"Show params names with Requires_grad==True.")
+# learning rate
+parser.add_argument("--lr", type=float, default=5e-5, help='lr to fine tuning adapters.')
+# epochs
+parser.add_argument("--epochs", type=int, default=10, help='epochs to fine tuning adapters.')
+parser.add_argument("--batch_size", type=int, default=8)
+parser.add_argument("--gpu_id", type=str, default="0", help="The GPU device to run generation on.")
+parser.add_argument("--run", type=str, default='train', help="train, test")
+parser.add_argument("--frame_n", type=int, default=10, help="Frame num of each video. Fixed to 10.")
+parser.add_argument("--text_max_len", type=int, default=25, help="Maximum textual reference length.")
+parser.add_argument("--max_eval_rows", type=int, default=-1, help="Max samples per split during eval; -1 = all.")
+parser.add_argument("--eval_split", type=str, default="test_u", help="Which split to evaluate: test_s, test_u, test_n.")
+args = parser.parse_args()
+# os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
+os.environ["CUDA_VISIBLE_DEVICES"] = args.gpu_id
+# print(f'>>> Sys: set "CUDA_VISIBLE_DEVICES" - GPU: {args.gpu_id}')

data/metadata.csv ADDED Viewed

The diff for this file is too large to render. See raw diff