ydshieh
/

flax-vision-encoder-decoder-vit-gpt2-coco-en

Model card Files Files and versions Community

ydshieh commited on Dec 16, 2021

Commit

0c9b4f3

•

1 Parent(s): 5306066

update debug.py

Browse files

Files changed (1) hide show

debug.py +78 -119

debug.py CHANGED Viewed

@@ -299,17 +299,45 @@ def data_loader(rng: jax.random.PRNGKey, dataset: Dataset, batch_size: int, shuf
     if shuffle:
         batch_idx = jax.random.permutation(rng, len(dataset))
     else:
-        batch_idx = jnp.arange(len(dataset))
     batch_idx = batch_idx[: steps_per_epoch * batch_size]  # Skip incomplete batch.
     batch_idx = batch_idx.reshape((steps_per_epoch, batch_size))
     for idx in batch_idx:
         batch = dataset[idx]
         batch = {k: jnp.array(v) for k, v in batch.items()}
         batch = shard(batch)
         yield batch
@@ -750,126 +778,43 @@ def main():
     )
     if training_args.do_train:
         if "train" not in dataset:
-             raise ValueError("--do_train requires a train dataset")
         train_dataset = dataset["train"]
-        train_dataset = datasets.concatenate_datasets([train_dataset] * 205)
         # remove problematic examples
-        s = time.time()
         train_dataset = train_dataset.filter(filter_fn, batched=True, num_proc=data_args.preprocessing_num_workers)
-        e = time.time()
-        print(f'filter time: {e-s}')
-        print(len(train_dataset))
-        rng = jax.random.PRNGKey(training_args.seed)
-        rng, input_rng = jax.random.split(rng)
-        s = time.time()
-        indices_jax = jax.random.permutation(input_rng, len(train_dataset))
-        e = time.time()
-        print(f'get permutation indices for the whole dataset with jax - time: {e-s}')
-        s = time.time()
-        indices_np = np.random.permutation(len(train_dataset))
-        e = time.time()
-        print(f'get permutation indices for the whole dataset with np - time: {e-s}')
-        # indices = jnp.arange(len(ds))
-        block_size = 4096
-        for idx in range(4):
-            start_idx = block_size * idx
-            end_idx = block_size * (idx + 1)
-            s = time.time()
-            selected_indices_jax = indices_jax[start_idx:end_idx]
-            e = time.time()
-            print(f'get block indices with jax - time: {e-s}')
-            print(type(selected_indices_jax))
-            s = time.time()
-            selected_indices_np = indices_np[start_idx:end_idx]
-            e = time.time()
-            print(f'get block indices with np - time: {e-s}')
-            print(type(selected_indices_np))
-            s = time.time()
-            _ds = train_dataset.select(selected_indices_jax)
-            e = time.time()
-            print(f'select block with jax - time: {e-s}')
-            s = time.time()
-            _ds = train_dataset.select(selected_indices_np)
-            e = time.time()
-            print(f'select block with np - time: {e-s}')
-            s = time.time()
-            _selected_indices_np = np.array(selected_indices_jax)
-            e = time.time()
-            print(f'convert jax to np - time: {e-s}')
-            batch_size = 256
-            steps_per_epoch = len(_ds) // batch_size
-            s = time.time()
-            batch_idx_jax = jax.random.permutation(rng, len(_ds))
-            e = time.time()
-            print(f'get permutation indices for the block with jax - time: {e-s}')
-            # batch_idx = jnp.arange(len(dataset))
-            s = time.time()
-            batch_idx_np = np.random.permutation(len(_ds))
-            e = time.time()
-            print(f'get permutation indices for the block with np - time: {e-s}')
-            s = time.time()
-            batch_idx_jax = batch_idx_jax[: steps_per_epoch * batch_size]  # Skip incomplete batch.
-            e = time.time()
-            print(f'skip incomplete batch with jax - time: {e-s}')
-            s = time.time()
-            batch_idx_np = batch_idx_np[: steps_per_epoch * batch_size]  # Skip incomplete batch.
-            e = time.time()
-            print(f'skip incomplete batch with np - time: {e-s}')
-            s = time.time()
-            batch_idx_jax = batch_idx_jax.reshape((steps_per_epoch, batch_size))
-            e = time.time()
-            print(f'reshape block indices with jax - time: {e-s}')
-            s = time.time()
-            batch_idx_np = batch_idx_np.reshape((steps_per_epoch, batch_size))
-            e = time.time()
-            print(f'reshape block indices with np - time: {e-s}')
-            for idx in batch_idx_jax:
-                s = time.time()
-                batch = _ds[idx]
-                e = time.time()
-                print(f'get one batch with jax - time: {e-s}')
-                #s = time.time()
-                #batch = {k: jnp.array(v) for k, v in batch.items()}
-                #e = time.time()
-                #print(f'convert one batch to jnp time: {e-s}')
-            for idx in batch_idx_np:
-                s = time.time()
-                batch = _ds[idx]
-                e = time.time()
-                print(f'get one batch with np - time: {e-s}')
-        exit(0)
     if training_args.do_predict:
         if "test" not in dataset:
@@ -929,7 +874,11 @@ def main():
         num_splits = steps // steps_per_split + int(steps % steps_per_split > 0)
         if shuffle:
-            indices = jax.random.permutation(input_rng, len(ds))
         else:
             indices = jnp.arange(len(ds))
@@ -938,9 +887,15 @@ def main():
             start_idx = block_size * idx
             end_idx = block_size * (idx + 1)
             selected_indices = indices[start_idx:end_idx]
             _ds = ds.select(selected_indices)
             names = {
                 "train": "train",
@@ -948,6 +903,7 @@ def main():
                 "test": "prediction",
             }
             _ds =_ds.map(
                 feature_extraction_fn,
                 batched=True,
@@ -955,10 +911,13 @@ def main():
                 remove_columns=[image_column],
                 load_from_cache_file=not data_args.overwrite_cache,
                 features=features,
-                keep_in_memory=keep_in_memory,
                 desc=f"Running feature extraction on {names[split]} dataset".replace("  ", " "),
             )
             _ds = _ds.with_format("numpy")
             # No need to shuffle here
             loader = data_loader(rng, _ds, batch_size=batch_size, shuffle=False)

     if shuffle:
         batch_idx = jax.random.permutation(rng, len(dataset))
     else:
+        s = time.time()
+        # batch_idx = jnp.arange(len(dataset))
+        batch_idx = np.arange(len(dataset))
+        e = time.time()
+        print(f'get permutation indices for the block with jax - time: {e-s}')
+    s = time.time()
     batch_idx = batch_idx[: steps_per_epoch * batch_size]  # Skip incomplete batch.
+    e = time.time()
+    print(f'skip incomplete batch with jax - time: {e-s}')
+    s = time.time()
     batch_idx = batch_idx.reshape((steps_per_epoch, batch_size))
+    e = time.time()
+    print(f'reshape block indices with np - time: {e-s}')
     for idx in batch_idx:
+        print(f'type idx: {type(idx)}')
+        print(f'pixel values type: {type(dataset["pixel_values"])}')
+        print(f'pixel values shape: {dataset["pixel_values"].shape}')
+        s = time.time()
         batch = dataset[idx]
+        e = time.time()
+        print(f'get one batch with jax - time: {e-s}')
+        exit(0)
+        s = time.time()
         batch = {k: jnp.array(v) for k, v in batch.items()}
+        e = time.time()
+        print(f'convert one batch from np to jax - time: {e-s}')
+        s = time.time()
         batch = shard(batch)
+        e = time.time()
+        print(f'shard one batch with jax - time: {e-s}')
         yield batch
     )
     if training_args.do_train:
         if "train" not in dataset:
+            raise ValueError("--do_train requires a train dataset")
         train_dataset = dataset["train"]
         # remove problematic examples
         train_dataset = train_dataset.filter(filter_fn, batched=True, num_proc=data_args.preprocessing_num_workers)
+        train_dataset = datasets.concatenate_datasets([train_dataset] * 205)
+        if data_args.max_train_samples is not None:
+            train_dataset = train_dataset.select(range(data_args.max_train_samples))
+        train_dataset = train_dataset.map(
+            tokenization_fn,
+            batched=True,
+            num_proc=data_args.preprocessing_num_workers,
+            # kept image paths
+            remove_columns=[x for x in column_names if x != image_column],
+            load_from_cache_file=not data_args.overwrite_cache,
+            desc=f"Running tokenizer on train dataset",
+            fn_kwargs={"max_target_length": data_args.max_target_length},
+        )
+    if training_args.do_eval:
+        if "validation" not in dataset:
+            raise ValueError("--do_eval requires a validation dataset")
+        eval_dataset = dataset["validation"]
+        # remove problematic examples
+        eval_dataset = eval_dataset.filter(filter_fn, batched=True, num_proc=data_args.preprocessing_num_workers)
+        if data_args.max_eval_samples is not None:
+            eval_dataset = eval_dataset.select(range(data_args.max_eval_samples))
+        eval_dataset = eval_dataset.map(
+            tokenization_fn,
+            batched=True,
+            num_proc=data_args.preprocessing_num_workers,
+            # kept image paths
+            remove_columns=[x for x in column_names if x != image_column],
+            load_from_cache_file=not data_args.overwrite_cache,
+            desc=f"Running tokenizer on validation dataset",
+            fn_kwargs={"max_target_length": data_args.val_max_target_length},
+        )
     if training_args.do_predict:
         if "test" not in dataset:
         num_splits = steps // steps_per_split + int(steps % steps_per_split > 0)
         if shuffle:
+            s = time.time()
+            #indices = jax.random.permutation(input_rng, len(ds))
+            indices = np.random.permutation(len(train_dataset))
+            e = time.time()
+            print(f'get permutation indices for the whole dataset with jax - time: {e-s}')
         else:
             indices = jnp.arange(len(ds))
             start_idx = block_size * idx
             end_idx = block_size * (idx + 1)
+            s = time.time()
             selected_indices = indices[start_idx:end_idx]
+            e = time.time()
+            print(f'get block indices with jax - time: {e-s}')
+            s = time.time()
             _ds = ds.select(selected_indices)
+            e = time.time()
+            print(f'select block with jax - time: {e-s}')
             names = {
                 "train": "train",
                 "test": "prediction",
             }
+            s = time.time()
             _ds =_ds.map(
                 feature_extraction_fn,
                 batched=True,
                 remove_columns=[image_column],
                 load_from_cache_file=not data_args.overwrite_cache,
                 features=features,
+                #keep_in_memory=keep_in_memory,
+                keep_in_memory=False,
                 desc=f"Running feature extraction on {names[split]} dataset".replace("  ", " "),
             )
             _ds = _ds.with_format("numpy")
+            e = time.time()
+            print(f'map feature extraction - time: {e-s}')
             # No need to shuffle here
             loader = data_loader(rng, _ds, batch_size=batch_size, shuffle=False)