ydshieh
/

flax-vision-encoder-decoder-vit-gpt2-coco-en

Model card Files Files and versions Community

ydshieh commited on Dec 16, 2021

Commit

7245cb4

•

1 Parent(s): 9ca46fa

update debug.py

Browse files

Files changed (1) hide show

debug.py +14 -43

debug.py CHANGED Viewed

@@ -298,46 +298,22 @@ def data_loader(rng: jax.random.PRNGKey, dataset: Dataset, batch_size: int, shuf
     if shuffle:
         batch_idx = jax.random.permutation(rng, len(dataset))
     else:
-        s = time.time()
-        # batch_idx = jnp.arange(len(dataset))
         batch_idx = np.arange(len(dataset))
-        e = time.time()
-        print(f'get permutation indices for the block with jax - time: {e-s}')
-    s = time.time()
     batch_idx = batch_idx[: steps_per_epoch * batch_size]  # Skip incomplete batch.
-    e = time.time()
-    print(f'skip incomplete batch with jax - time: {e-s}')
-    s = time.time()
     batch_idx = batch_idx.reshape((steps_per_epoch, batch_size))
-    e = time.time()
-    print(f'reshape block indices with np - time: {e-s}')
     for idx in batch_idx:
-        print(f'type idx: {type(idx)}')
-        print(f'pixel values type: {type(dataset["pixel_values"])}')
-        print(f'pixel values shape: {dataset["pixel_values"].shape}')
         s = time.time()
         batch = dataset[idx]
         e = time.time()
-        print(f'get one batch with jax - time: {e-s}')
-        exit(0)
-        s = time.time()
         batch = {k: jnp.array(v) for k, v in batch.items()}
-        e = time.time()
-        print(f'convert one batch from np to jax - time: {e-s}')
-        s = time.time()
         batch = shard(batch)
-        e = time.time()
-        print(f'shard one batch with jax - time: {e-s}')
         yield batch
@@ -781,9 +757,9 @@ def main():
         if "train" not in dataset:
             raise ValueError("--do_train requires a train dataset")
         train_dataset = dataset["train"]
         # remove problematic examples
         train_dataset = train_dataset.filter(filter_fn, batched=True, num_proc=data_args.preprocessing_num_workers)
-        train_dataset = datasets.concatenate_datasets([train_dataset] * 205)
         if data_args.max_train_samples is not None:
             train_dataset = train_dataset.select(range(data_args.max_train_samples))
         train_dataset = train_dataset.map(
@@ -803,6 +779,7 @@ def main():
         eval_dataset = dataset["validation"]
         # remove problematic examples
         eval_dataset = eval_dataset.filter(filter_fn, batched=True, num_proc=data_args.preprocessing_num_workers)
         if data_args.max_eval_samples is not None:
             eval_dataset = eval_dataset.select(range(data_args.max_eval_samples))
         eval_dataset = eval_dataset.map(
@@ -820,6 +797,7 @@ def main():
         if "test" not in dataset:
             raise ValueError("--do_predict requires a test dataset")
         predict_dataset = dataset["test"]
         # remove problematic examples
         predict_dataset = predict_dataset.filter(filter_fn, batched=True, num_proc=data_args.preprocessing_num_workers)
         if data_args.max_predict_samples is not None:
@@ -840,7 +818,7 @@ def main():
     # Split the dataset into several chunks - each chunk is processed (.map) without cache to create a
     # data loader separately (in a sequential order).
     block_size = training_args.block_size
     # Store some constant
     train_batch_size = int(training_args.per_device_train_batch_size) * jax.device_count()
@@ -874,28 +852,22 @@ def main():
         num_splits = steps // steps_per_split + int(steps % steps_per_split > 0)
         if shuffle:
-            s = time.time()
-            #indices = jax.random.permutation(input_rng, len(ds))
-            indices = np.random.permutation(len(train_dataset))
-            e = time.time()
-            print(f'get permutation indices for the whole dataset with jax - time: {e-s}')
         else:
-            indices = jnp.arange(len(ds))
         for idx in range(num_splits):
             start_idx = block_size * idx
             end_idx = block_size * (idx + 1)
-            s = time.time()
             selected_indices = indices[start_idx:end_idx]
-            e = time.time()
-            print(f'get block indices with jax - time: {e-s}')
             s = time.time()
             _ds = ds.select(selected_indices)
             e = time.time()
-            print(f'select block with jax - time: {e-s}')
             names = {
                 "train": "train",
@@ -904,20 +876,19 @@ def main():
             }
             s = time.time()
-            _ds =_ds.map(
                 feature_extraction_fn,
                 batched=True,
                 num_proc=data_args.preprocessing_num_workers,
                 remove_columns=[image_column],
                 load_from_cache_file=not data_args.overwrite_cache,
                 features=features,
-                #keep_in_memory=keep_in_memory,
-                keep_in_memory=False,
                 desc=f"Running feature extraction on {names[split]} dataset".replace("  ", " "),
             )
             _ds = _ds.with_format("numpy")
             e = time.time()
-            print(f'map feature extraction - time: {e-s}')
             # No need to shuffle here
             loader = data_loader(rng, _ds, batch_size=batch_size, shuffle=False)

     if shuffle:
         batch_idx = jax.random.permutation(rng, len(dataset))
+        batch_idx = np.asarray(batch_idx)
     else:
         batch_idx = np.arange(len(dataset))
     batch_idx = batch_idx[: steps_per_epoch * batch_size]  # Skip incomplete batch.
     batch_idx = batch_idx.reshape((steps_per_epoch, batch_size))
     for idx in batch_idx:
         s = time.time()
         batch = dataset[idx]
         e = time.time()
+        print(f'fetch batch time: {e-s}')
         batch = {k: jnp.array(v) for k, v in batch.items()}
         batch = shard(batch)
         yield batch
         if "train" not in dataset:
             raise ValueError("--do_train requires a train dataset")
         train_dataset = dataset["train"]
+        train_dataset = datasets.concatenate_datasets([train_dataset] * 205)
         # remove problematic examples
         train_dataset = train_dataset.filter(filter_fn, batched=True, num_proc=data_args.preprocessing_num_workers)
         if data_args.max_train_samples is not None:
             train_dataset = train_dataset.select(range(data_args.max_train_samples))
         train_dataset = train_dataset.map(
         eval_dataset = dataset["validation"]
         # remove problematic examples
         eval_dataset = eval_dataset.filter(filter_fn, batched=True, num_proc=data_args.preprocessing_num_workers)
+        eval_dataset = datasets.concatenate_datasets([eval_dataset] * 205)
         if data_args.max_eval_samples is not None:
             eval_dataset = eval_dataset.select(range(data_args.max_eval_samples))
         eval_dataset = eval_dataset.map(
         if "test" not in dataset:
             raise ValueError("--do_predict requires a test dataset")
         predict_dataset = dataset["test"]
+        predict_dataset = datasets.concatenate_datasets([predict_dataset] * 1024)
         # remove problematic examples
         predict_dataset = predict_dataset.filter(filter_fn, batched=True, num_proc=data_args.preprocessing_num_workers)
         if data_args.max_predict_samples is not None:
     # Split the dataset into several chunks - each chunk is processed (.map) without cache to create a
     # data loader separately (in a sequential order).
     block_size = training_args.block_size
     # Store some constant
     train_batch_size = int(training_args.per_device_train_batch_size) * jax.device_count()
         num_splits = steps // steps_per_split + int(steps % steps_per_split > 0)
         if shuffle:
+            indices = jax.random.permutation(rng, len(train_dataset))
+            indices = np.asarray(indices)
         else:
+            indices = np.arange(len(ds))
         for idx in range(num_splits):
             start_idx = block_size * idx
             end_idx = block_size * (idx + 1)
             selected_indices = indices[start_idx:end_idx]
             s = time.time()
             _ds = ds.select(selected_indices)
             e = time.time()
+            print(f'select block time: {e-s}')
             names = {
                 "train": "train",
             }
             s = time.time()
+            _ds = _ds.map(
                 feature_extraction_fn,
                 batched=True,
                 num_proc=data_args.preprocessing_num_workers,
                 remove_columns=[image_column],
                 load_from_cache_file=not data_args.overwrite_cache,
                 features=features,
+                keep_in_memory=keep_in_memory,
                 desc=f"Running feature extraction on {names[split]} dataset".replace("  ", " "),
             )
             _ds = _ds.with_format("numpy")
             e = time.time()
+            print(f'map time: {e-s}')
             # No need to shuffle here
             loader = data_loader(rng, _ds, batch_size=batch_size, shuffle=False)