diff --git a/checkpoints/latest_checkpoint.txt b/checkpoints/latest_checkpoint.txt new file mode 100644 index 0000000000000000000000000000000000000000..75904707d7bf3507e9a1cc9cf21f9f54584fe970 --- /dev/null +++ b/checkpoints/latest_checkpoint.txt @@ -0,0 +1 @@ +iter_000000500.pt diff --git a/checkpoints/model/iter_000000025.pt b/checkpoints/model/iter_000000025.pt new file mode 100644 index 0000000000000000000000000000000000000000..7d71e0a62c21c402bd801ecf99f2a48bb08470bd --- /dev/null +++ b/checkpoints/model/iter_000000025.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0806c269ff25cba0671b4fe8affac9aae0993927076fd500a9e947d2b6ec4214 +size 4005911812 diff --git a/checkpoints/model/iter_000000025_fused.pt b/checkpoints/model/iter_000000025_fused.pt new file mode 100644 index 0000000000000000000000000000000000000000..5d1c68a50e5a5f6dac5dcb5333628dbbbb46ddd0 --- /dev/null +++ b/checkpoints/model/iter_000000025_fused.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:79e66d1cbeef94f937c880f667214cd611fc19afa62d78e848d7fc3af1280249 +size 3913047915 diff --git a/checkpoints/model/iter_000000051.pt b/checkpoints/model/iter_000000051.pt new file mode 100644 index 0000000000000000000000000000000000000000..beb70948476cf33e96e8022c729031638a237df5 --- /dev/null +++ b/checkpoints/model/iter_000000051.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2dfc462f87a94e141f7d3c06a34b31a5e6dca3a492d9eb72cf53b780297939b1 +size 4005911812 diff --git a/checkpoints/model/iter_000000051_fused.pt b/checkpoints/model/iter_000000051_fused.pt new file mode 100644 index 0000000000000000000000000000000000000000..a87eef012477eaf289f01064ddae4c1a50ccebc5 --- /dev/null +++ b/checkpoints/model/iter_000000051_fused.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8494a54ca197a2c7d94c912fa4564f10a0a9b9afc9598b03dffc384695816ad6 +size 3913047915 diff --git a/checkpoints/model/iter_000000076.pt b/checkpoints/model/iter_000000076.pt new file mode 100644 index 0000000000000000000000000000000000000000..64e225e477067e12021fc6437cfa12193059888b --- /dev/null +++ b/checkpoints/model/iter_000000076.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cc1ac5a2bb6fcdcc84f2bc6e3820bfca56c480a0219acc628d810008dadfbbfe +size 4005911812 diff --git a/checkpoints/model/iter_000000076_fused.pt b/checkpoints/model/iter_000000076_fused.pt new file mode 100644 index 0000000000000000000000000000000000000000..056e918b0b9bb38e8ec1d1620101b42a6fb27973 --- /dev/null +++ b/checkpoints/model/iter_000000076_fused.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3057d7c76a87a80e77a0d3113cc8f4d6fe87eaa6a18148eb4c64560503a7a6a3 +size 3913047915 diff --git a/checkpoints/model/iter_000000100.pt b/checkpoints/model/iter_000000100.pt new file mode 100644 index 0000000000000000000000000000000000000000..2e8b2b114e04354d0452aabb5e182723cf051810 --- /dev/null +++ b/checkpoints/model/iter_000000100.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:58917ad5c68973c9600da3148b7684fff8a3025cb4205ea9013dc3f9ec991cea +size 4005911812 diff --git a/checkpoints/model/iter_000000100_fused.pt b/checkpoints/model/iter_000000100_fused.pt new file mode 100644 index 0000000000000000000000000000000000000000..6e80176987c38db939d765775646e687eea7b4af --- /dev/null +++ b/checkpoints/model/iter_000000100_fused.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f841d42386b6694c5774ba0608ff3783b6277fa588f96e022c7fb1bbfbccdab3 +size 3913047915 diff --git a/checkpoints/model/iter_000000200.pt b/checkpoints/model/iter_000000200.pt new file mode 100644 index 0000000000000000000000000000000000000000..98ea0425dce295646ef7411f68b577c3f4d64f77 --- /dev/null +++ b/checkpoints/model/iter_000000200.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:64bf4160f39549bddbe6caa2b4de2b7cb973144fb37e3db0c4d003d7686caea0 +size 4005911812 diff --git a/checkpoints/model/iter_000000200_fused.pt b/checkpoints/model/iter_000000200_fused.pt new file mode 100644 index 0000000000000000000000000000000000000000..5a1b175e1617825b0f98e694664bd377481010cb --- /dev/null +++ b/checkpoints/model/iter_000000200_fused.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a4162ac913cc47cfe34ed8bb56b1fb4b837e9c457df1a2e363f933a1ab3f7fae +size 3913047915 diff --git a/checkpoints/model/iter_000000300.pt b/checkpoints/model/iter_000000300.pt new file mode 100644 index 0000000000000000000000000000000000000000..2d66fb06ad38d9344f5282e8125545a0f9f9d551 --- /dev/null +++ b/checkpoints/model/iter_000000300.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f4a74f8376bc17b22573a3960923ca0845549f3de2935b58612cd0d5deb8b51b +size 4005911812 diff --git a/checkpoints/model/iter_000000300_fused.pt b/checkpoints/model/iter_000000300_fused.pt new file mode 100644 index 0000000000000000000000000000000000000000..c5a4acf4f4018c85a84416b4b92b0b2e2722b2b9 --- /dev/null +++ b/checkpoints/model/iter_000000300_fused.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6c20aee2f40f1c5408143827f966fc87031fbc75513242d52b637b7209afb4a6 +size 3913047915 diff --git a/checkpoints/model/iter_000000400.pt b/checkpoints/model/iter_000000400.pt new file mode 100644 index 0000000000000000000000000000000000000000..7f2b42e23606c5703b6e8e09e368535164146bfa --- /dev/null +++ b/checkpoints/model/iter_000000400.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:06bf49aee4d60a046f655c0c56ae9c0fabd08a35d18b2a9eec699031924f05c8 +size 4005911812 diff --git a/checkpoints/model/iter_000000400_fused.pt b/checkpoints/model/iter_000000400_fused.pt new file mode 100644 index 0000000000000000000000000000000000000000..27804cf1d21aaeebbbecf657d8bac3da03098086 --- /dev/null +++ b/checkpoints/model/iter_000000400_fused.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3ae1cffb116bc0ef4245cea33ee879e81dfdbce8df9c37ad1453ff206eeeb3e0 +size 3913047915 diff --git a/checkpoints/model/iter_000000500.pt b/checkpoints/model/iter_000000500.pt new file mode 100644 index 0000000000000000000000000000000000000000..a6599278a76282fc30f72b352c9f1e561722f83a --- /dev/null +++ b/checkpoints/model/iter_000000500.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:40e29afa9169b2b851de8be213152ba5ce76fcebf786f4f9ba61184d9e51f7ca +size 4005911812 diff --git a/checkpoints/model/iter_000000500_fused.pt b/checkpoints/model/iter_000000500_fused.pt new file mode 100644 index 0000000000000000000000000000000000000000..6ef89d5cee512c4a942b4eff34e25d8406a1990b --- /dev/null +++ b/checkpoints/model/iter_000000500_fused.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:26834697bd0af171b961d2e484a05babf83e0d20928b18e419934094638085bd +size 3913047915 diff --git a/checkpoints/optim/iter_000000025.pt b/checkpoints/optim/iter_000000025.pt new file mode 100644 index 0000000000000000000000000000000000000000..e4c35f2e87512ad88fb7cab38b35b076295cd06c --- /dev/null +++ b/checkpoints/optim/iter_000000025.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ac6a24553367916b3fdeacb55cf5a3e1bd63540795a467ef553487f1d4c65295 +size 1792982648 diff --git a/checkpoints/optim/iter_000000025_fused.pt b/checkpoints/optim/iter_000000025_fused.pt new file mode 100644 index 0000000000000000000000000000000000000000..e6699270f5db635312f93d6fbdab4b33e3bb7d27 --- /dev/null +++ b/checkpoints/optim/iter_000000025_fused.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a3623e37a83b73ddf4cc351c8cca32428d1fa631a04787e1891737fdf1876785 +size 1792991522 diff --git a/checkpoints/optim/iter_000000051.pt b/checkpoints/optim/iter_000000051.pt new file mode 100644 index 0000000000000000000000000000000000000000..fb41257cfe9865044eb769274d36983ada533747 --- /dev/null +++ b/checkpoints/optim/iter_000000051.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bf7cc68a1e555c3fb9d670764db2ada75aa289ac71ba10dfe48b909101838891 +size 1792982648 diff --git a/checkpoints/optim/iter_000000051_fused.pt b/checkpoints/optim/iter_000000051_fused.pt new file mode 100644 index 0000000000000000000000000000000000000000..870db0396afd2c2ef980835252debc68ab9cf843 --- /dev/null +++ b/checkpoints/optim/iter_000000051_fused.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2b769379f94368a5b27fb22c4aa6916279f8891ab4546021679995f6a317b7ca +size 1792991522 diff --git a/checkpoints/optim/iter_000000076.pt b/checkpoints/optim/iter_000000076.pt new file mode 100644 index 0000000000000000000000000000000000000000..b9fb8be6cced9738adcbf5bd5343964ea61c4be3 --- /dev/null +++ b/checkpoints/optim/iter_000000076.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:acb0cbb96fad5368da491fdca7a444424470d2c13255046d79bccb0b874d795a +size 1792982648 diff --git a/checkpoints/optim/iter_000000076_fused.pt b/checkpoints/optim/iter_000000076_fused.pt new file mode 100644 index 0000000000000000000000000000000000000000..8698db85a5427e0b025036738142c3aebd3ad18e --- /dev/null +++ b/checkpoints/optim/iter_000000076_fused.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:03f8842a76640d2038816ca8732a228e401cb881d96655223e126248a1fdd22f +size 1792991522 diff --git a/checkpoints/optim/iter_000000100.pt b/checkpoints/optim/iter_000000100.pt new file mode 100644 index 0000000000000000000000000000000000000000..4392a6b45752e611d7f0370fb9328873fe5b5e72 --- /dev/null +++ b/checkpoints/optim/iter_000000100.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:31bd0ba91065449be0b88fc2e606b759db6e1c3d1df7b2cc3ccc11e97c59c764 +size 1792982648 diff --git a/checkpoints/optim/iter_000000100_fused.pt b/checkpoints/optim/iter_000000100_fused.pt new file mode 100644 index 0000000000000000000000000000000000000000..77086349dbe69dbed9d60d93a267dcbd4d0a858e --- /dev/null +++ b/checkpoints/optim/iter_000000100_fused.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e5c28ab9ce3ef767bc43ad8a7986b96c0410e850652d92c946dc7ea903017d32 +size 1792991522 diff --git a/checkpoints/optim/iter_000000200.pt b/checkpoints/optim/iter_000000200.pt new file mode 100644 index 0000000000000000000000000000000000000000..6fa30ac3ba3a68565abece7f404d3cf9565daf81 --- /dev/null +++ b/checkpoints/optim/iter_000000200.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2654ed7279a81b2f2b1f31e640fc7d763cd9cc029fba7a3b6c87a663effa45c6 +size 1792982648 diff --git a/checkpoints/optim/iter_000000200_fused.pt b/checkpoints/optim/iter_000000200_fused.pt new file mode 100644 index 0000000000000000000000000000000000000000..cd9c696fce694400ae726930a75624ed94f74182 --- /dev/null +++ b/checkpoints/optim/iter_000000200_fused.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:658590331947610c7824c37103f2478a4915c1eaec357bf8594dd7f5fdb5775f +size 1792991522 diff --git a/checkpoints/optim/iter_000000300.pt b/checkpoints/optim/iter_000000300.pt new file mode 100644 index 0000000000000000000000000000000000000000..a24524c2632317e2dbdebaff0e2c815051baad92 --- /dev/null +++ b/checkpoints/optim/iter_000000300.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:011fc93fd95a84b64478954568b59cc4324082968ed2b4b996a0b99ad8a37d36 +size 1792982648 diff --git a/checkpoints/optim/iter_000000300_fused.pt b/checkpoints/optim/iter_000000300_fused.pt new file mode 100644 index 0000000000000000000000000000000000000000..2c2265b91f75d0e8ab440cece8de93389c303634 --- /dev/null +++ b/checkpoints/optim/iter_000000300_fused.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:aac7941c82a2ec8157ae74e12159e51e6d5338877a1a8cf1436d951e02f6d2fd +size 1792991522 diff --git a/checkpoints/optim/iter_000000400.pt b/checkpoints/optim/iter_000000400.pt new file mode 100644 index 0000000000000000000000000000000000000000..b63559c0955923602b1085950894ea9ee3982f15 --- /dev/null +++ b/checkpoints/optim/iter_000000400.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2a93d2c52fce4b9901788771d07443b4371e8eacc0ea286487aaca51f15669de +size 1792982648 diff --git a/checkpoints/optim/iter_000000400_fused.pt b/checkpoints/optim/iter_000000400_fused.pt new file mode 100644 index 0000000000000000000000000000000000000000..e59b105dae65d540d7ffbbc20b7383c77eacfa9c --- /dev/null +++ b/checkpoints/optim/iter_000000400_fused.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:313816a8a3ecfdad312f4aa8472023b1564d3d38ce5cbc08c07294860fb38a82 +size 1792991522 diff --git a/checkpoints/optim/iter_000000500.pt b/checkpoints/optim/iter_000000500.pt new file mode 100644 index 0000000000000000000000000000000000000000..0860f2ea05376b0d06cecfb53bfa2b35b9119acf --- /dev/null +++ b/checkpoints/optim/iter_000000500.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:42e655c9c4c142ab4beb867b126459a68ea3a2003e4424e4a13fdd042e9be3df +size 1792982648 diff --git a/checkpoints/optim/iter_000000500_fused.pt b/checkpoints/optim/iter_000000500_fused.pt new file mode 100644 index 0000000000000000000000000000000000000000..b3b7dae116f098b8115613c2ecb89cd26020d6ff --- /dev/null +++ b/checkpoints/optim/iter_000000500_fused.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:67f276d494d89ec55ff6fd4885bf8f09c6b1b8c43176b3329ff39618baaafeca +size 1792991522 diff --git a/checkpoints/scheduler/iter_000000025.pt b/checkpoints/scheduler/iter_000000025.pt new file mode 100644 index 0000000000000000000000000000000000000000..deb0487038625370be11338174a47df71a48140a --- /dev/null +++ b/checkpoints/scheduler/iter_000000025.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6a254e72a7c866c0748983cfa3eb9e6508f97c03f1770a5bb56057b44cb485bd +size 1602 diff --git a/checkpoints/scheduler/iter_000000025_fused.pt b/checkpoints/scheduler/iter_000000025_fused.pt new file mode 100644 index 0000000000000000000000000000000000000000..2a864773e4aae5a33788ef7d1891f1201ae4eb3e --- /dev/null +++ b/checkpoints/scheduler/iter_000000025_fused.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:311a2b8f7b3f641ba16c633a7596860b36f31c63c8666f423ed806a2ddd2f381 +size 1638 diff --git a/checkpoints/scheduler/iter_000000051.pt b/checkpoints/scheduler/iter_000000051.pt new file mode 100644 index 0000000000000000000000000000000000000000..010a5e55a1ac778cc64f2507d968f9419d19973f --- /dev/null +++ b/checkpoints/scheduler/iter_000000051.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2bf15bb3b5167142b908f9004afe5144549818a71300ba889aec5efb96bb25ee +size 1602 diff --git a/checkpoints/scheduler/iter_000000051_fused.pt b/checkpoints/scheduler/iter_000000051_fused.pt new file mode 100644 index 0000000000000000000000000000000000000000..4ac415be54a52eb713acf06964774fa9359de95f --- /dev/null +++ b/checkpoints/scheduler/iter_000000051_fused.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a63c0ca08eae7130ea1624369c54329802c03125a49f878ee80ddb2d65f2ff3f +size 1638 diff --git a/checkpoints/scheduler/iter_000000076.pt b/checkpoints/scheduler/iter_000000076.pt new file mode 100644 index 0000000000000000000000000000000000000000..6b956794e35592ce67d5d4c01ef98e64078a77a2 --- /dev/null +++ b/checkpoints/scheduler/iter_000000076.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:246ed66eb50daf0654363d8a1ed49d76ea0e4802841ef736abe8328ab9be47bf +size 1602 diff --git a/checkpoints/scheduler/iter_000000076_fused.pt b/checkpoints/scheduler/iter_000000076_fused.pt new file mode 100644 index 0000000000000000000000000000000000000000..151665c6fdfe77c79bff270f4545467cc17bfd57 --- /dev/null +++ b/checkpoints/scheduler/iter_000000076_fused.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c447c43025cf33e7cae294928dbcca69fc1346587df47d5f5fa6f8e92152f614 +size 1638 diff --git a/checkpoints/scheduler/iter_000000100.pt b/checkpoints/scheduler/iter_000000100.pt new file mode 100644 index 0000000000000000000000000000000000000000..35e0b70d992eb2b3418aa943c8c5138f624eadbe --- /dev/null +++ b/checkpoints/scheduler/iter_000000100.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:597be89529ea32cca3bee5b5b2db6c29ca6ba8bfbae9e4f7d4a00d53f7a79e68 +size 1602 diff --git a/checkpoints/scheduler/iter_000000100_fused.pt b/checkpoints/scheduler/iter_000000100_fused.pt new file mode 100644 index 0000000000000000000000000000000000000000..13625d2753abb09c7e5a10763ee8fe9425031163 --- /dev/null +++ b/checkpoints/scheduler/iter_000000100_fused.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d38a9ca72bf26fc4a109b5754ef036e798fd0144cfe7c6805e6b37f45f756e56 +size 1638 diff --git a/checkpoints/scheduler/iter_000000200.pt b/checkpoints/scheduler/iter_000000200.pt new file mode 100644 index 0000000000000000000000000000000000000000..6c5049fbb2f95a24861f4e22724d8764789445e0 --- /dev/null +++ b/checkpoints/scheduler/iter_000000200.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ae54e16af9c807da6478e4a3fca2a80ea75a5f92da834b419034c99072f61dcc +size 1602 diff --git a/checkpoints/scheduler/iter_000000200_fused.pt b/checkpoints/scheduler/iter_000000200_fused.pt new file mode 100644 index 0000000000000000000000000000000000000000..f82931fa4bcbac6ca7daa18113d975bfd71e65d7 --- /dev/null +++ b/checkpoints/scheduler/iter_000000200_fused.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4ac94fecfb1ea82d2f13fea39b66ce04bf9b19500ab8eadd8ab5593545ac7420 +size 1638 diff --git a/checkpoints/scheduler/iter_000000300.pt b/checkpoints/scheduler/iter_000000300.pt new file mode 100644 index 0000000000000000000000000000000000000000..b66f530f4c8db885e39f559add7d2adc49205b80 --- /dev/null +++ b/checkpoints/scheduler/iter_000000300.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:be8020106efa8f2672705817bb4a752a0109c978e7ceaca4a6ef5e86a610645f +size 1602 diff --git a/checkpoints/scheduler/iter_000000300_fused.pt b/checkpoints/scheduler/iter_000000300_fused.pt new file mode 100644 index 0000000000000000000000000000000000000000..49d9841e49fd32c9d5fb45f75f945f9977c58001 --- /dev/null +++ b/checkpoints/scheduler/iter_000000300_fused.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:be09924cf42d3dc6b004cb5c2e7be9a4d072fe068e336595ada2e7671271eb94 +size 1638 diff --git a/checkpoints/scheduler/iter_000000400.pt b/checkpoints/scheduler/iter_000000400.pt new file mode 100644 index 0000000000000000000000000000000000000000..9537736a476ca10c383c17d172c8e19fd5a14a06 --- /dev/null +++ b/checkpoints/scheduler/iter_000000400.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b52198677f3b6ec1e3502b799737f56ea5e765f55a17c4c03ae5ff3506abb759 +size 1602 diff --git a/checkpoints/scheduler/iter_000000400_fused.pt b/checkpoints/scheduler/iter_000000400_fused.pt new file mode 100644 index 0000000000000000000000000000000000000000..ff4b535eadc822d52c5c2acaddc6ada2e3d30c08 --- /dev/null +++ b/checkpoints/scheduler/iter_000000400_fused.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:eec1e315bb13b61bc944a240cb39d39d67bed926bb51220df7db0ce9fe1e83be +size 1638 diff --git a/checkpoints/scheduler/iter_000000500.pt b/checkpoints/scheduler/iter_000000500.pt new file mode 100644 index 0000000000000000000000000000000000000000..4d0aba60a452785ae7983aca85b8c12c6b96081d --- /dev/null +++ b/checkpoints/scheduler/iter_000000500.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7707bd114a1ddc31b5bb345eff4ade8980838f6b9148064b308bf31ac5b137d9 +size 1602 diff --git a/checkpoints/scheduler/iter_000000500_fused.pt b/checkpoints/scheduler/iter_000000500_fused.pt new file mode 100644 index 0000000000000000000000000000000000000000..6a8cab41b2edb5a393121f8f6a03baf56f741459 --- /dev/null +++ b/checkpoints/scheduler/iter_000000500_fused.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ad21fbda6d416dd07304cb219c57c26e21a6b85955b47a608f2cbf9043616b74 +size 1638 diff --git a/checkpoints/trainer/iter_000000025.pt b/checkpoints/trainer/iter_000000025.pt new file mode 100644 index 0000000000000000000000000000000000000000..29025d6820bd2daef0a2712dcedac27d0df8ff21 --- /dev/null +++ b/checkpoints/trainer/iter_000000025.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e0a4c8e73dc2ca57f4854bbad8c20ec7fd372662df4357b4597c12692199137e +size 892 diff --git a/checkpoints/trainer/iter_000000025_fused.pt b/checkpoints/trainer/iter_000000025_fused.pt new file mode 100644 index 0000000000000000000000000000000000000000..745c762d43554a05e73a9763cc98261da44bec19 --- /dev/null +++ b/checkpoints/trainer/iter_000000025_fused.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5b1470a9ae005f0ecc18ad210a6e566826f18c48945570babe61f05a9218a147 +size 916 diff --git a/checkpoints/trainer/iter_000000051.pt b/checkpoints/trainer/iter_000000051.pt new file mode 100644 index 0000000000000000000000000000000000000000..40110ea4f4c166067cf4f818564b121e2539a5e7 --- /dev/null +++ b/checkpoints/trainer/iter_000000051.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b2122b3e0cb4e1223e22d3231cfd6f4a3431ba93d77fcd6306b6c9481134ca90 +size 892 diff --git a/checkpoints/trainer/iter_000000051_fused.pt b/checkpoints/trainer/iter_000000051_fused.pt new file mode 100644 index 0000000000000000000000000000000000000000..681d6c3672c4a10b9124e57e8092f3ac8e57e6d5 --- /dev/null +++ b/checkpoints/trainer/iter_000000051_fused.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0a17899abd9dc781d9522aeeaa211834200844fc309b5909b6402f5c1af494a1 +size 916 diff --git a/checkpoints/trainer/iter_000000076.pt b/checkpoints/trainer/iter_000000076.pt new file mode 100644 index 0000000000000000000000000000000000000000..81534addf67055b417b00deb2f31077b5e1f0097 --- /dev/null +++ b/checkpoints/trainer/iter_000000076.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fe25f4831c6056f32cd92f4d8f08464f96ed7bfea0809badebf64bb706c62028 +size 892 diff --git a/checkpoints/trainer/iter_000000076_fused.pt b/checkpoints/trainer/iter_000000076_fused.pt new file mode 100644 index 0000000000000000000000000000000000000000..9a3091fd70263bc0c15c38f9818bf80c09b1b826 --- /dev/null +++ b/checkpoints/trainer/iter_000000076_fused.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4ed883fc9ae0b7c0b024a1b557a062a411f510cd2cb3776efc73cbebb39eee33 +size 916 diff --git a/checkpoints/trainer/iter_000000100.pt b/checkpoints/trainer/iter_000000100.pt new file mode 100644 index 0000000000000000000000000000000000000000..5535bc0b7e67ce40094bbcc144c6bba3d7f8fe8b --- /dev/null +++ b/checkpoints/trainer/iter_000000100.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d950795f379aef8f4dff7386679d77b4a65ca4291bed09afced61b39d3dfa49f +size 892 diff --git a/checkpoints/trainer/iter_000000100_fused.pt b/checkpoints/trainer/iter_000000100_fused.pt new file mode 100644 index 0000000000000000000000000000000000000000..240f9a3f3c5845be77ebca484947db859557af29 --- /dev/null +++ b/checkpoints/trainer/iter_000000100_fused.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f460c4ac29fbb35b81cbec7619f7e411d4055327364ecb6556e42e35993dee26 +size 916 diff --git a/checkpoints/trainer/iter_000000200.pt b/checkpoints/trainer/iter_000000200.pt new file mode 100644 index 0000000000000000000000000000000000000000..813312ba9fab3ba8936aacc34d410dd62c4765be --- /dev/null +++ b/checkpoints/trainer/iter_000000200.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b63e3d448feca7bea909fef7514faae3cbe7ada857a5e211a6e46fdf3d005d22 +size 892 diff --git a/checkpoints/trainer/iter_000000200_fused.pt b/checkpoints/trainer/iter_000000200_fused.pt new file mode 100644 index 0000000000000000000000000000000000000000..1e2286c465952448056e86a29958df7d319d97ea --- /dev/null +++ b/checkpoints/trainer/iter_000000200_fused.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d3e46bbba3ff1e79a6676041b66e5a6d49ed133e519363e77ff03439c94a313a +size 916 diff --git a/checkpoints/trainer/iter_000000300.pt b/checkpoints/trainer/iter_000000300.pt new file mode 100644 index 0000000000000000000000000000000000000000..a3f845ce292621fbaed110272c9ea22d422efc3b --- /dev/null +++ b/checkpoints/trainer/iter_000000300.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d0f362aa11769c1a4c5261f596d15c1d4dd509a0eb98fbded2add580ad97b56c +size 892 diff --git a/checkpoints/trainer/iter_000000300_fused.pt b/checkpoints/trainer/iter_000000300_fused.pt new file mode 100644 index 0000000000000000000000000000000000000000..8fb9f2621633bdbf0039a1ddc1444371ebfbf053 --- /dev/null +++ b/checkpoints/trainer/iter_000000300_fused.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:499bc3f48dc91966ec22778fb3193b61340e20e2cce59e134660aa264dddf016 +size 916 diff --git a/checkpoints/trainer/iter_000000400.pt b/checkpoints/trainer/iter_000000400.pt new file mode 100644 index 0000000000000000000000000000000000000000..3241ce9c9934f0aef3cebd5590452542a3cea57b --- /dev/null +++ b/checkpoints/trainer/iter_000000400.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3962658429705a75a1c67650f778c2937060352be8d407ae95199f003fb716e8 +size 892 diff --git a/checkpoints/trainer/iter_000000400_fused.pt b/checkpoints/trainer/iter_000000400_fused.pt new file mode 100644 index 0000000000000000000000000000000000000000..459ad26304e6b5763f6dd295c98593ae41843816 --- /dev/null +++ b/checkpoints/trainer/iter_000000400_fused.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:25c6fcc7907f7ee9f2607ed913ce5912785798194494671aa78442591a99cb85 +size 916 diff --git a/checkpoints/trainer/iter_000000500.pt b/checkpoints/trainer/iter_000000500.pt new file mode 100644 index 0000000000000000000000000000000000000000..06e7e7a7fabd9aae34dcc411a98e243673da6aba --- /dev/null +++ b/checkpoints/trainer/iter_000000500.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0907f99ff4186b13c44c403126adb5aad8058db62ed10d75282f43f93a2c79e2 +size 892 diff --git a/checkpoints/trainer/iter_000000500_fused.pt b/checkpoints/trainer/iter_000000500_fused.pt new file mode 100644 index 0000000000000000000000000000000000000000..3f97821fcf41f7d0f4bb42f9631cd837a1ab1571 --- /dev/null +++ b/checkpoints/trainer/iter_000000500_fused.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6454bc811aa9ce15be437117ac12f7d1981e50a536a83f5c8fce9ec01d8149a6 +size 916 diff --git a/config.pkl b/config.pkl new file mode 100644 index 0000000000000000000000000000000000000000..c0e84e3f3ce64dfe3a1eb2193cc1967ebe825cae --- /dev/null +++ b/config.pkl @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a0a162f682c91eaf7c3e58d1407c48ac638b032196c5e4cc279d90fe84084e07 +size 59402 diff --git a/config.yaml b/config.yaml new file mode 100644 index 0000000000000000000000000000000000000000..020ebd9044dee0a5ee963f4b9131202c160dd631 --- /dev/null +++ b/config.yaml @@ -0,0 +1,427 @@ +checkpoint: + broadcast_via_filesystem: 'False' + dcp_allow_mismatched_size: 'False' + dcp_async_mode_enabled: 'False' + jit: + device: cuda + dtype: bfloat16 + enabled: 'False' + input_shape: null + strict: 'True' + keys_not_to_resume: [] + load_ema_to_reg: 'False' + load_path: '' + load_training_state: 'False' + only_load_scheduler_state: 'False' + save_iter: '100' + strict_resume: 'True' + type: + _target_: + callbacks: null + verbose: 'True' +data_config: null +dataloader_train: + _target_: + batch_sampler: null + batch_size: '2' + collate_fn: null + dataset: + _target_: + data_fps: '30.0' + dataset_dir: /home/ubuntu/pdt-mimic/data/push-mimic + exclude_with_substring: null + include_only_with_substrings: null + is_multi_img: 'False' + is_val: 'False' + num_frames: '61' + obs_history: '5' + val_ratio: '0.0' + video_size: + - '480' + - '640' + drop_last: 'True' + generator: null + in_order: 'False' + multiprocessing_context: null + num_workers: '12' + persistent_workers: 'True' + pin_memory: 'True' + pin_memory_device: '' + prefetch_factor: '8' + sampler: + _target_: + dataset: + _target_: + data_fps: '30.0' + dataset_dir: /home/ubuntu/pdt-mimic/data/push-mimic + exclude_with_substring: null + include_only_with_substrings: null + is_multi_img: 'False' + is_val: 'False' + num_frames: '61' + obs_history: '5' + val_ratio: '0.0' + video_size: + - '480' + - '640' + shuffle: null + timeout: '0' + worker_init_fn: null +dataloader_val: + _target_: + batch_sampler: null + batch_size: '1' + collate_fn: null + dataset: + _target_: + data_fps: '30.0' + dataset_dir: /home/ubuntu/pdt-mimic/data/push-mimic + exclude_with_substring: null + include_only_with_substrings: null + is_multi_img: 'False' + is_val: 'True' + num_frames: '61' + obs_history: '5' + val_ratio: '0.0' + video_size: + - '480' + - '640' + drop_last: 'False' + generator: null + in_order: 'False' + multiprocessing_context: null + num_workers: '0' + persistent_workers: 'False' + pin_memory: 'False' + pin_memory_device: '' + prefetch_factor: null + sampler: + _target_: + dataset: + _target_: + data_fps: '30.0' + dataset_dir: /home/ubuntu/pdt-mimic/data/push-mimic + exclude_with_substring: null + include_only_with_substrings: null + is_multi_img: 'False' + is_val: 'True' + num_frames: '61' + obs_history: '5' + val_ratio: '0.0' + video_size: + - '480' + - '640' + shuffle: null + timeout: '0' + worker_init_fn: null +defaults: +- _self_ +- data_config: null +- video_dataset_train: null +- video_dataset_val: null +- dataloader_train: null +- dataloader_val: null +- world2action_pipe: null +- optimizer: fusedadamw +- scheduler: constant +- model: null +- callbacks: + - basic +- net: null +- ema: null +- checkpoint: null +- ckpt_type: null +- experiment: null +job: + group: video2world + name: v2w_push_lora_rank32_lr1.778e-04_bsz32 + project: posttraining +model: + _recursive_: 'False' + _target_: + config: + adjust_video_noise: true + debug_without_randomness: false + fsdp_shard_size: 0 + high_sigma_ratio: 0.05 + init_lora_weights: true + input_image_key: images + input_video_key: video + lora_alpha: 32 + lora_rank: 32 + lora_target_modules: q_proj,k_proj,v_proj,output_proj,x_embedder.proj.1,linear_1,linear_2,mlp.layer1,mlp.layer2 + loss_reduce: mean + loss_scale: 100.0 + model_manager_config: + _target_: cosmos_predict2.models.video2world_model.Predict2ModelManagerConfig + dit_path: /home/ubuntu/pdt-mimic/mimic-video/model/checkpoints/video_backbone//v2w_pretrained_cosmos.pt + text_encoder_path: '' + pipe_config: + adjust_video_noise: true + conditioner: + _target_: + fps: + _target_: + dropout_rate: '0.0' + dtype: null + input_key: fps + output_key: fps + padding_mask: + _target_: + dropout_rate: '0.0' + dtype: null + input_key: padding_mask + output_key: padding_mask + text: + _target_: + dropout_rate: '0.0' + input_key: + - obs/language_embedding + use_video_condition: + _target_: + dropout_rate: '0.0' + input_key: fps + output_key: use_video_condition + conditioning_strategy: frame_replace + ema: + _target_: cosmos_predict2.configs.defaults.ema.EMAConfig + enabled: 'False' + iteration_shift: '0' + rate: '0.1' + guardrail_config: + checkpoint_dir: /home/ubuntu/pdt-mimic/mimic-video/model/checkpoints + enabled: false + offload_model_to_cpu: true + input_image_key: images + input_video_key: video + max_num_conditional_frames: 2 + min_num_conditional_frames: 1 + net: + _target_: + adaln_lora_dim: '256' + atten_backend: minimal_a2a + concat_padding_mask: 'True' + extra_per_block_abs_pos_emb: 'False' + in_channels: '16' + max_frames: '128' + max_img_h: '240' + max_img_w: '240' + model_channels: '2048' + num_blocks: '28' + num_heads: '16' + out_channels: '16' + patch_spatial: '2' + patch_temporal: '1' + pos_emb_cls: rope3d + pos_emb_interpolation: crop + pos_emb_learnable: 'True' + rope_enable_fps_modulation: 'False' + rope_h_extrapolation_ratio: '3.0' + rope_t_extrapolation_ratio: '1.0' + rope_w_extrapolation_ratio: '3.0' + sac_config: + _target_: cosmos_predict2.models.text2image_dit.SACConfig + every_n_blocks: '1' + mode: predict2_2b_720 + use_adaln_lora: 'True' + precision: bfloat16 + rectified_flow_loss_weight_uniform: true + rectified_flow_t_scaling_factor: 1.0 + resize_online: false + resolution: '480' + sigma_conditional: 0.0001 + sigma_data: 1.0 + state_ch: 16 + state_t: 16 + text_encoder: + cls: !!python/object/apply:imaginaire.constants.TextEncoderClass + - t5 + t5: + ckpt_path: /home/ubuntu/pdt-mimic/mimic-video/model/checkpoints/text_encoder/t5-11b + embed_dim: 1024 + num_tokens: 512 + timestamps: + is_forward: false + nfe: 35 + order: 7.0 + t_max: 80.0 + t_min: 0.002 + tokenizer: + _target_: + chunk_duration: '81' + load_mean_std: 'False' + name: tokenizer + temporal_window: '16' + vae_pth: /home/ubuntu/pdt-mimic/mimic-video/model/checkpoints/video_backbone//tokenizer/tokenizer.pth + precision: bfloat16 + train_architecture: lora +model_parallel: + _cpu_offloading_context: null + async_tensor_model_parallel_allreduce: false + autocast_dtype: torch.float32 + barrier_with_L1_time: true + batch_p2p_comm: true + batch_p2p_sync: true + bf16: false + context_parallel_size: 1 + cpu_offloading: false + cpu_offloading_activations: false + cpu_offloading_num_layers: 0 + cpu_offloading_weights: false + cross_entropy_fusion_impl: native + cross_entropy_loss_fusion: false + deallocate_pipeline_outputs: false + defer_embedding_wgrad_compute: false + deterministic_mode: false + enable_autocast: false + expert_model_parallel_size: 1 + expert_tensor_parallel_size: 1 + finalize_model_grads_func: null + fp16: false + grad_scale_func: null + grad_sync_func: null + gradient_accumulation_fusion: false + hierarchical_context_parallel_sizes: null + microbatch_group_size_per_vp_stage: 1 + moe_extended_tp: false + no_sync_func: null + num_microbatches_with_partial_activation_checkpoints: null + overlap_p2p_comm: false + overlap_p2p_comm_warmup_flush: false + param_sync_func: null + params_dtype: torch.float32 + perform_initialization: true + pipeline_dtype: null + pipeline_model_parallel_comm_backend: null + pipeline_model_parallel_size: 1 + pipeline_model_parallel_split_rank: null + sequence_parallel: false + tensor_model_parallel_size: 1 + timers: null + tp_comm_atomic_ag: false + tp_comm_atomic_rs: false + tp_comm_bootstrap_backend: nccl + tp_comm_bulk_dgrad: true + tp_comm_bulk_wgrad: true + tp_comm_overlap: false + tp_comm_overlap_ag: true + tp_comm_overlap_disable_fc1: false + tp_comm_overlap_disable_qkv: false + tp_comm_overlap_rs: true + tp_comm_overlap_rs_dgrad: false + tp_comm_split_ag: true + tp_comm_split_rs: true + use_cpu_initialization: false + use_ring_exchange_p2p: false + use_te_rng_tracker: false + variable_seq_lengths: false + virtual_pipeline_model_parallel_size: null + wgrad_deferral_limit: 0 +optimizer: + _target_: + betas: + - '0.9' + - '0.99' + capturable: 'True' + eps: 1e-08 + lr: '4.445e-05' + master_weights: 'True' + model: null + optim_type: fusedadam + weight_decay: '0.1' +scheduler: + _target_: +trainer: + callbacks: + device_monitor: + _target_: + every_n: '1000' + log_memory_detail: 'True' + step_size: '1' + ema: + _target_: + config: null + trainer: null + grad_clip: + _target_: + clip_norm: '10.0' + force_finite: 'True' + log_wandb: 'False' + iter_speed: + _target_: + every_n: '1000' + hit_thres: '5' + low_prec: + _target_: + config: null + trainer: null + update_iter: '1' + manual_gc: + _target_: + every_n: '5' + warm_up: '5' + progress_bar: + _target_: + config: null + trainer: null + video_eval: + _target_: + fuse_lora: 'True' + cudnn: + benchmark: 'True' + deterministic: 'False' + ddp: + broadcast_buffers: 'True' + find_unused_parameters: 'False' + static_graph: 'True' + distributed_parallelism: ddp + grad_accum_iter: '4' + grad_scaler_args: + enabled: 'False' + logging_iter: '1000' + max_iter: '500' + max_val_iter: null + memory_format: torch.preserve_format + profiling: + enable_memory_snapshot: 'False' + enable_profiling: 'False' + first_n_rank: '4' + profile_freq: '1' + profile_memory: 'True' + record_shape: 'True' + with_modules: 'True' + with_stack: 'True' + run_validation: 'False' + seed: '0' + timeout_period: '999999999' + type: + validation_iter: '999999999' +video_dataset_train: + _target_: + data_fps: '30.0' + dataset_dir: /home/ubuntu/pdt-mimic/data/push-mimic + exclude_with_substring: null + include_only_with_substrings: null + is_multi_img: 'False' + is_val: 'False' + num_frames: '61' + obs_history: '5' + val_ratio: '0.0' + video_size: + - '480' + - '640' +video_dataset_val: + _target_: + data_fps: '30.0' + dataset_dir: /home/ubuntu/pdt-mimic/data/push-mimic + exclude_with_substring: null + include_only_with_substrings: null + is_multi_img: 'False' + is_val: 'True' + num_frames: '61' + obs_history: '5' + val_ratio: '0.0' + video_size: + - '480' + - '640' +world2action_pipe: null diff --git a/stdout.log b/stdout.log new file mode 100644 index 0000000000000000000000000000000000000000..9d40b561abfc73386b03dd6566d119e218704678 --- /dev/null +++ b/stdout.log @@ -0,0 +1,506 @@ +[05-07 17:16:59|INFO|imaginaire/trainer.py:116:__init__] Config: +* model: {'config': {'train_architecture': 'lora', 'lora_rank': 32, 'lora_alpha': 32, 'lora_target_modules': 'q_proj,k_proj,v_proj,output_proj,x_embedder.proj.1,linear_1,linear_2,mlp.layer1,mlp.layer2', 'init_lora_weights': True, 'precision': 'bfloat16', 'input_video_key': 'video', 'input_image_key': 'images', 'loss_reduce': 'mean', 'loss_scale': 100.0, 'adjust_video_noise': True, 'model_manager_config': {'dit_path': '/home/ubuntu/pdt-mimic/mimic-video/model/checkpoints/video_backbone//v2w_pretrained_cosmos.pt', 'text_encoder_path': '', '_target_': 'cosmos_predict2.models.video2world_model.Predict2ModelManagerConfig'}, 'pipe_config': {'adjust_video_noise': True, 'conditioner': {'fps': {'output_key': 'fps', 'dropout_rate': 0.0, 'dtype': None, 'input_key': 'fps', '_target_': }, 'padding_mask': {'output_key': 'padding_mask', 'dropout_rate': 0.0, 'dtype': None, 'input_key': 'padding_mask', '_target_': }, 'text': {'dropout_rate': 0.0, 'input_key': ['obs/language_embedding'], '_target_': }, 'use_video_condition': {'output_key': 'use_video_condition', 'dropout_rate': 0.0, 'input_key': 'fps', '_target_': }, '_target_': }, 'conditioning_strategy': 'frame_replace', 'min_num_conditional_frames': 1, 'max_num_conditional_frames': 2, 'sigma_conditional': 0.0001, 'net': {'max_img_h': 240, 'max_img_w': 240, 'max_frames': 128, 'in_channels': 16, 'out_channels': 16, 'patch_spatial': 2, 'patch_temporal': 1, 'concat_padding_mask': True, 'model_channels': 2048, 'num_blocks': 28, 'num_heads': 16, 'atten_backend': 'minimal_a2a', 'pos_emb_cls': 'rope3d', 'pos_emb_learnable': True, 'pos_emb_interpolation': 'crop', 'use_adaln_lora': True, 'adaln_lora_dim': 256, 'rope_h_extrapolation_ratio': 3.0, 'rope_w_extrapolation_ratio': 3.0, 'rope_t_extrapolation_ratio': 1.0, 'extra_per_block_abs_pos_emb': False, 'rope_enable_fps_modulation': False, 'sac_config': {'mode': 'predict2_2b_720', 'every_n_blocks': 1, '_target_': 'cosmos_predict2.models.text2image_dit.SACConfig'}, '_target_': }, 'tokenizer': {'chunk_duration': 81, 'load_mean_std': False, 'temporal_window': 16, 'name': 'tokenizer', 'vae_pth': '/home/ubuntu/pdt-mimic/mimic-video/model/checkpoints/video_backbone//tokenizer/tokenizer.pth', '_target_': }, 'guardrail_config': {'checkpoint_dir': '/home/ubuntu/pdt-mimic/mimic-video/model/checkpoints', 'offload_model_to_cpu': True, 'enabled': False}, 'precision': 'bfloat16', 'rectified_flow_t_scaling_factor': 1.0, 'rectified_flow_loss_weight_uniform': True, 'resize_online': False, 'resolution': '480', 'ema': {'enabled': False, 'rate': 0.1, 'iteration_shift': 0, '_target_': 'cosmos_predict2.configs.defaults.ema.EMAConfig'}, 'sigma_data': 1.0, 'state_ch': 16, 'state_t': 16, 'text_encoder': {'cls': , 't5': {'ckpt_path': '/home/ubuntu/pdt-mimic/mimic-video/model/checkpoints/text_encoder/t5-11b', 'num_tokens': 512, 'embed_dim': 1024}}, 'input_video_key': 'video', 'input_image_key': 'images', 'timestamps': {'nfe': 35, 't_min': 0.002, 't_max': 80.0, 'order': 7.0, 'is_forward': False}}, 'debug_without_randomness': False, 'fsdp_shard_size': 0, 'high_sigma_ratio': 0.05}, '_recursive_': False, '_target_': } +* world2action_pipe: None +* optimizer: {'optim_type': 'fusedadam', 'model': None, 'lr': 0.0001778279410038923, 'weight_decay': 0.1, 'betas': [0.9, 0.99], 'eps': 1e-08, 'master_weights': True, 'capturable': True, '_target_': } +* scheduler: {'_target_': } +* data_config: None +* video_dataset_train: {'include_only_with_substrings': None, 'exclude_with_substring': None, 'data_fps': 30.0, 'obs_history': 5, 'is_multi_img': False, 'val_ratio': 0.0, 'dataset_dir': '/home/ubuntu/pdt-mimic/data/push-mimic', 'num_frames': 61, 'video_size': [480, 640], 'is_val': False, '_target_': } +* video_dataset_val: {'include_only_with_substrings': None, 'exclude_with_substring': None, 'data_fps': 30.0, 'obs_history': 5, 'is_multi_img': False, 'val_ratio': 0.0, 'dataset_dir': '/home/ubuntu/pdt-mimic/data/push-mimic', 'num_frames': 61, 'video_size': [480, 640], 'is_val': True, '_target_': } +* dataloader_train: {'batch_size': 4, 'shuffle': None, 'sampler': {'dataset': {'include_only_with_substrings': None, 'exclude_with_substring': None, 'data_fps': 30.0, 'obs_history': 5, 'is_multi_img': False, 'val_ratio': 0.0, 'dataset_dir': '/home/ubuntu/pdt-mimic/data/push-mimic', 'num_frames': 61, 'video_size': [480, 640], 'is_val': False, '_target_': }, '_target_': }, 'batch_sampler': None, 'num_workers': 12, 'collate_fn': None, 'pin_memory': True, 'drop_last': True, 'timeout': 0, 'worker_init_fn': None, 'multiprocessing_context': None, 'generator': None, 'prefetch_factor': 8, 'persistent_workers': True, 'pin_memory_device': '', 'in_order': False, 'dataset': {'include_only_with_substrings': None, 'exclude_with_substring': None, 'data_fps': 30.0, 'obs_history': 5, 'is_multi_img': False, 'val_ratio': 0.0, 'dataset_dir': '/home/ubuntu/pdt-mimic/data/push-mimic', 'num_frames': 61, 'video_size': [480, 640], 'is_val': False, '_target_': }, '_target_': } +* dataloader_val: {'batch_size': 1, 'shuffle': None, 'sampler': {'dataset': {'include_only_with_substrings': None, 'exclude_with_substring': None, 'data_fps': 30.0, 'obs_history': 5, 'is_multi_img': False, 'val_ratio': 0.0, 'dataset_dir': '/home/ubuntu/pdt-mimic/data/push-mimic', 'num_frames': 61, 'video_size': [480, 640], 'is_val': True, '_target_': }, '_target_': }, 'batch_sampler': None, 'num_workers': 0, 'collate_fn': None, 'pin_memory': False, 'drop_last': False, 'timeout': 0, 'worker_init_fn': None, 'multiprocessing_context': None, 'generator': None, 'prefetch_factor': None, 'persistent_workers': False, 'pin_memory_device': '', 'in_order': False, 'dataset': {'include_only_with_substrings': None, 'exclude_with_substring': None, 'data_fps': 30.0, 'obs_history': 5, 'is_multi_img': False, 'val_ratio': 0.0, 'dataset_dir': '/home/ubuntu/pdt-mimic/data/push-mimic', 'num_frames': 61, 'video_size': [480, 640], 'is_val': True, '_target_': }, '_target_': } +* job: + * project: posttraining + * group: video2world + * name: v2w_push_lora_rank32_lr1.778e-04_bsz32 +* trainer: + * type:  + * callbacks: {'ema': {'config': None, 'trainer': None, '_target_': }, 'progress_bar': {'config': None, 'trainer': None, '_target_': }, 'low_prec': {'config': None, 'trainer': None, 'update_iter': 1, '_target_': }, 'iter_speed': {'hit_thres': 5, 'every_n': 1000, '_target_': }, 'device_monitor': {'every_n': 1000, 'step_size': 1, 'log_memory_detail': True, '_target_': }, 'manual_gc': {'warm_up': 5, 'every_n': 5, '_target_': }, 'grad_clip': {'clip_norm': 10.0, 'force_finite': True, 'log_wandb': False, '_target_': }, 'video_eval': {'fuse_lora': True, '_target_': }} + * distributed_parallelism: ddp + * ddp: + * find_unused_parameters: False + * static_graph: True + * broadcast_buffers: True + * cudnn: + * deterministic: False + * benchmark: True + * seed: 0 + * grad_scaler_args: {'enabled': False} + * max_iter: 500 + * max_val_iter: None + * logging_iter: 1000 + * run_validation: False + * validation_iter: 999999999 + * timeout_period: 999999999 + * memory_format: torch.preserve_format + * grad_accum_iter: 8 + * profiling: + * enable_profiling: False + * enable_memory_snapshot: False + * profile_freq: 1 + * first_n_rank: 4 + * record_shape: True + * profile_memory: True + * with_stack: True + * with_modules: True +* model_parallel: ModelParallelConfig(tensor_model_parallel_size=1, pipeline_model_parallel_comm_backend=None, pipeline_model_parallel_size=1, virtual_pipeline_model_parallel_size=None, sequence_parallel=False, context_parallel_size=1, hierarchical_context_parallel_sizes=None, expert_model_parallel_size=1, expert_tensor_parallel_size=1, moe_extended_tp=False, perform_initialization=True, use_cpu_initialization=False, fp16=False, bf16=False, params_dtype=torch.float32, timers=None, finalize_model_grads_func=None, grad_scale_func=None, no_sync_func=None, grad_sync_func=None, param_sync_func=None, deterministic_mode=False, enable_autocast=False, autocast_dtype=torch.float32, num_microbatches_with_partial_activation_checkpoints=None, gradient_accumulation_fusion=False, async_tensor_model_parallel_allreduce=False, use_te_rng_tracker=False, tp_comm_overlap=False, tp_comm_bulk_wgrad=True, tp_comm_bulk_dgrad=True, tp_comm_overlap_ag=True, tp_comm_overlap_rs=True, tp_comm_overlap_rs_dgrad=False, tp_comm_split_ag=True, tp_comm_atomic_ag=False, tp_comm_split_rs=True, tp_comm_atomic_rs=False, cross_entropy_loss_fusion=False, cross_entropy_fusion_impl='native', tp_comm_overlap_disable_qkv=False, tp_comm_overlap_disable_fc1=False, tp_comm_bootstrap_backend='nccl', pipeline_dtype=None, variable_seq_lengths=False, overlap_p2p_comm=False, batch_p2p_comm=True, batch_p2p_sync=True, use_ring_exchange_p2p=False, deallocate_pipeline_outputs=False, defer_embedding_wgrad_compute=False, wgrad_deferral_limit=0, pipeline_model_parallel_split_rank=None, overlap_p2p_comm_warmup_flush=False, microbatch_group_size_per_vp_stage=1, cpu_offloading=False, cpu_offloading_num_layers=0, _cpu_offloading_context=None, cpu_offloading_activations=False, cpu_offloading_weights=False, barrier_with_L1_time=True) +* checkpoint: + * type: {'callbacks': None, '_target_': } + * dcp_async_mode_enabled: False + * save_iter: 100 + * load_path:  + * load_training_state: False + * only_load_scheduler_state: False + * strict_resume: True + * jit: + * enabled: False + * input_shape: None + * device: cuda + * dtype: bfloat16 + * strict: True + * verbose: True + * keys_not_to_resume: [] + * broadcast_via_filesystem: False + * load_ema_to_reg: False + * dcp_allow_mismatched_size: False +* defaults: ['_self_', {'data_config': None}, {'video_dataset_train': None}, {'video_dataset_val': None}, {'dataloader_train': None}, {'dataloader_val': None}, {'world2action_pipe': None}, {'optimizer': 'fusedadamw'}, {'scheduler': 'constant'}, {'model': None}, {'callbacks': ['basic']}, {'net': None}, {'ema': None}, {'checkpoint': None}, {'ckpt_type': None}, {'experiment': None}] +[05-07 17:16:59|WARNING|imaginaire/utils/misc.py:127:print_environ_variables] Environment variable TORCH_HOME not set! +[05-07 17:16:59|INFO|imaginaire/utils/misc.py:125:print_environ_variables] Environment variable IMAGINAIRE_OUTPUT_ROOT: /home/ubuntu/checkpoints +[05-07 17:16:59|INFO|imaginaire/utils/misc.py:139:set_random_seed] Using random seed 0. +[05-07 17:16:59|CRITICAL|imaginaire/utils/callback.py:77:__init__] Instantiating callback ema: {'config': None, 'trainer': None, '_target_': } +[05-07 17:16:59|CRITICAL|imaginaire/utils/callback.py:77:__init__] Instantiating callback progress_bar: {'config': None, 'trainer': None, '_target_': } +[05-07 17:16:59|CRITICAL|imaginaire/utils/callback.py:77:__init__] Instantiating callback low_prec: {'config': None, 'trainer': None, 'update_iter': 1, '_target_': } +[05-07 17:16:59|CRITICAL|imaginaire/utils/callback.py:77:__init__] Instantiating callback iter_speed: {'hit_thres': 5, 'every_n': 1000, '_target_': } +[05-07 17:16:59|CRITICAL|imaginaire/utils/callback.py:77:__init__] Instantiating callback device_monitor: {'every_n': 1000, 'step_size': 1, 'log_memory_detail': True, '_target_': } +[05-07 17:16:59|CRITICAL|imaginaire/utils/callback.py:77:__init__] Instantiating callback manual_gc: {'warm_up': 5, 'every_n': 5, '_target_': } +[05-07 17:16:59|CRITICAL|imaginaire/utils/callback.py:77:__init__] Instantiating callback grad_clip: {'clip_norm': 10.0, 'force_finite': True, 'log_wandb': False, '_target_': } +[05-07 17:16:59|CRITICAL|imaginaire/utils/callback.py:77:__init__] Instantiating callback video_eval: {'fuse_lora': True, '_target_': } +[05-07 17:16:59|CRITICAL|cosmos_predict2/models/video2world_model.py:138:__init__] Using mean loss reduce with loss scale 100.0 +[05-07 17:16:59|WARNING|cosmos_predict2/pipelines/video2world.py:277:from_config] precision torch.bfloat16 +[05-07 17:17:00|INFO|cosmos_predict2/tokenizers/tokenizer.py:687:_video_vae] Loading /home/ubuntu/pdt-mimic/mimic-video/model/checkpoints/video_backbone//tokenizer/tokenizer.pth +[05-07 17:17:00|SUCCESS|cosmos_predict2/tokenizers/tokenizer.py:689:_video_vae] Successfully loaded /home/ubuntu/pdt-mimic/mimic-video/model/checkpoints/video_backbone//tokenizer/tokenizer.pth +[05-07 17:17:00|INFO|imaginaire/utils/distributed.py:430:sync_model_states] Synchronizing model states from rank 0 to all ranks in process group [0]. +[05-07 17:17:00|INFO|cosmos_predict2/pipelines/video2world.py:335:from_config] Loading DiT from /home/ubuntu/pdt-mimic/mimic-video/model/checkpoints/video_backbone//v2w_pretrained_cosmos.pt +[05-07 17:17:04|SUCCESS|cosmos_predict2/pipelines/video2world.py:354:from_config] Successfully loaded DiT from /home/ubuntu/pdt-mimic/mimic-video/model/checkpoints/video_backbone//v2w_pretrained_cosmos.pt +[05-07 17:17:05|INFO|cosmos_predict2/models/video2world_model.py:321:add_lora_to_model] Adding LoRA adapters: rank=32, alpha=32, targets=['q_proj', 'k_proj', 'v_proj', 'output_proj', 'x_embedder.proj.1', 'linear_1', 'linear_2', 'mlp.layer1', 'mlp.layer2'] +[05-07 17:17:05|INFO|cosmos_predict2/models/video2world_model.py:345:add_lora_to_model] LoRA injection successful: 46,336,256 trainable parameters out of 2,002,749,696 total (2.314%) +[05-07 17:17:05|INFO|cosmos_predict2/models/video2world_model.py:363:_log_lora_statistics] LoRA parameter breakdown: +[05-07 17:17:05|INFO|cosmos_predict2/models/video2world_model.py:365:_log_lora_statistics] lora_A: 22,153,472 parameters +[05-07 17:17:05|INFO|cosmos_predict2/models/video2world_model.py:365:_log_lora_statistics] lora_B: 24,182,784 parameters +[05-07 17:17:05|INFO|cosmos_predict2/models/video2world_model.py:366:_log_lora_statistics] Total LoRA: 46,336,256 parameters +[05-07 17:17:05|INFO|cosmos_predict2/models/video2world_model.py:185:__init__] Total parameters: 2.00B, Frozen parameters: 1,778,679,808, Trainable parameters: 224,069,888 +[05-07 17:17:05|INFO|cosmos_predict2/models/video2world_model.py:202:__init__] FSDP (Fully Sharded Data Parallel) is disabled. +[05-07 17:17:05|INFO|cosmos_predict2/data/dataset_video.py:102:__init__] 204 videos in total +[05-07 17:17:05|INFO|cosmos_predict2/data/dataset_video.py:112:__init__] 204 videos in train. +[05-07 17:17:05|INFO|cosmos_predict2/data/dataset_video.py:102:__init__] 204 videos in total +[05-07 17:17:05|INFO|cosmos_predict2/data/dataset_video.py:112:__init__] 204 videos in train. +[05-07 17:17:05|CRITICAL|cosmos_predict2/utils/optim_instantiate_dtensor.py:49:get_base_optimizer] total num parameters : 224,069,888 +[05-07 17:17:05|WARNING|cosmos_predict2/utils/fused_adam_dtensor.py:103:__init__] FusedAdam master_weights: True capturable: True +[05-07 17:17:05|INFO|cosmos_predict2/checkpointer.py:288:load] Training from scratch. +[05-07 17:17:05|INFO|imaginaire/utils/misc.py:266:wrapper] Time spent on checkpoint loading: 0.0013 seconds +[05-07 17:17:05|CRITICAL|imaginaire/trainer.py:178:train] Distributed parallelism mode: ddp +[05-07 17:17:06|INFO|imaginaire/trainer.py:186:train] Starting training... +[05-07 17:17:06|INFO|cosmos_predict2/callbacks/device_monitor.py:92:on_train_start] DeviceMonitor callback: local_dir: /home/ubuntu/checkpoints/posttraining/video2world/v2w_push_lora_rank32_lr1.778e-04_bsz32/DeviceMonitor +[05-07 17:17:09|WARNING|imaginaire/utils/distributed.py:284:ddp_sync_grad] DDP static_graph=True is incompatible with sync_grad(). Performance will be reduced. +[05-07 17:18:29|INFO|cosmos_predict2/callbacks/iter_speed.py:51:on_training_step_end] Iteration 1: Hit counter: 1/5 | Loss: 4.0617 | Time: 89.90s +[05-07 17:19:00|INFO|imaginaire/trainer.py:116:__init__] Config: +* model: {'config': {'train_architecture': 'lora', 'lora_rank': 32, 'lora_alpha': 32, 'lora_target_modules': 'q_proj,k_proj,v_proj,output_proj,x_embedder.proj.1,linear_1,linear_2,mlp.layer1,mlp.layer2', 'init_lora_weights': True, 'precision': 'bfloat16', 'input_video_key': 'video', 'input_image_key': 'images', 'loss_reduce': 'mean', 'loss_scale': 100.0, 'adjust_video_noise': True, 'model_manager_config': {'dit_path': '/home/ubuntu/pdt-mimic/mimic-video/model/checkpoints/video_backbone//v2w_pretrained_cosmos.pt', 'text_encoder_path': '', '_target_': 'cosmos_predict2.models.video2world_model.Predict2ModelManagerConfig'}, 'pipe_config': {'adjust_video_noise': True, 'conditioner': {'fps': {'output_key': 'fps', 'dropout_rate': 0.0, 'dtype': None, 'input_key': 'fps', '_target_': }, 'padding_mask': {'output_key': 'padding_mask', 'dropout_rate': 0.0, 'dtype': None, 'input_key': 'padding_mask', '_target_': }, 'text': {'dropout_rate': 0.0, 'input_key': ['obs/language_embedding'], '_target_': }, 'use_video_condition': {'output_key': 'use_video_condition', 'dropout_rate': 0.0, 'input_key': 'fps', '_target_': }, '_target_': }, 'conditioning_strategy': 'frame_replace', 'min_num_conditional_frames': 1, 'max_num_conditional_frames': 2, 'sigma_conditional': 0.0001, 'net': {'max_img_h': 240, 'max_img_w': 240, 'max_frames': 128, 'in_channels': 16, 'out_channels': 16, 'patch_spatial': 2, 'patch_temporal': 1, 'concat_padding_mask': True, 'model_channels': 2048, 'num_blocks': 28, 'num_heads': 16, 'atten_backend': 'minimal_a2a', 'pos_emb_cls': 'rope3d', 'pos_emb_learnable': True, 'pos_emb_interpolation': 'crop', 'use_adaln_lora': True, 'adaln_lora_dim': 256, 'rope_h_extrapolation_ratio': 3.0, 'rope_w_extrapolation_ratio': 3.0, 'rope_t_extrapolation_ratio': 1.0, 'extra_per_block_abs_pos_emb': False, 'rope_enable_fps_modulation': False, 'sac_config': {'mode': 'predict2_2b_720', 'every_n_blocks': 1, '_target_': 'cosmos_predict2.models.text2image_dit.SACConfig'}, '_target_': }, 'tokenizer': {'chunk_duration': 81, 'load_mean_std': False, 'temporal_window': 16, 'name': 'tokenizer', 'vae_pth': '/home/ubuntu/pdt-mimic/mimic-video/model/checkpoints/video_backbone//tokenizer/tokenizer.pth', '_target_': }, 'guardrail_config': {'checkpoint_dir': '/home/ubuntu/pdt-mimic/mimic-video/model/checkpoints', 'offload_model_to_cpu': True, 'enabled': False}, 'precision': 'bfloat16', 'rectified_flow_t_scaling_factor': 1.0, 'rectified_flow_loss_weight_uniform': True, 'resize_online': False, 'resolution': '480', 'ema': {'enabled': False, 'rate': 0.1, 'iteration_shift': 0, '_target_': 'cosmos_predict2.configs.defaults.ema.EMAConfig'}, 'sigma_data': 1.0, 'state_ch': 16, 'state_t': 16, 'text_encoder': {'cls': , 't5': {'ckpt_path': '/home/ubuntu/pdt-mimic/mimic-video/model/checkpoints/text_encoder/t5-11b', 'num_tokens': 512, 'embed_dim': 1024}}, 'input_video_key': 'video', 'input_image_key': 'images', 'timestamps': {'nfe': 35, 't_min': 0.002, 't_max': 80.0, 'order': 7.0, 'is_forward': False}}, 'debug_without_randomness': False, 'fsdp_shard_size': 0, 'high_sigma_ratio': 0.05}, '_recursive_': False, '_target_': } +* world2action_pipe: None +* optimizer: {'optim_type': 'fusedadam', 'model': None, 'lr': 0.0001778279410038923, 'weight_decay': 0.1, 'betas': [0.9, 0.99], 'eps': 1e-08, 'master_weights': True, 'capturable': True, '_target_': } +* scheduler: {'_target_': } +* data_config: None +* video_dataset_train: {'include_only_with_substrings': None, 'exclude_with_substring': None, 'data_fps': 30.0, 'obs_history': 5, 'is_multi_img': False, 'val_ratio': 0.0, 'dataset_dir': '/home/ubuntu/pdt-mimic/data/push-mimic', 'num_frames': 61, 'video_size': [480, 640], 'is_val': False, '_target_': } +* video_dataset_val: {'include_only_with_substrings': None, 'exclude_with_substring': None, 'data_fps': 30.0, 'obs_history': 5, 'is_multi_img': False, 'val_ratio': 0.0, 'dataset_dir': '/home/ubuntu/pdt-mimic/data/push-mimic', 'num_frames': 61, 'video_size': [480, 640], 'is_val': True, '_target_': } +* dataloader_train: {'batch_size': 2, 'shuffle': None, 'sampler': {'dataset': {'include_only_with_substrings': None, 'exclude_with_substring': None, 'data_fps': 30.0, 'obs_history': 5, 'is_multi_img': False, 'val_ratio': 0.0, 'dataset_dir': '/home/ubuntu/pdt-mimic/data/push-mimic', 'num_frames': 61, 'video_size': [480, 640], 'is_val': False, '_target_': }, '_target_': }, 'batch_sampler': None, 'num_workers': 12, 'collate_fn': None, 'pin_memory': True, 'drop_last': True, 'timeout': 0, 'worker_init_fn': None, 'multiprocessing_context': None, 'generator': None, 'prefetch_factor': 8, 'persistent_workers': True, 'pin_memory_device': '', 'in_order': False, 'dataset': {'include_only_with_substrings': None, 'exclude_with_substring': None, 'data_fps': 30.0, 'obs_history': 5, 'is_multi_img': False, 'val_ratio': 0.0, 'dataset_dir': '/home/ubuntu/pdt-mimic/data/push-mimic', 'num_frames': 61, 'video_size': [480, 640], 'is_val': False, '_target_': }, '_target_': } +* dataloader_val: {'batch_size': 1, 'shuffle': None, 'sampler': {'dataset': {'include_only_with_substrings': None, 'exclude_with_substring': None, 'data_fps': 30.0, 'obs_history': 5, 'is_multi_img': False, 'val_ratio': 0.0, 'dataset_dir': '/home/ubuntu/pdt-mimic/data/push-mimic', 'num_frames': 61, 'video_size': [480, 640], 'is_val': True, '_target_': }, '_target_': }, 'batch_sampler': None, 'num_workers': 0, 'collate_fn': None, 'pin_memory': False, 'drop_last': False, 'timeout': 0, 'worker_init_fn': None, 'multiprocessing_context': None, 'generator': None, 'prefetch_factor': None, 'persistent_workers': False, 'pin_memory_device': '', 'in_order': False, 'dataset': {'include_only_with_substrings': None, 'exclude_with_substring': None, 'data_fps': 30.0, 'obs_history': 5, 'is_multi_img': False, 'val_ratio': 0.0, 'dataset_dir': '/home/ubuntu/pdt-mimic/data/push-mimic', 'num_frames': 61, 'video_size': [480, 640], 'is_val': True, '_target_': }, '_target_': } +* job: + * project: posttraining + * group: video2world + * name: v2w_push_lora_rank32_lr1.778e-04_bsz32 +* trainer: + * type:  + * callbacks: {'ema': {'config': None, 'trainer': None, '_target_': }, 'progress_bar': {'config': None, 'trainer': None, '_target_': }, 'low_prec': {'config': None, 'trainer': None, 'update_iter': 1, '_target_': }, 'iter_speed': {'hit_thres': 5, 'every_n': 1000, '_target_': }, 'device_monitor': {'every_n': 1000, 'step_size': 1, 'log_memory_detail': True, '_target_': }, 'manual_gc': {'warm_up': 5, 'every_n': 5, '_target_': }, 'grad_clip': {'clip_norm': 10.0, 'force_finite': True, 'log_wandb': False, '_target_': }, 'video_eval': {'fuse_lora': True, '_target_': }} + * distributed_parallelism: ddp + * ddp: + * find_unused_parameters: False + * static_graph: True + * broadcast_buffers: True + * cudnn: + * deterministic: False + * benchmark: True + * seed: 0 + * grad_scaler_args: {'enabled': False} + * max_iter: 500 + * max_val_iter: None + * logging_iter: 1000 + * run_validation: False + * validation_iter: 999999999 + * timeout_period: 999999999 + * memory_format: torch.preserve_format + * grad_accum_iter: 4 + * profiling: + * enable_profiling: False + * enable_memory_snapshot: False + * profile_freq: 1 + * first_n_rank: 4 + * record_shape: True + * profile_memory: True + * with_stack: True + * with_modules: True +* model_parallel: ModelParallelConfig(tensor_model_parallel_size=1, pipeline_model_parallel_comm_backend=None, pipeline_model_parallel_size=1, virtual_pipeline_model_parallel_size=None, sequence_parallel=False, context_parallel_size=1, hierarchical_context_parallel_sizes=None, expert_model_parallel_size=1, expert_tensor_parallel_size=1, moe_extended_tp=False, perform_initialization=True, use_cpu_initialization=False, fp16=False, bf16=False, params_dtype=torch.float32, timers=None, finalize_model_grads_func=None, grad_scale_func=None, no_sync_func=None, grad_sync_func=None, param_sync_func=None, deterministic_mode=False, enable_autocast=False, autocast_dtype=torch.float32, num_microbatches_with_partial_activation_checkpoints=None, gradient_accumulation_fusion=False, async_tensor_model_parallel_allreduce=False, use_te_rng_tracker=False, tp_comm_overlap=False, tp_comm_bulk_wgrad=True, tp_comm_bulk_dgrad=True, tp_comm_overlap_ag=True, tp_comm_overlap_rs=True, tp_comm_overlap_rs_dgrad=False, tp_comm_split_ag=True, tp_comm_atomic_ag=False, tp_comm_split_rs=True, tp_comm_atomic_rs=False, cross_entropy_loss_fusion=False, cross_entropy_fusion_impl='native', tp_comm_overlap_disable_qkv=False, tp_comm_overlap_disable_fc1=False, tp_comm_bootstrap_backend='nccl', pipeline_dtype=None, variable_seq_lengths=False, overlap_p2p_comm=False, batch_p2p_comm=True, batch_p2p_sync=True, use_ring_exchange_p2p=False, deallocate_pipeline_outputs=False, defer_embedding_wgrad_compute=False, wgrad_deferral_limit=0, pipeline_model_parallel_split_rank=None, overlap_p2p_comm_warmup_flush=False, microbatch_group_size_per_vp_stage=1, cpu_offloading=False, cpu_offloading_num_layers=0, _cpu_offloading_context=None, cpu_offloading_activations=False, cpu_offloading_weights=False, barrier_with_L1_time=True) +* checkpoint: + * type: {'callbacks': None, '_target_': } + * dcp_async_mode_enabled: False + * save_iter: 100 + * load_path:  + * load_training_state: False + * only_load_scheduler_state: False + * strict_resume: True + * jit: + * enabled: False + * input_shape: None + * device: cuda + * dtype: bfloat16 + * strict: True + * verbose: True + * keys_not_to_resume: [] + * broadcast_via_filesystem: False + * load_ema_to_reg: False + * dcp_allow_mismatched_size: False +* defaults: ['_self_', {'data_config': None}, {'video_dataset_train': None}, {'video_dataset_val': None}, {'dataloader_train': None}, {'dataloader_val': None}, {'world2action_pipe': None}, {'optimizer': 'fusedadamw'}, {'scheduler': 'constant'}, {'model': None}, {'callbacks': ['basic']}, {'net': None}, {'ema': None}, {'checkpoint': None}, {'ckpt_type': None}, {'experiment': None}] +[05-07 17:19:00|WARNING|imaginaire/utils/misc.py:127:print_environ_variables] Environment variable TORCH_HOME not set! +[05-07 17:19:00|INFO|imaginaire/utils/misc.py:125:print_environ_variables] Environment variable IMAGINAIRE_OUTPUT_ROOT: /home/ubuntu/checkpoints +[05-07 17:19:00|INFO|imaginaire/utils/misc.py:139:set_random_seed] Using random seed 0. +[05-07 17:19:00|CRITICAL|imaginaire/utils/callback.py:77:__init__] Instantiating callback ema: {'config': None, 'trainer': None, '_target_': } +[05-07 17:19:00|CRITICAL|imaginaire/utils/callback.py:77:__init__] Instantiating callback progress_bar: {'config': None, 'trainer': None, '_target_': } +[05-07 17:19:00|CRITICAL|imaginaire/utils/callback.py:77:__init__] Instantiating callback low_prec: {'config': None, 'trainer': None, 'update_iter': 1, '_target_': } +[05-07 17:19:00|CRITICAL|imaginaire/utils/callback.py:77:__init__] Instantiating callback iter_speed: {'hit_thres': 5, 'every_n': 1000, '_target_': } +[05-07 17:19:00|CRITICAL|imaginaire/utils/callback.py:77:__init__] Instantiating callback device_monitor: {'every_n': 1000, 'step_size': 1, 'log_memory_detail': True, '_target_': } +[05-07 17:19:00|CRITICAL|imaginaire/utils/callback.py:77:__init__] Instantiating callback manual_gc: {'warm_up': 5, 'every_n': 5, '_target_': } +[05-07 17:19:00|CRITICAL|imaginaire/utils/callback.py:77:__init__] Instantiating callback grad_clip: {'clip_norm': 10.0, 'force_finite': True, 'log_wandb': False, '_target_': } +[05-07 17:19:00|CRITICAL|imaginaire/utils/callback.py:77:__init__] Instantiating callback video_eval: {'fuse_lora': True, '_target_': } +[05-07 17:19:00|CRITICAL|cosmos_predict2/models/video2world_model.py:138:__init__] Using mean loss reduce with loss scale 100.0 +[05-07 17:19:00|WARNING|cosmos_predict2/pipelines/video2world.py:277:from_config] precision torch.bfloat16 +[05-07 17:19:00|INFO|cosmos_predict2/tokenizers/tokenizer.py:687:_video_vae] Loading /home/ubuntu/pdt-mimic/mimic-video/model/checkpoints/video_backbone//tokenizer/tokenizer.pth +[05-07 17:19:00|SUCCESS|cosmos_predict2/tokenizers/tokenizer.py:689:_video_vae] Successfully loaded /home/ubuntu/pdt-mimic/mimic-video/model/checkpoints/video_backbone//tokenizer/tokenizer.pth +[05-07 17:19:00|INFO|imaginaire/utils/distributed.py:430:sync_model_states] Synchronizing model states from rank 0 to all ranks in process group [0]. +[05-07 17:19:00|INFO|cosmos_predict2/pipelines/video2world.py:335:from_config] Loading DiT from /home/ubuntu/pdt-mimic/mimic-video/model/checkpoints/video_backbone//v2w_pretrained_cosmos.pt +[05-07 17:19:05|SUCCESS|cosmos_predict2/pipelines/video2world.py:354:from_config] Successfully loaded DiT from /home/ubuntu/pdt-mimic/mimic-video/model/checkpoints/video_backbone//v2w_pretrained_cosmos.pt +[05-07 17:19:05|INFO|cosmos_predict2/models/video2world_model.py:321:add_lora_to_model] Adding LoRA adapters: rank=32, alpha=32, targets=['q_proj', 'k_proj', 'v_proj', 'output_proj', 'x_embedder.proj.1', 'linear_1', 'linear_2', 'mlp.layer1', 'mlp.layer2'] +[05-07 17:19:06|INFO|cosmos_predict2/models/video2world_model.py:345:add_lora_to_model] LoRA injection successful: 46,336,256 trainable parameters out of 2,002,749,696 total (2.314%) +[05-07 17:19:06|INFO|cosmos_predict2/models/video2world_model.py:363:_log_lora_statistics] LoRA parameter breakdown: +[05-07 17:19:06|INFO|cosmos_predict2/models/video2world_model.py:365:_log_lora_statistics] lora_A: 22,153,472 parameters +[05-07 17:19:06|INFO|cosmos_predict2/models/video2world_model.py:365:_log_lora_statistics] lora_B: 24,182,784 parameters +[05-07 17:19:06|INFO|cosmos_predict2/models/video2world_model.py:366:_log_lora_statistics] Total LoRA: 46,336,256 parameters +[05-07 17:19:06|INFO|cosmos_predict2/models/video2world_model.py:185:__init__] Total parameters: 2.00B, Frozen parameters: 1,778,679,808, Trainable parameters: 224,069,888 +[05-07 17:19:06|INFO|cosmos_predict2/models/video2world_model.py:202:__init__] FSDP (Fully Sharded Data Parallel) is disabled. +[05-07 17:19:06|INFO|cosmos_predict2/data/dataset_video.py:102:__init__] 204 videos in total +[05-07 17:19:06|INFO|cosmos_predict2/data/dataset_video.py:112:__init__] 204 videos in train. +[05-07 17:19:06|INFO|cosmos_predict2/data/dataset_video.py:102:__init__] 204 videos in total +[05-07 17:19:06|INFO|cosmos_predict2/data/dataset_video.py:112:__init__] 204 videos in train. +[05-07 17:19:06|CRITICAL|cosmos_predict2/utils/optim_instantiate_dtensor.py:49:get_base_optimizer] total num parameters : 224,069,888 +[05-07 17:19:06|WARNING|cosmos_predict2/utils/fused_adam_dtensor.py:103:__init__] FusedAdam master_weights: True capturable: True +[05-07 17:19:06|INFO|cosmos_predict2/checkpointer.py:288:load] Training from scratch. +[05-07 17:19:06|INFO|imaginaire/utils/misc.py:266:wrapper] Time spent on checkpoint loading: 0.0014 seconds +[05-07 17:19:06|CRITICAL|imaginaire/trainer.py:178:train] Distributed parallelism mode: ddp +[05-07 17:19:06|INFO|imaginaire/trainer.py:186:train] Starting training... +[05-07 17:19:06|INFO|cosmos_predict2/callbacks/device_monitor.py:92:on_train_start] DeviceMonitor callback: local_dir: /home/ubuntu/checkpoints/posttraining/video2world/v2w_push_lora_rank32_lr1.778e-04_bsz32/DeviceMonitor +[05-07 17:19:08|WARNING|imaginaire/utils/distributed.py:284:ddp_sync_grad] DDP static_graph=True is incompatible with sync_grad(). Performance will be reduced. +[05-07 17:19:35|INFO|cosmos_predict2/callbacks/iter_speed.py:51:on_training_step_end] Iteration 1: Hit counter: 1/5 | Loss: 4.5430 | Time: 35.34s +[05-07 17:19:52|INFO|cosmos_predict2/callbacks/iter_speed.py:51:on_training_step_end] Iteration 2: Hit counter: 2/5 | Loss: 7.1425 | Time: 17.27s +[05-07 17:20:09|INFO|cosmos_predict2/callbacks/iter_speed.py:51:on_training_step_end] Iteration 3: Hit counter: 3/5 | Loss: 4.8634 | Time: 17.24s +[05-07 17:20:27|INFO|cosmos_predict2/callbacks/iter_speed.py:51:on_training_step_end] Iteration 4: Hit counter: 4/5 | Loss: 4.9483 | Time: 17.32s +[05-07 17:21:34|INFO|imaginaire/trainer.py:116:__init__] Config: +* model: {'config': {'train_architecture': 'lora', 'lora_rank': 32, 'lora_alpha': 32, 'lora_target_modules': 'q_proj,k_proj,v_proj,output_proj,x_embedder.proj.1,linear_1,linear_2,mlp.layer1,mlp.layer2', 'init_lora_weights': True, 'precision': 'bfloat16', 'input_video_key': 'video', 'input_image_key': 'images', 'loss_reduce': 'mean', 'loss_scale': 100.0, 'adjust_video_noise': True, 'model_manager_config': {'dit_path': '/home/ubuntu/pdt-mimic/mimic-video/model/checkpoints/video_backbone//v2w_pretrained_cosmos.pt', 'text_encoder_path': '', '_target_': 'cosmos_predict2.models.video2world_model.Predict2ModelManagerConfig'}, 'pipe_config': {'adjust_video_noise': True, 'conditioner': {'fps': {'output_key': 'fps', 'dropout_rate': 0.0, 'dtype': None, 'input_key': 'fps', '_target_': }, 'padding_mask': {'output_key': 'padding_mask', 'dropout_rate': 0.0, 'dtype': None, 'input_key': 'padding_mask', '_target_': }, 'text': {'dropout_rate': 0.0, 'input_key': ['obs/language_embedding'], '_target_': }, 'use_video_condition': {'output_key': 'use_video_condition', 'dropout_rate': 0.0, 'input_key': 'fps', '_target_': }, '_target_': }, 'conditioning_strategy': 'frame_replace', 'min_num_conditional_frames': 1, 'max_num_conditional_frames': 2, 'sigma_conditional': 0.0001, 'net': {'max_img_h': 240, 'max_img_w': 240, 'max_frames': 128, 'in_channels': 16, 'out_channels': 16, 'patch_spatial': 2, 'patch_temporal': 1, 'concat_padding_mask': True, 'model_channels': 2048, 'num_blocks': 28, 'num_heads': 16, 'atten_backend': 'minimal_a2a', 'pos_emb_cls': 'rope3d', 'pos_emb_learnable': True, 'pos_emb_interpolation': 'crop', 'use_adaln_lora': True, 'adaln_lora_dim': 256, 'rope_h_extrapolation_ratio': 3.0, 'rope_w_extrapolation_ratio': 3.0, 'rope_t_extrapolation_ratio': 1.0, 'extra_per_block_abs_pos_emb': False, 'rope_enable_fps_modulation': False, 'sac_config': {'mode': 'predict2_2b_720', 'every_n_blocks': 1, '_target_': 'cosmos_predict2.models.text2image_dit.SACConfig'}, '_target_': }, 'tokenizer': {'chunk_duration': 81, 'load_mean_std': False, 'temporal_window': 16, 'name': 'tokenizer', 'vae_pth': '/home/ubuntu/pdt-mimic/mimic-video/model/checkpoints/video_backbone//tokenizer/tokenizer.pth', '_target_': }, 'guardrail_config': {'checkpoint_dir': '/home/ubuntu/pdt-mimic/mimic-video/model/checkpoints', 'offload_model_to_cpu': True, 'enabled': False}, 'precision': 'bfloat16', 'rectified_flow_t_scaling_factor': 1.0, 'rectified_flow_loss_weight_uniform': True, 'resize_online': False, 'resolution': '480', 'ema': {'enabled': False, 'rate': 0.1, 'iteration_shift': 0, '_target_': 'cosmos_predict2.configs.defaults.ema.EMAConfig'}, 'sigma_data': 1.0, 'state_ch': 16, 'state_t': 16, 'text_encoder': {'cls': , 't5': {'ckpt_path': '/home/ubuntu/pdt-mimic/mimic-video/model/checkpoints/text_encoder/t5-11b', 'num_tokens': 512, 'embed_dim': 1024}}, 'input_video_key': 'video', 'input_image_key': 'images', 'timestamps': {'nfe': 35, 't_min': 0.002, 't_max': 80.0, 'order': 7.0, 'is_forward': False}}, 'debug_without_randomness': False, 'fsdp_shard_size': 0, 'high_sigma_ratio': 0.05}, '_recursive_': False, '_target_': } +* world2action_pipe: None +* optimizer: {'optim_type': 'fusedadam', 'model': None, 'lr': 4.445e-05, 'weight_decay': 0.1, 'betas': [0.9, 0.99], 'eps': 1e-08, 'master_weights': True, 'capturable': True, '_target_': } +* scheduler: {'_target_': } +* data_config: None +* video_dataset_train: {'include_only_with_substrings': None, 'exclude_with_substring': None, 'data_fps': 30.0, 'obs_history': 5, 'is_multi_img': False, 'val_ratio': 0.0, 'dataset_dir': '/home/ubuntu/pdt-mimic/data/push-mimic', 'num_frames': 61, 'video_size': [480, 640], 'is_val': False, '_target_': } +* video_dataset_val: {'include_only_with_substrings': None, 'exclude_with_substring': None, 'data_fps': 30.0, 'obs_history': 5, 'is_multi_img': False, 'val_ratio': 0.0, 'dataset_dir': '/home/ubuntu/pdt-mimic/data/push-mimic', 'num_frames': 61, 'video_size': [480, 640], 'is_val': True, '_target_': } +* dataloader_train: {'batch_size': 2, 'shuffle': None, 'sampler': {'dataset': {'include_only_with_substrings': None, 'exclude_with_substring': None, 'data_fps': 30.0, 'obs_history': 5, 'is_multi_img': False, 'val_ratio': 0.0, 'dataset_dir': '/home/ubuntu/pdt-mimic/data/push-mimic', 'num_frames': 61, 'video_size': [480, 640], 'is_val': False, '_target_': }, '_target_': }, 'batch_sampler': None, 'num_workers': 12, 'collate_fn': None, 'pin_memory': True, 'drop_last': True, 'timeout': 0, 'worker_init_fn': None, 'multiprocessing_context': None, 'generator': None, 'prefetch_factor': 8, 'persistent_workers': True, 'pin_memory_device': '', 'in_order': False, 'dataset': {'include_only_with_substrings': None, 'exclude_with_substring': None, 'data_fps': 30.0, 'obs_history': 5, 'is_multi_img': False, 'val_ratio': 0.0, 'dataset_dir': '/home/ubuntu/pdt-mimic/data/push-mimic', 'num_frames': 61, 'video_size': [480, 640], 'is_val': False, '_target_': }, '_target_': } +* dataloader_val: {'batch_size': 1, 'shuffle': None, 'sampler': {'dataset': {'include_only_with_substrings': None, 'exclude_with_substring': None, 'data_fps': 30.0, 'obs_history': 5, 'is_multi_img': False, 'val_ratio': 0.0, 'dataset_dir': '/home/ubuntu/pdt-mimic/data/push-mimic', 'num_frames': 61, 'video_size': [480, 640], 'is_val': True, '_target_': }, '_target_': }, 'batch_sampler': None, 'num_workers': 0, 'collate_fn': None, 'pin_memory': False, 'drop_last': False, 'timeout': 0, 'worker_init_fn': None, 'multiprocessing_context': None, 'generator': None, 'prefetch_factor': None, 'persistent_workers': False, 'pin_memory_device': '', 'in_order': False, 'dataset': {'include_only_with_substrings': None, 'exclude_with_substring': None, 'data_fps': 30.0, 'obs_history': 5, 'is_multi_img': False, 'val_ratio': 0.0, 'dataset_dir': '/home/ubuntu/pdt-mimic/data/push-mimic', 'num_frames': 61, 'video_size': [480, 640], 'is_val': True, '_target_': }, '_target_': } +* job: + * project: posttraining + * group: video2world + * name: v2w_push_lora_rank32_lr1.778e-04_bsz32 +* trainer: + * type:  + * callbacks: {'ema': {'config': None, 'trainer': None, '_target_': }, 'progress_bar': {'config': None, 'trainer': None, '_target_': }, 'low_prec': {'config': None, 'trainer': None, 'update_iter': 1, '_target_': }, 'iter_speed': {'hit_thres': 5, 'every_n': 1000, '_target_': }, 'device_monitor': {'every_n': 1000, 'step_size': 1, 'log_memory_detail': True, '_target_': }, 'manual_gc': {'warm_up': 5, 'every_n': 5, '_target_': }, 'grad_clip': {'clip_norm': 10.0, 'force_finite': True, 'log_wandb': False, '_target_': }, 'video_eval': {'fuse_lora': True, '_target_': }} + * distributed_parallelism: ddp + * ddp: + * find_unused_parameters: False + * static_graph: True + * broadcast_buffers: True + * cudnn: + * deterministic: False + * benchmark: True + * seed: 0 + * grad_scaler_args: {'enabled': False} + * max_iter: 500 + * max_val_iter: None + * logging_iter: 1000 + * run_validation: False + * validation_iter: 999999999 + * timeout_period: 999999999 + * memory_format: torch.preserve_format + * grad_accum_iter: 4 + * profiling: + * enable_profiling: False + * enable_memory_snapshot: False + * profile_freq: 1 + * first_n_rank: 4 + * record_shape: True + * profile_memory: True + * with_stack: True + * with_modules: True +* model_parallel: ModelParallelConfig(tensor_model_parallel_size=1, pipeline_model_parallel_comm_backend=None, pipeline_model_parallel_size=1, virtual_pipeline_model_parallel_size=None, sequence_parallel=False, context_parallel_size=1, hierarchical_context_parallel_sizes=None, expert_model_parallel_size=1, expert_tensor_parallel_size=1, moe_extended_tp=False, perform_initialization=True, use_cpu_initialization=False, fp16=False, bf16=False, params_dtype=torch.float32, timers=None, finalize_model_grads_func=None, grad_scale_func=None, no_sync_func=None, grad_sync_func=None, param_sync_func=None, deterministic_mode=False, enable_autocast=False, autocast_dtype=torch.float32, num_microbatches_with_partial_activation_checkpoints=None, gradient_accumulation_fusion=False, async_tensor_model_parallel_allreduce=False, use_te_rng_tracker=False, tp_comm_overlap=False, tp_comm_bulk_wgrad=True, tp_comm_bulk_dgrad=True, tp_comm_overlap_ag=True, tp_comm_overlap_rs=True, tp_comm_overlap_rs_dgrad=False, tp_comm_split_ag=True, tp_comm_atomic_ag=False, tp_comm_split_rs=True, tp_comm_atomic_rs=False, cross_entropy_loss_fusion=False, cross_entropy_fusion_impl='native', tp_comm_overlap_disable_qkv=False, tp_comm_overlap_disable_fc1=False, tp_comm_bootstrap_backend='nccl', pipeline_dtype=None, variable_seq_lengths=False, overlap_p2p_comm=False, batch_p2p_comm=True, batch_p2p_sync=True, use_ring_exchange_p2p=False, deallocate_pipeline_outputs=False, defer_embedding_wgrad_compute=False, wgrad_deferral_limit=0, pipeline_model_parallel_split_rank=None, overlap_p2p_comm_warmup_flush=False, microbatch_group_size_per_vp_stage=1, cpu_offloading=False, cpu_offloading_num_layers=0, _cpu_offloading_context=None, cpu_offloading_activations=False, cpu_offloading_weights=False, barrier_with_L1_time=True) +* checkpoint: + * type: {'callbacks': None, '_target_': } + * dcp_async_mode_enabled: False + * save_iter: 100 + * load_path:  + * load_training_state: False + * only_load_scheduler_state: False + * strict_resume: True + * jit: + * enabled: False + * input_shape: None + * device: cuda + * dtype: bfloat16 + * strict: True + * verbose: True + * keys_not_to_resume: [] + * broadcast_via_filesystem: False + * load_ema_to_reg: False + * dcp_allow_mismatched_size: False +* defaults: ['_self_', {'data_config': None}, {'video_dataset_train': None}, {'video_dataset_val': None}, {'dataloader_train': None}, {'dataloader_val': None}, {'world2action_pipe': None}, {'optimizer': 'fusedadamw'}, {'scheduler': 'constant'}, {'model': None}, {'callbacks': ['basic']}, {'net': None}, {'ema': None}, {'checkpoint': None}, {'ckpt_type': None}, {'experiment': None}] +[05-07 17:21:34|WARNING|imaginaire/utils/misc.py:127:print_environ_variables] Environment variable TORCH_HOME not set! +[05-07 17:21:34|INFO|imaginaire/utils/misc.py:125:print_environ_variables] Environment variable IMAGINAIRE_OUTPUT_ROOT: /home/ubuntu/checkpoints +[05-07 17:21:34|INFO|imaginaire/utils/misc.py:139:set_random_seed] Using random seed 0. +[05-07 17:21:34|CRITICAL|imaginaire/utils/callback.py:77:__init__] Instantiating callback ema: {'config': None, 'trainer': None, '_target_': } +[05-07 17:21:34|CRITICAL|imaginaire/utils/callback.py:77:__init__] Instantiating callback progress_bar: {'config': None, 'trainer': None, '_target_': } +[05-07 17:21:34|CRITICAL|imaginaire/utils/callback.py:77:__init__] Instantiating callback low_prec: {'config': None, 'trainer': None, 'update_iter': 1, '_target_': } +[05-07 17:21:34|CRITICAL|imaginaire/utils/callback.py:77:__init__] Instantiating callback iter_speed: {'hit_thres': 5, 'every_n': 1000, '_target_': } +[05-07 17:21:34|CRITICAL|imaginaire/utils/callback.py:77:__init__] Instantiating callback device_monitor: {'every_n': 1000, 'step_size': 1, 'log_memory_detail': True, '_target_': } +[05-07 17:21:34|CRITICAL|imaginaire/utils/callback.py:77:__init__] Instantiating callback manual_gc: {'warm_up': 5, 'every_n': 5, '_target_': } +[05-07 17:21:34|CRITICAL|imaginaire/utils/callback.py:77:__init__] Instantiating callback grad_clip: {'clip_norm': 10.0, 'force_finite': True, 'log_wandb': False, '_target_': } +[05-07 17:21:34|CRITICAL|imaginaire/utils/callback.py:77:__init__] Instantiating callback video_eval: {'fuse_lora': True, '_target_': } +[05-07 17:21:34|CRITICAL|cosmos_predict2/models/video2world_model.py:138:__init__] Using mean loss reduce with loss scale 100.0 +[05-07 17:21:34|WARNING|cosmos_predict2/pipelines/video2world.py:277:from_config] precision torch.bfloat16 +[05-07 17:21:34|INFO|cosmos_predict2/tokenizers/tokenizer.py:687:_video_vae] Loading /home/ubuntu/pdt-mimic/mimic-video/model/checkpoints/video_backbone//tokenizer/tokenizer.pth +[05-07 17:21:34|SUCCESS|cosmos_predict2/tokenizers/tokenizer.py:689:_video_vae] Successfully loaded /home/ubuntu/pdt-mimic/mimic-video/model/checkpoints/video_backbone//tokenizer/tokenizer.pth +[05-07 17:21:34|INFO|imaginaire/utils/distributed.py:430:sync_model_states] Synchronizing model states from rank 0 to all ranks in process group [0]. +[05-07 17:21:34|INFO|cosmos_predict2/pipelines/video2world.py:335:from_config] Loading DiT from /home/ubuntu/pdt-mimic/mimic-video/model/checkpoints/video_backbone//v2w_pretrained_cosmos.pt +[05-07 17:21:39|SUCCESS|cosmos_predict2/pipelines/video2world.py:354:from_config] Successfully loaded DiT from /home/ubuntu/pdt-mimic/mimic-video/model/checkpoints/video_backbone//v2w_pretrained_cosmos.pt +[05-07 17:21:40|INFO|cosmos_predict2/models/video2world_model.py:321:add_lora_to_model] Adding LoRA adapters: rank=32, alpha=32, targets=['q_proj', 'k_proj', 'v_proj', 'output_proj', 'x_embedder.proj.1', 'linear_1', 'linear_2', 'mlp.layer1', 'mlp.layer2'] +[05-07 17:21:40|INFO|cosmos_predict2/models/video2world_model.py:345:add_lora_to_model] LoRA injection successful: 46,336,256 trainable parameters out of 2,002,749,696 total (2.314%) +[05-07 17:21:40|INFO|cosmos_predict2/models/video2world_model.py:363:_log_lora_statistics] LoRA parameter breakdown: +[05-07 17:21:40|INFO|cosmos_predict2/models/video2world_model.py:365:_log_lora_statistics] lora_A: 22,153,472 parameters +[05-07 17:21:40|INFO|cosmos_predict2/models/video2world_model.py:365:_log_lora_statistics] lora_B: 24,182,784 parameters +[05-07 17:21:40|INFO|cosmos_predict2/models/video2world_model.py:366:_log_lora_statistics] Total LoRA: 46,336,256 parameters +[05-07 17:21:40|INFO|cosmos_predict2/models/video2world_model.py:185:__init__] Total parameters: 2.00B, Frozen parameters: 1,778,679,808, Trainable parameters: 224,069,888 +[05-07 17:21:40|INFO|cosmos_predict2/models/video2world_model.py:202:__init__] FSDP (Fully Sharded Data Parallel) is disabled. +[05-07 17:21:40|INFO|cosmos_predict2/data/dataset_video.py:102:__init__] 204 videos in total +[05-07 17:21:40|INFO|cosmos_predict2/data/dataset_video.py:112:__init__] 204 videos in train. +[05-07 17:21:40|INFO|cosmos_predict2/data/dataset_video.py:102:__init__] 204 videos in total +[05-07 17:21:40|INFO|cosmos_predict2/data/dataset_video.py:112:__init__] 204 videos in train. +[05-07 17:21:40|CRITICAL|cosmos_predict2/utils/optim_instantiate_dtensor.py:49:get_base_optimizer] total num parameters : 224,069,888 +[05-07 17:21:40|WARNING|cosmos_predict2/utils/fused_adam_dtensor.py:103:__init__] FusedAdam master_weights: True capturable: True +[05-07 17:21:40|INFO|cosmos_predict2/checkpointer.py:288:load] Training from scratch. +[05-07 17:21:40|INFO|imaginaire/utils/misc.py:266:wrapper] Time spent on checkpoint loading: 0.0014 seconds +[05-07 17:21:40|CRITICAL|imaginaire/trainer.py:178:train] Distributed parallelism mode: ddp +[05-07 17:21:40|INFO|imaginaire/trainer.py:186:train] Starting training... +[05-07 17:21:40|INFO|cosmos_predict2/callbacks/device_monitor.py:92:on_train_start] DeviceMonitor callback: local_dir: /home/ubuntu/checkpoints/posttraining/video2world/v2w_push_lora_rank32_lr1.778e-04_bsz32/DeviceMonitor +[05-07 17:21:42|WARNING|imaginaire/utils/distributed.py:284:ddp_sync_grad] DDP static_graph=True is incompatible with sync_grad(). Performance will be reduced. +[05-07 17:22:09|INFO|cosmos_predict2/callbacks/iter_speed.py:51:on_training_step_end] Iteration 1: Hit counter: 1/5 | Loss: 4.1693 | Time: 35.39s +[05-07 17:22:26|INFO|cosmos_predict2/callbacks/iter_speed.py:51:on_training_step_end] Iteration 2: Hit counter: 2/5 | Loss: 6.1576 | Time: 17.27s +[05-07 17:22:44|INFO|cosmos_predict2/callbacks/iter_speed.py:51:on_training_step_end] Iteration 3: Hit counter: 3/5 | Loss: 4.6536 | Time: 17.24s +[05-07 17:23:01|INFO|cosmos_predict2/callbacks/iter_speed.py:51:on_training_step_end] Iteration 4: Hit counter: 4/5 | Loss: 3.3145 | Time: 17.34s +[05-07 17:23:18|INFO|cosmos_predict2/callbacks/iter_speed.py:51:on_training_step_end] Iteration 5: Hit counter: 5/5 | Loss: 4.0082 | Time: 17.26s +[05-07 17:29:03|CRITICAL|imaginaire/callbacks/manual_gc.py:48:every_n_impl] Garbage collection disabled +[05-07 17:29:17|SUCCESS|cosmos_predict2/checkpointer.py:139:_save_worker_local] Saved checkpoint (local): /home/ubuntu/checkpoints/posttraining/video2world/v2w_push_lora_rank32_lr1.778e-04_bsz32/checkpoints/model/iter_000000025.pt +[05-07 17:29:26|INFO|imaginaire/utils/misc.py:266:wrapper] Time spent on checkpoint saving (local): 11.6441 seconds +[05-07 17:29:27|SUCCESS|cosmos_predict2/checkpointer.py:139:_save_worker_local] Saved checkpoint (local): /home/ubuntu/checkpoints/posttraining/video2world/v2w_push_lora_rank32_lr1.778e-04_bsz32/checkpoints/optim/iter_000000025.pt +[05-07 17:29:29|INFO|imaginaire/utils/misc.py:266:wrapper] Time spent on checkpoint saving (local): 3.5409 seconds +[05-07 17:29:29|SUCCESS|cosmos_predict2/checkpointer.py:139:_save_worker_local] Saved checkpoint (local): /home/ubuntu/checkpoints/posttraining/video2world/v2w_push_lora_rank32_lr1.778e-04_bsz32/checkpoints/scheduler/iter_000000025.pt +[05-07 17:29:29|INFO|imaginaire/utils/misc.py:266:wrapper] Time spent on checkpoint saving (local): 0.0019 seconds +[05-07 17:29:29|SUCCESS|cosmos_predict2/checkpointer.py:139:_save_worker_local] Saved checkpoint (local): /home/ubuntu/checkpoints/posttraining/video2world/v2w_push_lora_rank32_lr1.778e-04_bsz32/checkpoints/trainer/iter_000000025.pt +[05-07 17:29:29|INFO|imaginaire/utils/misc.py:266:wrapper] Time spent on checkpoint saving (local): 0.0020 seconds +[05-07 17:36:54|SUCCESS|cosmos_predict2/checkpointer.py:139:_save_worker_local] Saved checkpoint (local): /home/ubuntu/checkpoints/posttraining/video2world/v2w_push_lora_rank32_lr1.778e-04_bsz32/checkpoints/model/iter_000000051.pt +[05-07 17:37:02|INFO|imaginaire/utils/misc.py:266:wrapper] Time spent on checkpoint saving (local): 11.2918 seconds +[05-07 17:37:04|SUCCESS|cosmos_predict2/checkpointer.py:139:_save_worker_local] Saved checkpoint (local): /home/ubuntu/checkpoints/posttraining/video2world/v2w_push_lora_rank32_lr1.778e-04_bsz32/checkpoints/optim/iter_000000051.pt +[05-07 17:37:06|INFO|imaginaire/utils/misc.py:266:wrapper] Time spent on checkpoint saving (local): 3.2635 seconds +[05-07 17:37:06|SUCCESS|cosmos_predict2/checkpointer.py:139:_save_worker_local] Saved checkpoint (local): /home/ubuntu/checkpoints/posttraining/video2world/v2w_push_lora_rank32_lr1.778e-04_bsz32/checkpoints/scheduler/iter_000000051.pt +[05-07 17:37:06|INFO|imaginaire/utils/misc.py:266:wrapper] Time spent on checkpoint saving (local): 0.0018 seconds +[05-07 17:37:06|SUCCESS|cosmos_predict2/checkpointer.py:139:_save_worker_local] Saved checkpoint (local): /home/ubuntu/checkpoints/posttraining/video2world/v2w_push_lora_rank32_lr1.778e-04_bsz32/checkpoints/trainer/iter_000000051.pt +[05-07 17:37:06|INFO|imaginaire/utils/misc.py:266:wrapper] Time spent on checkpoint saving (local): 0.0022 seconds +[05-07 17:44:30|SUCCESS|cosmos_predict2/checkpointer.py:139:_save_worker_local] Saved checkpoint (local): /home/ubuntu/checkpoints/posttraining/video2world/v2w_push_lora_rank32_lr1.778e-04_bsz32/checkpoints/model/iter_000000076.pt +[05-07 17:44:37|INFO|imaginaire/utils/misc.py:266:wrapper] Time spent on checkpoint saving (local): 9.5314 seconds +[05-07 17:44:38|SUCCESS|cosmos_predict2/checkpointer.py:139:_save_worker_local] Saved checkpoint (local): /home/ubuntu/checkpoints/posttraining/video2world/v2w_push_lora_rank32_lr1.778e-04_bsz32/checkpoints/optim/iter_000000076.pt +[05-07 17:44:40|INFO|imaginaire/utils/misc.py:266:wrapper] Time spent on checkpoint saving (local): 3.4604 seconds +[05-07 17:44:40|SUCCESS|cosmos_predict2/checkpointer.py:139:_save_worker_local] Saved checkpoint (local): /home/ubuntu/checkpoints/posttraining/video2world/v2w_push_lora_rank32_lr1.778e-04_bsz32/checkpoints/scheduler/iter_000000076.pt +[05-07 17:44:40|INFO|imaginaire/utils/misc.py:266:wrapper] Time spent on checkpoint saving (local): 0.0019 seconds +[05-07 17:44:40|SUCCESS|cosmos_predict2/checkpointer.py:139:_save_worker_local] Saved checkpoint (local): /home/ubuntu/checkpoints/posttraining/video2world/v2w_push_lora_rank32_lr1.778e-04_bsz32/checkpoints/trainer/iter_000000076.pt +[05-07 17:44:40|INFO|imaginaire/utils/misc.py:266:wrapper] Time spent on checkpoint saving (local): 0.0021 seconds +[05-07 17:51:31|SUCCESS|cosmos_predict2/checkpointer.py:139:_save_worker_local] Saved checkpoint (local): /home/ubuntu/checkpoints/posttraining/video2world/v2w_push_lora_rank32_lr1.778e-04_bsz32/checkpoints/model/iter_000000100.pt +[05-07 17:51:38|INFO|imaginaire/utils/misc.py:266:wrapper] Time spent on checkpoint saving (local): 10.0176 seconds +[05-07 17:51:39|SUCCESS|cosmos_predict2/checkpointer.py:139:_save_worker_local] Saved checkpoint (local): /home/ubuntu/checkpoints/posttraining/video2world/v2w_push_lora_rank32_lr1.778e-04_bsz32/checkpoints/optim/iter_000000100.pt +[05-07 17:51:42|INFO|imaginaire/utils/misc.py:266:wrapper] Time spent on checkpoint saving (local): 3.3693 seconds +[05-07 17:51:42|SUCCESS|cosmos_predict2/checkpointer.py:139:_save_worker_local] Saved checkpoint (local): /home/ubuntu/checkpoints/posttraining/video2world/v2w_push_lora_rank32_lr1.778e-04_bsz32/checkpoints/scheduler/iter_000000100.pt +[05-07 17:51:42|INFO|imaginaire/utils/misc.py:266:wrapper] Time spent on checkpoint saving (local): 0.0019 seconds +[05-07 17:51:42|SUCCESS|cosmos_predict2/checkpointer.py:139:_save_worker_local] Saved checkpoint (local): /home/ubuntu/checkpoints/posttraining/video2world/v2w_push_lora_rank32_lr1.778e-04_bsz32/checkpoints/trainer/iter_000000100.pt +[05-07 17:51:42|INFO|imaginaire/utils/misc.py:266:wrapper] Time spent on checkpoint saving (local): 0.0020 seconds +[05-07 18:13:39|INFO|imaginaire/trainer.py:116:__init__] Config: +* model: {'config': {'train_architecture': 'lora', 'lora_rank': 32, 'lora_alpha': 32, 'lora_target_modules': 'q_proj,k_proj,v_proj,output_proj,x_embedder.proj.1,linear_1,linear_2,mlp.layer1,mlp.layer2', 'init_lora_weights': True, 'precision': 'bfloat16', 'input_video_key': 'video', 'input_image_key': 'images', 'loss_reduce': 'mean', 'loss_scale': 100.0, 'adjust_video_noise': True, 'model_manager_config': {'dit_path': '/home/ubuntu/pdt-mimic/mimic-video/model/checkpoints/video_backbone//v2w_pretrained_cosmos.pt', 'text_encoder_path': '', '_target_': 'cosmos_predict2.models.video2world_model.Predict2ModelManagerConfig'}, 'pipe_config': {'adjust_video_noise': True, 'conditioner': {'fps': {'output_key': 'fps', 'dropout_rate': 0.0, 'dtype': None, 'input_key': 'fps', '_target_': }, 'padding_mask': {'output_key': 'padding_mask', 'dropout_rate': 0.0, 'dtype': None, 'input_key': 'padding_mask', '_target_': }, 'text': {'dropout_rate': 0.0, 'input_key': ['obs/language_embedding'], '_target_': }, 'use_video_condition': {'output_key': 'use_video_condition', 'dropout_rate': 0.0, 'input_key': 'fps', '_target_': }, '_target_': }, 'conditioning_strategy': 'frame_replace', 'min_num_conditional_frames': 1, 'max_num_conditional_frames': 2, 'sigma_conditional': 0.0001, 'net': {'max_img_h': 240, 'max_img_w': 240, 'max_frames': 128, 'in_channels': 16, 'out_channels': 16, 'patch_spatial': 2, 'patch_temporal': 1, 'concat_padding_mask': True, 'model_channels': 2048, 'num_blocks': 28, 'num_heads': 16, 'atten_backend': 'minimal_a2a', 'pos_emb_cls': 'rope3d', 'pos_emb_learnable': True, 'pos_emb_interpolation': 'crop', 'use_adaln_lora': True, 'adaln_lora_dim': 256, 'rope_h_extrapolation_ratio': 3.0, 'rope_w_extrapolation_ratio': 3.0, 'rope_t_extrapolation_ratio': 1.0, 'extra_per_block_abs_pos_emb': False, 'rope_enable_fps_modulation': False, 'sac_config': {'mode': 'predict2_2b_720', 'every_n_blocks': 1, '_target_': 'cosmos_predict2.models.text2image_dit.SACConfig'}, '_target_': }, 'tokenizer': {'chunk_duration': 81, 'load_mean_std': False, 'temporal_window': 16, 'name': 'tokenizer', 'vae_pth': '/home/ubuntu/pdt-mimic/mimic-video/model/checkpoints/video_backbone//tokenizer/tokenizer.pth', '_target_': }, 'guardrail_config': {'checkpoint_dir': '/home/ubuntu/pdt-mimic/mimic-video/model/checkpoints', 'offload_model_to_cpu': True, 'enabled': False}, 'precision': 'bfloat16', 'rectified_flow_t_scaling_factor': 1.0, 'rectified_flow_loss_weight_uniform': True, 'resize_online': False, 'resolution': '480', 'ema': {'enabled': False, 'rate': 0.1, 'iteration_shift': 0, '_target_': 'cosmos_predict2.configs.defaults.ema.EMAConfig'}, 'sigma_data': 1.0, 'state_ch': 16, 'state_t': 16, 'text_encoder': {'cls': , 't5': {'ckpt_path': '/home/ubuntu/pdt-mimic/mimic-video/model/checkpoints/text_encoder/t5-11b', 'num_tokens': 512, 'embed_dim': 1024}}, 'input_video_key': 'video', 'input_image_key': 'images', 'timestamps': {'nfe': 35, 't_min': 0.002, 't_max': 80.0, 'order': 7.0, 'is_forward': False}}, 'debug_without_randomness': False, 'fsdp_shard_size': 0, 'high_sigma_ratio': 0.05}, '_recursive_': False, '_target_': } +* world2action_pipe: None +* optimizer: {'optim_type': 'fusedadam', 'model': None, 'lr': 4.445e-05, 'weight_decay': 0.1, 'betas': [0.9, 0.99], 'eps': 1e-08, 'master_weights': True, 'capturable': True, '_target_': } +* scheduler: {'_target_': } +* data_config: None +* video_dataset_train: {'include_only_with_substrings': None, 'exclude_with_substring': None, 'data_fps': 30.0, 'obs_history': 5, 'is_multi_img': False, 'val_ratio': 0.0, 'dataset_dir': '/home/ubuntu/pdt-mimic/data/push-mimic', 'num_frames': 61, 'video_size': [480, 640], 'is_val': False, '_target_': } +* video_dataset_val: {'include_only_with_substrings': None, 'exclude_with_substring': None, 'data_fps': 30.0, 'obs_history': 5, 'is_multi_img': False, 'val_ratio': 0.0, 'dataset_dir': '/home/ubuntu/pdt-mimic/data/push-mimic', 'num_frames': 61, 'video_size': [480, 640], 'is_val': True, '_target_': } +* dataloader_train: {'batch_size': 2, 'shuffle': None, 'sampler': {'dataset': {'include_only_with_substrings': None, 'exclude_with_substring': None, 'data_fps': 30.0, 'obs_history': 5, 'is_multi_img': False, 'val_ratio': 0.0, 'dataset_dir': '/home/ubuntu/pdt-mimic/data/push-mimic', 'num_frames': 61, 'video_size': [480, 640], 'is_val': False, '_target_': }, '_target_': }, 'batch_sampler': None, 'num_workers': 12, 'collate_fn': None, 'pin_memory': True, 'drop_last': True, 'timeout': 0, 'worker_init_fn': None, 'multiprocessing_context': None, 'generator': None, 'prefetch_factor': 8, 'persistent_workers': True, 'pin_memory_device': '', 'in_order': False, 'dataset': {'include_only_with_substrings': None, 'exclude_with_substring': None, 'data_fps': 30.0, 'obs_history': 5, 'is_multi_img': False, 'val_ratio': 0.0, 'dataset_dir': '/home/ubuntu/pdt-mimic/data/push-mimic', 'num_frames': 61, 'video_size': [480, 640], 'is_val': False, '_target_': }, '_target_': } +* dataloader_val: {'batch_size': 1, 'shuffle': None, 'sampler': {'dataset': {'include_only_with_substrings': None, 'exclude_with_substring': None, 'data_fps': 30.0, 'obs_history': 5, 'is_multi_img': False, 'val_ratio': 0.0, 'dataset_dir': '/home/ubuntu/pdt-mimic/data/push-mimic', 'num_frames': 61, 'video_size': [480, 640], 'is_val': True, '_target_': }, '_target_': }, 'batch_sampler': None, 'num_workers': 0, 'collate_fn': None, 'pin_memory': False, 'drop_last': False, 'timeout': 0, 'worker_init_fn': None, 'multiprocessing_context': None, 'generator': None, 'prefetch_factor': None, 'persistent_workers': False, 'pin_memory_device': '', 'in_order': False, 'dataset': {'include_only_with_substrings': None, 'exclude_with_substring': None, 'data_fps': 30.0, 'obs_history': 5, 'is_multi_img': False, 'val_ratio': 0.0, 'dataset_dir': '/home/ubuntu/pdt-mimic/data/push-mimic', 'num_frames': 61, 'video_size': [480, 640], 'is_val': True, '_target_': }, '_target_': } +* job: + * project: posttraining + * group: video2world + * name: v2w_push_lora_rank32_lr1.778e-04_bsz32 +* trainer: + * type:  + * callbacks: {'ema': {'config': None, 'trainer': None, '_target_': }, 'progress_bar': {'config': None, 'trainer': None, '_target_': }, 'low_prec': {'config': None, 'trainer': None, 'update_iter': 1, '_target_': }, 'iter_speed': {'hit_thres': 5, 'every_n': 1000, '_target_': }, 'device_monitor': {'every_n': 1000, 'step_size': 1, 'log_memory_detail': True, '_target_': }, 'manual_gc': {'warm_up': 5, 'every_n': 5, '_target_': }, 'grad_clip': {'clip_norm': 10.0, 'force_finite': True, 'log_wandb': False, '_target_': }, 'video_eval': {'fuse_lora': True, '_target_': }} + * distributed_parallelism: ddp + * ddp: + * find_unused_parameters: False + * static_graph: True + * broadcast_buffers: True + * cudnn: + * deterministic: False + * benchmark: True + * seed: 0 + * grad_scaler_args: {'enabled': False} + * max_iter: 500 + * max_val_iter: None + * logging_iter: 1000 + * run_validation: False + * validation_iter: 999999999 + * timeout_period: 999999999 + * memory_format: torch.preserve_format + * grad_accum_iter: 4 + * profiling: + * enable_profiling: False + * enable_memory_snapshot: False + * profile_freq: 1 + * first_n_rank: 4 + * record_shape: True + * profile_memory: True + * with_stack: True + * with_modules: True +* model_parallel: ModelParallelConfig(tensor_model_parallel_size=1, pipeline_model_parallel_comm_backend=None, pipeline_model_parallel_size=1, virtual_pipeline_model_parallel_size=None, sequence_parallel=False, context_parallel_size=1, hierarchical_context_parallel_sizes=None, expert_model_parallel_size=1, expert_tensor_parallel_size=1, moe_extended_tp=False, perform_initialization=True, use_cpu_initialization=False, fp16=False, bf16=False, params_dtype=torch.float32, timers=None, finalize_model_grads_func=None, grad_scale_func=None, no_sync_func=None, grad_sync_func=None, param_sync_func=None, deterministic_mode=False, enable_autocast=False, autocast_dtype=torch.float32, num_microbatches_with_partial_activation_checkpoints=None, gradient_accumulation_fusion=False, async_tensor_model_parallel_allreduce=False, use_te_rng_tracker=False, tp_comm_overlap=False, tp_comm_bulk_wgrad=True, tp_comm_bulk_dgrad=True, tp_comm_overlap_ag=True, tp_comm_overlap_rs=True, tp_comm_overlap_rs_dgrad=False, tp_comm_split_ag=True, tp_comm_atomic_ag=False, tp_comm_split_rs=True, tp_comm_atomic_rs=False, cross_entropy_loss_fusion=False, cross_entropy_fusion_impl='native', tp_comm_overlap_disable_qkv=False, tp_comm_overlap_disable_fc1=False, tp_comm_bootstrap_backend='nccl', pipeline_dtype=None, variable_seq_lengths=False, overlap_p2p_comm=False, batch_p2p_comm=True, batch_p2p_sync=True, use_ring_exchange_p2p=False, deallocate_pipeline_outputs=False, defer_embedding_wgrad_compute=False, wgrad_deferral_limit=0, pipeline_model_parallel_split_rank=None, overlap_p2p_comm_warmup_flush=False, microbatch_group_size_per_vp_stage=1, cpu_offloading=False, cpu_offloading_num_layers=0, _cpu_offloading_context=None, cpu_offloading_activations=False, cpu_offloading_weights=False, barrier_with_L1_time=True) +* checkpoint: + * type: {'callbacks': None, '_target_': } + * dcp_async_mode_enabled: False + * save_iter: 100 + * load_path:  + * load_training_state: False + * only_load_scheduler_state: False + * strict_resume: True + * jit: + * enabled: False + * input_shape: None + * device: cuda + * dtype: bfloat16 + * strict: True + * verbose: True + * keys_not_to_resume: [] + * broadcast_via_filesystem: False + * load_ema_to_reg: False + * dcp_allow_mismatched_size: False +* defaults: ['_self_', {'data_config': None}, {'video_dataset_train': None}, {'video_dataset_val': None}, {'dataloader_train': None}, {'dataloader_val': None}, {'world2action_pipe': None}, {'optimizer': 'fusedadamw'}, {'scheduler': 'constant'}, {'model': None}, {'callbacks': ['basic']}, {'net': None}, {'ema': None}, {'checkpoint': None}, {'ckpt_type': None}, {'experiment': None}] +[05-07 18:13:39|WARNING|imaginaire/utils/misc.py:127:print_environ_variables] Environment variable TORCH_HOME not set! +[05-07 18:13:39|INFO|imaginaire/utils/misc.py:125:print_environ_variables] Environment variable IMAGINAIRE_OUTPUT_ROOT: /home/ubuntu/checkpoints +[05-07 18:13:39|INFO|imaginaire/utils/misc.py:139:set_random_seed] Using random seed 0. +[05-07 18:13:39|CRITICAL|imaginaire/utils/callback.py:77:__init__] Instantiating callback ema: {'config': None, 'trainer': None, '_target_': } +[05-07 18:13:39|CRITICAL|imaginaire/utils/callback.py:77:__init__] Instantiating callback progress_bar: {'config': None, 'trainer': None, '_target_': } +[05-07 18:13:39|CRITICAL|imaginaire/utils/callback.py:77:__init__] Instantiating callback low_prec: {'config': None, 'trainer': None, 'update_iter': 1, '_target_': } +[05-07 18:13:39|CRITICAL|imaginaire/utils/callback.py:77:__init__] Instantiating callback iter_speed: {'hit_thres': 5, 'every_n': 1000, '_target_': } +[05-07 18:13:39|CRITICAL|imaginaire/utils/callback.py:77:__init__] Instantiating callback device_monitor: {'every_n': 1000, 'step_size': 1, 'log_memory_detail': True, '_target_': } +[05-07 18:13:39|CRITICAL|imaginaire/utils/callback.py:77:__init__] Instantiating callback manual_gc: {'warm_up': 5, 'every_n': 5, '_target_': } +[05-07 18:13:39|CRITICAL|imaginaire/utils/callback.py:77:__init__] Instantiating callback grad_clip: {'clip_norm': 10.0, 'force_finite': True, 'log_wandb': False, '_target_': } +[05-07 18:13:39|CRITICAL|imaginaire/utils/callback.py:77:__init__] Instantiating callback video_eval: {'fuse_lora': True, '_target_': } +[05-07 18:13:39|CRITICAL|cosmos_predict2/models/video2world_model.py:138:__init__] Using mean loss reduce with loss scale 100.0 +[05-07 18:13:39|WARNING|cosmos_predict2/pipelines/video2world.py:277:from_config] precision torch.bfloat16 +[05-07 18:13:40|INFO|cosmos_predict2/tokenizers/tokenizer.py:687:_video_vae] Loading /home/ubuntu/pdt-mimic/mimic-video/model/checkpoints/video_backbone//tokenizer/tokenizer.pth +[05-07 18:13:40|SUCCESS|cosmos_predict2/tokenizers/tokenizer.py:689:_video_vae] Successfully loaded /home/ubuntu/pdt-mimic/mimic-video/model/checkpoints/video_backbone//tokenizer/tokenizer.pth +[05-07 18:13:40|INFO|imaginaire/utils/distributed.py:430:sync_model_states] Synchronizing model states from rank 0 to all ranks in process group [0]. +[05-07 18:13:40|INFO|cosmos_predict2/pipelines/video2world.py:335:from_config] Loading DiT from /home/ubuntu/pdt-mimic/mimic-video/model/checkpoints/video_backbone//v2w_pretrained_cosmos.pt +[05-07 18:13:45|SUCCESS|cosmos_predict2/pipelines/video2world.py:354:from_config] Successfully loaded DiT from /home/ubuntu/pdt-mimic/mimic-video/model/checkpoints/video_backbone//v2w_pretrained_cosmos.pt +[05-07 18:13:45|INFO|cosmos_predict2/models/video2world_model.py:321:add_lora_to_model] Adding LoRA adapters: rank=32, alpha=32, targets=['q_proj', 'k_proj', 'v_proj', 'output_proj', 'x_embedder.proj.1', 'linear_1', 'linear_2', 'mlp.layer1', 'mlp.layer2'] +[05-07 18:13:46|INFO|cosmos_predict2/models/video2world_model.py:345:add_lora_to_model] LoRA injection successful: 46,336,256 trainable parameters out of 2,002,749,696 total (2.314%) +[05-07 18:13:46|INFO|cosmos_predict2/models/video2world_model.py:363:_log_lora_statistics] LoRA parameter breakdown: +[05-07 18:13:46|INFO|cosmos_predict2/models/video2world_model.py:365:_log_lora_statistics] lora_A: 22,153,472 parameters +[05-07 18:13:46|INFO|cosmos_predict2/models/video2world_model.py:365:_log_lora_statistics] lora_B: 24,182,784 parameters +[05-07 18:13:46|INFO|cosmos_predict2/models/video2world_model.py:366:_log_lora_statistics] Total LoRA: 46,336,256 parameters +[05-07 18:13:46|INFO|cosmos_predict2/models/video2world_model.py:185:__init__] Total parameters: 2.00B, Frozen parameters: 1,778,679,808, Trainable parameters: 224,069,888 +[05-07 18:13:46|INFO|cosmos_predict2/models/video2world_model.py:202:__init__] FSDP (Fully Sharded Data Parallel) is disabled. +[05-07 18:13:46|INFO|cosmos_predict2/data/dataset_video.py:102:__init__] 204 videos in total +[05-07 18:13:46|INFO|cosmos_predict2/data/dataset_video.py:112:__init__] 204 videos in train. +[05-07 18:13:46|INFO|cosmos_predict2/data/dataset_video.py:102:__init__] 204 videos in total +[05-07 18:13:46|INFO|cosmos_predict2/data/dataset_video.py:112:__init__] 204 videos in train. +[05-07 18:13:46|CRITICAL|cosmos_predict2/utils/optim_instantiate_dtensor.py:49:get_base_optimizer] total num parameters : 224,069,888 +[05-07 18:13:46|WARNING|cosmos_predict2/utils/fused_adam_dtensor.py:103:__init__] FusedAdam master_weights: True capturable: True +[05-07 18:13:46|INFO|cosmos_predict2/checkpointer.py:206:load] Loading checkpoint (local): /home/ubuntu/checkpoints/posttraining/video2world/v2w_push_lora_rank32_lr1.778e-04_bsz32/checkpoints/model/iter_000000100.pt +[05-07 18:13:48|SUCCESS|cosmos_predict2/checkpointer.py:208:load] Complete loading checkpoint (local): /home/ubuntu/checkpoints/posttraining/video2world/v2w_push_lora_rank32_lr1.778e-04_bsz32/checkpoints/model/iter_000000100.pt +[05-07 18:13:48|INFO|cosmos_predict2/checkpointer.py:206:load] Loading checkpoint (local): /home/ubuntu/checkpoints/posttraining/video2world/v2w_push_lora_rank32_lr1.778e-04_bsz32/checkpoints/optim/iter_000000100.pt +[05-07 18:13:49|SUCCESS|cosmos_predict2/checkpointer.py:208:load] Complete loading checkpoint (local): /home/ubuntu/checkpoints/posttraining/video2world/v2w_push_lora_rank32_lr1.778e-04_bsz32/checkpoints/optim/iter_000000100.pt +[05-07 18:13:49|INFO|cosmos_predict2/checkpointer.py:206:load] Loading checkpoint (local): /home/ubuntu/checkpoints/posttraining/video2world/v2w_push_lora_rank32_lr1.778e-04_bsz32/checkpoints/scheduler/iter_000000100.pt +[05-07 18:13:49|SUCCESS|cosmos_predict2/checkpointer.py:208:load] Complete loading checkpoint (local): /home/ubuntu/checkpoints/posttraining/video2world/v2w_push_lora_rank32_lr1.778e-04_bsz32/checkpoints/scheduler/iter_000000100.pt +[05-07 18:13:49|INFO|cosmos_predict2/checkpointer.py:206:load] Loading checkpoint (local): /home/ubuntu/checkpoints/posttraining/video2world/v2w_push_lora_rank32_lr1.778e-04_bsz32/checkpoints/trainer/iter_000000100.pt +[05-07 18:13:49|SUCCESS|cosmos_predict2/checkpointer.py:208:load] Complete loading checkpoint (local): /home/ubuntu/checkpoints/posttraining/video2world/v2w_push_lora_rank32_lr1.778e-04_bsz32/checkpoints/trainer/iter_000000100.pt +[05-07 18:13:49|INFO|cosmos_predict2/checkpointer.py:212:load] - Loading the model... +[05-07 18:13:49|INFO|cosmos_predict2/checkpointer.py:260:load] - Loading the scheduler... +[05-07 18:13:49|INFO|cosmos_predict2/checkpointer.py:267:load] - Loading the optimizer... +[05-07 18:13:49|INFO|cosmos_predict2/checkpointer.py:280:load] - Loading the gradient scaler... +[05-07 18:13:49|SUCCESS|cosmos_predict2/checkpointer.py:282:load] Done with loading the checkpoint (iteration 100). +[05-07 18:13:49|INFO|imaginaire/utils/misc.py:266:wrapper] Time spent on checkpoint loading: 3.4351 seconds +[05-07 18:13:49|CRITICAL|imaginaire/trainer.py:178:train] Distributed parallelism mode: ddp +[05-07 18:13:50|INFO|imaginaire/trainer.py:186:train] Starting training... +[05-07 18:13:50|INFO|cosmos_predict2/callbacks/device_monitor.py:92:on_train_start] DeviceMonitor callback: local_dir: /home/ubuntu/checkpoints/posttraining/video2world/v2w_push_lora_rank32_lr1.778e-04_bsz32/DeviceMonitor +[05-07 18:13:51|WARNING|imaginaire/utils/distributed.py:284:ddp_sync_grad] DDP static_graph=True is incompatible with sync_grad(). Performance will be reduced. +[05-07 18:14:13|INFO|cosmos_predict2/callbacks/iter_speed.py:51:on_training_step_end] Iteration 101: Hit counter: 1/5 | Loss: 4.8886 | Time: 33.45s +[05-07 18:14:30|INFO|cosmos_predict2/callbacks/iter_speed.py:51:on_training_step_end] Iteration 102: Hit counter: 2/5 | Loss: 5.8927 | Time: 17.23s +[05-07 18:14:49|INFO|cosmos_predict2/callbacks/iter_speed.py:51:on_training_step_end] Iteration 103: Hit counter: 3/5 | Loss: 4.2789 | Time: 19.49s +[05-07 18:15:07|INFO|cosmos_predict2/callbacks/iter_speed.py:51:on_training_step_end] Iteration 104: Hit counter: 4/5 | Loss: 3.6165 | Time: 17.32s +[05-07 18:15:24|INFO|cosmos_predict2/callbacks/iter_speed.py:51:on_training_step_end] Iteration 105: Hit counter: 5/5 | Loss: 4.6682 | Time: 17.27s +[05-07 18:21:09|CRITICAL|imaginaire/callbacks/manual_gc.py:48:every_n_impl] Garbage collection disabled +[05-07 18:42:54|SUCCESS|cosmos_predict2/checkpointer.py:139:_save_worker_local] Saved checkpoint (local): /home/ubuntu/checkpoints/posttraining/video2world/v2w_push_lora_rank32_lr1.778e-04_bsz32/checkpoints/model/iter_000000200.pt +[05-07 18:43:03|INFO|imaginaire/utils/misc.py:266:wrapper] Time spent on checkpoint saving (local): 12.1174 seconds +[05-07 18:43:04|SUCCESS|cosmos_predict2/checkpointer.py:139:_save_worker_local] Saved checkpoint (local): /home/ubuntu/checkpoints/posttraining/video2world/v2w_push_lora_rank32_lr1.778e-04_bsz32/checkpoints/optim/iter_000000200.pt +[05-07 18:43:07|INFO|imaginaire/utils/misc.py:266:wrapper] Time spent on checkpoint saving (local): 3.6517 seconds +[05-07 18:43:07|SUCCESS|cosmos_predict2/checkpointer.py:139:_save_worker_local] Saved checkpoint (local): /home/ubuntu/checkpoints/posttraining/video2world/v2w_push_lora_rank32_lr1.778e-04_bsz32/checkpoints/scheduler/iter_000000200.pt +[05-07 18:43:07|INFO|imaginaire/utils/misc.py:266:wrapper] Time spent on checkpoint saving (local): 0.0020 seconds +[05-07 18:43:07|SUCCESS|cosmos_predict2/checkpointer.py:139:_save_worker_local] Saved checkpoint (local): /home/ubuntu/checkpoints/posttraining/video2world/v2w_push_lora_rank32_lr1.778e-04_bsz32/checkpoints/trainer/iter_000000200.pt +[05-07 18:43:07|INFO|imaginaire/utils/misc.py:266:wrapper] Time spent on checkpoint saving (local): 0.0020 seconds +[05-07 19:12:02|SUCCESS|cosmos_predict2/checkpointer.py:139:_save_worker_local] Saved checkpoint (local): /home/ubuntu/checkpoints/posttraining/video2world/v2w_push_lora_rank32_lr1.778e-04_bsz32/checkpoints/model/iter_000000300.pt +[05-07 19:12:09|INFO|imaginaire/utils/misc.py:266:wrapper] Time spent on checkpoint saving (local): 9.9800 seconds +[05-07 19:12:10|SUCCESS|cosmos_predict2/checkpointer.py:139:_save_worker_local] Saved checkpoint (local): /home/ubuntu/checkpoints/posttraining/video2world/v2w_push_lora_rank32_lr1.778e-04_bsz32/checkpoints/optim/iter_000000300.pt +[05-07 19:12:12|INFO|imaginaire/utils/misc.py:266:wrapper] Time spent on checkpoint saving (local): 3.4690 seconds +[05-07 19:12:12|SUCCESS|cosmos_predict2/checkpointer.py:139:_save_worker_local] Saved checkpoint (local): /home/ubuntu/checkpoints/posttraining/video2world/v2w_push_lora_rank32_lr1.778e-04_bsz32/checkpoints/scheduler/iter_000000300.pt +[05-07 19:12:12|INFO|imaginaire/utils/misc.py:266:wrapper] Time spent on checkpoint saving (local): 0.0021 seconds +[05-07 19:12:12|SUCCESS|cosmos_predict2/checkpointer.py:139:_save_worker_local] Saved checkpoint (local): /home/ubuntu/checkpoints/posttraining/video2world/v2w_push_lora_rank32_lr1.778e-04_bsz32/checkpoints/trainer/iter_000000300.pt +[05-07 19:12:12|INFO|imaginaire/utils/misc.py:266:wrapper] Time spent on checkpoint saving (local): 0.0020 seconds +[05-07 19:41:07|SUCCESS|cosmos_predict2/checkpointer.py:139:_save_worker_local] Saved checkpoint (local): /home/ubuntu/checkpoints/posttraining/video2world/v2w_push_lora_rank32_lr1.778e-04_bsz32/checkpoints/model/iter_000000400.pt +[05-07 19:41:14|INFO|imaginaire/utils/misc.py:266:wrapper] Time spent on checkpoint saving (local): 10.2649 seconds +[05-07 19:41:16|SUCCESS|cosmos_predict2/checkpointer.py:139:_save_worker_local] Saved checkpoint (local): /home/ubuntu/checkpoints/posttraining/video2world/v2w_push_lora_rank32_lr1.778e-04_bsz32/checkpoints/optim/iter_000000400.pt +[05-07 19:41:18|INFO|imaginaire/utils/misc.py:266:wrapper] Time spent on checkpoint saving (local): 3.5255 seconds +[05-07 19:41:18|SUCCESS|cosmos_predict2/checkpointer.py:139:_save_worker_local] Saved checkpoint (local): /home/ubuntu/checkpoints/posttraining/video2world/v2w_push_lora_rank32_lr1.778e-04_bsz32/checkpoints/scheduler/iter_000000400.pt +[05-07 19:41:18|INFO|imaginaire/utils/misc.py:266:wrapper] Time spent on checkpoint saving (local): 0.0020 seconds +[05-07 19:41:18|SUCCESS|cosmos_predict2/checkpointer.py:139:_save_worker_local] Saved checkpoint (local): /home/ubuntu/checkpoints/posttraining/video2world/v2w_push_lora_rank32_lr1.778e-04_bsz32/checkpoints/trainer/iter_000000400.pt +[05-07 19:41:18|INFO|imaginaire/utils/misc.py:266:wrapper] Time spent on checkpoint saving (local): 0.0023 seconds +[05-07 20:10:12|SUCCESS|cosmos_predict2/checkpointer.py:139:_save_worker_local] Saved checkpoint (local): /home/ubuntu/checkpoints/posttraining/video2world/v2w_push_lora_rank32_lr1.778e-04_bsz32/checkpoints/model/iter_000000500.pt +[05-07 20:10:19|INFO|imaginaire/utils/misc.py:266:wrapper] Time spent on checkpoint saving (local): 9.8569 seconds +[05-07 20:10:20|SUCCESS|cosmos_predict2/checkpointer.py:139:_save_worker_local] Saved checkpoint (local): /home/ubuntu/checkpoints/posttraining/video2world/v2w_push_lora_rank32_lr1.778e-04_bsz32/checkpoints/optim/iter_000000500.pt +[05-07 20:10:22|INFO|imaginaire/utils/misc.py:266:wrapper] Time spent on checkpoint saving (local): 3.6050 seconds +[05-07 20:10:22|SUCCESS|cosmos_predict2/checkpointer.py:139:_save_worker_local] Saved checkpoint (local): /home/ubuntu/checkpoints/posttraining/video2world/v2w_push_lora_rank32_lr1.778e-04_bsz32/checkpoints/scheduler/iter_000000500.pt +[05-07 20:10:22|INFO|imaginaire/utils/misc.py:266:wrapper] Time spent on checkpoint saving (local): 0.0020 seconds +[05-07 20:10:22|SUCCESS|cosmos_predict2/checkpointer.py:139:_save_worker_local] Saved checkpoint (local): /home/ubuntu/checkpoints/posttraining/video2world/v2w_push_lora_rank32_lr1.778e-04_bsz32/checkpoints/trainer/iter_000000500.pt +[05-07 20:10:22|INFO|imaginaire/utils/misc.py:266:wrapper] Time spent on checkpoint saving (local): 0.0021 seconds +[05-07 20:10:22|SUCCESS|imaginaire/trainer.py:288:train] Done with training.