Motif-Technologies
/

optimizer

wyldecat Claude Opus 4.6 commited on Mar 13

Commit

3f5cf49

1 Parent(s): da7e5da

Update comment to reflect use_local_synchronization behavior [skip-build]

Files changed (1) hide show

torch-ext/optimizer/distributed/utils.py CHANGED Viewed

@@ -207,8 +207,10 @@ def construct_shard_mesh(
     assert len(shard_placements) == len(set(shard_placements))
     # -- Step 4: Create/retrieve ProcessGroup for current rank's sub-mesh. --
-    # All ranks must call dist.new_group in the same order, even though each
-    # rank only joins one group.
     def _cache_key(t: torch.Tensor) -> tuple:
         return (*t.shape, *t.flatten().tolist())

     assert len(shard_placements) == len(set(shard_placements))
     # -- Step 4: Create/retrieve ProcessGroup for current rank's sub-mesh. --
+    # Each rank only creates the group it belongs to, using
+    # use_local_synchronization=True so that only group members need to
+    # coordinate. This avoids deadlocks when different PP stages call
+    # construct_shard_mesh for different parameters.
     def _cache_key(t: torch.Tensor) -> tuple:
         return (*t.shape, *t.flatten().tolist())