Update svd_triton_gram_newton.py

Browse files

Files changed (1) hide show

svd_triton_gram_newton.py +158 -95

svd_triton_gram_newton.py CHANGED Viewed

@@ -492,25 +492,23 @@ def projected_svd_quality(A, target_rank=24):
 def procrustes_alignment_quality(N=48, k=24, n_samples=5000):
-    """The real test: does rank-k Procrustes produce the same alignment as rank-N?
-    Simulates the actual use case:
-      1. Generate two embedding spaces (source, target) with shared structure
-      2. Align with full N-d Procrustes
-      3. Align with projected k-d Procrustes
-      4. Compare: rotation agreement, aligned embedding cosine, downstream task impact
-    Returns dict of comparison metrics.
     """
     device = 'cuda'
     # Create two embedding spaces with shared low-rank structure + noise
-    # This simulates two expert encoders that agree on major directions
-    shared_rank = min(N // 2, 32)  # true shared structure
     shared_basis = torch.randn(shared_rank, N, device=device)
-    shared_basis = torch.linalg.qr(shared_basis.T).Q.T  # orthonormal rows
-    # Source and target share the basis but with different coefficients + noise
     coeffs_src = torch.randn(n_samples, shared_rank, device=device)
     coeffs_tgt = torch.randn(n_samples, shared_rank, device=device) * 0.8 + coeffs_src * 0.5
     noise_scale = 0.3
@@ -518,119 +516,184 @@ def procrustes_alignment_quality(N=48, k=24, n_samples=5000):
     source = coeffs_src @ shared_basis + noise_scale * torch.randn(n_samples, N, device=device)
     target = coeffs_tgt @ shared_basis + noise_scale * torch.randn(n_samples, N, device=device)
-    # Center
     source = source - source.mean(0, keepdim=True)
     target = target - target.mean(0, keepdim=True)
-    # ═══ Full N-d Procrustes ═══
-    # Cross-covariance → SVD → rotation
-    C_full = source.T @ target  # (N, N)
-    U_f, S_f, Vh_f = torch.linalg.svd(C_full)
-    R_full = U_f @ Vh_f  # (N, N) optimal rotation
     aligned_full = source @ R_full
-    cos_full = F.cosine_similarity(aligned_full, target, dim=-1)  # (n_samples,)
     # ═══ Projected k-d Procrustes ═══
-    # Project both spaces to k-d, align there, lift back
-    # Random projection
     P = torch.randn(N, k, device=device) / math.sqrt(k)
-    src_proj = source @ P  # (n_samples, k)
-    tgt_proj = target @ P  # (n_samples, k)
-    C_proj = src_proj.T @ tgt_proj  # (k, k)
-    U_p, S_p, Vh_p = torch.linalg.svd(C_proj)
-    R_proj_k = U_p @ Vh_p  # (k, k) rotation in projected space
-    # Lift rotation back to N-d: R_N = P @ R_k @ P^T (pseudoinverse)
-    # More precisely: align in projected space, then evaluate in full space
-    aligned_proj_k = src_proj @ R_proj_k  # aligned in k-d
-    # Lift back: find best N-d rotation that maps source to target
-    # using only the k-d alignment as guidance
-    # R_lifted = P @ R_k @ pinv(P)
-    P_pinv = torch.linalg.pinv(P)  # (k, N)
-    R_lifted = P @ R_proj_k @ P_pinv  # (N, N)
-    aligned_lifted = source @ R_lifted
-    cos_lifted = F.cosine_similarity(aligned_lifted, target, dim=-1)
-    # ═══ Also test: Procrustes in k-d only (don't lift, compare in k-d) ═══
-    cos_proj_space = F.cosine_similarity(aligned_proj_k, tgt_proj, dim=-1)
-    # Reference: full Procrustes projected to k-d
-    aligned_full_proj = aligned_full @ P
-    cos_full_proj = F.cosine_similarity(aligned_full_proj, tgt_proj, dim=-1)
-    # ═══ Rotation agreement: how similar are R_full and R_lifted? ═══
-    # Frobenius norm of difference
-    rot_frob = (R_full - R_lifted).norm().item() / (R_full.norm().item() + 1e-8)
-    # Trace agreement: tr(R_full^T @ R_lifted) / N — 1.0 if identical
-    rot_trace = (R_full.T @ R_lifted).trace().item() / N
-    # ═══ Downstream proxy: classification agreement ═══
-    # If we classify by nearest-neighbor in aligned space, do both agree?
-    # Use first 100 as "anchors", rest as queries
     n_anchor = min(100, n_samples // 2)
-    anchors_full = aligned_full[:n_anchor]
-    anchors_lift = aligned_lifted[:n_anchor]
-    queries_full = aligned_full[n_anchor:]
-    queries_lift = aligned_lifted[n_anchor:]
-    nn_full = (queries_full @ anchors_full.T).argmax(-1)
-    nn_lift = (queries_lift @ anchors_lift.T).argmax(-1)
-    nn_agreement = (nn_full == nn_lift).float().mean().item()
     return {
         'N': N, 'k': k,
-        'cos_full_mean': cos_full.mean().item(),       # alignment quality: full Procrustes
-        'cos_lifted_mean': cos_lifted.mean().item(),    # alignment quality: projected + lifted
-        'cos_proj_space': cos_proj_space.mean().item(), # alignment in k-d only
-        'cos_full_proj': cos_full_proj.mean().item(),   # full Procrustes seen from k-d
-        'rot_frob_rel': rot_frob,                       # rotation matrix difference (relative)
-        'rot_trace_norm': rot_trace,                    # rotation trace agreement (1.0 = perfect)
-        'nn_agreement': nn_agreement,                   # nearest-neighbor classification agreement
     }
 def profile_procrustes_quality():
-    """Compare Procrustes alignment quality: full N-d vs projected k-d."""
-    print(f"\n{'='*100}")
-    print(f"  PROCRUSTES ALIGNMENT QUALITY: full N-d vs projected k-d")
-    print(f"  Does rank-k alignment produce the same rotation as rank-N?")
-    print(f"{'='*100}")
     configs = [
-        (32,  [8, 12, 16, 24]),
-        (48,  [8, 12, 16, 24, 32]),
-        (64,  [8, 12, 16, 24, 32]),
-        (96,  [8, 16, 24, 32, 48]),
-        (128, [8, 16, 24, 32, 48, 64]),
     ]
     all_results = []
     for N, ranks in configs:
         print(f"\n  N={N}:")
-        print(f"  {'k':>5}  {'cos_full':>9}  {'cos_lifted':>11}  {'cos_k-d':>9}"
-              f"  {'rot_trace':>10}  {'rot_frob':>10}  {'NN_agree':>9}")
-        print(f"  {'─'*76}")
         for k in ranks:
             if k >= N:
                 continue
             q = procrustes_alignment_quality(N=N, k=k)
-            print(f"  {k:>5}  {q['cos_full_mean']:>9.4f}  {q['cos_lifted_mean']:>11.4f}"
-                  f"  {q['cos_proj_space']:>9.4f}  {q['rot_trace_norm']:>10.4f}"
-                  f"  {q['rot_frob_rel']:>10.4f}  {q['nn_agreement']:>9.4f}")
             all_results.append(q)
-    # Summary
-    print(f"\n  {'─'*76}")
-    print(f"  KEY: cos_full = full Procrustes alignment cosine (ceiling)")
-    print(f"       cos_lifted = projected Procrustes lifted back to N-d (what we get)")
-    print(f"       rot_trace = tr(R_full^T @ R_proj)/N  (1.0 = same rotation)")
-    print(f"       NN_agree = nearest-neighbor classification agreement (task proxy)")
-    print(f"  {'─'*76}")
-    print(f"  If cos_lifted ≈ cos_full and NN_agree > 0.95, projection is safe.")
     return all_results

 def procrustes_alignment_quality(N=48, k=24, n_samples=5000):
+    """Compare 5 methods of applying rank-k Procrustes back to N-d.
+    Methods:
+      1. full:      Full N-d Procrustes (ceiling)
+      2. pinv:      P @ R_k @ pinv(P) — naive lift (broken baseline)
+      3. lerp:      (1-α)I + α*(P @ R_k @ pinv(P)) — blend with identity
+      4. slerp:     matrix_exp(α * matrix_log(R_lifted)) — geodesic on SO(N)
+      5. subspace:  Rotate in-subspace component, preserve orthogonal complement
+      6. stay_k:    Don't lift — compare in k-d (reference for k-d quality)
     """
     device = 'cuda'
     # Create two embedding spaces with shared low-rank structure + noise
+    shared_rank = min(N // 2, 32)
     shared_basis = torch.randn(shared_rank, N, device=device)
+    shared_basis = torch.linalg.qr(shared_basis.T).Q.T
     coeffs_src = torch.randn(n_samples, shared_rank, device=device)
     coeffs_tgt = torch.randn(n_samples, shared_rank, device=device) * 0.8 + coeffs_src * 0.5
     noise_scale = 0.3
     source = coeffs_src @ shared_basis + noise_scale * torch.randn(n_samples, N, device=device)
     target = coeffs_tgt @ shared_basis + noise_scale * torch.randn(n_samples, N, device=device)
     source = source - source.mean(0, keepdim=True)
     target = target - target.mean(0, keepdim=True)
+    # ═══ Full N-d Procrustes (ceiling) ═══
+    C_full = source.T @ target
+    U_f, _, Vh_f = torch.linalg.svd(C_full)
+    R_full = U_f @ Vh_f
     aligned_full = source @ R_full
+    cos_full = F.cosine_similarity(aligned_full, target, dim=-1).mean().item()
     # ═══ Projected k-d Procrustes ═══
     P = torch.randn(N, k, device=device) / math.sqrt(k)
+    # Orthogonalize P for cleaner subspace decomposition
+    P = torch.linalg.qr(P).Q  # (N, k) orthonormal columns
+    src_proj = source @ P
+    tgt_proj = target @ P
+    C_proj = src_proj.T @ tgt_proj
+    U_p, _, Vh_p = torch.linalg.svd(C_proj)
+    R_k = U_p @ Vh_p  # (k, k) optimal rotation in k-d
+    # ═══ Method 1: Naive pinv lift (broken baseline) ═══
+    P_pinv = torch.linalg.pinv(P)
+    R_pinv = P @ R_k @ P_pinv
+    aligned_pinv = source @ R_pinv
+    cos_pinv = F.cosine_similarity(aligned_pinv, target, dim=-1).mean().item()
+    # ═══ Method 2: LERP — blend projected rotation with identity ═══
+    # Test multiple α values, pick best
+    I_N = torch.eye(N, device=device)
+    best_lerp_cos = -1.0
+    best_lerp_alpha = 0.0
+    lerp_results = {}
+    for alpha in [0.3, 0.5, 0.7, 0.9, 1.0]:
+        R_lerp = (1.0 - alpha) * I_N + alpha * R_pinv
+        aligned_lerp = source @ R_lerp
+        c = F.cosine_similarity(aligned_lerp, target, dim=-1).mean().item()
+        lerp_results[alpha] = c
+        if c > best_lerp_cos:
+            best_lerp_cos = c
+            best_lerp_alpha = alpha
+    # Also get NN agreement for best lerp
+    R_lerp_best = (1.0 - best_lerp_alpha) * I_N + best_lerp_alpha * R_pinv
+    aligned_lerp_best = source @ R_lerp_best
+    # ═══ Method 3: SLERP — geodesic interpolation on rotation manifold ═══
+    # R_pinv may not be exactly orthogonal, so clean it first
+    U_clean, _, Vh_clean = torch.linalg.svd(R_pinv)
+    R_ortho = U_clean @ Vh_clean  # closest orthogonal matrix
+    best_slerp_cos = -1.0
+    best_slerp_alpha = 0.0
+    try:
+        log_R = torch.linalg.matrix_log(R_ortho.to(torch.complex64)).real
+        slerp_works = True
+    except Exception:
+        slerp_works = False
+        log_R = None
+    if slerp_works:
+        for alpha in [0.3, 0.5, 0.7, 0.9, 1.0]:
+            R_slerp = torch.matrix_exp(alpha * log_R)
+            aligned_slerp = source @ R_slerp
+            c = F.cosine_similarity(aligned_slerp, target, dim=-1).mean().item()
+            if c > best_slerp_cos:
+                best_slerp_cos = c
+                best_slerp_alpha = alpha
+        R_slerp_best = torch.matrix_exp(best_slerp_alpha * log_R)
+        aligned_slerp_best = source @ R_slerp_best
+    else:
+        best_slerp_cos = cos_pinv
+        best_slerp_alpha = -1.0
+        aligned_slerp_best = aligned_pinv
+    # ═══ Method 4: Subspace-preserving rotation ═══
+    # Decompose source into in-subspace and orthogonal complement
+    # P @ P^T is the projector onto the k-d subspace (P has orthonormal columns)
+    src_in = source @ P  # (n, k) — coefficients in subspace
+    src_perp = source - src_in @ P.T  # (n, N) — orthogonal complement
+    # Rotate only the in-subspace component
+    src_in_rotated = src_in @ R_k  # (n, k) — rotated in k-d
+    aligned_subspace = src_in_rotated @ P.T + src_perp  # lift rotated + add perp back
+    cos_subspace = F.cosine_similarity(aligned_subspace, target, dim=-1).mean().item()
+    # ═══ Method 5: Stay in k-d (don't lift, reference) ═══
+    aligned_k = src_proj @ R_k
+    cos_stay_k = F.cosine_similarity(aligned_k, tgt_proj, dim=-1).mean().item()
+    # ═══ NN agreement for all methods ═══
     n_anchor = min(100, n_samples // 2)
+    def _nn_agree(aligned_a, aligned_b):
+        anc_a, anc_b = aligned_a[:n_anchor], aligned_b[:n_anchor]
+        q_a, q_b = aligned_a[n_anchor:], aligned_b[n_anchor:]
+        nn_a = (q_a @ anc_a.T).argmax(-1)
+        nn_b = (q_b @ anc_b.T).argmax(-1)
+        return (nn_a == nn_b).float().mean().item()
+    nn_pinv = _nn_agree(aligned_full, aligned_pinv)
+    nn_lerp = _nn_agree(aligned_full, aligned_lerp_best)
+    nn_slerp = _nn_agree(aligned_full, aligned_slerp_best)
+    nn_subspace = _nn_agree(aligned_full, aligned_subspace)
     return {
         'N': N, 'k': k,
+        'cos_full': cos_full,
+        'cos_pinv': cos_pinv,
+        'cos_lerp': best_lerp_cos, 'lerp_alpha': best_lerp_alpha,
+        'cos_slerp': best_slerp_cos, 'slerp_alpha': best_slerp_alpha,
+        'cos_subspace': cos_subspace,
+        'cos_stay_k': cos_stay_k,
+        'nn_pinv': nn_pinv, 'nn_lerp': nn_lerp,
+        'nn_slerp': nn_slerp, 'nn_subspace': nn_subspace,
+        'lerp_all': lerp_results,
     }
 def profile_procrustes_quality():
+    """Compare all Procrustes lift-back methods."""
+    print(f"\n{'='*120}")
+    print(f"  PROCRUSTES ALIGNMENT: 5 methods of applying rank-k rotation to N-d space")
+    print(f"  cos = mean cosine similarity after alignment (higher = better, full = ceiling)")
+    print(f"  NN = nearest-neighbor agreement with full Procrustes (1.0 = identical downstream)")
+    print(f"{'='*120}")
     configs = [
+        (32,  [8, 16, 24]),
+        (48,  [8, 16, 24, 32]),
+        (64,  [8, 16, 24, 32]),
+        (96,  [16, 24, 32, 48]),
+        (128, [16, 24, 32, 48, 64]),
     ]
     all_results = []
     for N, ranks in configs:
         print(f"\n  N={N}:")
+        print(f"  {'k':>5}  {'full':>7}  {'pinv':>7}  {'lerp':>7} {'(α)':>4}"
+              f"  {'slerp':>7} {'(α)':>4}  {'subspc':>7}  {'stay_k':>7}"
+              f"  │ {'nn_pv':>6} {'nn_lr':>6} {'nn_sl':>6} {'nn_ss':>6}")
+        print(f"  {'─'*105}")
         for k in ranks:
             if k >= N:
                 continue
             q = procrustes_alignment_quality(N=N, k=k)
+            sl_alpha = f"{q['slerp_alpha']:.1f}" if q['slerp_alpha'] >= 0 else " err"
+            print(f"  {k:>5}  {q['cos_full']:>7.4f}  {q['cos_pinv']:>7.4f}"
+                  f"  {q['cos_lerp']:>7.4f} {q['lerp_alpha']:>3.1f}"
+                  f"  {q['cos_slerp']:>7.4f} {sl_alpha:>4}"
+                  f"  {q['cos_subspace']:>7.4f}  {q['cos_stay_k']:>7.4f}"
+                  f"  │ {q['nn_pinv']:>6.3f} {q['nn_lerp']:>6.3f}"
+                  f" {q['nn_slerp']:>6.3f} {q['nn_subspace']:>6.3f}")
             all_results.append(q)
+    # Winner summary
+    print(f"\n  {'═'*105}")
+    print(f"  WINNER PER CONFIG (closest cos to full, highest NN agreement):")
+    print(f"  {'═'*105}")
+    for q in all_results:
+        methods = {
+            'pinv': q['cos_pinv'], 'lerp': q['cos_lerp'],
+            'slerp': q['cos_slerp'], 'subspace': q['cos_subspace'],
+        }
+        best_method = max(methods, key=methods.get)
+        best_cos = methods[best_method]
+        gap = q['cos_full'] - best_cos
+        nn_methods = {
+            'pinv': q['nn_pinv'], 'lerp': q['nn_lerp'],
+            'slerp': q['nn_slerp'], 'subspace': q['nn_subspace'],
+        }
+        best_nn_method = max(nn_methods, key=nn_methods.get)
+        print(f"    N={q['N']:>3} k={q['k']:>3}: best_cos={best_method:>8} ({best_cos:.4f}, gap={gap:.4f})"
+              f"  best_nn={best_nn_method:>8} ({nn_methods[best_nn_method]:.3f})")
     return all_results