esunAI
/

FlowFinal

+import torch
+import torch.nn.functional as F
+import numpy as np
+from tqdm import tqdm
+import os
+from datetime import datetime
+#d Import torchdiffeq for proper ODE solving
+try:
+    from torchdiffeq import odeint
+    TORCHDIFFEQ_AVAILABLE = True
+    print("✓ torchdiffeq available for proper ODE solving")
+except ImportError:
+    TORCHDIFFEQ_AVAILABLE = False
+    print("⚠️  torchdiffeq not available, using manual Euler integration")
+# Import your components
+from compressor_with_embeddings import Compressor, Decompressor
+from final_flow_model import AMPFlowMatcherCFGConcat, AMPProtFlowPipelineCFG
+class AMPGenerator:
+    """
+    Generate AMP samples using trained ProtFlow model.
+    """
+    def __init__(self, model_path, device='cuda'):
+        self.device = device
+        # Load models
+        self._load_models(model_path)
+        # Load preprocessing statistics
+        self.stats = torch.load('normalization_stats.pt', map_location=device)
+    def _load_models(self, model_path):
+        """Load trained models."""
+        print("Loading trained models...")
+        # Load compressor and decompressor
+        self.compressor = Compressor().to(self.device)
+        self.decompressor = Decompressor().to(self.device)
+        self.compressor.load_state_dict(torch.load('/data2/edwardsun/flow_amp/models/final_compressor_model.pth', map_location=self.device))
+        self.decompressor.load_state_dict(torch.load('/data2/edwardsun/flow_amp/models/final_decompressor_model.pth', map_location=self.device))
+        # Load flow matching model with CFG
+        self.flow_model = AMPFlowMatcherCFGConcat(
+            hidden_dim=480,
+            compressed_dim=80,  # 1280 // 16
+            n_layers=12,
+            n_heads=16,
+            dim_ff=3072,
+            max_seq_len=25,
+            use_cfg=True
+        ).to(self.device)
+        checkpoint = torch.load(model_path, map_location=self.device, weights_only=False)
+        # Handle PyTorch compilation wrapper
+        state_dict = checkpoint['flow_model_state_dict']
+        new_state_dict = {}
+        for key, value in state_dict.items():
+            # Remove _orig_mod prefix if present
+            if key.startswith('_orig_mod.'):
+                new_key = key[10:]  # Remove '_orig_mod.' prefix
+            else:
+                new_key = key
+            new_state_dict[new_key] = value
+        self.flow_model.load_state_dict(new_state_dict)
+        print(f"✓ All models loaded successfully from step {checkpoint['step']}!")
+        print(f"  Loss at checkpoint: {checkpoint['loss']:.6f}")
+        # Initialize ODE solving capabilities
+        if TORCHDIFFEQ_AVAILABLE:
+            print("✓ Enhanced with proper ODE solving (torchdiffeq)")
+        else:
+            print("⚠️  Using fallback Euler integration")
+    def _create_ode_func(self, cfg_scale=7.5):
+        """Create ODE function for torchdiffeq integration."""
+        def ode_func(t, x):
+            """
+            ODE function: dx/dt = v_theta(x, t)
+            Args:
+                t: scalar time (single float)
+                x: state tensor [B*L*D] (flattened)
+            Returns:
+                dx/dt: derivative [B*L*D] (flattened)
+            """
+            # Reshape x back to [B, L, D]
+            batch_size, seq_len, dim = self.current_shape
+            x = x.view(batch_size, seq_len, dim)
+            # Create time tensor for batch
+            t_tensor = torch.full((batch_size,), t, device=self.device, dtype=x.dtype)
+            # Compute vector field with CFG
+            if cfg_scale > 0:
+                # With AMP condition
+                amp_labels = torch.full((batch_size,), 0, device=self.device)  # 0 = AMP
+                vt_cond = self.flow_model(x, t_tensor, labels=amp_labels)
+                # Without condition (mask)
+                mask_labels = torch.full((batch_size,), 2, device=self.device)  # 2 = Mask
+                vt_uncond = self.flow_model(x, t_tensor, labels=mask_labels)
+                # CFG interpolation
+                vt = vt_uncond + cfg_scale * (vt_cond - vt_uncond)
+            else:
+                # No CFG, use mask label
+                mask_labels = torch.full((batch_size,), 2, device=self.device)
+                vt = self.flow_model(x, t_tensor, labels=mask_labels)
+            # Return flattened derivative
+            return vt.view(-1)
+        return ode_func
+    def generate_amps(self, num_samples=100, num_steps=25, batch_size=32, cfg_scale=7.5,
+                     ode_method='dopri5', rtol=1e-5, atol=1e-6):
+        """
+        Generate AMP samples using flow matching with CFG and improved ODE solving.
+        Args:
+            num_samples: Number of AMP samples to generate
+            num_steps: Number of ODE solving steps (25 for good quality, 1 for reflow)
+            batch_size: Batch size for generation
+            cfg_scale: CFG guidance scale (higher = stronger conditioning)
+            ode_method: ODE solver method ('dopri5', 'rk4', 'euler', 'adaptive_heun')
+            rtol: Relative tolerance for adaptive solvers
+            atol: Absolute tolerance for adaptive solvers
+        """
+        method_str = f"{ode_method} ODE solver" if TORCHDIFFEQ_AVAILABLE and ode_method != 'euler' else "manual Euler integration"
+        print(f"Generating {num_samples} AMP samples with {method_str} (CFG scale: {cfg_scale})...")
+        if TORCHDIFFEQ_AVAILABLE and ode_method != 'euler':
+            print(f"  Method: {ode_method}, rtol={rtol}, atol={atol}")
+        self.flow_model.eval()
+        self.compressor.eval()
+        self.decompressor.eval()
+        all_generated = []
+        with torch.no_grad():
+            for i in tqdm(range(0, num_samples, batch_size), desc="Generating with improved ODE"):
+                current_batch = min(batch_size, num_samples - i)
+                # Sample random noise (starting point at t=1)
+                eps = torch.randn(current_batch, 25, 80, device=self.device)  # [B, L', COMP_DIM]
+                # Choose ODE solving method
+                if TORCHDIFFEQ_AVAILABLE and ode_method != 'euler':
+                    # Use proper ODE solver
+                    try:
+                        # Store shape for ODE function
+                        self.current_shape = eps.shape
+                        # Create ODE function
+                        ode_func = self._create_ode_func(cfg_scale=cfg_scale)
+                        # Time span: from t=1 (noise) to t=0 (data)
+                        t_span = torch.tensor([1.0, 0.0], device=self.device, dtype=eps.dtype)
+                        # Flatten initial condition for torchdiffeq
+                        y0 = eps.view(-1)
+                        # Solve ODE with proper adaptive solver
+                        if ode_method in ['dopri5', 'adaptive_heun']:
+                            # Adaptive solvers
+                            solution = odeint(
+                                ode_func, y0, t_span,
+                                method=ode_method,
+                                rtol=rtol,
+                                atol=atol,
+                                options={'max_num_steps': 1000}
+                            )
+                        else:
+                            # Fixed-step solvers
+                            solution = odeint(
+                                ode_func, y0, t_span,
+                                method=ode_method,
+                                options={'step_size': 0.04}  # 1/25 for 25 steps
+                            )
+                        # Get final solution (at t=0)
+                        xt = solution[-1].view(self.current_shape)
+                    except Exception as e:
+                        print(f"⚠️  ODE solving failed for batch {i//batch_size + 1}: {e}")
+                        print("Falling back to Euler method...")
+                        # Fall through to Euler method
+                        xt = self._generate_with_euler(eps, current_batch, cfg_scale, num_steps)
+                else:
+                    # Use manual Euler integration (original method)
+                    xt = self._generate_with_euler(eps, current_batch, cfg_scale, num_steps)
+                # Decompress to get embeddings
+                decompressed = self.decompressor(xt)  # [B, L, ESM_DIM]
+                # Apply reverse preprocessing
+                m, s, mn, mx = self.stats['mean'], self.stats['std'], self.stats['min'], self.stats['max']
+                decompressed = decompressed * (mx - mn + 1e-8) + mn
+                decompressed = decompressed * s + m
+                all_generated.append(decompressed.cpu())
+        # Concatenate all batches
+        generated_embeddings = torch.cat(all_generated, dim=0)
+        print(f"✓ Generated {generated_embeddings.shape[0]} AMP embeddings")
+        print(f"  Shape: {generated_embeddings.shape}")
+        print(f"  Stats - Mean: {generated_embeddings.mean():.4f}, Std: {generated_embeddings.std():.4f}")
+        return generated_embeddings
+    def _generate_with_euler(self, eps, current_batch, cfg_scale, num_steps):
+        """Fallback Euler integration method (original implementation)."""
+        xt = eps.clone()
+        amp_labels = torch.full((current_batch,), 0, device=self.device)  # 0 = AMP
+        mask_labels = torch.full((current_batch,), 2, device=self.device)  # 2 = Mask
+        for step in range(num_steps):
+            t = torch.ones(current_batch, device=self.device) * (1.0 - step/num_steps)
+            # CFG: Generate with condition and without condition
+            if cfg_scale > 0:
+                # With AMP condition
+                vt_cond = self.flow_model(xt, t, labels=amp_labels)
+                # Without condition (mask)
+                vt_uncond = self.flow_model(xt, t, labels=mask_labels)
+                # CFG interpolation
+                vt = vt_uncond + cfg_scale * (vt_cond - vt_uncond)
+            else:
+                # No CFG, use mask label
+                vt = self.flow_model(xt, t, labels=mask_labels)
+            # Euler step for backward integration (t: 1 -> 0)
+            dt = -1.0 / num_steps
+            xt = xt + vt * dt
+        return xt
+    def compare_ode_methods(self, num_samples=20, cfg_scale=7.5):
+        """
+        Compare different ODE solving methods for quality assessment.
+        """
+        if not TORCHDIFFEQ_AVAILABLE:
+            print("⚠️  torchdiffeq not available, cannot compare ODE methods")
+            return self.generate_amps(num_samples=num_samples, cfg_scale=cfg_scale)
+        methods = ['euler', 'rk4', 'dopri5', 'adaptive_heun']
+        results = {}
+        print("🔬 Comparing ODE solving methods...")
+        for method in methods:
+            print(f"\n--- Testing {method} ---")
+            try:
+                start_time = torch.cuda.Event(enable_timing=True)
+                end_time = torch.cuda.Event(enable_timing=True)
+                start_time.record()
+                embeddings = self.generate_amps(
+                    num_samples=num_samples,
+                    batch_size=10,
+                    cfg_scale=cfg_scale,
+                    ode_method=method
+                )
+                end_time.record()
+                torch.cuda.synchronize()
+                elapsed_time = start_time.elapsed_time(end_time) / 1000.0  # Convert to seconds
+                results[method] = {
+                    'embeddings': embeddings,
+                    'time': elapsed_time,
+                    'mean': embeddings.mean().item(),
+                    'std': embeddings.std().item(),
+                    'success': True
+                }
+                print(f"✓ {method}: {elapsed_time:.2f}s, mean={embeddings.mean():.4f}, std={embeddings.std():.4f}")
+            except Exception as e:
+                print(f"❌ {method} failed: {e}")
+                results[method] = {'success': False, 'error': str(e)}
+        return results
+    def generate_with_reflow(self, num_samples=100):
+        """
+        Generate AMP samples using 1-step reflow (if you have reflow model).
+        """
+        print(f"Generating {num_samples} AMP samples with 1-step reflow...")
+        # This would use the reflow implementation
+        # For now, just use 1-step generation
+        return self.generate_amps(num_samples=num_samples, num_steps=1, batch_size=32)
+def main():
+    """Main generation function."""
+    print("=== AMP Generation Pipeline with CFG ===")
+    # Use the best model from training (lowest validation loss: 0.017183)
+    model_path = '/data2/edwardsun/flow_checkpoints/amp_flow_model_best_optimized.pth'
+    # Check if checkpoint exists
+    try:
+        checkpoint = torch.load(model_path, map_location='cpu', weights_only=False)
+        print(f"✓ Found best model at step {checkpoint['step']} with loss {checkpoint['loss']:.6f}")
+        print(f"  Global step: {checkpoint['global_step']}")
+        print(f"  Total samples: {checkpoint['total_samples']:,}")
+    except:
+        print(f"❌ Best model not found: {model_path}")
+        print("Please train the flow matching model first using amp_flow_training.py")
+        return
+    # Initialize generator
+    generator = AMPGenerator(model_path, device='cuda')
+    # Test ODE methods comparison if available
+    if TORCHDIFFEQ_AVAILABLE:
+        print("\n🔬 Comparing ODE solving methods...")
+        comparison_results = generator.compare_ode_methods(num_samples=10, cfg_scale=7.5)
+        # Use best method for generation
+        best_method = 'dopri5'  # Recommended method
+        print(f"\n🚀 Using {best_method} for main generation...")
+    else:
+        best_method = 'euler'
+        print("\n⚠️  Using fallback Euler integration...")
+    # Generate samples with different CFG scales using improved ODE solving
+    print("\n1. Generating with CFG scale 0.0 (no conditioning)...")
+    samples_no_cfg = generator.generate_amps(num_samples=20, num_steps=25, cfg_scale=0.0, ode_method=best_method)
+    print("\n2. Generating with CFG scale 3.0 (weak conditioning)...")
+    samples_weak_cfg = generator.generate_amps(num_samples=20, num_steps=25, cfg_scale=3.0, ode_method=best_method)
+    print("\n3. Generating with CFG scale 7.5 (strong conditioning)...")
+    samples_strong_cfg = generator.generate_amps(num_samples=20, num_steps=25, cfg_scale=7.5, ode_method=best_method)
+    print("\n4. Generating with CFG scale 15.0 (very strong conditioning)...")
+    samples_very_strong_cfg = generator.generate_amps(num_samples=20, num_steps=25, cfg_scale=15.0, ode_method=best_method)
+    # Create output directory if it doesn't exist
+    output_dir = '/data2/edwardsun/generated_samples'
+    os.makedirs(output_dir, exist_ok=True)
+    # Get today's date for filename
+    today = datetime.now().strftime('%Y%m%d')
+    # Save generated samples with date
+    torch.save(samples_no_cfg, os.path.join(output_dir, f'generated_amps_best_model_no_cfg_{today}.pt'))
+    torch.save(samples_weak_cfg, os.path.join(output_dir, f'generated_amps_best_model_weak_cfg_{today}.pt'))
+    torch.save(samples_strong_cfg, os.path.join(output_dir, f'generated_amps_best_model_strong_cfg_{today}.pt'))
+    torch.save(samples_very_strong_cfg, os.path.join(output_dir, f'generated_amps_best_model_very_strong_cfg_{today}.pt'))
+    print("\n✓ Generation complete!")
+    print(f"Generated samples saved (Date: {today}):")
+    print(f"  - generated_amps_best_model_no_cfg_{today}.pt (no conditioning)")
+    print(f"  - generated_amps_best_model_weak_cfg_{today}.pt (weak CFG)")
+    print(f"  - generated_amps_best_model_strong_cfg_{today}.pt (strong CFG)")
+    print(f"  - generated_amps_best_model_very_strong_cfg_{today}.pt (very strong CFG)")
+    print("\nCFG Analysis:")
+    print("  - CFG scale 0.0: No conditioning, generates diverse sequences")
+    print("  - CFG scale 3.0: Weak AMP conditioning")
+    print("  - CFG scale 7.5: Strong AMP conditioning (recommended)")
+    print("  - CFG scale 15.0: Very strong AMP conditioning (may be too restrictive)")
+    print("\nNext steps:")
+    print("1. Decode embeddings back to sequences using ESM-2 decoder")
+    print("2. Evaluate with ProtFlow metrics (FPD, MMD, ESM-2 perplexity)")
+    print("3. Compare sequences generated with different CFG scales")
+    print("4. Evaluate AMP properties (antimicrobial activity, toxicity)")
+    if TORCHDIFFEQ_AVAILABLE:
+        print(f"5. ✓ Enhanced generation with {best_method} ODE solver")
+    else:
+        print("5. Install torchdiffeq for improved ODE solving: pip install torchdiffeq")
+if __name__ == "__main__":
+    main()