nancyH commited on Mar 13

Commit

b46126b

verified ·

1 Parent(s): 06f4eae

Upload folder using huggingface_hub

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +1 -0
check_vocab.py +94 -0
chr1_dna.txt +3 -0
chr1_phyloP_norm.npy +3 -0
consensus_string.py +35 -0
extract.py +286 -0
extract_log.txt +0 -0
run_job.sh +21 -0
run_train.py +274 -0
save/sparse_ae_50bp_epoch1.pt +3 -0
save/sparse_ae_50bp_epoch2.pt +3 -0
save/sparse_ae_50bp_epoch3.pt +3 -0
save/sparse_ae_50bp_epoch4.pt +3 -0
save/sparse_ae_50bp_epoch5.pt +3 -0
sparse_ae_50bp_epoch1.pt +3 -0
sparse_ae_50bp_epoch2.pt +3 -0
sparse_ae_50bp_epoch3.pt +3 -0
sparse_ae_50bp_epoch4.pt +3 -0
sparse_ae_50bp_epoch5.pt +3 -0
summarize_tokens.py +62 -0
token1248_phy.npy +3 -0
token1248_pwm.npy +3 -0
token1312_phy.npy +3 -0
token1312_pwm.npy +3 -0
token138_phy.npy +3 -0
token138_pwm.npy +3 -0
token1417_phy.npy +3 -0
token1417_pwm.npy +3 -0
token1448_phy.npy +3 -0
token1448_pwm.npy +3 -0
token1487_phy.npy +3 -0
token1487_pwm.npy +3 -0
token1494_phy.npy +3 -0
token1494_pwm.npy +3 -0
token1503_phy.npy +3 -0
token1503_pwm.npy +3 -0
token1721_phy.npy +3 -0
token1721_pwm.npy +3 -0
token175_phy.npy +3 -0
token175_pwm.npy +3 -0
token1831_phy.npy +3 -0
token1831_pwm.npy +3 -0
token192_phy.npy +3 -0
token192_pwm.npy +3 -0
token296_phy.npy +3 -0
token296_pwm.npy +3 -0
token363_phy.npy +3 -0
token363_pwm.npy +3 -0
token468_phy.npy +3 -0
token468_pwm.npy +3 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+chr1_dna.txt filter=lfs diff=lfs merge=lfs -text

check_vocab.py ADDED Viewed

	@@ -0,0 +1,94 @@

+import torch
+import torch.nn as nn
+import numpy as np
+import matplotlib.pyplot as plt
+import torch.nn.functional as F
+# =====================
+# 6. MODEL: SPARSE AUTOENCODER
+# =====================
+INPUT_DIM = L * 5      # 4 DNA + 1 phyloP
+LATENT_DIM = 2048
+HIDDEN_DIM = 1024
+class SparseAE(nn.Module):
+    def __init__(self, input_dim=INPUT_DIM, latent_dim=LATENT_DIM, hidden_dim=HIDDEN_DIM):
+        super().__init__()
+        # Encoder
+        self.encoder = nn.Sequential(
+            nn.Linear(input_dim, hidden_dim),
+            nn.ReLU(),
+            nn.Linear(hidden_dim, latent_dim),
+            nn.ReLU()  # ReLU helps sparsity with L1
+        )
+        # Decoder shared
+        self.dec_hidden = nn.Linear(latent_dim, hidden_dim)
+        # Decoder heads
+        self.dec_dna = nn.Linear(hidden_dim, L * 4)
+        self.dec_phy = nn.Linear(hidden_dim, L * 1)
+    def forward(self, dna, phy):
+        B = dna.size(0)
+        x = torch.cat(
+            [dna.reshape(B, -1), phy.reshape(B, -1)],
+            dim=1
+        )  # (B, INPUT_DIM)
+        h = self.encoder(x)
+        dec = F.relu(self.dec_hidden(h))
+        recon_dna = self.dec_dna(dec).reshape(B, L, 4)      # (B, L, 4)
+        recon_phy = torch.tanh(self.dec_phy(dec)).reshape(B, L)  # (B, L)
+        return recon_dna, recon_phy, h
+# Setup
+L = 50
+device = "cuda" if torch.cuda.is_available() else "cpu"
+model = SparseAE().to(device)
+# Load the final checkpoint
+model.load_state_dict(torch.load("sparse_ae_50bp_epoch3.pt", map_location=device))
+model.eval()
+print("Model loaded.")
+# --- GENERATE FAKE DATA (Or load real if you prefer) ---
+print("Generating test data...")
+# Create random DNA (approximate genomic distribution)
+N_SAMPLES = 10000
+probs = torch.tensor([0.25, 0.25, 0.25, 0.25]) # A, C, G, T
+test_dna_idx = torch.multinomial(probs, N_SAMPLES * L, replacement=True).view(N_SAMPLES, L)
+test_dna = F.one_hot(test_dna_idx, num_classes=4).float().to(device)
+test_phy = torch.randn(N_SAMPLES, L).to(device)
+# --- RUN INFERENCE ---
+print("Running inference...")
+with torch.no_grad():
+    # Run model to get latent 'h'
+    # Note: If you used Top-K in training, ensure you use it here too.
+    # If you used standard L1/KL, just get 'h' from encoder.
+    B = test_dna.size(0)
+    x = torch.cat([test_dna.reshape(B, -1), test_phy.reshape(B, -1)], dim=1)
+    h = model.encoder(x)
+# --- ANALYZE VOCABULARY ---
+h_np = h.cpu().numpy()
+# 1. How often is each token used? (Frequency)
+# We count a neuron as "firing" if it > 0.1
+neuron_firing_counts = np.sum(h_np > 0.1, axis=0) # Shape (2048,)
+# 2. Sort them
+sorted_counts = np.sort(neuron_firing_counts)[::-1]
+print("\n--- VOCABULARY HEALTH CHECK ---")
+print(f"Total Neurons: 2048")
+print(f"Dead Neurons (Never fire): {np.sum(neuron_firing_counts == 0)}")
+print(f"Rare Neurons (Fire < 10 times): {np.sum(neuron_firing_counts < 10)}")
+print(f"Common Neurons (Fire > 1000 times): {np.sum(neuron_firing_counts > 1000)}")

chr1_dna.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:74c534c06853bf3f631c627e6b026bd2a29cade1926c19eda6fa03e462f86f02
+size 248932432

chr1_phyloP_norm.npy ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:36f76db186a5031c277f406f4df0bf24fae1d5db15b715daa0303b6cbcebeb06
+size 995729856

consensus_string.py ADDED Viewed

	@@ -0,0 +1,35 @@

+import numpy as np
+import glob
+# Helper to map index to letter
+idx_to_base = ['A', 'C', 'G', 'T']
+# Find your files
+pwm_files = glob.glob("token*_pwm.npy")
+pwm_files.sort()
+print(f"{'Token ID':<10} | {'Consensus Sequence (50bp)':<55}")
+print("-" * 70)
+for pwm_file in pwm_files:
+    # Get ID
+    tid = pwm_file.split("token")[1].split("_")[0]
+    # Load Matrix (50, 4)
+    pwm = np.load(pwm_file)
+    # Generate Consensus String
+    consensus = []
+    for row in pwm:
+        # row is [prob_A, prob_C, prob_G, prob_T]
+        max_idx = np.argmax(row)
+        max_val = row[max_idx]
+        # If the probability is low (e.g., < 0.4), it's just noise/background
+        if max_val < 0.25:
+            consensus.append(".") # Low confidence
+        else:
+            consensus.append(idx_to_base[max_idx])
+    seq_str = "".join(consensus)
+    print(f"{tid:<10} | {seq_str}")

extract.py ADDED Viewed

	@@ -0,0 +1,286 @@

+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.utils.data import Dataset, DataLoader, RandomSampler
+from collections import defaultdict
+from tqdm import tqdm
+# =====================
+# 1. SETUP & DATA LOADING
+# =====================
+print("Loading sequence / phyloP data...")
+dna_path = "/home/n5huang/dna_token/SparseAE/chr1_dna.txt"
+phy_path = "/home/n5huang/dna_token/SparseAE/chr1_phyloP_norm.npy"
+with open(dna_path) as f:
+    sequence = f.read().strip()
+phy_norm = np.load(phy_path)
+assert len(sequence) == len(phy_norm), "DNA and phyloP length mismatch!"
+chrom_len = len(sequence)
+print(f"Chromosome 1 length: {chrom_len:,} bp")
+# =====================
+# 2. DNA ENCODING (HANDLE 'N')
+# =====================
+print("Encoding DNA to one-hot (with N handling)...")
+mapping = {'A': 0, 'C': 1, 'G': 2, 'T': 3}
+# Map bases to ints, using 4 as "N/unknown"
+dna_int = np.fromiter((mapping.get(b, 4) for b in sequence), dtype=np.int8)
+num_N = np.sum(dna_int == 4)
+print(f"Number of N bases: {num_N:,}")
+# One-hot with an extra row for N
+# 0=A,1=C,2=G,3=T,4=[0,0,0,0,1]
+temp_onehot = np.eye(5, dtype=np.float32)[dna_int]
+# Slice to first 4 columns: N -> [0,0,0,0]
+dna_onehot = temp_onehot[:, :4]   # shape (chrom_len, 4)
+# =====================
+# 3. PHYLOP CHECK + COMBINE
+# =====================
+print("Preparing combined tensor...")
+# Assume phy_norm is already in [-1,1]; warn if not.
+max_abs_phy = np.max(np.abs(phy_norm))
+if max_abs_phy > 1.1:
+    print(f"WARNING: phy_norm max abs={max_abs_phy:.3f} > 1.1; "
+          f"data may not be normalized as expected.")
+phy_norm = phy_norm.astype(np.float32)
+phy_col = phy_norm.reshape(-1, 1)          # (chrom_len, 1)
+combined_np = np.concatenate([dna_onehot, phy_col], axis=1)  # (chrom_len, 5)
+combined_tensor = torch.from_numpy(combined_np)              # CPU tensor
+print(f"Master tensor shape: {combined_tensor.shape}")
+# =====================
+# 4. DATASET: CHUNKED WINDOWING
+# =====================
+L = 50
+class ChunkedChr1Dataset(Dataset):
+    def __init__(self, combined, L=50):
+        self.combined = combined
+        self.L = L
+        self.N = combined.shape[0] - L   # number of valid start positions
+    def __len__(self):
+        return self.N
+    def __getitem__(self, idx):
+        # window: (L, 5)
+        window = self.combined[idx : idx + self.L]
+        dna = window[:, :4]          # (L, 4)
+        phy = window[:, 4]           # (L,)
+        return dna, phy, idx
+dataset = ChunkedChr1Dataset(combined_tensor, L=L)
+print(f"Dataset length (#windows): {len(dataset):,}")
+# =====================
+# 5. DATALOADER WITH RANDOM SAMPLER
+# =====================
+BATCH_SIZE = 1024
+SAMPLES_PER_EPOCH = 5_000_000  # number of windows per epoch (tunable)
+sampler = RandomSampler(
+    dataset,
+    replacement=True,
+    num_samples=SAMPLES_PER_EPOCH
+)
+loader = DataLoader(
+    dataset,
+    batch_size=BATCH_SIZE,
+    sampler=sampler,
+    shuffle=False,      # <--- MUST BE FALSE for mapping back to genome
+    drop_last=False,    # <--- Process every last bit
+    num_workers=0,       # safer on large dataset
+    pin_memory=True
+)
+print("DataLoader ready.")
+# =====================
+# 6. MODEL: SPARSE AUTOENCODER
+# =====================
+INPUT_DIM = L * 5      # 4 DNA + 1 phyloP
+LATENT_DIM = 2048
+HIDDEN_DIM = 1024
+class SparseAE(nn.Module):
+    def __init__(self, input_dim=INPUT_DIM, latent_dim=LATENT_DIM, hidden_dim=HIDDEN_DIM):
+        super().__init__()
+        # Encoder
+        self.encoder = nn.Sequential(
+            nn.Linear(input_dim, hidden_dim),
+            nn.ReLU(),
+            nn.Linear(hidden_dim, latent_dim),
+            nn.ReLU()  # ReLU helps sparsity with L1
+        )
+        # Decoder shared
+        self.dec_hidden = nn.Linear(latent_dim, hidden_dim)
+        # Decoder heads
+        self.dec_dna = nn.Linear(hidden_dim, L * 4)
+        self.dec_phy = nn.Linear(hidden_dim, L * 1)
+    def forward(self, dna, phy):
+        B = dna.size(0)
+        x = torch.cat(
+            [dna.reshape(B, -1), phy.reshape(B, -1)],
+            dim=1
+        )  # (B, INPUT_DIM)
+        h = self.encoder(x)
+        dec = F.relu(self.dec_hidden(h))
+        recon_dna = self.dec_dna(dec).reshape(B, L, 4)      # (B, L, 4)
+        recon_phy = torch.tanh(self.dec_phy(dec)).reshape(B, L)  # (B, L)
+        return recon_dna, recon_phy, h
+########################################
+# 4. LOAD CHECKPOINT
+########################################
+device = "cuda" if torch.cuda.is_available() else "cpu"
+print(f"Using device: {device}")
+model = SparseAE().to(device)
+model.load_state_dict(torch.load("sparse_ae_50bp_epoch3.pt", map_location=device))
+model.eval()
+print("Model loaded.")
+########################################
+# 5. TOKEN EXTRACTION
+########################################
+print("Extracting tokens...")
+all_token_ids = np.zeros(len(dataset), dtype=np.int32)
+#h_values = np.zeros((len(dataset), LATENT_DIM), dtype=np.float32)
+with torch.no_grad():
+    offset = 0
+    for dna_batch, phy_batch, idx_batch in tqdm(loader):
+        dna_batch = dna_batch.to(device).float()
+        phy_batch = phy_batch.to(device).float()
+        _, _, h = model(dna_batch, phy_batch)
+        h_cpu = h.cpu().numpy()
+        # argmax token
+        token_ids = np.argmax(h_cpu, axis=1)
+        all_token_ids[offset : offset + len(token_ids)] = token_ids
+        #h_values[offset : offset + len(token_ids)] = h_cpu
+        offset += len(token_ids)
+print("Token extraction complete.")
+np.save("token_ids.npy", all_token_ids)
+#np.save("latent_h.npy", h_values)
+# Histogram
+hist = np.bincount(all_token_ids, minlength=LATENT_DIM)
+np.save("token_hist.npy", hist)
+print("Top tokens:")
+#top_tokens = np.argsort(hist)[::-1][:20]
+top_tokens = np.argsort(hist)[:20]
+for t in top_tokens:
+    print(f"Token {t}: count={hist[t]}")
+########################################
+# 6. MOTIF SUMMARY FOR TOP TOKENS
+########################################
+print("\nBuilding PWM + average PhyloP for top tokens...")
+# Initialize accumulators for ALL tokens
+pwm_sum = {t: np.zeros((L, 4), dtype=np.float32) for t in range(LATENT_DIM)}
+phy_sum = {t: np.zeros(L, dtype=np.float32)      for t in range(LATENT_DIM)}
+counts  = {t: 0                                  for t in range(LATENT_DIM)}
+print("Accumulating statistics (this may take 15-30 mins)...")
+limit = len(all_token_ids) - L
+for i in tqdm(range(limit)):
+    t = all_token_ids[i]
+    # Always accumulate
+    window = combined_np[i : i+L]
+    pwm_sum[t] += window[:, :4]
+    phy_sum[t] += window[:, 4]
+    counts[t]  += 1
+########################################
+# 6A. Save per-token PWMs & phylo profiles
+########################################
+print("Saving profiles...")
+for t in range(LATENT_DIM):
+    if counts[t] == 0:
+        continue
+    pwm     = pwm_sum[t] / counts[t]
+    avg_phy = phy_sum[t] / counts[t]
+    np.save(f"token{t}_pwm.npy", pwm)
+    np.save(f"token{t}_phy.npy", avg_phy)
+########################################
+# 6B. Rank tokens by PhyloP and rarity
+########################################
+avg_phylop_per_token = np.zeros(LATENT_DIM)
+count_per_token      = np.zeros(LATENT_DIM)
+for t in range(LATENT_DIM):
+    if counts[t] > 0:
+        avg_phylop_per_token[t] = (phy_sum[t] / counts[t]).mean()
+        count_per_token[t]      = counts[t]
+    else:
+        avg_phylop_per_token[t] = -999
+        count_per_token[t]      = 0
+# Rank by PhyloP (high to low)
+tokens_by_phylop = np.argsort(avg_phylop_per_token)[::-1]
+top_phy_tokens = tokens_by_phylop[:20]
+# Rank by rarity (low to high)
+rare_tokens = np.argsort(count_per_token)[:20]
+print("Top 20 conserved tokens:", top_phy_tokens)
+print("Top 20 rarest tokens:", rare_tokens)
+print("\n=== Extraction Completed Successfully ===")

extract_log.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

run_job.sh ADDED Viewed

	@@ -0,0 +1,21 @@

+#!/bin/bash
+# --- 0. Activate Conda Environment ---
+echo "Activating Conda environment..."
+source /home/n5huang/miniconda3/etc/profile.d/conda.sh
+conda activate dnabert_v2
+# --- 1. Configuration ---
+# Set the GPU ID
+export CUDA_VISIBLE_DEVICES=6
+echo "Assigned GPU: $CUDA_VISIBLE_DEVICES"
+# --- 2. Correct Working Directory ---
+# Move to the folder where the script actually lives to ensure relative paths work (if any)
+cd /home/n5huang/dna_token/SparseAE/
+# --- 3. Run the Python Script ---
+# We use absolute path just to be safe
+# -u ensures logs are written immediately
+echo "Starting Python training..."
+python -u extract.py #run_train.py

run_train.py ADDED Viewed

	@@ -0,0 +1,274 @@

+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.utils.data import Dataset, DataLoader, RandomSampler
+# =====================
+# 1. SETUP & DATA LOADING
+# =====================
+print("Loading sequence / phyloP data...")
+dna_path = "/home/n5huang/dna_token/SparseAE/chr1_dna.txt"
+phy_path = "/home/n5huang/dna_token/SparseAE/chr1_phyloP_norm.npy"
+with open(dna_path) as f:
+    sequence = f.read().strip()
+phy_norm = np.load(phy_path)
+assert len(sequence) == len(phy_norm), "DNA and phyloP length mismatch!"
+chrom_len = len(sequence)
+print(f"Chromosome 1 length: {chrom_len:,} bp")
+# =====================
+# 2. DNA ENCODING (HANDLE 'N')
+# =====================
+print("Encoding DNA to one-hot (with N handling)...")
+mapping = {'A': 0, 'C': 1, 'G': 2, 'T': 3}
+# Map bases to ints, using 4 as "N/unknown"
+dna_int = np.fromiter((mapping.get(b, 4) for b in sequence), dtype=np.int8)
+num_N = np.sum(dna_int == 4)
+print(f"Number of N bases: {num_N:,}")
+# One-hot with an extra row for N
+# 0=A,1=C,2=G,3=T,4=[0,0,0,0,1]
+temp_onehot = np.eye(5, dtype=np.float32)[dna_int]
+# Slice to first 4 columns: N -> [0,0,0,0]
+dna_onehot = temp_onehot[:, :4]   # shape (chrom_len, 4)
+# =====================
+# 3. PHYLOP CHECK + COMBINE
+# =====================
+print("Preparing combined tensor...")
+# Assume phy_norm is already in [-1,1]; warn if not.
+max_abs_phy = np.max(np.abs(phy_norm))
+if max_abs_phy > 1.1:
+    print(f"WARNING: phy_norm max abs={max_abs_phy:.3f} > 1.1; "
+          f"data may not be normalized as expected.")
+phy_norm = phy_norm.astype(np.float32)
+phy_col = phy_norm.reshape(-1, 1)          # (chrom_len, 1)
+combined_np = np.concatenate([dna_onehot, phy_col], axis=1)  # (chrom_len, 5)
+combined_tensor = torch.from_numpy(combined_np)              # CPU tensor
+print(f"Master tensor shape: {combined_tensor.shape}")
+# =====================
+# 4. DATASET: CHUNKED WINDOWING
+# =====================
+L = 50
+class ChunkedChr1Dataset(Dataset):
+    def __init__(self, combined, L=50):
+        self.combined = combined
+        self.L = L
+        self.N = combined.shape[0] - L   # number of valid start positions
+    def __len__(self):
+        return self.N
+    def __getitem__(self, idx):
+        # window: (L, 5)
+        window = self.combined[idx : idx + self.L]
+        dna = window[:, :4]          # (L, 4)
+        phy = window[:, 4]           # (L,)
+        return dna, phy
+dataset = ChunkedChr1Dataset(combined_tensor, L=L)
+print(f"Dataset length (#windows): {len(dataset):,}")
+# =====================
+# 5. DATALOADER WITH RANDOM SAMPLER
+# =====================
+BATCH_SIZE = 1024
+SAMPLES_PER_EPOCH = 5_000_000  # number of windows per epoch (tunable)
+sampler = RandomSampler(
+    dataset,
+    replacement=True,
+    num_samples=SAMPLES_PER_EPOCH
+)
+loader = DataLoader(
+    dataset,
+    batch_size=BATCH_SIZE,
+    sampler=sampler,
+    drop_last=True,
+    num_workers=0,       # safer on large dataset
+    pin_memory=True
+)
+print("DataLoader ready.")
+# =====================
+# 6. MODEL: SPARSE AUTOENCODER
+# =====================
+INPUT_DIM = L * 5      # 4 DNA + 1 phyloP
+LATENT_DIM = 2048
+HIDDEN_DIM = 1024
+class SparseAE(nn.Module):
+    def __init__(self, input_dim=INPUT_DIM, latent_dim=LATENT_DIM, hidden_dim=HIDDEN_DIM):
+        super().__init__()
+        # Encoder
+        self.encoder = nn.Sequential(
+            nn.Linear(input_dim, hidden_dim),
+            nn.ReLU(),
+            nn.Linear(hidden_dim, latent_dim),
+            nn.ReLU()  # ReLU helps sparsity with L1
+        )
+        # Decoder shared
+        self.dec_hidden = nn.Linear(latent_dim, hidden_dim)
+        # Decoder heads
+        self.dec_dna = nn.Linear(hidden_dim, L * 4)
+        self.dec_phy = nn.Linear(hidden_dim, L * 1)
+    def forward(self, dna, phy):
+        B = dna.size(0)
+        x = torch.cat(
+            [dna.reshape(B, -1), phy.reshape(B, -1)],
+            dim=1
+        )  # (B, INPUT_DIM)
+        h = self.encoder(x)
+        dec = F.relu(self.dec_hidden(h))
+        recon_dna = self.dec_dna(dec).reshape(B, L, 4)      # (B, L, 4)
+        recon_phy = torch.tanh(self.dec_phy(dec)).reshape(B, L)  # (B, L)
+        return recon_dna, recon_phy, h
+# =====================
+# 7. TRAINING LOOP
+# =====================
+device = "cuda" if torch.cuda.is_available() else "cpu"
+print(f"Training on device: {device}")
+model = SparseAE().to(device)
+optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
+#lambda_l1 = 0.01      # slightly stronger sparsity
+lambda_l1_start = 0.02
+lambda_l1_end   = 0.005
+phy_weight = 10.0
+num_epochs = 5
+PRINT_EVERY = 1000    # batches
+beta_kl_schedule = [0.0, 0.01, 0.02, 0.05, 0.1]   # per epoch
+print("Starting training...")
+for epoch in range(num_epochs):
+    model.train()
+    total_loss = 0.0
+    total_dna = 0.0
+    total_phy = 0.0
+    total_active = 0.0
+    batch_count = 0
+    for dna_batch, phy_batch in loader:
+        batch_count += 1
+        dna_batch = dna_batch.to(device, non_blocking=True).float()  # (B, L, 4)
+        phy_batch = phy_batch.to(device, non_blocking=True).float()  # (B, L)
+        optimizer.zero_grad()
+        recon_dna, recon_phy, h = model(dna_batch, phy_batch)
+        # Mask positions that are 'N' (all-zero one-hot)
+        mask = dna_batch.sum(dim=-1) > 0   # (B, L), True where valid base
+        # --- DNA loss (masked CE) ---
+        true_dna_cls = dna_batch.argmax(dim=-1)              # (B, L)
+        dna_logits = recon_dna.permute(0, 2, 1)              # (B, 4, L)
+        loss_dna_raw = F.cross_entropy(dna_logits, true_dna_cls, reduction='none')  # (B, L)
+        if mask.sum() > 0:
+            loss_dna = (loss_dna_raw * mask).sum() / mask.sum()
+        else:
+            loss_dna = torch.tensor(0.0, device=device)
+        # --- PhyloP loss (masked MSE) ---
+        loss_phy_raw = F.mse_loss(recon_phy, phy_batch, reduction='none')  # (B, L)
+        if mask.sum() > 0:
+            loss_phy = (loss_phy_raw * mask).sum() / mask.sum()
+        else:
+            loss_phy = torch.tensor(0.0, device=device)
+        # --- KL sparsity penalty ---
+        rho = 0.02   # target sparsity
+        eps = 1e-12
+        rho_hat = torch.mean(h, dim=0)
+        rho_hat = torch.clamp(rho_hat, min=1e-6, max=1-1e-6)
+        kl_per_unit = (
+            rho * torch.log((rho + eps) / (rho_hat + eps)) +
+            (1 - rho) * torch.log(((1 - rho) + eps) / ((1 - rho_hat) + eps))
+        )
+        beta_kl = beta_kl_schedule[min(epoch, len(beta_kl_schedule)-1)]
+        #loss_kl = 1 * kl_per_unit.sum()   # β = 1 regularization weight
+        loss_kl = beta_kl * kl_per_unit.sum()
+        lambda_l1 = (
+            lambda_l1_start
+            + (lambda_l1_end - lambda_l1_start) * (epoch / (num_epochs - 1))
+        )
+        # --- L1 sparsity on latent ---
+        loss_l1 = lambda_l1 * torch.mean(torch.abs(h))
+        # Total loss
+        loss = loss_dna + phy_weight * loss_phy + loss_l1 + loss_kl
+        loss.backward()
+        optimizer.step()
+        # Logging accumulators
+        B = dna_batch.size(0)
+        total_loss += loss.item() * B
+        total_dna  += loss_dna.item() * B
+        total_phy  += loss_phy.item() * B
+        # approximate number of active neurons (h > threshold)
+        active_count = (h > 0.01).float().sum(dim=1).mean().item()
+        total_active += active_count * B
+        if batch_count % PRINT_EVERY == 0:
+            print(
+                f"Epoch {epoch+1} | Batch {batch_count} | "
+                f"Loss={loss.item():.4f} | DNA_CE={loss_dna.item():.4f} | "
+                f"Phy_MSE={loss_phy.item():.5f} | Active={active_count:.1f}"
+            )
+    # Epoch summary
+    N = SAMPLES_PER_EPOCH  # effective number of samples this epoch
+    avg_loss = total_loss / N
+    avg_dna  = total_dna / N
+    avg_phy  = total_phy / N
+    avg_active = total_active / N
+    print(f"\n=== Epoch {epoch+1}/{num_epochs} COMPLETE ===")
+    print(
+        f"Avg Loss={avg_loss:.4f} | Avg DNA_CE={avg_dna:.4f} | "
+        f"Avg Phy_MSE={avg_phy:.5f} | "
+        f"Avg Active Neurons={avg_active:.1f} / {LATENT_DIM} "
+        f"({100.0 * avg_active / LATENT_DIM:.1f}%)"
+    )
+    # Save checkpoint
+    ckpt_path = f"sparse_ae_50bp_epoch{epoch+1}.pt"
+    torch.save(model.state_dict(), ckpt_path)
+    print(f"Saved checkpoint to {ckpt_path}\n")

save/sparse_ae_50bp_epoch1.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:783330441dbcb8954e41b73c14c9f2c6b5b7d4391b2a34f497efbaffea185061
+size 18845511

save/sparse_ae_50bp_epoch2.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f951d20a5e142004926a72c9fcf5eb930b60fecd6c1016bc937f9590ce1acf4f
+size 18845511

save/sparse_ae_50bp_epoch3.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d8b3568994c3c777b37bea3b781c8d79e8c8883c9fbceee001f6247d7e21def0
+size 18845511

save/sparse_ae_50bp_epoch4.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e1625cf96716fac10180bff026b4a22bb3141c9d7599ffac03963b908e9310d3
+size 18845511

save/sparse_ae_50bp_epoch5.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9aed9a40b48ddc84c91d5fee7ed1917922a39138c6de011655dc9789d6d94939
+size 18845511

sparse_ae_50bp_epoch1.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0524bbaf5d3890d983e6a5a0950700caa1edb079b4ba8d351f10aedda4e4cbfe
+size 18845511

sparse_ae_50bp_epoch2.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:49e44d486cf65d22baef24d6d35c480e2b0d1ac54216ee62089be54acf28cee8
+size 18845511

sparse_ae_50bp_epoch3.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b28ea09c9c217e70bce16a914aacda6d8a864f95e8497eb690e02b7c84e6a036
+size 18845511

sparse_ae_50bp_epoch4.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9b188cb2204f93983c0942d697f8f04736db3bf56012b9324e507e4d83a4673d
+size 18845511

sparse_ae_50bp_epoch5.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4cc638b7535deeb9c4c10bb9a6c47d746ff09d2aa51cf223b2d262ec468a018a
+size 18845511

summarize_tokens.py ADDED Viewed

	@@ -0,0 +1,62 @@

+import re
+import glob
+import numpy as np
+import pandas as pd
+LOG_FILE = "extract_log.txt"
+# 1. Parse N hits from log file
+token_counts = {}
+pattern = re.compile(r"Saved token (\d+) \(N=(\d+)\)")
+with open(LOG_FILE, "r") as f:
+    for line in f:
+        m = pattern.search(line)
+        if m:
+            t = int(m.group(1))
+            n = int(m.group(2))
+            token_counts[t] = n
+print(f"Found {len(token_counts)} tokens with counts from log.")
+# 2. For each token, load PWM + phyloP, compute entropy + avg phyloP
+rows = []
+def pwm_entropy(pwm, eps=1e-8):
+    """
+    pwm: (L, 4) array of mean one-hot probs
+    returns: mean Shannon entropy across positions, in bits
+    """
+    p = pwm / (pwm.sum(axis=1, keepdims=True) + eps)  # normalize safety
+    H = -np.sum(p * np.log2(p + eps), axis=1)         # (L,)
+    return H.mean()
+for pwm_path in glob.glob("token*_pwm.npy"):
+    # token ID from filename
+    m = re.search(r"token(\d+)_pwm\.npy", pwm_path)
+    if not m:
+        continue
+    t = int(m.group(1))
+    pwm = np.load(pwm_path)       # (L, 4)
+    phy = np.load(f"token{t}_phy.npy")  # (L,)
+    H = pwm_entropy(pwm)
+    avg_phy = float(phy.mean())
+    N_hits = token_counts.get(t, None)
+    rows.append({
+        "token_id": t,
+        "N_hits": N_hits,
+        "pwm_entropy_bits": H,
+        "avg_phyloP": avg_phy
+    })
+df = pd.DataFrame(rows)
+df = df.sort_values(["pwm_entropy_bits", "avg_phyloP"], ascending=[True, False])
+print(df.head(20))
+df.to_csv("token_summary.tsv", sep="\t", index=False)
+print("\nSaved summary to token_summary.tsv")

token1248_phy.npy ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d0ff04bef228fbb4034063112451e0ff3f61ea5010085c896dde823c80995944
+size 328

token1248_pwm.npy ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:48ecfc793f3988fb8ac3f44fceeb7e19c78e65c153d9f8f3d8ef84ba95901fce
+size 928

token1312_phy.npy ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3e7f7df3737fb75cf04dba3b9dace53c6288b5df1c8f1504e756795d16ec587e
+size 328

token1312_pwm.npy ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:636a911b43f7f1117d8f1350966a3832b21f4ec02891bd96386cc0e3f297bae9
+size 928

token138_phy.npy ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:94821977d4c17a4f60989babf4c9404e650160ce56ef0f109e3ac078e4292aaa
+size 328

token138_pwm.npy ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:00274837ae1e7e223ce799a649edb1ca7e01998780ec0b836e731427e1739ef6
+size 928

token1417_phy.npy ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1c51f148b5df3cf4bce2fde39242187600d0d8fe47d10094b2dccfd4c3715870
+size 328

token1417_pwm.npy ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a294d6621fc0b33bedbf4580ecf3c50f5ae779e15a0c60365015f4a81b9801bd
+size 928

token1448_phy.npy ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7818a614c3797dff04b89ae64493fc348c32d774a34c068cc6212d097e02cca8
+size 328

token1448_pwm.npy ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:328c466c07fa92f28d23a19accbf8a852f584116a234b9e62c96eec4d1243c0b
+size 928

token1487_phy.npy ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e511783f52ad148ca2fb1eaef348fd19c844b9d8ceb9c9ffef07efeb48097960
+size 328

token1487_pwm.npy ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7859c6c5085e90a0b2fe26c5f63ce02dbb2f9e56e8e20a77ddf664630a4c6849
+size 928

token1494_phy.npy ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:cc33acaa4223bb72be2190dc196840f870c6c5128ce06cb46c86b7aacc8011d1
+size 328

token1494_pwm.npy ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9d05893b76c9a7d253d025f8a09f600ec8e6e78914793dd866ed5aae58028330
+size 928

token1503_phy.npy ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:bb5354c0f0721175f73d3e5a76f30ca9aa4f76ef20d918ca9faf6b4040c3ce55
+size 328

token1503_pwm.npy ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7c4ddd567056af4a22fe33e4783afb65e0037c2fda7778825a7b3efcffbce4c0
+size 928

token1721_phy.npy ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9e1b524b3870dd824a06a9113ecd1e08d1f78e133e39cf6700c6734a5d8e487f
+size 328

token1721_pwm.npy ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c5df963ff0dc52052663c9122812da7708299e263f749a2accbb6297b42bfd78
+size 928

token175_phy.npy ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d20851e9bed217d47d86f6ef752922ba756a17dcb5f1ba826523161827ea4af0
+size 328

token175_pwm.npy ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:c5ce7c68421a2db5fc5285296599945917c37be115a0eeda6ec9de965be25624
+size 928

token1831_phy.npy ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:68b1ef67ff02631fb4b32165652ef6d97c4edb3b1ca292f03f644098637b821a
+size 328

token1831_pwm.npy ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:43dc041a1c64ea71b531c7ab1b1d4ea30bd77b015b6a5d3997dee5ddb00f5fea
+size 928

token192_phy.npy ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:98cff173bd0ec8c9d3f02d217a5bfa96ee0cca52d03f624dd29939eaee52a8df
+size 328

token192_pwm.npy ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:24c99c84a3db007261a0ab245020b7db930dcc53e8e646e16a77a3bdc31bd259
+size 928

token296_phy.npy ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:fd142b806afb9d3ecf36bc07eeb4ac05e69f48f64513620059023ad7ea59a5d0
+size 328

token296_pwm.npy ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3d9722d77f41184dbbc8089b0c909f00121fb114d33b0b09f5fdacc419aa724d
+size 928

token363_phy.npy ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:66a992eb090aed56f742f8b84e2bc36eee9dba0849b5de454f65c67c8d433c83
+size 328

token363_pwm.npy ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:dd011451a85cc30d6f4aa5123fab4f6a9a401a74a84ae30859ac1e29262c026c
+size 928

token468_phy.npy ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:471273cce914188e0b266f884091cd1b85c8f1c554943bd0d5db3a423c3b6cb5
+size 328

token468_pwm.npy ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:84493201ce2efb598ceaf6eb6f86b25168b2a4da76611bee68897a677a732fa1
+size 928