diff --git a/.gitattributes b/.gitattributes index a6344aac8c09253b3b630fb776ae94478aa0275b..ace7146a491525e7efc3d69767e6a4ffcbcca4e3 100644 --- a/.gitattributes +++ b/.gitattributes @@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text *.zip filter=lfs diff=lfs merge=lfs -text *.zst filter=lfs diff=lfs merge=lfs -text *tfevents* filter=lfs diff=lfs merge=lfs -text +chr1_dna.txt filter=lfs diff=lfs merge=lfs -text diff --git a/check_vocab.py b/check_vocab.py new file mode 100644 index 0000000000000000000000000000000000000000..21efc29acb636cb28f1c88f02adf2bc3afba5886 --- /dev/null +++ b/check_vocab.py @@ -0,0 +1,94 @@ +import torch +import torch.nn as nn +import numpy as np +import matplotlib.pyplot as plt +import torch.nn.functional as F + +# ===================== +# 6. MODEL: SPARSE AUTOENCODER +# ===================== +INPUT_DIM = L * 5 # 4 DNA + 1 phyloP +LATENT_DIM = 2048 +HIDDEN_DIM = 1024 + +class SparseAE(nn.Module): + def __init__(self, input_dim=INPUT_DIM, latent_dim=LATENT_DIM, hidden_dim=HIDDEN_DIM): + super().__init__() + + # Encoder + self.encoder = nn.Sequential( + nn.Linear(input_dim, hidden_dim), + nn.ReLU(), + nn.Linear(hidden_dim, latent_dim), + nn.ReLU() # ReLU helps sparsity with L1 + ) + + # Decoder shared + self.dec_hidden = nn.Linear(latent_dim, hidden_dim) + + # Decoder heads + self.dec_dna = nn.Linear(hidden_dim, L * 4) + self.dec_phy = nn.Linear(hidden_dim, L * 1) + + def forward(self, dna, phy): + B = dna.size(0) + + x = torch.cat( + [dna.reshape(B, -1), phy.reshape(B, -1)], + dim=1 + ) # (B, INPUT_DIM) + + h = self.encoder(x) + dec = F.relu(self.dec_hidden(h)) + + recon_dna = self.dec_dna(dec).reshape(B, L, 4) # (B, L, 4) + recon_phy = torch.tanh(self.dec_phy(dec)).reshape(B, L) # (B, L) + + return recon_dna, recon_phy, h + + +# Setup +L = 50 +device = "cuda" if torch.cuda.is_available() else "cpu" +model = SparseAE().to(device) + +# Load the final checkpoint +model.load_state_dict(torch.load("sparse_ae_50bp_epoch3.pt", map_location=device)) +model.eval() +print("Model loaded.") + +# --- GENERATE FAKE DATA (Or load real if you prefer) --- +print("Generating test data...") +# Create random DNA (approximate genomic distribution) +N_SAMPLES = 10000 +probs = torch.tensor([0.25, 0.25, 0.25, 0.25]) # A, C, G, T +test_dna_idx = torch.multinomial(probs, N_SAMPLES * L, replacement=True).view(N_SAMPLES, L) +test_dna = F.one_hot(test_dna_idx, num_classes=4).float().to(device) +test_phy = torch.randn(N_SAMPLES, L).to(device) + +# --- RUN INFERENCE --- +print("Running inference...") +with torch.no_grad(): + # Run model to get latent 'h' + # Note: If you used Top-K in training, ensure you use it here too. + # If you used standard L1/KL, just get 'h' from encoder. + B = test_dna.size(0) + x = torch.cat([test_dna.reshape(B, -1), test_phy.reshape(B, -1)], dim=1) + h = model.encoder(x) + + +# --- ANALYZE VOCABULARY --- +h_np = h.cpu().numpy() + +# 1. How often is each token used? (Frequency) +# We count a neuron as "firing" if it > 0.1 +neuron_firing_counts = np.sum(h_np > 0.1, axis=0) # Shape (2048,) + +# 2. Sort them +sorted_counts = np.sort(neuron_firing_counts)[::-1] + +print("\n--- VOCABULARY HEALTH CHECK ---") +print(f"Total Neurons: 2048") +print(f"Dead Neurons (Never fire): {np.sum(neuron_firing_counts == 0)}") +print(f"Rare Neurons (Fire < 10 times): {np.sum(neuron_firing_counts < 10)}") +print(f"Common Neurons (Fire > 1000 times): {np.sum(neuron_firing_counts > 1000)}") diff --git a/chr1_dna.txt b/chr1_dna.txt new file mode 100644 index 0000000000000000000000000000000000000000..57119842b296195b73d0edc7a4f314eed4a85847 --- /dev/null +++ b/chr1_dna.txt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:74c534c06853bf3f631c627e6b026bd2a29cade1926c19eda6fa03e462f86f02 +size 248932432 diff --git a/chr1_phyloP_norm.npy b/chr1_phyloP_norm.npy new file mode 100644 index 0000000000000000000000000000000000000000..159d492666ed2c5213072d7ebaf796155b53eacf --- /dev/null +++ b/chr1_phyloP_norm.npy @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:36f76db186a5031c277f406f4df0bf24fae1d5db15b715daa0303b6cbcebeb06 +size 995729856 diff --git a/consensus_string.py b/consensus_string.py new file mode 100644 index 0000000000000000000000000000000000000000..51b384d267bf84d65f5903dfd80fe0b89caa08e2 --- /dev/null +++ b/consensus_string.py @@ -0,0 +1,35 @@ +import numpy as np +import glob + +# Helper to map index to letter +idx_to_base = ['A', 'C', 'G', 'T'] + +# Find your files +pwm_files = glob.glob("token*_pwm.npy") +pwm_files.sort() + +print(f"{'Token ID':<10} | {'Consensus Sequence (50bp)':<55}") +print("-" * 70) + +for pwm_file in pwm_files: + # Get ID + tid = pwm_file.split("token")[1].split("_")[0] + + # Load Matrix (50, 4) + pwm = np.load(pwm_file) + + # Generate Consensus String + consensus = [] + for row in pwm: + # row is [prob_A, prob_C, prob_G, prob_T] + max_idx = np.argmax(row) + max_val = row[max_idx] + + # If the probability is low (e.g., < 0.4), it's just noise/background + if max_val < 0.25: + consensus.append(".") # Low confidence + else: + consensus.append(idx_to_base[max_idx]) + + seq_str = "".join(consensus) + print(f"{tid:<10} | {seq_str}") diff --git a/extract.py b/extract.py new file mode 100644 index 0000000000000000000000000000000000000000..693783e113b806f8146c587564e77744123dffd2 --- /dev/null +++ b/extract.py @@ -0,0 +1,286 @@ +import numpy as np +import torch +import torch.nn as nn +import torch.nn.functional as F +from torch.utils.data import Dataset, DataLoader, RandomSampler +from collections import defaultdict +from tqdm import tqdm + + + + +# ===================== +# 1. SETUP & DATA LOADING +# ===================== +print("Loading sequence / phyloP data...") + +dna_path = "/home/n5huang/dna_token/SparseAE/chr1_dna.txt" +phy_path = "/home/n5huang/dna_token/SparseAE/chr1_phyloP_norm.npy" + +with open(dna_path) as f: + sequence = f.read().strip() + +phy_norm = np.load(phy_path) + +assert len(sequence) == len(phy_norm), "DNA and phyloP length mismatch!" +chrom_len = len(sequence) +print(f"Chromosome 1 length: {chrom_len:,} bp") + +# ===================== +# 2. DNA ENCODING (HANDLE 'N') +# ===================== +print("Encoding DNA to one-hot (with N handling)...") + +mapping = {'A': 0, 'C': 1, 'G': 2, 'T': 3} + +# Map bases to ints, using 4 as "N/unknown" +dna_int = np.fromiter((mapping.get(b, 4) for b in sequence), dtype=np.int8) +num_N = np.sum(dna_int == 4) +print(f"Number of N bases: {num_N:,}") + +# One-hot with an extra row for N +# 0=A,1=C,2=G,3=T,4=[0,0,0,0,1] +temp_onehot = np.eye(5, dtype=np.float32)[dna_int] + +# Slice to first 4 columns: N -> [0,0,0,0] +dna_onehot = temp_onehot[:, :4] # shape (chrom_len, 4) + +# ===================== +# 3. PHYLOP CHECK + COMBINE +# ===================== +print("Preparing combined tensor...") + +# Assume phy_norm is already in [-1,1]; warn if not. +max_abs_phy = np.max(np.abs(phy_norm)) +if max_abs_phy > 1.1: + print(f"WARNING: phy_norm max abs={max_abs_phy:.3f} > 1.1; " + f"data may not be normalized as expected.") + +phy_norm = phy_norm.astype(np.float32) +phy_col = phy_norm.reshape(-1, 1) # (chrom_len, 1) + +combined_np = np.concatenate([dna_onehot, phy_col], axis=1) # (chrom_len, 5) +combined_tensor = torch.from_numpy(combined_np) # CPU tensor + +print(f"Master tensor shape: {combined_tensor.shape}") + +# ===================== +# 4. DATASET: CHUNKED WINDOWING +# ===================== +L = 50 + +class ChunkedChr1Dataset(Dataset): + def __init__(self, combined, L=50): + self.combined = combined + self.L = L + self.N = combined.shape[0] - L # number of valid start positions + + def __len__(self): + return self.N + + def __getitem__(self, idx): + # window: (L, 5) + window = self.combined[idx : idx + self.L] + dna = window[:, :4] # (L, 4) + phy = window[:, 4] # (L,) + return dna, phy, idx + +dataset = ChunkedChr1Dataset(combined_tensor, L=L) +print(f"Dataset length (#windows): {len(dataset):,}") + +# ===================== +# 5. DATALOADER WITH RANDOM SAMPLER +# ===================== +BATCH_SIZE = 1024 +SAMPLES_PER_EPOCH = 5_000_000 # number of windows per epoch (tunable) + +sampler = RandomSampler( + dataset, + replacement=True, + num_samples=SAMPLES_PER_EPOCH +) + +loader = DataLoader( + dataset, + batch_size=BATCH_SIZE, + sampler=sampler, + shuffle=False, # <--- MUST BE FALSE for mapping back to genome + drop_last=False, # <--- Process every last bit + num_workers=0, # safer on large dataset + pin_memory=True +) + +print("DataLoader ready.") + +# ===================== +# 6. MODEL: SPARSE AUTOENCODER +# ===================== +INPUT_DIM = L * 5 # 4 DNA + 1 phyloP +LATENT_DIM = 2048 +HIDDEN_DIM = 1024 + +class SparseAE(nn.Module): + def __init__(self, input_dim=INPUT_DIM, latent_dim=LATENT_DIM, hidden_dim=HIDDEN_DIM): + super().__init__() + + # Encoder + self.encoder = nn.Sequential( + nn.Linear(input_dim, hidden_dim), + nn.ReLU(), + nn.Linear(hidden_dim, latent_dim), + nn.ReLU() # ReLU helps sparsity with L1 + ) + + # Decoder shared + self.dec_hidden = nn.Linear(latent_dim, hidden_dim) + + # Decoder heads + self.dec_dna = nn.Linear(hidden_dim, L * 4) + self.dec_phy = nn.Linear(hidden_dim, L * 1) + + def forward(self, dna, phy): + B = dna.size(0) + + x = torch.cat( + [dna.reshape(B, -1), phy.reshape(B, -1)], + dim=1 + ) # (B, INPUT_DIM) + + h = self.encoder(x) + dec = F.relu(self.dec_hidden(h)) + + recon_dna = self.dec_dna(dec).reshape(B, L, 4) # (B, L, 4) + recon_phy = torch.tanh(self.dec_phy(dec)).reshape(B, L) # (B, L) + + return recon_dna, recon_phy, h + +######################################## +# 4. LOAD CHECKPOINT +######################################## + +device = "cuda" if torch.cuda.is_available() else "cpu" +print(f"Using device: {device}") + +model = SparseAE().to(device) +model.load_state_dict(torch.load("sparse_ae_50bp_epoch3.pt", map_location=device)) +model.eval() +print("Model loaded.") + + +######################################## +# 5. TOKEN EXTRACTION +######################################## + +print("Extracting tokens...") + +all_token_ids = np.zeros(len(dataset), dtype=np.int32) +#h_values = np.zeros((len(dataset), LATENT_DIM), dtype=np.float32) + +with torch.no_grad(): + offset = 0 + for dna_batch, phy_batch, idx_batch in tqdm(loader): + dna_batch = dna_batch.to(device).float() + phy_batch = phy_batch.to(device).float() + + _, _, h = model(dna_batch, phy_batch) + h_cpu = h.cpu().numpy() + + # argmax token + token_ids = np.argmax(h_cpu, axis=1) + + all_token_ids[offset : offset + len(token_ids)] = token_ids + #h_values[offset : offset + len(token_ids)] = h_cpu + + offset += len(token_ids) + +print("Token extraction complete.") + +np.save("token_ids.npy", all_token_ids) +#np.save("latent_h.npy", h_values) + + + +# Histogram +hist = np.bincount(all_token_ids, minlength=LATENT_DIM) +np.save("token_hist.npy", hist) + +print("Top tokens:") +#top_tokens = np.argsort(hist)[::-1][:20] +top_tokens = np.argsort(hist)[:20] + +for t in top_tokens: + print(f"Token {t}: count={hist[t]}") + + +######################################## +# 6. MOTIF SUMMARY FOR TOP TOKENS +######################################## + +print("\nBuilding PWM + average PhyloP for top tokens...") +# Initialize accumulators for ALL tokens +pwm_sum = {t: np.zeros((L, 4), dtype=np.float32) for t in range(LATENT_DIM)} +phy_sum = {t: np.zeros(L, dtype=np.float32) for t in range(LATENT_DIM)} +counts = {t: 0 for t in range(LATENT_DIM)} + +print("Accumulating statistics (this may take 15-30 mins)...") + +limit = len(all_token_ids) - L + +for i in tqdm(range(limit)): + t = all_token_ids[i] + + # Always accumulate + window = combined_np[i : i+L] + + pwm_sum[t] += window[:, :4] + phy_sum[t] += window[:, 4] + counts[t] += 1 + + + +######################################## +# 6A. Save per-token PWMs & phylo profiles +######################################## + +print("Saving profiles...") + +for t in range(LATENT_DIM): + if counts[t] == 0: + continue + + pwm = pwm_sum[t] / counts[t] + avg_phy = phy_sum[t] / counts[t] + + np.save(f"token{t}_pwm.npy", pwm) + np.save(f"token{t}_phy.npy", avg_phy) + + +######################################## +# 6B. Rank tokens by PhyloP and rarity +######################################## + +avg_phylop_per_token = np.zeros(LATENT_DIM) +count_per_token = np.zeros(LATENT_DIM) + +for t in range(LATENT_DIM): + if counts[t] > 0: + avg_phylop_per_token[t] = (phy_sum[t] / counts[t]).mean() + count_per_token[t] = counts[t] + else: + avg_phylop_per_token[t] = -999 + count_per_token[t] = 0 + +# Rank by PhyloP (high to low) +tokens_by_phylop = np.argsort(avg_phylop_per_token)[::-1] +top_phy_tokens = tokens_by_phylop[:20] + +# Rank by rarity (low to high) +rare_tokens = np.argsort(count_per_token)[:20] + +print("Top 20 conserved tokens:", top_phy_tokens) +print("Top 20 rarest tokens:", rare_tokens) + + + +print("\n=== Extraction Completed Successfully ===") + diff --git a/extract_log.txt b/extract_log.txt new file mode 100644 index 0000000000000000000000000000000000000000..7c703f8bff64ee5cfe0e60935a9907dc3a08b8df --- /dev/null +++ b/extract_log.txt @@ -0,0 +1,42 @@ +nohup: ignoring input +Activating Conda environment... +Assigned GPU: 6 +Starting Python training... +Loading sequence / phyloP data... +Chromosome 1 length: 248,932,432 bp +Encoding DNA to one-hot (with N handling)... +Number of N bases: 18,455,410 +Preparing combined tensor... +Master tensor shape: torch.Size([248932432, 5]) +Dataset length (#windows): 248,932,382 +DataLoader ready. +Using device: cuda +Model loaded. +Extracting tokens... + 0%| | 0/4883 [00:00 [0,0,0,0] +dna_onehot = temp_onehot[:, :4] # shape (chrom_len, 4) + +# ===================== +# 3. PHYLOP CHECK + COMBINE +# ===================== +print("Preparing combined tensor...") + +# Assume phy_norm is already in [-1,1]; warn if not. +max_abs_phy = np.max(np.abs(phy_norm)) +if max_abs_phy > 1.1: + print(f"WARNING: phy_norm max abs={max_abs_phy:.3f} > 1.1; " + f"data may not be normalized as expected.") + +phy_norm = phy_norm.astype(np.float32) +phy_col = phy_norm.reshape(-1, 1) # (chrom_len, 1) + +combined_np = np.concatenate([dna_onehot, phy_col], axis=1) # (chrom_len, 5) +combined_tensor = torch.from_numpy(combined_np) # CPU tensor + +print(f"Master tensor shape: {combined_tensor.shape}") + +# ===================== +# 4. DATASET: CHUNKED WINDOWING +# ===================== +L = 50 + +class ChunkedChr1Dataset(Dataset): + def __init__(self, combined, L=50): + self.combined = combined + self.L = L + self.N = combined.shape[0] - L # number of valid start positions + + def __len__(self): + return self.N + + def __getitem__(self, idx): + # window: (L, 5) + window = self.combined[idx : idx + self.L] + dna = window[:, :4] # (L, 4) + phy = window[:, 4] # (L,) + return dna, phy + +dataset = ChunkedChr1Dataset(combined_tensor, L=L) +print(f"Dataset length (#windows): {len(dataset):,}") + +# ===================== +# 5. DATALOADER WITH RANDOM SAMPLER +# ===================== +BATCH_SIZE = 1024 +SAMPLES_PER_EPOCH = 5_000_000 # number of windows per epoch (tunable) + +sampler = RandomSampler( + dataset, + replacement=True, + num_samples=SAMPLES_PER_EPOCH +) + +loader = DataLoader( + dataset, + batch_size=BATCH_SIZE, + sampler=sampler, + drop_last=True, + num_workers=0, # safer on large dataset + pin_memory=True +) + +print("DataLoader ready.") + +# ===================== +# 6. MODEL: SPARSE AUTOENCODER +# ===================== +INPUT_DIM = L * 5 # 4 DNA + 1 phyloP +LATENT_DIM = 2048 +HIDDEN_DIM = 1024 + +class SparseAE(nn.Module): + def __init__(self, input_dim=INPUT_DIM, latent_dim=LATENT_DIM, hidden_dim=HIDDEN_DIM): + super().__init__() + + # Encoder + self.encoder = nn.Sequential( + nn.Linear(input_dim, hidden_dim), + nn.ReLU(), + nn.Linear(hidden_dim, latent_dim), + nn.ReLU() # ReLU helps sparsity with L1 + ) + + # Decoder shared + self.dec_hidden = nn.Linear(latent_dim, hidden_dim) + + # Decoder heads + self.dec_dna = nn.Linear(hidden_dim, L * 4) + self.dec_phy = nn.Linear(hidden_dim, L * 1) + + def forward(self, dna, phy): + B = dna.size(0) + + x = torch.cat( + [dna.reshape(B, -1), phy.reshape(B, -1)], + dim=1 + ) # (B, INPUT_DIM) + + h = self.encoder(x) + dec = F.relu(self.dec_hidden(h)) + + recon_dna = self.dec_dna(dec).reshape(B, L, 4) # (B, L, 4) + recon_phy = torch.tanh(self.dec_phy(dec)).reshape(B, L) # (B, L) + + return recon_dna, recon_phy, h + +# ===================== +# 7. TRAINING LOOP +# ===================== +device = "cuda" if torch.cuda.is_available() else "cpu" +print(f"Training on device: {device}") + +model = SparseAE().to(device) +optimizer = torch.optim.Adam(model.parameters(), lr=1e-4) + +#lambda_l1 = 0.01 # slightly stronger sparsity +lambda_l1_start = 0.02 +lambda_l1_end = 0.005 +phy_weight = 10.0 +num_epochs = 5 +PRINT_EVERY = 1000 # batches +beta_kl_schedule = [0.0, 0.01, 0.02, 0.05, 0.1] # per epoch + + + +print("Starting training...") + +for epoch in range(num_epochs): + model.train() + total_loss = 0.0 + total_dna = 0.0 + total_phy = 0.0 + total_active = 0.0 + + batch_count = 0 + for dna_batch, phy_batch in loader: + batch_count += 1 + + dna_batch = dna_batch.to(device, non_blocking=True).float() # (B, L, 4) + phy_batch = phy_batch.to(device, non_blocking=True).float() # (B, L) + + optimizer.zero_grad() + + recon_dna, recon_phy, h = model(dna_batch, phy_batch) + + # Mask positions that are 'N' (all-zero one-hot) + mask = dna_batch.sum(dim=-1) > 0 # (B, L), True where valid base + + # --- DNA loss (masked CE) --- + true_dna_cls = dna_batch.argmax(dim=-1) # (B, L) + dna_logits = recon_dna.permute(0, 2, 1) # (B, 4, L) + loss_dna_raw = F.cross_entropy(dna_logits, true_dna_cls, reduction='none') # (B, L) + + if mask.sum() > 0: + loss_dna = (loss_dna_raw * mask).sum() / mask.sum() + else: + loss_dna = torch.tensor(0.0, device=device) + + # --- PhyloP loss (masked MSE) --- + loss_phy_raw = F.mse_loss(recon_phy, phy_batch, reduction='none') # (B, L) + + if mask.sum() > 0: + loss_phy = (loss_phy_raw * mask).sum() / mask.sum() + else: + loss_phy = torch.tensor(0.0, device=device) + + # --- KL sparsity penalty --- + rho = 0.02 # target sparsity + eps = 1e-12 + rho_hat = torch.mean(h, dim=0) + rho_hat = torch.clamp(rho_hat, min=1e-6, max=1-1e-6) + kl_per_unit = ( + rho * torch.log((rho + eps) / (rho_hat + eps)) + + (1 - rho) * torch.log(((1 - rho) + eps) / ((1 - rho_hat) + eps)) + ) + beta_kl = beta_kl_schedule[min(epoch, len(beta_kl_schedule)-1)] + #loss_kl = 1 * kl_per_unit.sum() # β = 1 regularization weight + loss_kl = beta_kl * kl_per_unit.sum() + + + lambda_l1 = ( + lambda_l1_start + + (lambda_l1_end - lambda_l1_start) * (epoch / (num_epochs - 1)) + ) + + # --- L1 sparsity on latent --- + loss_l1 = lambda_l1 * torch.mean(torch.abs(h)) + + # Total loss + loss = loss_dna + phy_weight * loss_phy + loss_l1 + loss_kl + + loss.backward() + optimizer.step() + + # Logging accumulators + B = dna_batch.size(0) + total_loss += loss.item() * B + total_dna += loss_dna.item() * B + total_phy += loss_phy.item() * B + + # approximate number of active neurons (h > threshold) + active_count = (h > 0.01).float().sum(dim=1).mean().item() + total_active += active_count * B + + if batch_count % PRINT_EVERY == 0: + print( + f"Epoch {epoch+1} | Batch {batch_count} | " + f"Loss={loss.item():.4f} | DNA_CE={loss_dna.item():.4f} | " + f"Phy_MSE={loss_phy.item():.5f} | Active={active_count:.1f}" + ) + + # Epoch summary + N = SAMPLES_PER_EPOCH # effective number of samples this epoch + avg_loss = total_loss / N + avg_dna = total_dna / N + avg_phy = total_phy / N + avg_active = total_active / N + + print(f"\n=== Epoch {epoch+1}/{num_epochs} COMPLETE ===") + print( + f"Avg Loss={avg_loss:.4f} | Avg DNA_CE={avg_dna:.4f} | " + f"Avg Phy_MSE={avg_phy:.5f} | " + f"Avg Active Neurons={avg_active:.1f} / {LATENT_DIM} " + f"({100.0 * avg_active / LATENT_DIM:.1f}%)" + ) + + # Save checkpoint + ckpt_path = f"sparse_ae_50bp_epoch{epoch+1}.pt" + torch.save(model.state_dict(), ckpt_path) + print(f"Saved checkpoint to {ckpt_path}\n") diff --git a/save/sparse_ae_50bp_epoch1.pt b/save/sparse_ae_50bp_epoch1.pt new file mode 100644 index 0000000000000000000000000000000000000000..26fae7e13c476dd2033dbb1bc8b4da7a523884bc --- /dev/null +++ b/save/sparse_ae_50bp_epoch1.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:783330441dbcb8954e41b73c14c9f2c6b5b7d4391b2a34f497efbaffea185061 +size 18845511 diff --git a/save/sparse_ae_50bp_epoch2.pt b/save/sparse_ae_50bp_epoch2.pt new file mode 100644 index 0000000000000000000000000000000000000000..ed7bedb0245d3f4832a52b7ed99a33e0930a21bd --- /dev/null +++ b/save/sparse_ae_50bp_epoch2.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f951d20a5e142004926a72c9fcf5eb930b60fecd6c1016bc937f9590ce1acf4f +size 18845511 diff --git a/save/sparse_ae_50bp_epoch3.pt b/save/sparse_ae_50bp_epoch3.pt new file mode 100644 index 0000000000000000000000000000000000000000..c1ec2922be1bc0c218333cead7e77bf56006f0fc --- /dev/null +++ b/save/sparse_ae_50bp_epoch3.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d8b3568994c3c777b37bea3b781c8d79e8c8883c9fbceee001f6247d7e21def0 +size 18845511 diff --git a/save/sparse_ae_50bp_epoch4.pt b/save/sparse_ae_50bp_epoch4.pt new file mode 100644 index 0000000000000000000000000000000000000000..93822464bf8a875ca9097aa54e04aef2a0c994ec --- /dev/null +++ b/save/sparse_ae_50bp_epoch4.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e1625cf96716fac10180bff026b4a22bb3141c9d7599ffac03963b908e9310d3 +size 18845511 diff --git a/save/sparse_ae_50bp_epoch5.pt b/save/sparse_ae_50bp_epoch5.pt new file mode 100644 index 0000000000000000000000000000000000000000..5af5f929fb2a21cc3bd63951941c93d4fb955209 --- /dev/null +++ b/save/sparse_ae_50bp_epoch5.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9aed9a40b48ddc84c91d5fee7ed1917922a39138c6de011655dc9789d6d94939 +size 18845511 diff --git a/sparse_ae_50bp_epoch1.pt b/sparse_ae_50bp_epoch1.pt new file mode 100644 index 0000000000000000000000000000000000000000..d6c36398444fc96bcbe2036fd427087a483d64be --- /dev/null +++ b/sparse_ae_50bp_epoch1.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0524bbaf5d3890d983e6a5a0950700caa1edb079b4ba8d351f10aedda4e4cbfe +size 18845511 diff --git a/sparse_ae_50bp_epoch2.pt b/sparse_ae_50bp_epoch2.pt new file mode 100644 index 0000000000000000000000000000000000000000..35d276d6e7b3d6bb56d0ce6fdd71aa6a1e58ca22 --- /dev/null +++ b/sparse_ae_50bp_epoch2.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:49e44d486cf65d22baef24d6d35c480e2b0d1ac54216ee62089be54acf28cee8 +size 18845511 diff --git a/sparse_ae_50bp_epoch3.pt b/sparse_ae_50bp_epoch3.pt new file mode 100644 index 0000000000000000000000000000000000000000..a060376378993f8f2c8173f87c636e336dc72f46 --- /dev/null +++ b/sparse_ae_50bp_epoch3.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b28ea09c9c217e70bce16a914aacda6d8a864f95e8497eb690e02b7c84e6a036 +size 18845511 diff --git a/sparse_ae_50bp_epoch4.pt b/sparse_ae_50bp_epoch4.pt new file mode 100644 index 0000000000000000000000000000000000000000..b8b0fe8df4842dc45f6b8884816aca0317a46db2 --- /dev/null +++ b/sparse_ae_50bp_epoch4.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9b188cb2204f93983c0942d697f8f04736db3bf56012b9324e507e4d83a4673d +size 18845511 diff --git a/sparse_ae_50bp_epoch5.pt b/sparse_ae_50bp_epoch5.pt new file mode 100644 index 0000000000000000000000000000000000000000..40bcc5bc4c4ed032dbfe0e4d4c7e4bbce86093fc --- /dev/null +++ b/sparse_ae_50bp_epoch5.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4cc638b7535deeb9c4c10bb9a6c47d746ff09d2aa51cf223b2d262ec468a018a +size 18845511 diff --git a/summarize_tokens.py b/summarize_tokens.py new file mode 100644 index 0000000000000000000000000000000000000000..0312560d2410e646b9de3e8476a274f3a57d0eeb --- /dev/null +++ b/summarize_tokens.py @@ -0,0 +1,62 @@ +import re +import glob +import numpy as np +import pandas as pd + +LOG_FILE = "extract_log.txt" + +# 1. Parse N hits from log file +token_counts = {} +pattern = re.compile(r"Saved token (\d+) \(N=(\d+)\)") + +with open(LOG_FILE, "r") as f: + for line in f: + m = pattern.search(line) + if m: + t = int(m.group(1)) + n = int(m.group(2)) + token_counts[t] = n + +print(f"Found {len(token_counts)} tokens with counts from log.") + +# 2. For each token, load PWM + phyloP, compute entropy + avg phyloP +rows = [] + +def pwm_entropy(pwm, eps=1e-8): + """ + pwm: (L, 4) array of mean one-hot probs + returns: mean Shannon entropy across positions, in bits + """ + p = pwm / (pwm.sum(axis=1, keepdims=True) + eps) # normalize safety + H = -np.sum(p * np.log2(p + eps), axis=1) # (L,) + return H.mean() + +for pwm_path in glob.glob("token*_pwm.npy"): + # token ID from filename + m = re.search(r"token(\d+)_pwm\.npy", pwm_path) + if not m: + continue + t = int(m.group(1)) + + pwm = np.load(pwm_path) # (L, 4) + phy = np.load(f"token{t}_phy.npy") # (L,) + + H = pwm_entropy(pwm) + avg_phy = float(phy.mean()) + N_hits = token_counts.get(t, None) + + rows.append({ + "token_id": t, + "N_hits": N_hits, + "pwm_entropy_bits": H, + "avg_phyloP": avg_phy + }) + +df = pd.DataFrame(rows) +df = df.sort_values(["pwm_entropy_bits", "avg_phyloP"], ascending=[True, False]) + +print(df.head(20)) + +df.to_csv("token_summary.tsv", sep="\t", index=False) +print("\nSaved summary to token_summary.tsv") + diff --git a/token1248_phy.npy b/token1248_phy.npy new file mode 100644 index 0000000000000000000000000000000000000000..9a3f55759c379fe63b1546522d9ef0f2332e52da --- /dev/null +++ b/token1248_phy.npy @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d0ff04bef228fbb4034063112451e0ff3f61ea5010085c896dde823c80995944 +size 328 diff --git a/token1248_pwm.npy b/token1248_pwm.npy new file mode 100644 index 0000000000000000000000000000000000000000..134c70f372defcfa57e689b080bebc17a90d9f6a --- /dev/null +++ b/token1248_pwm.npy @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:48ecfc793f3988fb8ac3f44fceeb7e19c78e65c153d9f8f3d8ef84ba95901fce +size 928 diff --git a/token1312_phy.npy b/token1312_phy.npy new file mode 100644 index 0000000000000000000000000000000000000000..17edbac10d299c41f2e6b4fb1eef572e4178b083 --- /dev/null +++ b/token1312_phy.npy @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3e7f7df3737fb75cf04dba3b9dace53c6288b5df1c8f1504e756795d16ec587e +size 328 diff --git a/token1312_pwm.npy b/token1312_pwm.npy new file mode 100644 index 0000000000000000000000000000000000000000..5fe0976f7f4867043e5787611b306b5fc184f66a --- /dev/null +++ b/token1312_pwm.npy @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:636a911b43f7f1117d8f1350966a3832b21f4ec02891bd96386cc0e3f297bae9 +size 928 diff --git a/token138_phy.npy b/token138_phy.npy new file mode 100644 index 0000000000000000000000000000000000000000..cfa2b8125814f094b70fb199b6c40ec818b2b5f5 --- /dev/null +++ b/token138_phy.npy @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:94821977d4c17a4f60989babf4c9404e650160ce56ef0f109e3ac078e4292aaa +size 328 diff --git a/token138_pwm.npy b/token138_pwm.npy new file mode 100644 index 0000000000000000000000000000000000000000..1cb834cecdd1ed0f32a5cd21aff3d3d280cdb1be --- /dev/null +++ b/token138_pwm.npy @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:00274837ae1e7e223ce799a649edb1ca7e01998780ec0b836e731427e1739ef6 +size 928 diff --git a/token1417_phy.npy b/token1417_phy.npy new file mode 100644 index 0000000000000000000000000000000000000000..fb85b37a465e1471019ebf73b06b2f9a4687aa84 --- /dev/null +++ b/token1417_phy.npy @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1c51f148b5df3cf4bce2fde39242187600d0d8fe47d10094b2dccfd4c3715870 +size 328 diff --git a/token1417_pwm.npy b/token1417_pwm.npy new file mode 100644 index 0000000000000000000000000000000000000000..776600ac966fd9d53c3327fde1111da1e377c439 --- /dev/null +++ b/token1417_pwm.npy @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a294d6621fc0b33bedbf4580ecf3c50f5ae779e15a0c60365015f4a81b9801bd +size 928 diff --git a/token1448_phy.npy b/token1448_phy.npy new file mode 100644 index 0000000000000000000000000000000000000000..4b36c1524d94cb7cf2dcb899ffa687b2fa97e9ac --- /dev/null +++ b/token1448_phy.npy @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7818a614c3797dff04b89ae64493fc348c32d774a34c068cc6212d097e02cca8 +size 328 diff --git a/token1448_pwm.npy b/token1448_pwm.npy new file mode 100644 index 0000000000000000000000000000000000000000..b5d5bdb83394c43edeed774d8a67c09ab5e537a7 --- /dev/null +++ b/token1448_pwm.npy @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:328c466c07fa92f28d23a19accbf8a852f584116a234b9e62c96eec4d1243c0b +size 928 diff --git a/token1487_phy.npy b/token1487_phy.npy new file mode 100644 index 0000000000000000000000000000000000000000..b3bdb31af12aeba73de2e7ad99b7861652f2dff8 --- /dev/null +++ b/token1487_phy.npy @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e511783f52ad148ca2fb1eaef348fd19c844b9d8ceb9c9ffef07efeb48097960 +size 328 diff --git a/token1487_pwm.npy b/token1487_pwm.npy new file mode 100644 index 0000000000000000000000000000000000000000..949e2bf6fe0abdc5e6f6961a5b87df002ea5301b --- /dev/null +++ b/token1487_pwm.npy @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7859c6c5085e90a0b2fe26c5f63ce02dbb2f9e56e8e20a77ddf664630a4c6849 +size 928 diff --git a/token1494_phy.npy b/token1494_phy.npy new file mode 100644 index 0000000000000000000000000000000000000000..c19f1af89858390080aa988c52d044378ddfb90f --- /dev/null +++ b/token1494_phy.npy @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cc33acaa4223bb72be2190dc196840f870c6c5128ce06cb46c86b7aacc8011d1 +size 328 diff --git a/token1494_pwm.npy b/token1494_pwm.npy new file mode 100644 index 0000000000000000000000000000000000000000..4d0fe814228381ffa574653465b237a3999470dc --- /dev/null +++ b/token1494_pwm.npy @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9d05893b76c9a7d253d025f8a09f600ec8e6e78914793dd866ed5aae58028330 +size 928 diff --git a/token1503_phy.npy b/token1503_phy.npy new file mode 100644 index 0000000000000000000000000000000000000000..c70b7499c41d4f124cee7c21aefcc931ab41ed11 --- /dev/null +++ b/token1503_phy.npy @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bb5354c0f0721175f73d3e5a76f30ca9aa4f76ef20d918ca9faf6b4040c3ce55 +size 328 diff --git a/token1503_pwm.npy b/token1503_pwm.npy new file mode 100644 index 0000000000000000000000000000000000000000..29b48c3e373c38a087cd976930b76ab14e08a3ba --- /dev/null +++ b/token1503_pwm.npy @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7c4ddd567056af4a22fe33e4783afb65e0037c2fda7778825a7b3efcffbce4c0 +size 928 diff --git a/token1721_phy.npy b/token1721_phy.npy new file mode 100644 index 0000000000000000000000000000000000000000..762cdd27355d8f01b77e31b2e9b631844ebbfbaf --- /dev/null +++ b/token1721_phy.npy @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9e1b524b3870dd824a06a9113ecd1e08d1f78e133e39cf6700c6734a5d8e487f +size 328 diff --git a/token1721_pwm.npy b/token1721_pwm.npy new file mode 100644 index 0000000000000000000000000000000000000000..ec6f3e43a1c331784850030c2104b24acc88372e --- /dev/null +++ b/token1721_pwm.npy @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c5df963ff0dc52052663c9122812da7708299e263f749a2accbb6297b42bfd78 +size 928 diff --git a/token175_phy.npy b/token175_phy.npy new file mode 100644 index 0000000000000000000000000000000000000000..9a5d1a502579f866546b6ad581a752471e77353e --- /dev/null +++ b/token175_phy.npy @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d20851e9bed217d47d86f6ef752922ba756a17dcb5f1ba826523161827ea4af0 +size 328 diff --git a/token175_pwm.npy b/token175_pwm.npy new file mode 100644 index 0000000000000000000000000000000000000000..2c40c2391620ee6e20c3da1265d02515bc9450ad --- /dev/null +++ b/token175_pwm.npy @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c5ce7c68421a2db5fc5285296599945917c37be115a0eeda6ec9de965be25624 +size 928 diff --git a/token1831_phy.npy b/token1831_phy.npy new file mode 100644 index 0000000000000000000000000000000000000000..294e5ab47663f74e240ca09a55f51ddeab7163f0 --- /dev/null +++ b/token1831_phy.npy @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:68b1ef67ff02631fb4b32165652ef6d97c4edb3b1ca292f03f644098637b821a +size 328 diff --git a/token1831_pwm.npy b/token1831_pwm.npy new file mode 100644 index 0000000000000000000000000000000000000000..324d1cc1a1757ed5edf26dff99c4d67b77d48f47 --- /dev/null +++ b/token1831_pwm.npy @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:43dc041a1c64ea71b531c7ab1b1d4ea30bd77b015b6a5d3997dee5ddb00f5fea +size 928 diff --git a/token192_phy.npy b/token192_phy.npy new file mode 100644 index 0000000000000000000000000000000000000000..d291fbfe538208939607ea22d0d3344af79d365a --- /dev/null +++ b/token192_phy.npy @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:98cff173bd0ec8c9d3f02d217a5bfa96ee0cca52d03f624dd29939eaee52a8df +size 328 diff --git a/token192_pwm.npy b/token192_pwm.npy new file mode 100644 index 0000000000000000000000000000000000000000..530e91d4f9825ddcbeea7f515191cde2d50f5d1b --- /dev/null +++ b/token192_pwm.npy @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:24c99c84a3db007261a0ab245020b7db930dcc53e8e646e16a77a3bdc31bd259 +size 928 diff --git a/token296_phy.npy b/token296_phy.npy new file mode 100644 index 0000000000000000000000000000000000000000..10f0df77c45967a72eb0f3dcb4ff43b5fcb10a41 --- /dev/null +++ b/token296_phy.npy @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fd142b806afb9d3ecf36bc07eeb4ac05e69f48f64513620059023ad7ea59a5d0 +size 328 diff --git a/token296_pwm.npy b/token296_pwm.npy new file mode 100644 index 0000000000000000000000000000000000000000..f5fe23047f1104c25db26d6d34a640947edc5c12 --- /dev/null +++ b/token296_pwm.npy @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3d9722d77f41184dbbc8089b0c909f00121fb114d33b0b09f5fdacc419aa724d +size 928 diff --git a/token363_phy.npy b/token363_phy.npy new file mode 100644 index 0000000000000000000000000000000000000000..457da5c9d9bff080bb03af0d99cdfa2d04022c48 --- /dev/null +++ b/token363_phy.npy @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:66a992eb090aed56f742f8b84e2bc36eee9dba0849b5de454f65c67c8d433c83 +size 328 diff --git a/token363_pwm.npy b/token363_pwm.npy new file mode 100644 index 0000000000000000000000000000000000000000..8180aaf6e3bf5f457d5d3313aea29ac4da8dc9ec --- /dev/null +++ b/token363_pwm.npy @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dd011451a85cc30d6f4aa5123fab4f6a9a401a74a84ae30859ac1e29262c026c +size 928 diff --git a/token468_phy.npy b/token468_phy.npy new file mode 100644 index 0000000000000000000000000000000000000000..5c3b8ecd94c63bd718bfc87e33df07dc5c4aea30 --- /dev/null +++ b/token468_phy.npy @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:471273cce914188e0b266f884091cd1b85c8f1c554943bd0d5db3a423c3b6cb5 +size 328 diff --git a/token468_pwm.npy b/token468_pwm.npy new file mode 100644 index 0000000000000000000000000000000000000000..9e8ac5922f3d8e48cc6baba008c78bb6a46f2b44 --- /dev/null +++ b/token468_pwm.npy @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:84493201ce2efb598ceaf6eb6f86b25168b2a4da76611bee68897a677a732fa1 +size 928 diff --git a/token472_phy.npy b/token472_phy.npy new file mode 100644 index 0000000000000000000000000000000000000000..e8824fe495443dd647c5fa30b9ba9116c32d3ebc --- /dev/null +++ b/token472_phy.npy @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:be183b28938cafb1050a0a159280ac5d645419d4731568fc0d2153628bbbfcbf +size 328 diff --git a/token472_pwm.npy b/token472_pwm.npy new file mode 100644 index 0000000000000000000000000000000000000000..fa792d97037742f9409badeb891265bae7e1e2b8 --- /dev/null +++ b/token472_pwm.npy @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3bcd1cc017d517018089c138477055a228fb3a965cc1efd92959dbb62491aa7f +size 928 diff --git a/token517_phy.npy b/token517_phy.npy new file mode 100644 index 0000000000000000000000000000000000000000..a720e82a8f46b7dd61dec3a4087b83838672ce28 --- /dev/null +++ b/token517_phy.npy @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f51bd5c7d8ec0ac51253f4eec8a526584b156adc4cd2e02d72283b559d2316e1 +size 528 diff --git a/token517_pwm.npy b/token517_pwm.npy new file mode 100644 index 0000000000000000000000000000000000000000..ce97d0f941972740d99aad3c53eb01fe4ce9dfef --- /dev/null +++ b/token517_pwm.npy @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b1fe3f77c1c52895bb475cd5b4b77f8a88e4134297a931698559961ef9a0aabe +size 1728 diff --git a/token64_phy.npy b/token64_phy.npy new file mode 100644 index 0000000000000000000000000000000000000000..4b72d4f4de7d235c51043c4263566755e598a3e7 --- /dev/null +++ b/token64_phy.npy @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6fa86ef8aa8737163127740a9f7dfa23ae273e071d9470c08f1c5de543e59e41 +size 328 diff --git a/token64_pwm.npy b/token64_pwm.npy new file mode 100644 index 0000000000000000000000000000000000000000..d3ab736909999ca05964674e29ee672ad213f095 --- /dev/null +++ b/token64_pwm.npy @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d4e5c784227d093b39c836f3e937cc7edd965ef53cdb49014b0e41396cdf441b +size 928 diff --git a/token75_phy.npy b/token75_phy.npy new file mode 100644 index 0000000000000000000000000000000000000000..f936ece145431791dae2ec08bcbf77bd2df46888 --- /dev/null +++ b/token75_phy.npy @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8674ab895cf8178516444dc167fac1fd7c9bbcb452b7d96b7ddc5d204b625573 +size 328 diff --git a/token75_pwm.npy b/token75_pwm.npy new file mode 100644 index 0000000000000000000000000000000000000000..0df40d967b534f7fc0882350b85994bd85e47d67 --- /dev/null +++ b/token75_pwm.npy @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:97719893641e2ce4ecbcfddddc562f233db51b1a87aa1048863d87e80f52199f +size 928 diff --git a/token_hist.npy b/token_hist.npy new file mode 100644 index 0000000000000000000000000000000000000000..64c026bca57132b870821cf476cf469e12a21a3a --- /dev/null +++ b/token_hist.npy @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1f54d902bd6990262d8687161a01a5000e8da24a6ae88e01e91414b9506cc63e +size 16512 diff --git a/token_ids.npy b/token_ids.npy new file mode 100644 index 0000000000000000000000000000000000000000..20cbc581827ded449da62391bb5b23061ad353a6 --- /dev/null +++ b/token_ids.npy @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0c5b373966723c998c986e72c6dbac80f0c31c16488d4616fd25d8ef329f19eb +size 995729656 diff --git a/token_summary.tsv b/token_summary.tsv new file mode 100644 index 0000000000000000000000000000000000000000..ff5503c3254f268d08f07a62199d4ddacfecfceb --- /dev/null +++ b/token_summary.tsv @@ -0,0 +1,20 @@ +token_id N_hits pwm_entropy_bits avg_phyloP +1448 27015 1.9964770078659058 -0.00574650289490819 +138 25555 1.9965238571166992 -0.004780031740665436 +1487 43973 1.9965260028839111 -0.007170177064836025 +175 51890 1.9965900182724 -0.005986363161355257 +472 24937 1.9966228008270264 -0.00586310401558876 +1248 27006 1.9966228008270264 -0.007577949669212103 +75 65104 1.9966543912887573 -0.006708875298500061 +64 31841 1.9966717958450317 -0.004744209814816713 +1494 33241 1.996689796447754 -0.004931221250444651 +1721 25598 1.996769666671753 -0.00491239782422781 +1417 26209 1.9967825412750244 -0.0050399755127727985 +363 38579 1.996788501739502 -0.005597930401563644 +468 31002 1.9968034029006958 -0.006207745056599379 +1312 30509 1.99680495262146 -0.00598685909062624 +517 369671 1.9968288616759156 -0.005221050817240619 +1503 28326 1.9969356060028076 -0.005365537479519844 +296 33085 1.996943712234497 -0.005549528170377016 +192 25466 1.99696683883667 -0.005745543632656336 +1831 25243 1.9971096515655518 -0.005123969167470932 diff --git a/train_log.txt b/train_log.txt new file mode 100644 index 0000000000000000000000000000000000000000..fc98261b0b9553784a2d04fd0226645389848326 --- /dev/null +++ b/train_log.txt @@ -0,0 +1,59 @@ +nohup: ignoring input +Activating Conda environment... +Assigned GPU: 6 +Starting Python training... +Loading sequence / phyloP data... +Chromosome 1 length: 248,932,432 bp +Encoding DNA to one-hot (with N handling)... +Number of N bases: 18,455,410 +Preparing combined tensor... +Master tensor shape: torch.Size([248932432, 5]) +Dataset length (#windows): 248,932,382 +DataLoader ready. +Training on device: cuda +Starting training... +Epoch 1 | Batch 1000 | Loss=0.3124 | DNA_CE=0.2716 | Phy_MSE=0.00249 | Active=1963.1 +Epoch 1 | Batch 2000 | Loss=0.0886 | DNA_CE=0.0479 | Phy_MSE=0.00234 | Active=1950.1 +Epoch 1 | Batch 3000 | Loss=0.0620 | DNA_CE=0.0216 | Phy_MSE=0.00238 | Active=1929.0 +Epoch 1 | Batch 4000 | Loss=0.0552 | DNA_CE=0.0120 | Phy_MSE=0.00270 | Active=1916.5 + +=== Epoch 1/5 COMPLETE === +Avg Loss=0.2745 | Avg DNA_CE=0.2078 | Avg Phy_MSE=0.00510 | Avg Active Neurons=1932.7 / 2048 (94.4%) +Saved checkpoint to sparse_ae_50bp_epoch1.pt + +Epoch 2 | Batch 1000 | Loss=2.5741 | DNA_CE=0.2930 | Phy_MSE=0.02092 | Active=266.3 +Epoch 2 | Batch 2000 | Loss=2.1532 | DNA_CE=0.1783 | Phy_MSE=0.01576 | Active=264.3 +Epoch 2 | Batch 3000 | Loss=1.9267 | DNA_CE=0.1264 | Phy_MSE=0.01145 | Active=261.0 +Epoch 2 | Batch 4000 | Loss=1.8043 | DNA_CE=0.0932 | Phy_MSE=0.00857 | Active=274.1 + +=== Epoch 2/5 COMPLETE === +Avg Loss=2.2968 | Avg DNA_CE=0.2133 | Avg Phy_MSE=0.01868 | Avg Active Neurons=267.0 / 2048 (13.0%) +Saved checkpoint to sparse_ae_50bp_epoch2.pt + +Epoch 3 | Batch 1000 | Loss=3.1683 | DNA_CE=0.0731 | Phy_MSE=0.00600 | Active=256.2 +Epoch 3 | Batch 2000 | Loss=3.0996 | DNA_CE=0.0480 | Phy_MSE=0.00432 | Active=271.0 +Epoch 3 | Batch 3000 | Loss=3.0520 | DNA_CE=0.0359 | Phy_MSE=0.00359 | Active=279.7 +Epoch 3 | Batch 4000 | Loss=3.0216 | DNA_CE=0.0256 | Phy_MSE=0.00301 | Active=285.8 + +=== Epoch 3/5 COMPLETE === +Avg Loss=3.0960 | Avg DNA_CE=0.0486 | Avg Phy_MSE=0.00451 | Avg Active Neurons=271.4 / 2048 (13.3%) +Saved checkpoint to sparse_ae_50bp_epoch3.pt + +Epoch 4 | Batch 1000 | Loss=7.3649 | DNA_CE=0.0198 | Phy_MSE=0.00302 | Active=280.3 +Epoch 4 | Batch 2000 | Loss=7.3400 | DNA_CE=0.0166 | Phy_MSE=0.00270 | Active=282.7 +Epoch 4 | Batch 3000 | Loss=7.3057 | DNA_CE=0.0130 | Phy_MSE=0.00259 | Active=285.4 +Epoch 4 | Batch 4000 | Loss=7.2911 | DNA_CE=0.0112 | Phy_MSE=0.00253 | Active=284.6 + +=== Epoch 4/5 COMPLETE === +Avg Loss=7.3314 | Avg DNA_CE=0.0163 | Avg Phy_MSE=0.00287 | Avg Active Neurons=280.1 / 2048 (13.7%) +Saved checkpoint to sparse_ae_50bp_epoch4.pt + +Epoch 5 | Batch 1000 | Loss=14.4925 | DNA_CE=0.0095 | Phy_MSE=0.00282 | Active=280.6 +Epoch 5 | Batch 2000 | Loss=14.3056 | DNA_CE=0.0081 | Phy_MSE=0.00265 | Active=280.5 +Epoch 5 | Batch 3000 | Loss=14.2537 | DNA_CE=0.0071 | Phy_MSE=0.00294 | Active=286.3 +Epoch 5 | Batch 4000 | Loss=14.1657 | DNA_CE=0.0062 | Phy_MSE=0.00288 | Active=287.3 + +=== Epoch 5/5 COMPLETE === +Avg Loss=14.3153 | Avg DNA_CE=0.0077 | Avg Phy_MSE=0.00278 | Avg Active Neurons=283.2 / 2048 (13.8%) +Saved checkpoint to sparse_ae_50bp_epoch5.pt + diff --git a/windows.py.save b/windows.py.save new file mode 100644 index 0000000000000000000000000000000000000000..36b3c6c39f29e5b8b20092ab7310d6149400391a --- /dev/null +++ b/windows.py.save @@ -0,0 +1,57 @@ +import numpy as np +import torch +from torch.utils.data import Dataset, DataLoader + + +# Load phyloP normalized +phy_norm = np.load("chr1_phy_norm.npy") + +# Load DNA sequence +with open("chr1_dna.txt") as f: + sequence = f.read().strip() + +print(len(sequence), phy_norm.shape) + +L = 50 +mapping = {"A":0, "C":1, "G":2, "T":3} + +windows_dna = [] +windows_phy = [] + +seq_len = len(sequence) + +for i in range(seq_len - L): + window_seq = sequence[i : i+L] + if "N" in window_seq: + continue + + # One-hot encode + onehot = np.zeros((L,4), dtype=np.float32) + for t, base in enumerate(window_seq): + if base in mapping: + onehot[t, mapping[base]] = 1.0 + + windows_dna.append(onehot) + windows_phy.append(phy_norm[i : i+L]) + +windows_dna = torch.tensor(np.array(windows_dna)) +windows_phy = torch.tensor(np.array(windows_phy)) + +print(windows_dna.shape, windows_phy.shape) +# (N, 50, 4), (N, 50) + +class Chr1Dataset(Dataset): + def __init__(self, dna, phy): + self.dna = dna + self.phy = phy + + def __len__(self): + return len(self.dna) + + def __getitem__(self, idx): + return self.dna[idx], self.phy[idx] + +dataset = Chr1Dataset(windows_dna, windows_phy) + +loader = DataLoader(dataset, batch_size=1024, shuffle=True, drop_last=True) +