Upload folder using huggingface_hub
Browse filesThis view is limited to 50 files because it contains too many changes. See raw diff
- .gitattributes +1 -0
- check_vocab.py +94 -0
- chr1_dna.txt +3 -0
- chr1_phyloP_norm.npy +3 -0
- consensus_string.py +35 -0
- extract.py +286 -0
- extract_log.txt +0 -0
- run_job.sh +21 -0
- run_train.py +274 -0
- save/sparse_ae_50bp_epoch1.pt +3 -0
- save/sparse_ae_50bp_epoch2.pt +3 -0
- save/sparse_ae_50bp_epoch3.pt +3 -0
- save/sparse_ae_50bp_epoch4.pt +3 -0
- save/sparse_ae_50bp_epoch5.pt +3 -0
- sparse_ae_50bp_epoch1.pt +3 -0
- sparse_ae_50bp_epoch2.pt +3 -0
- sparse_ae_50bp_epoch3.pt +3 -0
- sparse_ae_50bp_epoch4.pt +3 -0
- sparse_ae_50bp_epoch5.pt +3 -0
- summarize_tokens.py +62 -0
- token1248_phy.npy +3 -0
- token1248_pwm.npy +3 -0
- token1312_phy.npy +3 -0
- token1312_pwm.npy +3 -0
- token138_phy.npy +3 -0
- token138_pwm.npy +3 -0
- token1417_phy.npy +3 -0
- token1417_pwm.npy +3 -0
- token1448_phy.npy +3 -0
- token1448_pwm.npy +3 -0
- token1487_phy.npy +3 -0
- token1487_pwm.npy +3 -0
- token1494_phy.npy +3 -0
- token1494_pwm.npy +3 -0
- token1503_phy.npy +3 -0
- token1503_pwm.npy +3 -0
- token1721_phy.npy +3 -0
- token1721_pwm.npy +3 -0
- token175_phy.npy +3 -0
- token175_pwm.npy +3 -0
- token1831_phy.npy +3 -0
- token1831_pwm.npy +3 -0
- token192_phy.npy +3 -0
- token192_pwm.npy +3 -0
- token296_phy.npy +3 -0
- token296_pwm.npy +3 -0
- token363_phy.npy +3 -0
- token363_pwm.npy +3 -0
- token468_phy.npy +3 -0
- token468_pwm.npy +3 -0
.gitattributes
CHANGED
|
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
chr1_dna.txt filter=lfs diff=lfs merge=lfs -text
|
check_vocab.py
ADDED
|
@@ -0,0 +1,94 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import torch
|
| 2 |
+
import torch.nn as nn
|
| 3 |
+
import numpy as np
|
| 4 |
+
import matplotlib.pyplot as plt
|
| 5 |
+
import torch.nn.functional as F
|
| 6 |
+
|
| 7 |
+
# =====================
|
| 8 |
+
# 6. MODEL: SPARSE AUTOENCODER
|
| 9 |
+
# =====================
|
| 10 |
+
INPUT_DIM = L * 5 # 4 DNA + 1 phyloP
|
| 11 |
+
LATENT_DIM = 2048
|
| 12 |
+
HIDDEN_DIM = 1024
|
| 13 |
+
|
| 14 |
+
class SparseAE(nn.Module):
|
| 15 |
+
def __init__(self, input_dim=INPUT_DIM, latent_dim=LATENT_DIM, hidden_dim=HIDDEN_DIM):
|
| 16 |
+
super().__init__()
|
| 17 |
+
|
| 18 |
+
# Encoder
|
| 19 |
+
self.encoder = nn.Sequential(
|
| 20 |
+
nn.Linear(input_dim, hidden_dim),
|
| 21 |
+
nn.ReLU(),
|
| 22 |
+
nn.Linear(hidden_dim, latent_dim),
|
| 23 |
+
nn.ReLU() # ReLU helps sparsity with L1
|
| 24 |
+
)
|
| 25 |
+
|
| 26 |
+
# Decoder shared
|
| 27 |
+
self.dec_hidden = nn.Linear(latent_dim, hidden_dim)
|
| 28 |
+
|
| 29 |
+
# Decoder heads
|
| 30 |
+
self.dec_dna = nn.Linear(hidden_dim, L * 4)
|
| 31 |
+
self.dec_phy = nn.Linear(hidden_dim, L * 1)
|
| 32 |
+
|
| 33 |
+
def forward(self, dna, phy):
|
| 34 |
+
B = dna.size(0)
|
| 35 |
+
|
| 36 |
+
x = torch.cat(
|
| 37 |
+
[dna.reshape(B, -1), phy.reshape(B, -1)],
|
| 38 |
+
dim=1
|
| 39 |
+
) # (B, INPUT_DIM)
|
| 40 |
+
|
| 41 |
+
h = self.encoder(x)
|
| 42 |
+
dec = F.relu(self.dec_hidden(h))
|
| 43 |
+
|
| 44 |
+
recon_dna = self.dec_dna(dec).reshape(B, L, 4) # (B, L, 4)
|
| 45 |
+
recon_phy = torch.tanh(self.dec_phy(dec)).reshape(B, L) # (B, L)
|
| 46 |
+
|
| 47 |
+
return recon_dna, recon_phy, h
|
| 48 |
+
|
| 49 |
+
|
| 50 |
+
# Setup
|
| 51 |
+
L = 50
|
| 52 |
+
device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 53 |
+
model = SparseAE().to(device)
|
| 54 |
+
|
| 55 |
+
# Load the final checkpoint
|
| 56 |
+
model.load_state_dict(torch.load("sparse_ae_50bp_epoch3.pt", map_location=device))
|
| 57 |
+
model.eval()
|
| 58 |
+
print("Model loaded.")
|
| 59 |
+
|
| 60 |
+
# --- GENERATE FAKE DATA (Or load real if you prefer) ---
|
| 61 |
+
print("Generating test data...")
|
| 62 |
+
# Create random DNA (approximate genomic distribution)
|
| 63 |
+
N_SAMPLES = 10000
|
| 64 |
+
probs = torch.tensor([0.25, 0.25, 0.25, 0.25]) # A, C, G, T
|
| 65 |
+
test_dna_idx = torch.multinomial(probs, N_SAMPLES * L, replacement=True).view(N_SAMPLES, L)
|
| 66 |
+
test_dna = F.one_hot(test_dna_idx, num_classes=4).float().to(device)
|
| 67 |
+
test_phy = torch.randn(N_SAMPLES, L).to(device)
|
| 68 |
+
|
| 69 |
+
# --- RUN INFERENCE ---
|
| 70 |
+
print("Running inference...")
|
| 71 |
+
with torch.no_grad():
|
| 72 |
+
# Run model to get latent 'h'
|
| 73 |
+
# Note: If you used Top-K in training, ensure you use it here too.
|
| 74 |
+
# If you used standard L1/KL, just get 'h' from encoder.
|
| 75 |
+
B = test_dna.size(0)
|
| 76 |
+
x = torch.cat([test_dna.reshape(B, -1), test_phy.reshape(B, -1)], dim=1)
|
| 77 |
+
h = model.encoder(x)
|
| 78 |
+
|
| 79 |
+
|
| 80 |
+
# --- ANALYZE VOCABULARY ---
|
| 81 |
+
h_np = h.cpu().numpy()
|
| 82 |
+
|
| 83 |
+
# 1. How often is each token used? (Frequency)
|
| 84 |
+
# We count a neuron as "firing" if it > 0.1
|
| 85 |
+
neuron_firing_counts = np.sum(h_np > 0.1, axis=0) # Shape (2048,)
|
| 86 |
+
|
| 87 |
+
# 2. Sort them
|
| 88 |
+
sorted_counts = np.sort(neuron_firing_counts)[::-1]
|
| 89 |
+
|
| 90 |
+
print("\n--- VOCABULARY HEALTH CHECK ---")
|
| 91 |
+
print(f"Total Neurons: 2048")
|
| 92 |
+
print(f"Dead Neurons (Never fire): {np.sum(neuron_firing_counts == 0)}")
|
| 93 |
+
print(f"Rare Neurons (Fire < 10 times): {np.sum(neuron_firing_counts < 10)}")
|
| 94 |
+
print(f"Common Neurons (Fire > 1000 times): {np.sum(neuron_firing_counts > 1000)}")
|
chr1_dna.txt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:74c534c06853bf3f631c627e6b026bd2a29cade1926c19eda6fa03e462f86f02
|
| 3 |
+
size 248932432
|
chr1_phyloP_norm.npy
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:36f76db186a5031c277f406f4df0bf24fae1d5db15b715daa0303b6cbcebeb06
|
| 3 |
+
size 995729856
|
consensus_string.py
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import numpy as np
|
| 2 |
+
import glob
|
| 3 |
+
|
| 4 |
+
# Helper to map index to letter
|
| 5 |
+
idx_to_base = ['A', 'C', 'G', 'T']
|
| 6 |
+
|
| 7 |
+
# Find your files
|
| 8 |
+
pwm_files = glob.glob("token*_pwm.npy")
|
| 9 |
+
pwm_files.sort()
|
| 10 |
+
|
| 11 |
+
print(f"{'Token ID':<10} | {'Consensus Sequence (50bp)':<55}")
|
| 12 |
+
print("-" * 70)
|
| 13 |
+
|
| 14 |
+
for pwm_file in pwm_files:
|
| 15 |
+
# Get ID
|
| 16 |
+
tid = pwm_file.split("token")[1].split("_")[0]
|
| 17 |
+
|
| 18 |
+
# Load Matrix (50, 4)
|
| 19 |
+
pwm = np.load(pwm_file)
|
| 20 |
+
|
| 21 |
+
# Generate Consensus String
|
| 22 |
+
consensus = []
|
| 23 |
+
for row in pwm:
|
| 24 |
+
# row is [prob_A, prob_C, prob_G, prob_T]
|
| 25 |
+
max_idx = np.argmax(row)
|
| 26 |
+
max_val = row[max_idx]
|
| 27 |
+
|
| 28 |
+
# If the probability is low (e.g., < 0.4), it's just noise/background
|
| 29 |
+
if max_val < 0.25:
|
| 30 |
+
consensus.append(".") # Low confidence
|
| 31 |
+
else:
|
| 32 |
+
consensus.append(idx_to_base[max_idx])
|
| 33 |
+
|
| 34 |
+
seq_str = "".join(consensus)
|
| 35 |
+
print(f"{tid:<10} | {seq_str}")
|
extract.py
ADDED
|
@@ -0,0 +1,286 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import numpy as np
|
| 2 |
+
import torch
|
| 3 |
+
import torch.nn as nn
|
| 4 |
+
import torch.nn.functional as F
|
| 5 |
+
from torch.utils.data import Dataset, DataLoader, RandomSampler
|
| 6 |
+
from collections import defaultdict
|
| 7 |
+
from tqdm import tqdm
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
# =====================
|
| 13 |
+
# 1. SETUP & DATA LOADING
|
| 14 |
+
# =====================
|
| 15 |
+
print("Loading sequence / phyloP data...")
|
| 16 |
+
|
| 17 |
+
dna_path = "/home/n5huang/dna_token/SparseAE/chr1_dna.txt"
|
| 18 |
+
phy_path = "/home/n5huang/dna_token/SparseAE/chr1_phyloP_norm.npy"
|
| 19 |
+
|
| 20 |
+
with open(dna_path) as f:
|
| 21 |
+
sequence = f.read().strip()
|
| 22 |
+
|
| 23 |
+
phy_norm = np.load(phy_path)
|
| 24 |
+
|
| 25 |
+
assert len(sequence) == len(phy_norm), "DNA and phyloP length mismatch!"
|
| 26 |
+
chrom_len = len(sequence)
|
| 27 |
+
print(f"Chromosome 1 length: {chrom_len:,} bp")
|
| 28 |
+
|
| 29 |
+
# =====================
|
| 30 |
+
# 2. DNA ENCODING (HANDLE 'N')
|
| 31 |
+
# =====================
|
| 32 |
+
print("Encoding DNA to one-hot (with N handling)...")
|
| 33 |
+
|
| 34 |
+
mapping = {'A': 0, 'C': 1, 'G': 2, 'T': 3}
|
| 35 |
+
|
| 36 |
+
# Map bases to ints, using 4 as "N/unknown"
|
| 37 |
+
dna_int = np.fromiter((mapping.get(b, 4) for b in sequence), dtype=np.int8)
|
| 38 |
+
num_N = np.sum(dna_int == 4)
|
| 39 |
+
print(f"Number of N bases: {num_N:,}")
|
| 40 |
+
|
| 41 |
+
# One-hot with an extra row for N
|
| 42 |
+
# 0=A,1=C,2=G,3=T,4=[0,0,0,0,1]
|
| 43 |
+
temp_onehot = np.eye(5, dtype=np.float32)[dna_int]
|
| 44 |
+
|
| 45 |
+
# Slice to first 4 columns: N -> [0,0,0,0]
|
| 46 |
+
dna_onehot = temp_onehot[:, :4] # shape (chrom_len, 4)
|
| 47 |
+
|
| 48 |
+
# =====================
|
| 49 |
+
# 3. PHYLOP CHECK + COMBINE
|
| 50 |
+
# =====================
|
| 51 |
+
print("Preparing combined tensor...")
|
| 52 |
+
|
| 53 |
+
# Assume phy_norm is already in [-1,1]; warn if not.
|
| 54 |
+
max_abs_phy = np.max(np.abs(phy_norm))
|
| 55 |
+
if max_abs_phy > 1.1:
|
| 56 |
+
print(f"WARNING: phy_norm max abs={max_abs_phy:.3f} > 1.1; "
|
| 57 |
+
f"data may not be normalized as expected.")
|
| 58 |
+
|
| 59 |
+
phy_norm = phy_norm.astype(np.float32)
|
| 60 |
+
phy_col = phy_norm.reshape(-1, 1) # (chrom_len, 1)
|
| 61 |
+
|
| 62 |
+
combined_np = np.concatenate([dna_onehot, phy_col], axis=1) # (chrom_len, 5)
|
| 63 |
+
combined_tensor = torch.from_numpy(combined_np) # CPU tensor
|
| 64 |
+
|
| 65 |
+
print(f"Master tensor shape: {combined_tensor.shape}")
|
| 66 |
+
|
| 67 |
+
# =====================
|
| 68 |
+
# 4. DATASET: CHUNKED WINDOWING
|
| 69 |
+
# =====================
|
| 70 |
+
L = 50
|
| 71 |
+
|
| 72 |
+
class ChunkedChr1Dataset(Dataset):
|
| 73 |
+
def __init__(self, combined, L=50):
|
| 74 |
+
self.combined = combined
|
| 75 |
+
self.L = L
|
| 76 |
+
self.N = combined.shape[0] - L # number of valid start positions
|
| 77 |
+
|
| 78 |
+
def __len__(self):
|
| 79 |
+
return self.N
|
| 80 |
+
|
| 81 |
+
def __getitem__(self, idx):
|
| 82 |
+
# window: (L, 5)
|
| 83 |
+
window = self.combined[idx : idx + self.L]
|
| 84 |
+
dna = window[:, :4] # (L, 4)
|
| 85 |
+
phy = window[:, 4] # (L,)
|
| 86 |
+
return dna, phy, idx
|
| 87 |
+
|
| 88 |
+
dataset = ChunkedChr1Dataset(combined_tensor, L=L)
|
| 89 |
+
print(f"Dataset length (#windows): {len(dataset):,}")
|
| 90 |
+
|
| 91 |
+
# =====================
|
| 92 |
+
# 5. DATALOADER WITH RANDOM SAMPLER
|
| 93 |
+
# =====================
|
| 94 |
+
BATCH_SIZE = 1024
|
| 95 |
+
SAMPLES_PER_EPOCH = 5_000_000 # number of windows per epoch (tunable)
|
| 96 |
+
|
| 97 |
+
sampler = RandomSampler(
|
| 98 |
+
dataset,
|
| 99 |
+
replacement=True,
|
| 100 |
+
num_samples=SAMPLES_PER_EPOCH
|
| 101 |
+
)
|
| 102 |
+
|
| 103 |
+
loader = DataLoader(
|
| 104 |
+
dataset,
|
| 105 |
+
batch_size=BATCH_SIZE,
|
| 106 |
+
sampler=sampler,
|
| 107 |
+
shuffle=False, # <--- MUST BE FALSE for mapping back to genome
|
| 108 |
+
drop_last=False, # <--- Process every last bit
|
| 109 |
+
num_workers=0, # safer on large dataset
|
| 110 |
+
pin_memory=True
|
| 111 |
+
)
|
| 112 |
+
|
| 113 |
+
print("DataLoader ready.")
|
| 114 |
+
|
| 115 |
+
# =====================
|
| 116 |
+
# 6. MODEL: SPARSE AUTOENCODER
|
| 117 |
+
# =====================
|
| 118 |
+
INPUT_DIM = L * 5 # 4 DNA + 1 phyloP
|
| 119 |
+
LATENT_DIM = 2048
|
| 120 |
+
HIDDEN_DIM = 1024
|
| 121 |
+
|
| 122 |
+
class SparseAE(nn.Module):
|
| 123 |
+
def __init__(self, input_dim=INPUT_DIM, latent_dim=LATENT_DIM, hidden_dim=HIDDEN_DIM):
|
| 124 |
+
super().__init__()
|
| 125 |
+
|
| 126 |
+
# Encoder
|
| 127 |
+
self.encoder = nn.Sequential(
|
| 128 |
+
nn.Linear(input_dim, hidden_dim),
|
| 129 |
+
nn.ReLU(),
|
| 130 |
+
nn.Linear(hidden_dim, latent_dim),
|
| 131 |
+
nn.ReLU() # ReLU helps sparsity with L1
|
| 132 |
+
)
|
| 133 |
+
|
| 134 |
+
# Decoder shared
|
| 135 |
+
self.dec_hidden = nn.Linear(latent_dim, hidden_dim)
|
| 136 |
+
|
| 137 |
+
# Decoder heads
|
| 138 |
+
self.dec_dna = nn.Linear(hidden_dim, L * 4)
|
| 139 |
+
self.dec_phy = nn.Linear(hidden_dim, L * 1)
|
| 140 |
+
|
| 141 |
+
def forward(self, dna, phy):
|
| 142 |
+
B = dna.size(0)
|
| 143 |
+
|
| 144 |
+
x = torch.cat(
|
| 145 |
+
[dna.reshape(B, -1), phy.reshape(B, -1)],
|
| 146 |
+
dim=1
|
| 147 |
+
) # (B, INPUT_DIM)
|
| 148 |
+
|
| 149 |
+
h = self.encoder(x)
|
| 150 |
+
dec = F.relu(self.dec_hidden(h))
|
| 151 |
+
|
| 152 |
+
recon_dna = self.dec_dna(dec).reshape(B, L, 4) # (B, L, 4)
|
| 153 |
+
recon_phy = torch.tanh(self.dec_phy(dec)).reshape(B, L) # (B, L)
|
| 154 |
+
|
| 155 |
+
return recon_dna, recon_phy, h
|
| 156 |
+
|
| 157 |
+
########################################
|
| 158 |
+
# 4. LOAD CHECKPOINT
|
| 159 |
+
########################################
|
| 160 |
+
|
| 161 |
+
device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 162 |
+
print(f"Using device: {device}")
|
| 163 |
+
|
| 164 |
+
model = SparseAE().to(device)
|
| 165 |
+
model.load_state_dict(torch.load("sparse_ae_50bp_epoch3.pt", map_location=device))
|
| 166 |
+
model.eval()
|
| 167 |
+
print("Model loaded.")
|
| 168 |
+
|
| 169 |
+
|
| 170 |
+
########################################
|
| 171 |
+
# 5. TOKEN EXTRACTION
|
| 172 |
+
########################################
|
| 173 |
+
|
| 174 |
+
print("Extracting tokens...")
|
| 175 |
+
|
| 176 |
+
all_token_ids = np.zeros(len(dataset), dtype=np.int32)
|
| 177 |
+
#h_values = np.zeros((len(dataset), LATENT_DIM), dtype=np.float32)
|
| 178 |
+
|
| 179 |
+
with torch.no_grad():
|
| 180 |
+
offset = 0
|
| 181 |
+
for dna_batch, phy_batch, idx_batch in tqdm(loader):
|
| 182 |
+
dna_batch = dna_batch.to(device).float()
|
| 183 |
+
phy_batch = phy_batch.to(device).float()
|
| 184 |
+
|
| 185 |
+
_, _, h = model(dna_batch, phy_batch)
|
| 186 |
+
h_cpu = h.cpu().numpy()
|
| 187 |
+
|
| 188 |
+
# argmax token
|
| 189 |
+
token_ids = np.argmax(h_cpu, axis=1)
|
| 190 |
+
|
| 191 |
+
all_token_ids[offset : offset + len(token_ids)] = token_ids
|
| 192 |
+
#h_values[offset : offset + len(token_ids)] = h_cpu
|
| 193 |
+
|
| 194 |
+
offset += len(token_ids)
|
| 195 |
+
|
| 196 |
+
print("Token extraction complete.")
|
| 197 |
+
|
| 198 |
+
np.save("token_ids.npy", all_token_ids)
|
| 199 |
+
#np.save("latent_h.npy", h_values)
|
| 200 |
+
|
| 201 |
+
|
| 202 |
+
|
| 203 |
+
# Histogram
|
| 204 |
+
hist = np.bincount(all_token_ids, minlength=LATENT_DIM)
|
| 205 |
+
np.save("token_hist.npy", hist)
|
| 206 |
+
|
| 207 |
+
print("Top tokens:")
|
| 208 |
+
#top_tokens = np.argsort(hist)[::-1][:20]
|
| 209 |
+
top_tokens = np.argsort(hist)[:20]
|
| 210 |
+
|
| 211 |
+
for t in top_tokens:
|
| 212 |
+
print(f"Token {t}: count={hist[t]}")
|
| 213 |
+
|
| 214 |
+
|
| 215 |
+
########################################
|
| 216 |
+
# 6. MOTIF SUMMARY FOR TOP TOKENS
|
| 217 |
+
########################################
|
| 218 |
+
|
| 219 |
+
print("\nBuilding PWM + average PhyloP for top tokens...")
|
| 220 |
+
# Initialize accumulators for ALL tokens
|
| 221 |
+
pwm_sum = {t: np.zeros((L, 4), dtype=np.float32) for t in range(LATENT_DIM)}
|
| 222 |
+
phy_sum = {t: np.zeros(L, dtype=np.float32) for t in range(LATENT_DIM)}
|
| 223 |
+
counts = {t: 0 for t in range(LATENT_DIM)}
|
| 224 |
+
|
| 225 |
+
print("Accumulating statistics (this may take 15-30 mins)...")
|
| 226 |
+
|
| 227 |
+
limit = len(all_token_ids) - L
|
| 228 |
+
|
| 229 |
+
for i in tqdm(range(limit)):
|
| 230 |
+
t = all_token_ids[i]
|
| 231 |
+
|
| 232 |
+
# Always accumulate
|
| 233 |
+
window = combined_np[i : i+L]
|
| 234 |
+
|
| 235 |
+
pwm_sum[t] += window[:, :4]
|
| 236 |
+
phy_sum[t] += window[:, 4]
|
| 237 |
+
counts[t] += 1
|
| 238 |
+
|
| 239 |
+
|
| 240 |
+
|
| 241 |
+
########################################
|
| 242 |
+
# 6A. Save per-token PWMs & phylo profiles
|
| 243 |
+
########################################
|
| 244 |
+
|
| 245 |
+
print("Saving profiles...")
|
| 246 |
+
|
| 247 |
+
for t in range(LATENT_DIM):
|
| 248 |
+
if counts[t] == 0:
|
| 249 |
+
continue
|
| 250 |
+
|
| 251 |
+
pwm = pwm_sum[t] / counts[t]
|
| 252 |
+
avg_phy = phy_sum[t] / counts[t]
|
| 253 |
+
|
| 254 |
+
np.save(f"token{t}_pwm.npy", pwm)
|
| 255 |
+
np.save(f"token{t}_phy.npy", avg_phy)
|
| 256 |
+
|
| 257 |
+
|
| 258 |
+
########################################
|
| 259 |
+
# 6B. Rank tokens by PhyloP and rarity
|
| 260 |
+
########################################
|
| 261 |
+
|
| 262 |
+
avg_phylop_per_token = np.zeros(LATENT_DIM)
|
| 263 |
+
count_per_token = np.zeros(LATENT_DIM)
|
| 264 |
+
|
| 265 |
+
for t in range(LATENT_DIM):
|
| 266 |
+
if counts[t] > 0:
|
| 267 |
+
avg_phylop_per_token[t] = (phy_sum[t] / counts[t]).mean()
|
| 268 |
+
count_per_token[t] = counts[t]
|
| 269 |
+
else:
|
| 270 |
+
avg_phylop_per_token[t] = -999
|
| 271 |
+
count_per_token[t] = 0
|
| 272 |
+
|
| 273 |
+
# Rank by PhyloP (high to low)
|
| 274 |
+
tokens_by_phylop = np.argsort(avg_phylop_per_token)[::-1]
|
| 275 |
+
top_phy_tokens = tokens_by_phylop[:20]
|
| 276 |
+
|
| 277 |
+
# Rank by rarity (low to high)
|
| 278 |
+
rare_tokens = np.argsort(count_per_token)[:20]
|
| 279 |
+
|
| 280 |
+
print("Top 20 conserved tokens:", top_phy_tokens)
|
| 281 |
+
print("Top 20 rarest tokens:", rare_tokens)
|
| 282 |
+
|
| 283 |
+
|
| 284 |
+
|
| 285 |
+
print("\n=== Extraction Completed Successfully ===")
|
| 286 |
+
|
extract_log.txt
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
run_job.sh
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/bin/bash
|
| 2 |
+
|
| 3 |
+
# --- 0. Activate Conda Environment ---
|
| 4 |
+
echo "Activating Conda environment..."
|
| 5 |
+
source /home/n5huang/miniconda3/etc/profile.d/conda.sh
|
| 6 |
+
conda activate dnabert_v2
|
| 7 |
+
|
| 8 |
+
# --- 1. Configuration ---
|
| 9 |
+
# Set the GPU ID
|
| 10 |
+
export CUDA_VISIBLE_DEVICES=6
|
| 11 |
+
echo "Assigned GPU: $CUDA_VISIBLE_DEVICES"
|
| 12 |
+
|
| 13 |
+
# --- 2. Correct Working Directory ---
|
| 14 |
+
# Move to the folder where the script actually lives to ensure relative paths work (if any)
|
| 15 |
+
cd /home/n5huang/dna_token/SparseAE/
|
| 16 |
+
|
| 17 |
+
# --- 3. Run the Python Script ---
|
| 18 |
+
# We use absolute path just to be safe
|
| 19 |
+
# -u ensures logs are written immediately
|
| 20 |
+
echo "Starting Python training..."
|
| 21 |
+
python -u extract.py #run_train.py
|
run_train.py
ADDED
|
@@ -0,0 +1,274 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import numpy as np
|
| 2 |
+
import torch
|
| 3 |
+
import torch.nn as nn
|
| 4 |
+
import torch.nn.functional as F
|
| 5 |
+
from torch.utils.data import Dataset, DataLoader, RandomSampler
|
| 6 |
+
|
| 7 |
+
# =====================
|
| 8 |
+
# 1. SETUP & DATA LOADING
|
| 9 |
+
# =====================
|
| 10 |
+
print("Loading sequence / phyloP data...")
|
| 11 |
+
|
| 12 |
+
dna_path = "/home/n5huang/dna_token/SparseAE/chr1_dna.txt"
|
| 13 |
+
phy_path = "/home/n5huang/dna_token/SparseAE/chr1_phyloP_norm.npy"
|
| 14 |
+
|
| 15 |
+
with open(dna_path) as f:
|
| 16 |
+
sequence = f.read().strip()
|
| 17 |
+
|
| 18 |
+
phy_norm = np.load(phy_path)
|
| 19 |
+
|
| 20 |
+
assert len(sequence) == len(phy_norm), "DNA and phyloP length mismatch!"
|
| 21 |
+
chrom_len = len(sequence)
|
| 22 |
+
print(f"Chromosome 1 length: {chrom_len:,} bp")
|
| 23 |
+
|
| 24 |
+
# =====================
|
| 25 |
+
# 2. DNA ENCODING (HANDLE 'N')
|
| 26 |
+
# =====================
|
| 27 |
+
print("Encoding DNA to one-hot (with N handling)...")
|
| 28 |
+
|
| 29 |
+
mapping = {'A': 0, 'C': 1, 'G': 2, 'T': 3}
|
| 30 |
+
|
| 31 |
+
# Map bases to ints, using 4 as "N/unknown"
|
| 32 |
+
dna_int = np.fromiter((mapping.get(b, 4) for b in sequence), dtype=np.int8)
|
| 33 |
+
num_N = np.sum(dna_int == 4)
|
| 34 |
+
print(f"Number of N bases: {num_N:,}")
|
| 35 |
+
|
| 36 |
+
# One-hot with an extra row for N
|
| 37 |
+
# 0=A,1=C,2=G,3=T,4=[0,0,0,0,1]
|
| 38 |
+
temp_onehot = np.eye(5, dtype=np.float32)[dna_int]
|
| 39 |
+
|
| 40 |
+
# Slice to first 4 columns: N -> [0,0,0,0]
|
| 41 |
+
dna_onehot = temp_onehot[:, :4] # shape (chrom_len, 4)
|
| 42 |
+
|
| 43 |
+
# =====================
|
| 44 |
+
# 3. PHYLOP CHECK + COMBINE
|
| 45 |
+
# =====================
|
| 46 |
+
print("Preparing combined tensor...")
|
| 47 |
+
|
| 48 |
+
# Assume phy_norm is already in [-1,1]; warn if not.
|
| 49 |
+
max_abs_phy = np.max(np.abs(phy_norm))
|
| 50 |
+
if max_abs_phy > 1.1:
|
| 51 |
+
print(f"WARNING: phy_norm max abs={max_abs_phy:.3f} > 1.1; "
|
| 52 |
+
f"data may not be normalized as expected.")
|
| 53 |
+
|
| 54 |
+
phy_norm = phy_norm.astype(np.float32)
|
| 55 |
+
phy_col = phy_norm.reshape(-1, 1) # (chrom_len, 1)
|
| 56 |
+
|
| 57 |
+
combined_np = np.concatenate([dna_onehot, phy_col], axis=1) # (chrom_len, 5)
|
| 58 |
+
combined_tensor = torch.from_numpy(combined_np) # CPU tensor
|
| 59 |
+
|
| 60 |
+
print(f"Master tensor shape: {combined_tensor.shape}")
|
| 61 |
+
|
| 62 |
+
# =====================
|
| 63 |
+
# 4. DATASET: CHUNKED WINDOWING
|
| 64 |
+
# =====================
|
| 65 |
+
L = 50
|
| 66 |
+
|
| 67 |
+
class ChunkedChr1Dataset(Dataset):
|
| 68 |
+
def __init__(self, combined, L=50):
|
| 69 |
+
self.combined = combined
|
| 70 |
+
self.L = L
|
| 71 |
+
self.N = combined.shape[0] - L # number of valid start positions
|
| 72 |
+
|
| 73 |
+
def __len__(self):
|
| 74 |
+
return self.N
|
| 75 |
+
|
| 76 |
+
def __getitem__(self, idx):
|
| 77 |
+
# window: (L, 5)
|
| 78 |
+
window = self.combined[idx : idx + self.L]
|
| 79 |
+
dna = window[:, :4] # (L, 4)
|
| 80 |
+
phy = window[:, 4] # (L,)
|
| 81 |
+
return dna, phy
|
| 82 |
+
|
| 83 |
+
dataset = ChunkedChr1Dataset(combined_tensor, L=L)
|
| 84 |
+
print(f"Dataset length (#windows): {len(dataset):,}")
|
| 85 |
+
|
| 86 |
+
# =====================
|
| 87 |
+
# 5. DATALOADER WITH RANDOM SAMPLER
|
| 88 |
+
# =====================
|
| 89 |
+
BATCH_SIZE = 1024
|
| 90 |
+
SAMPLES_PER_EPOCH = 5_000_000 # number of windows per epoch (tunable)
|
| 91 |
+
|
| 92 |
+
sampler = RandomSampler(
|
| 93 |
+
dataset,
|
| 94 |
+
replacement=True,
|
| 95 |
+
num_samples=SAMPLES_PER_EPOCH
|
| 96 |
+
)
|
| 97 |
+
|
| 98 |
+
loader = DataLoader(
|
| 99 |
+
dataset,
|
| 100 |
+
batch_size=BATCH_SIZE,
|
| 101 |
+
sampler=sampler,
|
| 102 |
+
drop_last=True,
|
| 103 |
+
num_workers=0, # safer on large dataset
|
| 104 |
+
pin_memory=True
|
| 105 |
+
)
|
| 106 |
+
|
| 107 |
+
print("DataLoader ready.")
|
| 108 |
+
|
| 109 |
+
# =====================
|
| 110 |
+
# 6. MODEL: SPARSE AUTOENCODER
|
| 111 |
+
# =====================
|
| 112 |
+
INPUT_DIM = L * 5 # 4 DNA + 1 phyloP
|
| 113 |
+
LATENT_DIM = 2048
|
| 114 |
+
HIDDEN_DIM = 1024
|
| 115 |
+
|
| 116 |
+
class SparseAE(nn.Module):
|
| 117 |
+
def __init__(self, input_dim=INPUT_DIM, latent_dim=LATENT_DIM, hidden_dim=HIDDEN_DIM):
|
| 118 |
+
super().__init__()
|
| 119 |
+
|
| 120 |
+
# Encoder
|
| 121 |
+
self.encoder = nn.Sequential(
|
| 122 |
+
nn.Linear(input_dim, hidden_dim),
|
| 123 |
+
nn.ReLU(),
|
| 124 |
+
nn.Linear(hidden_dim, latent_dim),
|
| 125 |
+
nn.ReLU() # ReLU helps sparsity with L1
|
| 126 |
+
)
|
| 127 |
+
|
| 128 |
+
# Decoder shared
|
| 129 |
+
self.dec_hidden = nn.Linear(latent_dim, hidden_dim)
|
| 130 |
+
|
| 131 |
+
# Decoder heads
|
| 132 |
+
self.dec_dna = nn.Linear(hidden_dim, L * 4)
|
| 133 |
+
self.dec_phy = nn.Linear(hidden_dim, L * 1)
|
| 134 |
+
|
| 135 |
+
def forward(self, dna, phy):
|
| 136 |
+
B = dna.size(0)
|
| 137 |
+
|
| 138 |
+
x = torch.cat(
|
| 139 |
+
[dna.reshape(B, -1), phy.reshape(B, -1)],
|
| 140 |
+
dim=1
|
| 141 |
+
) # (B, INPUT_DIM)
|
| 142 |
+
|
| 143 |
+
h = self.encoder(x)
|
| 144 |
+
dec = F.relu(self.dec_hidden(h))
|
| 145 |
+
|
| 146 |
+
recon_dna = self.dec_dna(dec).reshape(B, L, 4) # (B, L, 4)
|
| 147 |
+
recon_phy = torch.tanh(self.dec_phy(dec)).reshape(B, L) # (B, L)
|
| 148 |
+
|
| 149 |
+
return recon_dna, recon_phy, h
|
| 150 |
+
|
| 151 |
+
# =====================
|
| 152 |
+
# 7. TRAINING LOOP
|
| 153 |
+
# =====================
|
| 154 |
+
device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 155 |
+
print(f"Training on device: {device}")
|
| 156 |
+
|
| 157 |
+
model = SparseAE().to(device)
|
| 158 |
+
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
|
| 159 |
+
|
| 160 |
+
#lambda_l1 = 0.01 # slightly stronger sparsity
|
| 161 |
+
lambda_l1_start = 0.02
|
| 162 |
+
lambda_l1_end = 0.005
|
| 163 |
+
phy_weight = 10.0
|
| 164 |
+
num_epochs = 5
|
| 165 |
+
PRINT_EVERY = 1000 # batches
|
| 166 |
+
beta_kl_schedule = [0.0, 0.01, 0.02, 0.05, 0.1] # per epoch
|
| 167 |
+
|
| 168 |
+
|
| 169 |
+
|
| 170 |
+
print("Starting training...")
|
| 171 |
+
|
| 172 |
+
for epoch in range(num_epochs):
|
| 173 |
+
model.train()
|
| 174 |
+
total_loss = 0.0
|
| 175 |
+
total_dna = 0.0
|
| 176 |
+
total_phy = 0.0
|
| 177 |
+
total_active = 0.0
|
| 178 |
+
|
| 179 |
+
batch_count = 0
|
| 180 |
+
for dna_batch, phy_batch in loader:
|
| 181 |
+
batch_count += 1
|
| 182 |
+
|
| 183 |
+
dna_batch = dna_batch.to(device, non_blocking=True).float() # (B, L, 4)
|
| 184 |
+
phy_batch = phy_batch.to(device, non_blocking=True).float() # (B, L)
|
| 185 |
+
|
| 186 |
+
optimizer.zero_grad()
|
| 187 |
+
|
| 188 |
+
recon_dna, recon_phy, h = model(dna_batch, phy_batch)
|
| 189 |
+
|
| 190 |
+
# Mask positions that are 'N' (all-zero one-hot)
|
| 191 |
+
mask = dna_batch.sum(dim=-1) > 0 # (B, L), True where valid base
|
| 192 |
+
|
| 193 |
+
# --- DNA loss (masked CE) ---
|
| 194 |
+
true_dna_cls = dna_batch.argmax(dim=-1) # (B, L)
|
| 195 |
+
dna_logits = recon_dna.permute(0, 2, 1) # (B, 4, L)
|
| 196 |
+
loss_dna_raw = F.cross_entropy(dna_logits, true_dna_cls, reduction='none') # (B, L)
|
| 197 |
+
|
| 198 |
+
if mask.sum() > 0:
|
| 199 |
+
loss_dna = (loss_dna_raw * mask).sum() / mask.sum()
|
| 200 |
+
else:
|
| 201 |
+
loss_dna = torch.tensor(0.0, device=device)
|
| 202 |
+
|
| 203 |
+
# --- PhyloP loss (masked MSE) ---
|
| 204 |
+
loss_phy_raw = F.mse_loss(recon_phy, phy_batch, reduction='none') # (B, L)
|
| 205 |
+
|
| 206 |
+
if mask.sum() > 0:
|
| 207 |
+
loss_phy = (loss_phy_raw * mask).sum() / mask.sum()
|
| 208 |
+
else:
|
| 209 |
+
loss_phy = torch.tensor(0.0, device=device)
|
| 210 |
+
|
| 211 |
+
# --- KL sparsity penalty ---
|
| 212 |
+
rho = 0.02 # target sparsity
|
| 213 |
+
eps = 1e-12
|
| 214 |
+
rho_hat = torch.mean(h, dim=0)
|
| 215 |
+
rho_hat = torch.clamp(rho_hat, min=1e-6, max=1-1e-6)
|
| 216 |
+
kl_per_unit = (
|
| 217 |
+
rho * torch.log((rho + eps) / (rho_hat + eps)) +
|
| 218 |
+
(1 - rho) * torch.log(((1 - rho) + eps) / ((1 - rho_hat) + eps))
|
| 219 |
+
)
|
| 220 |
+
beta_kl = beta_kl_schedule[min(epoch, len(beta_kl_schedule)-1)]
|
| 221 |
+
#loss_kl = 1 * kl_per_unit.sum() # β = 1 regularization weight
|
| 222 |
+
loss_kl = beta_kl * kl_per_unit.sum()
|
| 223 |
+
|
| 224 |
+
|
| 225 |
+
lambda_l1 = (
|
| 226 |
+
lambda_l1_start
|
| 227 |
+
+ (lambda_l1_end - lambda_l1_start) * (epoch / (num_epochs - 1))
|
| 228 |
+
)
|
| 229 |
+
|
| 230 |
+
# --- L1 sparsity on latent ---
|
| 231 |
+
loss_l1 = lambda_l1 * torch.mean(torch.abs(h))
|
| 232 |
+
|
| 233 |
+
# Total loss
|
| 234 |
+
loss = loss_dna + phy_weight * loss_phy + loss_l1 + loss_kl
|
| 235 |
+
|
| 236 |
+
loss.backward()
|
| 237 |
+
optimizer.step()
|
| 238 |
+
|
| 239 |
+
# Logging accumulators
|
| 240 |
+
B = dna_batch.size(0)
|
| 241 |
+
total_loss += loss.item() * B
|
| 242 |
+
total_dna += loss_dna.item() * B
|
| 243 |
+
total_phy += loss_phy.item() * B
|
| 244 |
+
|
| 245 |
+
# approximate number of active neurons (h > threshold)
|
| 246 |
+
active_count = (h > 0.01).float().sum(dim=1).mean().item()
|
| 247 |
+
total_active += active_count * B
|
| 248 |
+
|
| 249 |
+
if batch_count % PRINT_EVERY == 0:
|
| 250 |
+
print(
|
| 251 |
+
f"Epoch {epoch+1} | Batch {batch_count} | "
|
| 252 |
+
f"Loss={loss.item():.4f} | DNA_CE={loss_dna.item():.4f} | "
|
| 253 |
+
f"Phy_MSE={loss_phy.item():.5f} | Active={active_count:.1f}"
|
| 254 |
+
)
|
| 255 |
+
|
| 256 |
+
# Epoch summary
|
| 257 |
+
N = SAMPLES_PER_EPOCH # effective number of samples this epoch
|
| 258 |
+
avg_loss = total_loss / N
|
| 259 |
+
avg_dna = total_dna / N
|
| 260 |
+
avg_phy = total_phy / N
|
| 261 |
+
avg_active = total_active / N
|
| 262 |
+
|
| 263 |
+
print(f"\n=== Epoch {epoch+1}/{num_epochs} COMPLETE ===")
|
| 264 |
+
print(
|
| 265 |
+
f"Avg Loss={avg_loss:.4f} | Avg DNA_CE={avg_dna:.4f} | "
|
| 266 |
+
f"Avg Phy_MSE={avg_phy:.5f} | "
|
| 267 |
+
f"Avg Active Neurons={avg_active:.1f} / {LATENT_DIM} "
|
| 268 |
+
f"({100.0 * avg_active / LATENT_DIM:.1f}%)"
|
| 269 |
+
)
|
| 270 |
+
|
| 271 |
+
# Save checkpoint
|
| 272 |
+
ckpt_path = f"sparse_ae_50bp_epoch{epoch+1}.pt"
|
| 273 |
+
torch.save(model.state_dict(), ckpt_path)
|
| 274 |
+
print(f"Saved checkpoint to {ckpt_path}\n")
|
save/sparse_ae_50bp_epoch1.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:783330441dbcb8954e41b73c14c9f2c6b5b7d4391b2a34f497efbaffea185061
|
| 3 |
+
size 18845511
|
save/sparse_ae_50bp_epoch2.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:f951d20a5e142004926a72c9fcf5eb930b60fecd6c1016bc937f9590ce1acf4f
|
| 3 |
+
size 18845511
|
save/sparse_ae_50bp_epoch3.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:d8b3568994c3c777b37bea3b781c8d79e8c8883c9fbceee001f6247d7e21def0
|
| 3 |
+
size 18845511
|
save/sparse_ae_50bp_epoch4.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:e1625cf96716fac10180bff026b4a22bb3141c9d7599ffac03963b908e9310d3
|
| 3 |
+
size 18845511
|
save/sparse_ae_50bp_epoch5.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:9aed9a40b48ddc84c91d5fee7ed1917922a39138c6de011655dc9789d6d94939
|
| 3 |
+
size 18845511
|
sparse_ae_50bp_epoch1.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:0524bbaf5d3890d983e6a5a0950700caa1edb079b4ba8d351f10aedda4e4cbfe
|
| 3 |
+
size 18845511
|
sparse_ae_50bp_epoch2.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:49e44d486cf65d22baef24d6d35c480e2b0d1ac54216ee62089be54acf28cee8
|
| 3 |
+
size 18845511
|
sparse_ae_50bp_epoch3.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:b28ea09c9c217e70bce16a914aacda6d8a864f95e8497eb690e02b7c84e6a036
|
| 3 |
+
size 18845511
|
sparse_ae_50bp_epoch4.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:9b188cb2204f93983c0942d697f8f04736db3bf56012b9324e507e4d83a4673d
|
| 3 |
+
size 18845511
|
sparse_ae_50bp_epoch5.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:4cc638b7535deeb9c4c10bb9a6c47d746ff09d2aa51cf223b2d262ec468a018a
|
| 3 |
+
size 18845511
|
summarize_tokens.py
ADDED
|
@@ -0,0 +1,62 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import re
|
| 2 |
+
import glob
|
| 3 |
+
import numpy as np
|
| 4 |
+
import pandas as pd
|
| 5 |
+
|
| 6 |
+
LOG_FILE = "extract_log.txt"
|
| 7 |
+
|
| 8 |
+
# 1. Parse N hits from log file
|
| 9 |
+
token_counts = {}
|
| 10 |
+
pattern = re.compile(r"Saved token (\d+) \(N=(\d+)\)")
|
| 11 |
+
|
| 12 |
+
with open(LOG_FILE, "r") as f:
|
| 13 |
+
for line in f:
|
| 14 |
+
m = pattern.search(line)
|
| 15 |
+
if m:
|
| 16 |
+
t = int(m.group(1))
|
| 17 |
+
n = int(m.group(2))
|
| 18 |
+
token_counts[t] = n
|
| 19 |
+
|
| 20 |
+
print(f"Found {len(token_counts)} tokens with counts from log.")
|
| 21 |
+
|
| 22 |
+
# 2. For each token, load PWM + phyloP, compute entropy + avg phyloP
|
| 23 |
+
rows = []
|
| 24 |
+
|
| 25 |
+
def pwm_entropy(pwm, eps=1e-8):
|
| 26 |
+
"""
|
| 27 |
+
pwm: (L, 4) array of mean one-hot probs
|
| 28 |
+
returns: mean Shannon entropy across positions, in bits
|
| 29 |
+
"""
|
| 30 |
+
p = pwm / (pwm.sum(axis=1, keepdims=True) + eps) # normalize safety
|
| 31 |
+
H = -np.sum(p * np.log2(p + eps), axis=1) # (L,)
|
| 32 |
+
return H.mean()
|
| 33 |
+
|
| 34 |
+
for pwm_path in glob.glob("token*_pwm.npy"):
|
| 35 |
+
# token ID from filename
|
| 36 |
+
m = re.search(r"token(\d+)_pwm\.npy", pwm_path)
|
| 37 |
+
if not m:
|
| 38 |
+
continue
|
| 39 |
+
t = int(m.group(1))
|
| 40 |
+
|
| 41 |
+
pwm = np.load(pwm_path) # (L, 4)
|
| 42 |
+
phy = np.load(f"token{t}_phy.npy") # (L,)
|
| 43 |
+
|
| 44 |
+
H = pwm_entropy(pwm)
|
| 45 |
+
avg_phy = float(phy.mean())
|
| 46 |
+
N_hits = token_counts.get(t, None)
|
| 47 |
+
|
| 48 |
+
rows.append({
|
| 49 |
+
"token_id": t,
|
| 50 |
+
"N_hits": N_hits,
|
| 51 |
+
"pwm_entropy_bits": H,
|
| 52 |
+
"avg_phyloP": avg_phy
|
| 53 |
+
})
|
| 54 |
+
|
| 55 |
+
df = pd.DataFrame(rows)
|
| 56 |
+
df = df.sort_values(["pwm_entropy_bits", "avg_phyloP"], ascending=[True, False])
|
| 57 |
+
|
| 58 |
+
print(df.head(20))
|
| 59 |
+
|
| 60 |
+
df.to_csv("token_summary.tsv", sep="\t", index=False)
|
| 61 |
+
print("\nSaved summary to token_summary.tsv")
|
| 62 |
+
|
token1248_phy.npy
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:d0ff04bef228fbb4034063112451e0ff3f61ea5010085c896dde823c80995944
|
| 3 |
+
size 328
|
token1248_pwm.npy
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:48ecfc793f3988fb8ac3f44fceeb7e19c78e65c153d9f8f3d8ef84ba95901fce
|
| 3 |
+
size 928
|
token1312_phy.npy
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:3e7f7df3737fb75cf04dba3b9dace53c6288b5df1c8f1504e756795d16ec587e
|
| 3 |
+
size 328
|
token1312_pwm.npy
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:636a911b43f7f1117d8f1350966a3832b21f4ec02891bd96386cc0e3f297bae9
|
| 3 |
+
size 928
|
token138_phy.npy
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:94821977d4c17a4f60989babf4c9404e650160ce56ef0f109e3ac078e4292aaa
|
| 3 |
+
size 328
|
token138_pwm.npy
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:00274837ae1e7e223ce799a649edb1ca7e01998780ec0b836e731427e1739ef6
|
| 3 |
+
size 928
|
token1417_phy.npy
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:1c51f148b5df3cf4bce2fde39242187600d0d8fe47d10094b2dccfd4c3715870
|
| 3 |
+
size 328
|
token1417_pwm.npy
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a294d6621fc0b33bedbf4580ecf3c50f5ae779e15a0c60365015f4a81b9801bd
|
| 3 |
+
size 928
|
token1448_phy.npy
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:7818a614c3797dff04b89ae64493fc348c32d774a34c068cc6212d097e02cca8
|
| 3 |
+
size 328
|
token1448_pwm.npy
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:328c466c07fa92f28d23a19accbf8a852f584116a234b9e62c96eec4d1243c0b
|
| 3 |
+
size 928
|
token1487_phy.npy
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:e511783f52ad148ca2fb1eaef348fd19c844b9d8ceb9c9ffef07efeb48097960
|
| 3 |
+
size 328
|
token1487_pwm.npy
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:7859c6c5085e90a0b2fe26c5f63ce02dbb2f9e56e8e20a77ddf664630a4c6849
|
| 3 |
+
size 928
|
token1494_phy.npy
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:cc33acaa4223bb72be2190dc196840f870c6c5128ce06cb46c86b7aacc8011d1
|
| 3 |
+
size 328
|
token1494_pwm.npy
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:9d05893b76c9a7d253d025f8a09f600ec8e6e78914793dd866ed5aae58028330
|
| 3 |
+
size 928
|
token1503_phy.npy
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:bb5354c0f0721175f73d3e5a76f30ca9aa4f76ef20d918ca9faf6b4040c3ce55
|
| 3 |
+
size 328
|
token1503_pwm.npy
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:7c4ddd567056af4a22fe33e4783afb65e0037c2fda7778825a7b3efcffbce4c0
|
| 3 |
+
size 928
|
token1721_phy.npy
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:9e1b524b3870dd824a06a9113ecd1e08d1f78e133e39cf6700c6734a5d8e487f
|
| 3 |
+
size 328
|
token1721_pwm.npy
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:c5df963ff0dc52052663c9122812da7708299e263f749a2accbb6297b42bfd78
|
| 3 |
+
size 928
|
token175_phy.npy
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:d20851e9bed217d47d86f6ef752922ba756a17dcb5f1ba826523161827ea4af0
|
| 3 |
+
size 328
|
token175_pwm.npy
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:c5ce7c68421a2db5fc5285296599945917c37be115a0eeda6ec9de965be25624
|
| 3 |
+
size 928
|
token1831_phy.npy
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:68b1ef67ff02631fb4b32165652ef6d97c4edb3b1ca292f03f644098637b821a
|
| 3 |
+
size 328
|
token1831_pwm.npy
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:43dc041a1c64ea71b531c7ab1b1d4ea30bd77b015b6a5d3997dee5ddb00f5fea
|
| 3 |
+
size 928
|
token192_phy.npy
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:98cff173bd0ec8c9d3f02d217a5bfa96ee0cca52d03f624dd29939eaee52a8df
|
| 3 |
+
size 328
|
token192_pwm.npy
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:24c99c84a3db007261a0ab245020b7db930dcc53e8e646e16a77a3bdc31bd259
|
| 3 |
+
size 928
|
token296_phy.npy
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:fd142b806afb9d3ecf36bc07eeb4ac05e69f48f64513620059023ad7ea59a5d0
|
| 3 |
+
size 328
|
token296_pwm.npy
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:3d9722d77f41184dbbc8089b0c909f00121fb114d33b0b09f5fdacc419aa724d
|
| 3 |
+
size 928
|
token363_phy.npy
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:66a992eb090aed56f742f8b84e2bc36eee9dba0849b5de454f65c67c8d433c83
|
| 3 |
+
size 328
|
token363_pwm.npy
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:dd011451a85cc30d6f4aa5123fab4f6a9a401a74a84ae30859ac1e29262c026c
|
| 3 |
+
size 928
|
token468_phy.npy
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:471273cce914188e0b266f884091cd1b85c8f1c554943bd0d5db3a423c3b6cb5
|
| 3 |
+
size 328
|
token468_pwm.npy
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:84493201ce2efb598ceaf6eb6f86b25168b2a4da76611bee68897a677a732fa1
|
| 3 |
+
size 928
|