Umair Khan commited on
Commit
b10cf8e
·
1 Parent(s): 0811027
Files changed (1) hide show
  1. app.py +9 -10
app.py CHANGED
@@ -26,9 +26,8 @@ EMB_KEY = "X_tx1-70m"
26
  APP_TITLE = "Tx1-70M Embeddings"
27
  APP_DESC = """
28
  Upload an AnnData, compute Tx1-70M embeddings,
29
- preview a UMAP, and download the results.
30
-
31
- **Limits:** Files up to 5GB. If an AnnData contains more
32
  than 50K cells, embeddings will be computed **only
33
  for the first 50K cells**.
34
  """
@@ -126,7 +125,7 @@ def ensure_dropdowns(fileobj):
126
 
127
  # custom callback to report progress to Gradio
128
  class GradioProgressCallback(Callback):
129
- def __init__(self, progress, total_batches, start=0.35, end=0.75):
130
  self.progress = progress
131
  self.total = max(1, int(total_batches))
132
  self.seen = 0
@@ -142,7 +141,7 @@ class GradioProgressCallback(Callback):
142
  def _embed(adata_bytes, layer_name, feature_col, use_symbols, progress):
143
 
144
  # retrieve AnnData from bytes
145
- progress(0.12, desc="loading AnnData")
146
  with tempfile.TemporaryDirectory() as td:
147
 
148
  # persist to a temporary file
@@ -230,7 +229,7 @@ def _embed(adata_bytes, layer_name, feature_col, use_symbols, progress):
230
  raise gr.Error(f"Feature column '{feature_col}' does not appear to contain Ensembl gene IDs. If the column contains gene symbols, use the checkbox.")
231
 
232
  # load model
233
- progress(0.22, desc="loading model")
234
  model, vocab, _, collator_config = ComposerTX.from_hf(
235
  "tahoebio/TahoeX1",
236
  "70m",
@@ -238,7 +237,7 @@ def _embed(adata_bytes, layer_name, feature_col, use_symbols, progress):
238
  )
239
 
240
  # prepare AnnData
241
- progress(0.30, desc="preparing AnnData")
242
  gene_id_key = feature_col
243
  adata.var["id_in_vocab"] = [vocab[gene] if gene in vocab else -1 for gene in adata.var[gene_id_key]]
244
  gene_ids_in_vocab = np.array(adata.var["id_in_vocab"])
@@ -252,7 +251,7 @@ def _embed(adata_bytes, layer_name, feature_col, use_symbols, progress):
252
  gene_ids = np.array([vocab[gene] for gene in genes], dtype=int)
253
 
254
  # create data loader
255
- progress(0.35, desc="creating data loader")
256
  count_matrix = _pick_layer(adata, layer_name)
257
  dataset = CountDataset(
258
  count_matrix,
@@ -349,7 +348,7 @@ def run_pipeline(fileobj, layer_choice, var_choice, obs_choice, use_symbols, pro
349
  raise gr.Error("Please select a .var column.")
350
 
351
  # read upload file to bytes so the GPU function can load it
352
- progress(0.05, desc="reading AnnData")
353
  with open(fileobj.name, "rb") as f:
354
  adata_bytes = f.read()
355
 
@@ -370,7 +369,7 @@ def run_pipeline(fileobj, layer_choice, var_choice, obs_choice, use_symbols, pro
370
  adata = sc.read_h5ad(tmp_in, backed=None)
371
 
372
  # compute UMAP
373
- progress(0.85, desc="computing UMAP")
374
  color_series = adata.obs[obs_choice] if (obs_choice and obs_choice in adata.obs) else None
375
  coords = _compute_umap_from_emb(E)
376
  adata.obsm["X_umap"] = coords
 
26
  APP_TITLE = "Tx1-70M Embeddings"
27
  APP_DESC = """
28
  Upload an AnnData, compute Tx1-70M embeddings,
29
+ preview a UMAP, and download the results. **Limits:**
30
+ Files up to 5GB. If an AnnData contains more
 
31
  than 50K cells, embeddings will be computed **only
32
  for the first 50K cells**.
33
  """
 
125
 
126
  # custom callback to report progress to Gradio
127
  class GradioProgressCallback(Callback):
128
+ def __init__(self, progress, total_batches, start=0.25, end=0.75):
129
  self.progress = progress
130
  self.total = max(1, int(total_batches))
131
  self.seen = 0
 
141
  def _embed(adata_bytes, layer_name, feature_col, use_symbols, progress):
142
 
143
  # retrieve AnnData from bytes
144
+ progress(0.05, desc="loading AnnData")
145
  with tempfile.TemporaryDirectory() as td:
146
 
147
  # persist to a temporary file
 
229
  raise gr.Error(f"Feature column '{feature_col}' does not appear to contain Ensembl gene IDs. If the column contains gene symbols, use the checkbox.")
230
 
231
  # load model
232
+ progress(0.15, desc="loading model")
233
  model, vocab, _, collator_config = ComposerTX.from_hf(
234
  "tahoebio/TahoeX1",
235
  "70m",
 
237
  )
238
 
239
  # prepare AnnData
240
+ progress(0.20, desc="preparing AnnData")
241
  gene_id_key = feature_col
242
  adata.var["id_in_vocab"] = [vocab[gene] if gene in vocab else -1 for gene in adata.var[gene_id_key]]
243
  gene_ids_in_vocab = np.array(adata.var["id_in_vocab"])
 
251
  gene_ids = np.array([vocab[gene] for gene in genes], dtype=int)
252
 
253
  # create data loader
254
+ progress(0.22, desc="creating data loader")
255
  count_matrix = _pick_layer(adata, layer_name)
256
  dataset = CountDataset(
257
  count_matrix,
 
348
  raise gr.Error("Please select a .var column.")
349
 
350
  # read upload file to bytes so the GPU function can load it
351
+ progress(0.02, desc="reading AnnData")
352
  with open(fileobj.name, "rb") as f:
353
  adata_bytes = f.read()
354
 
 
369
  adata = sc.read_h5ad(tmp_in, backed=None)
370
 
371
  # compute UMAP
372
+ progress(0.80, desc="computing UMAP")
373
  color_series = adata.obs[obs_choice] if (obs_choice and obs_choice in adata.obs) else None
374
  coords = _compute_umap_from_emb(E)
375
  adata.obsm["X_umap"] = coords