|
""" |
|
Module: calculate_biostats.py |
|
|
|
This module calculates and aggregates biological statistics from single-cell RNA sequencing (scRNA-seq) data |
|
stored in AnnData format. It generates per-category statistics (e.g., disease, cell type, tissue, sex) |
|
and computes the median expression values for genes across datasets. The results are saved as JSON and CSV files |
|
for downstream analysis. |
|
|
|
Main Features: |
|
- Computes the median expression values for genes in the "processed" layer of AnnData files. |
|
- Generates category-wise statistics (e.g., counts of diseases, cell types, tissues, and sexes). |
|
- Aggregates statistics across multiple training datasets. |
|
- Outputs results in JSON and CSV formats for easy integration with other tools. |
|
|
|
Dependencies: |
|
- anndata: For handling AnnData files. |
|
- numpy: For numerical operations, including median calculations. |
|
- pandas: For creating and exporting tabular data. |
|
- tqdm: For progress visualization during processing. |
|
- glob: For recursive file searching. |
|
|
|
Usage: |
|
- Run this script as a standalone program with the following arguments: |
|
- `--load_dir`: Directory containing the training `.h5ad` files. |
|
- `--stats_dict_name`: Path to save the aggregated statistics JSON file. |
|
""" |
|
|
|
import json |
|
import os |
|
from argparse import ArgumentParser |
|
from glob import glob |
|
|
|
import anndata as ad |
|
import numpy as np |
|
import pandas as pd |
|
from datasets.utils.logging import disable_progress_bar |
|
from tqdm import tqdm |
|
|
|
|
|
def make_median_list(file, out_file): |
|
data = ad.read_h5ad(file) |
|
|
|
|
|
gene_index = data.var.index |
|
all_X = data.layers["processed"].toarray() |
|
all_X[all_X == 0] = np.nan |
|
median = np.nanmedian(all_X, axis=0) |
|
num_median = np.where(~np.isnan(median))[0] |
|
median_dict = {gene_index[k]: median[k].item() for k in num_median} |
|
|
|
with open(out_file, "w") as f: |
|
json.dump(median_dict, f, indent=4) |
|
|
|
|
|
if __name__ == "__main__": |
|
parser = ArgumentParser() |
|
parser.add_argument("--load_dir", default="") |
|
parser.add_argument("--stats_dict_name", default="") |
|
args = parser.parse_args() |
|
disable_progress_bar() |
|
|
|
|
|
all_train = list(glob(args.load_dir + "/**/train_*.h5ad", recursive=True)) |
|
print("Generating individual stats") |
|
for train in tqdm(all_train): |
|
data = ad.read_h5ad(train, backed="r+") |
|
stats = {} |
|
for cat in ["disease", "cell_type", "tissue", "sex"]: |
|
stats[cat] = data.obs[cat].value_counts().to_dict() |
|
|
|
with open(os.path.join(os.path.dirname(train), "bio_stats.json"), "w") as f: |
|
json.dump(stats, f, indent=4) |
|
|
|
print("Collecting stats") |
|
summary_dict = {} |
|
summary_dict["disease"] = {} |
|
summary_dict["cell_type"] = {} |
|
summary_dict["tissue"] = {} |
|
summary_dict["sex"] = {} |
|
for train in tqdm(all_train): |
|
with open(os.path.join(os.path.dirname(train), "bio_stats.json")) as f: |
|
stats = json.load(f) |
|
for cat in ["disease", "cell_type", "tissue", "sex"]: |
|
for k in stats[cat].keys(): |
|
if k not in summary_dict[cat].keys(): |
|
summary_dict[cat][k] = stats[cat][k] |
|
else: |
|
summary_dict[cat][k] += stats[cat][k] |
|
|
|
os.makedirs(os.path.dirname(args.stats_dict_name), exist_ok=True) |
|
with open(args.stats_dict_name, "w") as f: |
|
json.dump(summary_dict, f, indent=4) |
|
|
|
|
|
|
|
|
|
for cat in ["disease", "cell_type", "tissue", "sex"]: |
|
df = pd.DataFrame.from_dict(summary_dict[cat], orient="index", columns=["Counts"]) |
|
df.to_csv(args.stats_dict_name.replace(".json", f"_{cat}.csv")) |
|
|