Spaces:

DTanzillo
/

Inspiration_Health_Data

No application file

App Files Files Community

Inspiration_Health_Data / scripts /stats.py

DTanzillo

Upload 4 files

6355ab5 verified 5 months ago

raw

history blame contribute delete

8.94 kB

	import pandas as pd
	import numpy as np
	from scipy import stats
	from .featureEngineering import parse_timepoint

	# Map analyte base names to human labels + units + reference ranges
	## To get sub and superscripts in Markdown I used ChatGPT: https://chatgpt.com/share/68d9c8f6-2674-8008-8ff7-0731bec9ad49
	ANALYTE_INFO = {
	#Blood Chemistry
	"albumin": {"label": "Albumin", "unit": "g/dL"},
	"alkaline_phosphatase": {"label": "Alkaline Phosphatase", "unit": "U/L"},
	"alt": {"label": "ALT", "unit": "U/L"},
	"ast": {"label": "AST", "unit": "U/L"},
	"total_bilirubin": {"label": "Bilirubin", "unit": "mg/dL"},
	"bun_to_creatinine_ratio": {"label": "BUN/Creatinine Ratio", "unit": ""},
	"calcium": {"label": "Ca²⁺", "unit": "mg/dL"},
	"carbon_dioxide": {"label": "CO₂", "unit": "mmol/L"},
	"chloride": {"label": "Cl⁻", "unit": "mmol/L"},
	"creatinine": {"label": "Creatinine", "unit": "mg/dL"},
	"egfr_african_american": {"label": "eGFR (AA)", "unit": "mL/min/1.73m²"},
	"egfr_non_african_american": {"label": "eGFR (non-AA)", "unit": "mL/min/1.73m²"},
	"globulin": {"label": "Globulin", "unit": "g/dL"},
	"glucose": {"label": "Glucose", "unit": "mg/dL"},
	"potassium": {"label": "K⁺", "unit": "mmol/L"},
	"total_protein": {"label": "Protein", "unit": "g/dL"},
	"sodium": {"label": "Na⁺", "unit": "mmol/L"},
	"urea_nitrogen_bun": {"label": "BUN", "unit": "mg/dL"},

	# Derived feature
	"anion_gap": {
	"label": "Anion Gap",
	"unit": "mmol/L",
	"min": 8, # manual reference range
	"max": 24
	},

	## cardiovascular
	## Cardiovascular
	"a2_macroglobulin": {"label": "α₂-Macroglobulin", "unit": "ng/mL"},
	"agp": {"label": "AGP (α1-acid glycoprotein)", "unit": "ng/mL"},
	"crp": {"label": "CRP (C-reactive protein)", "unit": "pg/mL"},
	"fetuin_a36": {"label": "Fetuin A3/6", "unit": "ng/mL"},
	"fibrinogen": {"label": "Fibrinogen", "unit": "ng/mL"},
	"haptoglobin": {"label": "Haptoglobin", "unit": "ng/mL"},
	"l_selectin": {"label": "L-Selectin", "unit": "pg/mL"},
	"pf4": {"label": "Platelet Factor 4", "unit": "ng/mL"},
	"sap": {"label": "SAP (Serum Amyloid P)", "unit": "pg/mL"},
	}

	# Helpers to find columns by prefix (robust to unit suffixes)
	def _first_col_startswith(df: pd.DataFrame, prefixes) -> str \| None:
	"""
	Return the first column whose lowercase name starts with any prefix in `prefixes`.
	"""
	if isinstance(prefixes, str):
	prefixes = [prefixes]
	prefixes = [p.lower() for p in prefixes]
	for col in df.columns:
	cl = col.lower()
	for p in prefixes:
	if cl.startswith(p):
	return col
	return None


	def _value_min_max_cols(df: pd.DataFrame, analyte: str):
	"""
	For a given base analyte name, return (value_col, min_col, max_col).
	Works with clinical chemistry (…_value) and cardiovascular (…_concentration / …_percent).
	"""
	v = _first_col_startswith(df, f"{analyte}_value")
	if v is None:
	v = _first_col_startswith(df, f"{analyte}_concentration")

	mn = _first_col_startswith(df, [f"{analyte}_range_min", f"{analyte}_min"])
	mx = _first_col_startswith(df, [f"{analyte}_range_max", f"{analyte}_max"])

	return v, mn, mx

	# Tidy Transformation
	def tidy_from_wide(df: pd.DataFrame) -> pd.DataFrame:
	"""
	Transform astronaut CSV with value/min/max triplets into tidy format.
	Adds derived analytes (like Anion Gap) using flexible column matching.
	Returns: columns [astronautID, timepoint, flight_day, analyte, value, min, max, unit, label, sex]
	"""
	tidy_records = []

	# normalize lookup for id/timepoint columns
	colmap = {c.lower(): c for c in df.columns}
	astronaut_col = colmap.get("astronautid")
	timepoint_col = colmap.get("timepoint")

	if astronaut_col is None or timepoint_col is None:
	raise KeyError("Expected astronautID and timepoint columns in input CSV")

	for analyte, meta in ANALYTE_INFO.items():
	if analyte == "anion_gap":
	continue

	value_col, min_col, max_col = _value_min_max_cols(df, analyte)
	if value_col is None:
	continue

	for _, row in df.iterrows():
	rec = {
	"astronautID": row[astronaut_col],
	"timepoint": row[timepoint_col],
	"flight_day": parse_timepoint(row[timepoint_col]),
	"analyte": analyte,
	"value": row[value_col],
	"min": (row[min_col] if (min_col and pd.notna(row[min_col])) else meta.get("min")),
	"max": (row[max_col] if (max_col and pd.notna(row[max_col])) else meta.get("max")),
	"label": meta["label"],
	"unit": meta["unit"],
	"sex": "Male" if str(row[astronaut_col]) in ["C001", "C004"] else "Female",
	}
	tidy_records.append(rec)

	return pd.DataFrame(tidy_records)

	# Statistical Comparison: R+1 vs L-series
	def analyze_r1_vs_L(tidy: pd.DataFrame) -> pd.DataFrame:
	"""
	Compare R+1 vs L-series for each analyte.
	- Within-astronaut: one-sample t-test (H0: mean(L) == R+1)
	Returns per-astronaut mean, std, SE, t-stat, p-value, and Cohen's d.
	- Across-astronauts (group-level): paired t-test on per-astronaut mean(L) vs R+1
	Returns group mean, std across astronauts, SEM, t-stat, p-value, and Cohen's d.
	"""
	results = []
	for analyte, subdf in tidy.groupby("analyte"):

	## Within-astronaut tests
	for astronaut, adf in subdf.groupby("astronautID"):
	L_mask = adf["timepoint"].astype(str).str.startswith("L")
	R1_mask = adf["timepoint"].astype(str).isin(["R+1", "R1", "R+01"])

	L_vals = adf.loc[L_mask, "value"].dropna().astype(float)
	R1_vals = adf.loc[R1_mask, "value"].dropna().astype(float)

	if len(L_vals) >= 2 and len(R1_vals) == 1:
	R1 = float(R1_vals.iloc[0])
	mean_L = float(L_vals.mean())
	std_L = float(L_vals.std(ddof=1))
	n_L = int(L_vals.shape[0])

	if std_L > 0:
	se = std_L / np.sqrt(n_L)
	t_stat = (mean_L - R1) / se
	p_val = 2 * (1 - stats.t.cdf(abs(t_stat), df=n_L - 1))
	cohen_d = (R1 - mean_L) / std_L
	else:
	se = t_stat = p_val = cohen_d = np.nan

	results.append({
	"analyte": analyte,
	"astronautID": astronaut,
	"test_type": "within",
	"n_L": n_L,
	"mean_L": round(mean_L, 2),
	"R1": round(R1, 2),
	"std_L": round(std_L, 2),
	"se_L": round(se, 2) if pd.notna(se) else np.nan,
	"t_stat": round(t_stat, 3) if pd.notna(t_stat) else np.nan,
	"p_value": round(p_val, 4) if pd.notna(p_val) else np.nan,
	"effect_size": round(cohen_d, 3) if pd.notna(cohen_d) else np.nan,
	})

	## Across-astronauts (paired test)
	astronaut_means, astronaut_R1 = [], []
	for astronaut, adf in subdf.groupby("astronautID"):
	L_mask = adf["timepoint"].astype(str).str.startswith("L")
	R1_mask = adf["timepoint"].astype(str).isin(["R+1", "R1", "R+01"])

	L_vals = adf.loc[L_mask, "value"].dropna().astype(float)
	R1_vals = adf.loc[R1_mask, "value"].dropna().astype(float)

	if len(L_vals) >= 2 and len(R1_vals) == 1:
	astronaut_means.append(float(L_vals.mean()))
	astronaut_R1.append(float(R1_vals.iloc[0]))

	if len(astronaut_means) >= 2:
	diffs = np.array(astronaut_R1) - np.array(astronaut_means)
	t_stat, p_val = stats.ttest_rel(astronaut_R1, astronaut_means)

	# Group-level variability
	std_L = np.std(astronaut_means, ddof=1)
	se_L = std_L / np.sqrt(len(astronaut_means))

	cohen_d = diffs.mean() / diffs.std(ddof=1) if diffs.std(ddof=1) > 0 else np.nan

	results.append({
	"analyte": analyte,
	"astronautID": "ALL",
	"test_type": "group",
	"n_L": len(astronaut_means),
	"mean_L": round(float(np.mean(astronaut_means)), 2),
	"R1": round(float(np.mean(astronaut_R1)), 2),
	"t_stat": round(float(t_stat), 3),
	"p_value": round(float(p_val), 4),
	"effect_size": round(float(cohen_d), 3) if pd.notna(cohen_d) else np.nan,
	})

	return pd.DataFrame(results)