BioRLHF / create_sft_dataset_expanded.py
jang1563's picture
Initial commit: BioRLHF v0.1.0
c7ebaa1
#!/usr/bin/env python3
"""
BioRLHF SFT Dataset Generator - EXPANDED VERSION
Creates 200+ instruction-tuning examples from KMP 2x2x2 factorial mouse data
"""
import json
import argparse
from typing import List, Dict
import random
# =============================================================================
# GROUND TRUTH DATA
# =============================================================================
STRESSOR_EFFECTS = {
'Heart': {'HU': 165, 'IR': 33, 'HU_IR': 910, 'HU_up': 67, 'HU_down': 98, 'IR_up': 17, 'IR_down': 16},
'Hippocampus': {'HU': 1555, 'IR': 5477, 'HU_IR': 5510, 'HU_up': 711, 'HU_down': 844, 'IR_up': 2554, 'IR_down': 2923},
'Liver': {'HU': 4110, 'IR': 1273, 'HU_IR': 6213, 'HU_up': 2189, 'HU_down': 1921, 'IR_up': 413, 'IR_down': 860},
'Soleus': {'HU': 6425, 'IR': 67, 'HU_IR': 6830, 'HU_up': 3251, 'HU_down': 3174, 'IR_up': 28, 'IR_down': 39},
}
KMP_EFFECTS = {
'Heart': {'baseline': 112, 'in_HU': 2, 'in_IR': 2, 'in_HU_IR': 2110, 'in_HU_IR_up': 1336, 'in_HU_IR_down': 774},
'Hippocampus': {'baseline': 4110, 'in_HU': 1, 'in_IR': 243, 'in_HU_IR': 140, 'baseline_up': 1813, 'baseline_down': 2297},
'Liver': {'baseline': 309, 'in_HU': 17, 'in_IR': 389, 'in_HU_IR': 3},
'Soleus': {'baseline': 0, 'in_HU': 1, 'in_IR': 52, 'in_HU_IR': 491, 'in_HU_IR_up': 425, 'in_HU_IR_down': 66},
}
INTERACTIONS = {
'Heart': {'HU_x_IR': 244, 'KMP_x_HU': 479, 'KMP_x_IR': 29},
'Hippocampus': {'HU_x_IR': 93, 'KMP_x_HU': 36, 'KMP_x_IR': 1221},
'Liver': {'HU_x_IR': 3210, 'KMP_x_HU': 3369, 'KMP_x_IR': 247},
'Soleus': {'HU_x_IR': 211, 'KMP_x_HU': 8484, 'KMP_x_IR': 484},
}
TISSUE_TYPES = {
'Heart': 'Type A (stress-activated)',
'Soleus': 'Type A (stress-activated)',
'Hippocampus': 'Type B (baseline-active)',
'Liver': 'Type C (stress-blocked)',
}
OXPHOS_PATTERNS = {
'Heart': {'stress_NES': -2.302, 'KMP_NES': 3.691, 'pattern': 'RESCUE', 'delta': 5.993},
'Hippocampus': {'stress_NES': 0.931, 'KMP_NES': 1.585, 'pattern': 'KMP Only', 'delta': 0.654},
'Liver': {'stress_NES': 3.596, 'KMP_NES': -1.6, 'pattern': 'SUPPRESSION', 'delta': -5.196},
'Soleus': {'stress_NES': -2.997, 'KMP_NES': 2.46, 'pattern': 'RESCUE', 'delta': 5.457},
}
PATHWAY_PATTERNS = {
'Heart': {
'FATTY_ACID_METABOLISM': {'stress': -2.371, 'kmp': 3.1, 'pattern': 'RESCUE'},
'ADIPOGENESIS': {'stress': -1.839, 'kmp': 2.81, 'pattern': 'RESCUE'},
'MTORC1_SIGNALING': {'stress': -1.662, 'kmp': 2.585, 'pattern': 'RESCUE'},
'INTERFERON_ALPHA': {'stress': -2.072, 'kmp': 1.581, 'pattern': 'RESCUE'},
},
'Liver': {
'MTORC1_SIGNALING': {'stress': 3.075, 'kmp': -1.678, 'pattern': 'SUPPRESSION'},
'INTERFERON_GAMMA': {'stress': 1.542, 'kmp': -2.336, 'pattern': 'SUPPRESSION'},
},
'Soleus': {
'FATTY_ACID_METABOLISM': {'stress': -2.418, 'kmp': 1.506, 'pattern': 'RESCUE'},
}
}
def generate_factual_basic() -> List[Dict]:
"""Basic factual questions about DEG counts."""
examples = []
for tissue, effects in STRESSOR_EFFECTS.items():
# HU effects with direction
examples.append({
"instruction": f"How many genes are differentially expressed due to hindlimb unloading in {tissue.lower()}?",
"input": "",
"output": f"Hindlimb unloading causes {effects['HU']:,} DEGs in {tissue.lower()} (padj < 0.05), with {effects.get('HU_up', 'N/A')} upregulated and {effects.get('HU_down', 'N/A')} downregulated."
})
# IR effects
examples.append({
"instruction": f"What is the transcriptomic response to ionizing radiation in {tissue.lower()}?",
"input": "",
"output": f"Ionizing radiation induces {effects['IR']:,} DEGs in {tissue.lower()}, with {effects.get('IR_up', 'N/A')} upregulated and {effects.get('IR_down', 'N/A')} downregulated."
})
# Combined effects
examples.append({
"instruction": f"Describe the combined stress response in {tissue.lower()}.",
"input": "",
"output": f"Combined HU+IR stress produces {effects['HU_IR']:,} DEGs in {tissue.lower()}, representing the integrated response to simultaneous mechanical unloading and radiation exposure."
})
# Additivity check
additive = effects['HU'] + effects['IR']
if effects['HU_IR'] > additive * 1.2:
add_type = "synergistic (more than additive)"
elif effects['HU_IR'] < additive * 0.8:
add_type = "sub-additive (less than sum)"
else:
add_type = "approximately additive"
examples.append({
"instruction": f"Is the combined HU+IR effect additive in {tissue.lower()}?",
"input": "",
"output": f"In {tissue.lower()}, HU causes {effects['HU']:,} DEGs and IR causes {effects['IR']:,} DEGs. The combined effect ({effects['HU_IR']:,} DEGs) is {add_type}."
})
return examples
def generate_factual_kmp() -> List[Dict]:
"""Factual questions about KMP effects."""
examples = []
for tissue, effects in KMP_EFFECTS.items():
# Baseline vs stress comparison
fold = effects['in_HU_IR'] / max(effects['baseline'], 1)
if fold > 5:
change = "dramatically increases"
elif fold < 0.2:
change = "dramatically decreases"
else:
change = "moderately changes"
examples.append({
"instruction": f"How does stress affect KMP's transcriptional activity in {tissue.lower()}?",
"input": "",
"output": f"KMP effect {change} from {effects['baseline']:,} DEGs at baseline to {effects['in_HU_IR']:,} DEGs under combined stress in {tissue.lower()}. This indicates {'stress-activated' if fold > 1 else 'stress-blocked'} pharmacodynamics."
})
# Each stress condition
examples.append({
"instruction": f"Compare KMP effects across different stress conditions in {tissue.lower()}.",
"input": "",
"output": f"In {tissue.lower()}, KMP produces: {effects['baseline']} DEGs at baseline, {effects['in_HU']} DEGs under HU only, {effects['in_IR']} DEGs under IR only, and {effects['in_HU_IR']:,} DEGs under combined HU+IR stress."
})
# Direction of KMP effect
if 'in_HU_IR_up' in effects:
pct_up = effects['in_HU_IR_up'] / effects['in_HU_IR'] * 100
examples.append({
"instruction": f"What is the direction of KMP-induced gene expression changes in {tissue.lower()} under stress?",
"input": "",
"output": f"Under combined stress, KMP induces {effects['in_HU_IR_up']:,} upregulated and {effects['in_HU_IR_down']:,} downregulated genes in {tissue.lower()} ({pct_up:.1f}% upregulated). This {'anabolic/protective' if pct_up > 60 else 'mixed' if pct_up > 40 else 'suppressive'} signature suggests {'tissue protection' if pct_up > 60 else 'complex regulation'}."
})
return examples
def generate_factual_interactions() -> List[Dict]:
"""Factual questions about interaction effects."""
examples = []
for tissue, ints in INTERACTIONS.items():
# KMP x HU
examples.append({
"instruction": f"What is the statistical interaction between KMP and HU in {tissue.lower()}?",
"input": "",
"output": f"The KMP × HU interaction produces {ints['KMP_x_HU']:,} DEGs in {tissue.lower()}, indicating {'massive' if ints['KMP_x_HU'] > 5000 else 'substantial' if ints['KMP_x_HU'] > 500 else 'moderate'} non-additive effects."
})
# KMP x IR
examples.append({
"instruction": f"Describe the KMP × IR interaction in {tissue.lower()}.",
"input": "",
"output": f"The KMP × IR interaction produces {ints['KMP_x_IR']:,} DEGs in {tissue.lower()}, {'representing the largest radiation-drug interaction' if ints['KMP_x_IR'] > 1000 else 'indicating modest interaction with radiation stress'}."
})
# HU x IR
examples.append({
"instruction": f"Is there a HU × IR interaction in {tissue.lower()}?",
"input": "",
"output": f"Yes, the HU × IR interaction produces {ints['HU_x_IR']:,} DEGs in {tissue.lower()}, indicating the two stressors have {'strong synergistic' if ints['HU_x_IR'] > 1000 else 'moderate non-additive'} effects."
})
return examples
def generate_factual_pathways() -> List[Dict]:
"""Factual questions about pathway patterns."""
examples = []
for tissue, pattern in OXPHOS_PATTERNS.items():
examples.append({
"instruction": f"What happens to oxidative phosphorylation in {tissue.lower()} under stress?",
"input": "",
"output": f"Under combined HU+IR stress, OXPHOS shows NES = {pattern['stress_NES']:.2f} in {tissue.lower()}, indicating {'suppression' if pattern['stress_NES'] < 0 else 'activation'} of mitochondrial respiration."
})
examples.append({
"instruction": f"How does KMP affect OXPHOS in {tissue.lower()}?",
"input": "",
"output": f"KMP shifts OXPHOS NES from {pattern['stress_NES']:.2f} to {pattern['KMP_NES']:.2f} in {tissue.lower()} (Δ = {pattern['delta']:.2f}). This {pattern['pattern']} pattern indicates {'restoration of mitochondrial function' if 'RESCUE' in pattern['pattern'] else 'metabolic braking' if 'SUPPRESSION' in pattern['pattern'] else 'KMP-specific effects'}."
})
for tissue, pathways in PATHWAY_PATTERNS.items():
for pathway, data in pathways.items():
examples.append({
"instruction": f"What is the {pathway.replace('_', ' ').lower()} response in {tissue.lower()}?",
"input": "",
"output": f"In {tissue.lower()}, stress produces {pathway.replace('_', ' ')} NES = {data['stress']:.2f}, and KMP shifts this to NES = {data['kmp']:.2f}. Pattern: {data['pattern']}."
})
return examples
def generate_comparison_questions() -> List[Dict]:
"""Cross-tissue and cross-condition comparisons."""
examples = []
# Tissue rankings
tissues_by_hu = sorted(STRESSOR_EFFECTS.items(), key=lambda x: x[1]['HU'], reverse=True)
examples.append({
"instruction": "Rank tissues by sensitivity to hindlimb unloading.",
"input": "",
"output": f"HU sensitivity ranking: 1) {tissues_by_hu[0][0]} ({tissues_by_hu[0][1]['HU']:,}), 2) {tissues_by_hu[1][0]} ({tissues_by_hu[1][1]['HU']:,}), 3) {tissues_by_hu[2][0]} ({tissues_by_hu[2][1]['HU']:,}), 4) {tissues_by_hu[3][0]} ({tissues_by_hu[3][1]['HU']})."
})
tissues_by_ir = sorted(STRESSOR_EFFECTS.items(), key=lambda x: x[1]['IR'], reverse=True)
examples.append({
"instruction": "Rank tissues by sensitivity to ionizing radiation.",
"input": "",
"output": f"IR sensitivity ranking: 1) {tissues_by_ir[0][0]} ({tissues_by_ir[0][1]['IR']:,}), 2) {tissues_by_ir[1][0]} ({tissues_by_ir[1][1]['IR']:,}), 3) {tissues_by_ir[2][0]} ({tissues_by_ir[2][1]['IR']}), 4) {tissues_by_ir[3][0]} ({tissues_by_ir[3][1]['IR']})."
})
tissues_by_kmp = sorted(KMP_EFFECTS.items(), key=lambda x: x[1]['in_HU_IR'], reverse=True)
examples.append({
"instruction": "Rank tissues by KMP effect under combined stress.",
"input": "",
"output": f"KMP effect under stress: 1) {tissues_by_kmp[0][0]} ({tissues_by_kmp[0][1]['in_HU_IR']:,}), 2) {tissues_by_kmp[1][0]} ({tissues_by_kmp[1][1]['in_HU_IR']}), 3) {tissues_by_kmp[2][0]} ({tissues_by_kmp[2][1]['in_HU_IR']}), 4) {tissues_by_kmp[3][0]} ({tissues_by_kmp[3][1]['in_HU_IR']})."
})
# Pairwise comparisons
for t1 in ['Heart', 'Hippocampus', 'Liver', 'Soleus']:
for t2 in ['Heart', 'Hippocampus', 'Liver', 'Soleus']:
if t1 < t2:
examples.append({
"instruction": f"Compare {t1.lower()} and {t2.lower()} responses to HU.",
"input": "",
"output": f"{t1}: {STRESSOR_EFFECTS[t1]['HU']:,} DEGs. {t2}: {STRESSOR_EFFECTS[t2]['HU']:,} DEGs. {'Same' if TISSUE_TYPES[t1] == TISSUE_TYPES[t2] else 'Different'} KMP response type."
})
examples.append({
"instruction": f"Compare KMP context-dependency in {t1.lower()} vs {t2.lower()}.",
"input": "",
"output": f"{t1} ({TISSUE_TYPES[t1]}): baseline→stress = {KMP_EFFECTS[t1]['baseline']}{KMP_EFFECTS[t1]['in_HU_IR']:,}. {t2} ({TISSUE_TYPES[t2]}): {KMP_EFFECTS[t2]['baseline']}{KMP_EFFECTS[t2]['in_HU_IR']}."
})
# Stressor dominance
for tissue, effects in STRESSOR_EFFECTS.items():
if effects['HU'] > effects['IR'] * 3:
dominance = "HU-dominant"
elif effects['IR'] > effects['HU'] * 3:
dominance = "IR-dominant"
else:
dominance = "balanced response"
examples.append({
"instruction": f"What stressor dominates the response in {tissue.lower()}?",
"input": "",
"output": f"{tissue} shows {dominance}: HU = {effects['HU']:,} DEGs, IR = {effects['IR']:,} DEGs (ratio = {effects['HU']/max(effects['IR'],1):.1f})."
})
return examples
def generate_prediction_tasks() -> List[Dict]:
"""Interaction and cross-tissue prediction tasks."""
examples = []
# Predict combined from main effects
for tissue in STRESSOR_EFFECTS.keys():
effects = STRESSOR_EFFECTS[tissue]
examples.append({
"instruction": f"Predict combined HU+IR effect in {tissue.lower()} from main effects.",
"input": f"HU alone: {effects['HU']:,} DEGs. IR alone: {effects['IR']:,} DEGs.",
"output": f"Additive prediction: ~{effects['HU']+effects['IR']:,} DEGs. Actual: {effects['HU_IR']:,} DEGs. The {'synergistic' if effects['HU_IR'] > effects['HU']+effects['IR'] else 'sub-additive'} effect reflects biological interaction between stressors."
})
# Predict KMP under stress from baseline
for tissue in KMP_EFFECTS.keys():
kmp = KMP_EFFECTS[tissue]
examples.append({
"instruction": f"Predict KMP effect under stress in {tissue.lower()}.",
"input": f"KMP at baseline: {kmp['baseline']} DEGs. Tissue type: {TISSUE_TYPES[tissue]}.",
"output": f"Based on tissue type, predict {'increase' if 'stress-activated' in TISSUE_TYPES[tissue] else 'decrease'}. Actual: {kmp['in_HU_IR']:,} DEGs. Ratio: {kmp['in_HU_IR']/max(kmp['baseline'],1):.1f}x."
})
# Cross-tissue predictions
examples.append({
"instruction": "Given heart (Type A) and soleus (Type A), predict similarity of KMP response.",
"input": "Both are Type A (stress-activated). Heart KMP under stress: 2,110 DEGs.",
"output": f"Prediction: Similar stress-activated pattern. Actual soleus: {KMP_EFFECTS['Soleus']['in_HU_IR']} DEGs. Both show stress-activated response, confirming Type A classification predicts pharmacodynamics."
})
examples.append({
"instruction": "Given heart OXPHOS RESCUE, predict liver OXPHOS pattern.",
"input": f"Heart: stress {OXPHOS_PATTERNS['Heart']['stress_NES']:.1f} → KMP {OXPHOS_PATTERNS['Heart']['KMP_NES']:.1f}. Liver is Type C (different).",
"output": f"Prediction: Different pattern due to metabolic role. Actual: Liver shows SUPPRESSION (stress {OXPHOS_PATTERNS['Liver']['stress_NES']:.1f} → KMP {OXPHOS_PATTERNS['Liver']['KMP_NES']:.1f}). Opposite pattern confirms tissue-specific mechanisms."
})
examples.append({
"instruction": "Predict KMP × HU interaction magnitude in soleus from heart data.",
"input": f"Heart KMP × HU: {INTERACTIONS['Heart']['KMP_x_HU']} DEGs. Soleus has larger HU response ({STRESSOR_EFFECTS['Soleus']['HU']:,} vs {STRESSOR_EFFECTS['Heart']['HU']}).",
"output": f"Prediction: Larger than heart due to greater HU sensitivity. Actual: {INTERACTIONS['Soleus']['KMP_x_HU']:,} DEGs ({INTERACTIONS['Soleus']['KMP_x_HU']/INTERACTIONS['Heart']['KMP_x_HU']:.0f}x larger). Massive interaction reflects stress-dependent KMP activation."
})
return examples
def generate_design_critique() -> List[Dict]:
"""Experimental design critique tasks."""
examples = []
examples.append({
"instruction": "Critique: Testing KMP only at baseline to assess spaceflight protection.",
"input": "",
"output": f"FLAWED. Heart baseline KMP: {KMP_EFFECTS['Heart']['baseline']} DEGs. Heart stressed KMP: {KMP_EFFECTS['Heart']['in_HU_IR']:,} DEGs. Testing at baseline would miss {KMP_EFFECTS['Heart']['in_HU_IR']/max(KMP_EFFECTS['Heart']['baseline'],1):.0f}x larger stressed effect. Must include stress conditions."
})
examples.append({
"instruction": "Critique: Extrapolating heart results to predict liver response.",
"input": "",
"output": "INCORRECT. Heart (Type A) and Liver (Type C) have opposite patterns. Heart OXPHOS: RESCUE. Liver OXPHOS: SUPPRESSION. KMP effect: Heart {0} → {1} DEGs, Liver {2} → {3} DEGs. Cross-tissue extrapolation invalid.".format(
KMP_EFFECTS['Heart']['baseline'], KMP_EFFECTS['Heart']['in_HU_IR'],
KMP_EFFECTS['Liver']['baseline'], KMP_EFFECTS['Liver']['in_HU_IR']
)
})
examples.append({
"instruction": "Critique: Using |log2FC| > 1 threshold.",
"input": "",
"output": f"PROBLEMATIC. Stringent thresholds miss coordinated small effects. Using padj-only: Heart KMP = {KMP_EFFECTS['Heart']['in_HU_IR']:,} DEGs. Many genes with <2-fold change contribute to pathway-level effects like OXPHOS rescue."
})
examples.append({
"instruction": "Critique: Single-tissue study for KMP countermeasure development.",
"input": "",
"output": f"INSUFFICIENT. Tissue responses vary dramatically: Heart {KMP_EFFECTS['Heart']['in_HU_IR']:,}, Soleus {KMP_EFFECTS['Soleus']['in_HU_IR']}, Hippocampus {KMP_EFFECTS['Hippocampus']['in_HU_IR']}, Liver {KMP_EFFECTS['Liver']['in_HU_IR']} DEGs under stress. Type A/B/C classification requires multi-tissue data."
})
examples.append({
"instruction": "Critique: Assuming additive drug-stressor effects.",
"input": "",
"output": f"INCORRECT. KMP × HU interactions are highly non-additive. Soleus: {INTERACTIONS['Soleus']['KMP_x_HU']:,} interaction DEGs. This super-additive effect indicates synergistic mechanisms that additive models cannot capture."
})
examples.append({
"instruction": "Evaluate cell culture KMP data for in vivo predictions.",
"input": "",
"output": f"LIMITED VALUE. Cell culture cannot capture: context-dependency (KMP effects range 0-{KMP_EFFECTS['Heart']['in_HU_IR']:,} DEGs), tissue specificity (Type A/B/C), or interaction effects (KMP × HU = {INTERACTIONS['Soleus']['KMP_x_HU']:,} DEGs). In vivo factorial studies essential."
})
examples.append({
"instruction": "Critique: Human dosing based on mouse efficacy data.",
"input": "",
"output": "PREMATURE. Missing: dose-response (single dose tested), human PK, therapeutic window, bioavailability. Mouse-to-human scaling unreliable (10-100x errors). Need: dose-response studies, PK modeling, Phase I trials."
})
return examples
def generate_mechanistic_reasoning() -> List[Dict]:
"""Mechanistic reasoning tasks."""
examples = []
examples.append({
"instruction": "Explain hippocampus baseline-active, stress-blocked pattern.",
"input": "",
"output": f"Hippocampus: {KMP_EFFECTS['Hippocampus']['baseline']:,} DEGs baseline → {KMP_EFFECTS['Hippocampus']['in_HU_IR']} DEGs stressed. Mechanisms: (1) Target saturation by stress signals, (2) Pathway competition with DNA damage response, (3) Altered transcriptional machinery from radiation. Implication: Pre-treatment may be more effective."
})
examples.append({
"instruction": "Explain the massive KMP × HU interaction in soleus.",
"input": "",
"output": f"Soleus KMP × HU = {INTERACTIONS['Soleus']['KMP_x_HU']:,} DEGs (largest interaction). Mechanisms: (1) Mechanosensitive pathways create KMP binding sites, (2) HU upregulates KMP targets normally not expressed, (3) Mitochondrial dysfunction reveals KMP protective capacity. Key insight: KMP specifically protects stressed tissue."
})
examples.append({
"instruction": "Explain opposite OXPHOS patterns in heart vs liver.",
"input": "",
"output": f"Heart OXPHOS: stress suppresses ({OXPHOS_PATTERNS['Heart']['stress_NES']:.1f}) → KMP rescues ({OXPHOS_PATTERNS['Heart']['KMP_NES']:.1f}). Liver OXPHOS: stress activates ({OXPHOS_PATTERNS['Liver']['stress_NES']:.1f}) → KMP suppresses ({OXPHOS_PATTERNS['Liver']['KMP_NES']:.1f}). Heart needs ATP restoration for contraction. Liver needs metabolic braking to prevent oxidative damage. 'Rescue' is tissue-specific."
})
examples.append({
"instruction": "Why does soleus show 0 KMP DEGs at baseline but 491 under stress?",
"input": "",
"output": f"Complete stress-dependency. Baseline: KMP targets not expressed or accessible. Under HU stress: Mechanosensitive pathways activate, target expression increases, mitochondrial stress reveals KMP binding sites. Result: 0 → {KMP_EFFECTS['Soleus']['in_HU_IR']} DEGs. KMP is conditionally active therapeutic."
})
examples.append({
"instruction": "Explain Type A/B/C classification system.",
"input": "",
"output": f"Type A (Heart, Soleus): Stress-activated. Baseline minimal, stress maximal. KMP works when needed. Type B (Hippocampus): Baseline-active ({KMP_EFFECTS['Hippocampus']['baseline']:,}{KMP_EFFECTS['Hippocampus']['in_HU_IR']}). Stress blocks. May need pre-treatment. Type C (Liver): Stress-blocked ({KMP_EFFECTS['Liver']['baseline']}{KMP_EFFECTS['Liver']['in_HU_IR']}). Different mechanism needed."
})
examples.append({
"instruction": "Why is KMP × HU larger than KMP × IR in most tissues?",
"input": "",
"output": f"KMP × HU: Heart {INTERACTIONS['Heart']['KMP_x_HU']}, Liver {INTERACTIONS['Liver']['KMP_x_HU']:,}, Soleus {INTERACTIONS['Soleus']['KMP_x_HU']:,}. KMP × IR: Heart {INTERACTIONS['Heart']['KMP_x_IR']}, Liver {INTERACTIONS['Liver']['KMP_x_IR']}, Soleus {INTERACTIONS['Soleus']['KMP_x_IR']}. KMP likely targets mechanosensitive/metabolic pathways more than DNA damage pathways."
})
return examples
def generate_uncertainty_calibration() -> List[Dict]:
"""Uncertainty calibration tasks."""
examples = []
examples.append({
"instruction": "What is KMP's effect on cardiac contractility?",
"input": "",
"output": f"CANNOT DIRECTLY ANSWER. Data shows: {KMP_EFFECTS['Heart']['in_HU_IR']:,} DEGs, OXPHOS rescue. These are transcriptomic changes. NOT shown: functional contractility, protein levels, force generation. Inference: OXPHOS rescue may improve ATP→may improve contractility. Confidence: LOW for function."
})
examples.append({
"instruction": "Will KMP prevent human astronaut muscle atrophy?",
"input": "",
"output": f"CANNOT CONFIDENTLY ANSWER. Supports: Stress-activated protection ({KMP_EFFECTS['Soleus']['in_HU_IR']} DEGs), KMP × HU interaction ({INTERACTIONS['Soleus']['KMP_x_HU']:,}), OXPHOS rescue. Missing: Human PK, true microgravity (HU is analog), long-duration effects. Strong rationale but insufficient for prediction."
})
examples.append({
"instruction": "Which gene is most important for KMP protection?",
"input": "",
"output": "CANNOT DEFINITIVELY ANSWER. Transcriptomics shows correlation, not causation. Large change ≠ causal role. Redundant pathways exist. No knockout validation. Better questions: 'Largest effects?' (answerable), 'Most enriched pathways?' (answerable). Need: CRISPR screening for causality."
})
examples.append({
"instruction": "What KMP dose should be used for astronauts?",
"input": "",
"output": "CANNOT INFORM DOSING. Missing: Dose-response (one dose tested), human PK, therapeutic window, bioavailability. Mouse-human scaling unreliable. Need: Multi-dose mouse studies, PK modeling, human Phase I. Do not extrapolate."
})
examples.append({
"instruction": "Is the 8,484 DEG interaction statistically robust?",
"input": "",
"output": f"YES with caveats. Each of {INTERACTIONS['Soleus']['KMP_x_HU']:,} genes has padj < 0.05. Highly significant. But: exact number is threshold-dependent, effect sizes vary. Appropriate: 'approximately 8,500 DEGs indicating substantial non-additive effect.' Avoid false precision."
})
examples.append({
"instruction": "Can we predict KMP effects in kidney from this data?",
"input": "",
"output": "CANNOT PREDICT. Kidney not studied. Type A/B/C patterns are tissue-specific and cannot be extrapolated. Kidney has unique metabolic/filtration role. Would need: dedicated kidney experiments. Cross-tissue prediction requires mechanistic validation."
})
return examples
def generate_application_questions() -> List[Dict]:
"""Practical application questions."""
examples = []
examples.append({
"instruction": "Design a KMP efficacy study for astronaut protection.",
"input": "",
"output": f"Required: (1) Factorial design with stress conditions (not baseline-only), (2) Multiple tissues (Type A/B/C responses differ), (3) Functional endpoints beyond transcriptomics, (4) Dose-response. Rationale: Baseline-only would miss {KMP_EFFECTS['Heart']['in_HU_IR']:,}/{KMP_EFFECTS['Heart']['baseline']} DEG effect in heart."
})
examples.append({
"instruction": "Prioritize tissues for KMP therapeutic development.",
"input": "",
"output": f"Priority 1: Soleus - largest interaction ({INTERACTIONS['Soleus']['KMP_x_HU']:,}), stress-activated, OXPHOS rescue. Priority 2: Heart - strong rescue ({KMP_EFFECTS['Heart']['in_HU_IR']:,} DEGs). Lower priority: Hippocampus (stress-blocked), Liver (minimal effect). Focus on Type A tissues."
})
examples.append({
"instruction": "Identify biomarkers for KMP efficacy monitoring.",
"input": "",
"output": "Candidate biomarkers: (1) OXPHOS pathway genes (track rescue/suppression), (2) Tissue-type classification (confirm Type A/B/C), (3) Interaction-responsive genes from KMP × HU. Need validation: functional correlation, accessibility (blood-based), temporal dynamics."
})
examples.append({
"instruction": "What controls are essential for KMP spaceflight studies?",
"input": "",
"output": f"Essential controls: (1) Vehicle under all stress conditions (not just baseline), (2) KMP at baseline (to detect context-dependency), (3) Single stressors (HU-only, IR-only) for interaction calculation, (4) Multiple tissues. Missing any control prevents detecting effects like {KMP_EFFECTS['Soleus']['baseline']}{KMP_EFFECTS['Soleus']['in_HU_IR']} shift."
})
return examples
def compile_sft_dataset(output_file: str = 'kmp_sft_dataset.json'):
"""Compile all examples into final SFT dataset."""
all_examples = []
print("Generating factual basic examples...")
all_examples.extend(generate_factual_basic())
print("Generating factual KMP examples...")
all_examples.extend(generate_factual_kmp())
print("Generating factual interaction examples...")
all_examples.extend(generate_factual_interactions())
print("Generating factual pathway examples...")
all_examples.extend(generate_factual_pathways())
print("Generating comparison examples...")
all_examples.extend(generate_comparison_questions())
print("Generating prediction examples...")
all_examples.extend(generate_prediction_tasks())
print("Generating design critique examples...")
all_examples.extend(generate_design_critique())
print("Generating mechanistic reasoning examples...")
all_examples.extend(generate_mechanistic_reasoning())
print("Generating uncertainty calibration examples...")
all_examples.extend(generate_uncertainty_calibration())
print("Generating application examples...")
all_examples.extend(generate_application_questions())
# Format for training
formatted = []
for ex in all_examples:
if ex.get('input'):
text = f"### Instruction:\n{ex['instruction']}\n\n### Input:\n{ex['input']}\n\n### Response:\n{ex['output']}"
else:
text = f"### Instruction:\n{ex['instruction']}\n\n### Response:\n{ex['output']}"
formatted.append({"text": text})
# Shuffle for training
random.seed(42)
random.shuffle(formatted)
with open(output_file, 'w') as f:
json.dump(formatted, f, indent=2)
print(f"\n{'='*60}")
print(f"SFT Dataset Summary")
print(f"{'='*60}")
print(f"Total examples: {len(formatted)}")
print(f"Output file: {output_file}")
return formatted
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument('--output', default='kmp_sft_dataset.json')
args = parser.parse_args()
compile_sft_dataset(args.output)