| |
| """ |
| BioRLHF SFT Dataset Generator - EXPANDED VERSION |
| Creates 200+ instruction-tuning examples from KMP 2x2x2 factorial mouse data |
| """ |
|
|
| import json |
| import argparse |
| from typing import List, Dict |
| import random |
|
|
| |
| |
| |
|
|
| STRESSOR_EFFECTS = { |
| 'Heart': {'HU': 165, 'IR': 33, 'HU_IR': 910, 'HU_up': 67, 'HU_down': 98, 'IR_up': 17, 'IR_down': 16}, |
| 'Hippocampus': {'HU': 1555, 'IR': 5477, 'HU_IR': 5510, 'HU_up': 711, 'HU_down': 844, 'IR_up': 2554, 'IR_down': 2923}, |
| 'Liver': {'HU': 4110, 'IR': 1273, 'HU_IR': 6213, 'HU_up': 2189, 'HU_down': 1921, 'IR_up': 413, 'IR_down': 860}, |
| 'Soleus': {'HU': 6425, 'IR': 67, 'HU_IR': 6830, 'HU_up': 3251, 'HU_down': 3174, 'IR_up': 28, 'IR_down': 39}, |
| } |
|
|
| KMP_EFFECTS = { |
| 'Heart': {'baseline': 112, 'in_HU': 2, 'in_IR': 2, 'in_HU_IR': 2110, 'in_HU_IR_up': 1336, 'in_HU_IR_down': 774}, |
| 'Hippocampus': {'baseline': 4110, 'in_HU': 1, 'in_IR': 243, 'in_HU_IR': 140, 'baseline_up': 1813, 'baseline_down': 2297}, |
| 'Liver': {'baseline': 309, 'in_HU': 17, 'in_IR': 389, 'in_HU_IR': 3}, |
| 'Soleus': {'baseline': 0, 'in_HU': 1, 'in_IR': 52, 'in_HU_IR': 491, 'in_HU_IR_up': 425, 'in_HU_IR_down': 66}, |
| } |
|
|
| INTERACTIONS = { |
| 'Heart': {'HU_x_IR': 244, 'KMP_x_HU': 479, 'KMP_x_IR': 29}, |
| 'Hippocampus': {'HU_x_IR': 93, 'KMP_x_HU': 36, 'KMP_x_IR': 1221}, |
| 'Liver': {'HU_x_IR': 3210, 'KMP_x_HU': 3369, 'KMP_x_IR': 247}, |
| 'Soleus': {'HU_x_IR': 211, 'KMP_x_HU': 8484, 'KMP_x_IR': 484}, |
| } |
|
|
| TISSUE_TYPES = { |
| 'Heart': 'Type A (stress-activated)', |
| 'Soleus': 'Type A (stress-activated)', |
| 'Hippocampus': 'Type B (baseline-active)', |
| 'Liver': 'Type C (stress-blocked)', |
| } |
|
|
| OXPHOS_PATTERNS = { |
| 'Heart': {'stress_NES': -2.302, 'KMP_NES': 3.691, 'pattern': 'RESCUE', 'delta': 5.993}, |
| 'Hippocampus': {'stress_NES': 0.931, 'KMP_NES': 1.585, 'pattern': 'KMP Only', 'delta': 0.654}, |
| 'Liver': {'stress_NES': 3.596, 'KMP_NES': -1.6, 'pattern': 'SUPPRESSION', 'delta': -5.196}, |
| 'Soleus': {'stress_NES': -2.997, 'KMP_NES': 2.46, 'pattern': 'RESCUE', 'delta': 5.457}, |
| } |
|
|
| PATHWAY_PATTERNS = { |
| 'Heart': { |
| 'FATTY_ACID_METABOLISM': {'stress': -2.371, 'kmp': 3.1, 'pattern': 'RESCUE'}, |
| 'ADIPOGENESIS': {'stress': -1.839, 'kmp': 2.81, 'pattern': 'RESCUE'}, |
| 'MTORC1_SIGNALING': {'stress': -1.662, 'kmp': 2.585, 'pattern': 'RESCUE'}, |
| 'INTERFERON_ALPHA': {'stress': -2.072, 'kmp': 1.581, 'pattern': 'RESCUE'}, |
| }, |
| 'Liver': { |
| 'MTORC1_SIGNALING': {'stress': 3.075, 'kmp': -1.678, 'pattern': 'SUPPRESSION'}, |
| 'INTERFERON_GAMMA': {'stress': 1.542, 'kmp': -2.336, 'pattern': 'SUPPRESSION'}, |
| }, |
| 'Soleus': { |
| 'FATTY_ACID_METABOLISM': {'stress': -2.418, 'kmp': 1.506, 'pattern': 'RESCUE'}, |
| } |
| } |
|
|
|
|
| def generate_factual_basic() -> List[Dict]: |
| """Basic factual questions about DEG counts.""" |
| examples = [] |
| |
| for tissue, effects in STRESSOR_EFFECTS.items(): |
| |
| examples.append({ |
| "instruction": f"How many genes are differentially expressed due to hindlimb unloading in {tissue.lower()}?", |
| "input": "", |
| "output": f"Hindlimb unloading causes {effects['HU']:,} DEGs in {tissue.lower()} (padj < 0.05), with {effects.get('HU_up', 'N/A')} upregulated and {effects.get('HU_down', 'N/A')} downregulated." |
| }) |
| |
| |
| examples.append({ |
| "instruction": f"What is the transcriptomic response to ionizing radiation in {tissue.lower()}?", |
| "input": "", |
| "output": f"Ionizing radiation induces {effects['IR']:,} DEGs in {tissue.lower()}, with {effects.get('IR_up', 'N/A')} upregulated and {effects.get('IR_down', 'N/A')} downregulated." |
| }) |
| |
| |
| examples.append({ |
| "instruction": f"Describe the combined stress response in {tissue.lower()}.", |
| "input": "", |
| "output": f"Combined HU+IR stress produces {effects['HU_IR']:,} DEGs in {tissue.lower()}, representing the integrated response to simultaneous mechanical unloading and radiation exposure." |
| }) |
| |
| |
| additive = effects['HU'] + effects['IR'] |
| if effects['HU_IR'] > additive * 1.2: |
| add_type = "synergistic (more than additive)" |
| elif effects['HU_IR'] < additive * 0.8: |
| add_type = "sub-additive (less than sum)" |
| else: |
| add_type = "approximately additive" |
| |
| examples.append({ |
| "instruction": f"Is the combined HU+IR effect additive in {tissue.lower()}?", |
| "input": "", |
| "output": f"In {tissue.lower()}, HU causes {effects['HU']:,} DEGs and IR causes {effects['IR']:,} DEGs. The combined effect ({effects['HU_IR']:,} DEGs) is {add_type}." |
| }) |
| |
| return examples |
|
|
|
|
| def generate_factual_kmp() -> List[Dict]: |
| """Factual questions about KMP effects.""" |
| examples = [] |
| |
| for tissue, effects in KMP_EFFECTS.items(): |
| |
| fold = effects['in_HU_IR'] / max(effects['baseline'], 1) |
| if fold > 5: |
| change = "dramatically increases" |
| elif fold < 0.2: |
| change = "dramatically decreases" |
| else: |
| change = "moderately changes" |
| |
| examples.append({ |
| "instruction": f"How does stress affect KMP's transcriptional activity in {tissue.lower()}?", |
| "input": "", |
| "output": f"KMP effect {change} from {effects['baseline']:,} DEGs at baseline to {effects['in_HU_IR']:,} DEGs under combined stress in {tissue.lower()}. This indicates {'stress-activated' if fold > 1 else 'stress-blocked'} pharmacodynamics." |
| }) |
| |
| |
| examples.append({ |
| "instruction": f"Compare KMP effects across different stress conditions in {tissue.lower()}.", |
| "input": "", |
| "output": f"In {tissue.lower()}, KMP produces: {effects['baseline']} DEGs at baseline, {effects['in_HU']} DEGs under HU only, {effects['in_IR']} DEGs under IR only, and {effects['in_HU_IR']:,} DEGs under combined HU+IR stress." |
| }) |
| |
| |
| if 'in_HU_IR_up' in effects: |
| pct_up = effects['in_HU_IR_up'] / effects['in_HU_IR'] * 100 |
| examples.append({ |
| "instruction": f"What is the direction of KMP-induced gene expression changes in {tissue.lower()} under stress?", |
| "input": "", |
| "output": f"Under combined stress, KMP induces {effects['in_HU_IR_up']:,} upregulated and {effects['in_HU_IR_down']:,} downregulated genes in {tissue.lower()} ({pct_up:.1f}% upregulated). This {'anabolic/protective' if pct_up > 60 else 'mixed' if pct_up > 40 else 'suppressive'} signature suggests {'tissue protection' if pct_up > 60 else 'complex regulation'}." |
| }) |
| |
| return examples |
|
|
|
|
| def generate_factual_interactions() -> List[Dict]: |
| """Factual questions about interaction effects.""" |
| examples = [] |
| |
| for tissue, ints in INTERACTIONS.items(): |
| |
| examples.append({ |
| "instruction": f"What is the statistical interaction between KMP and HU in {tissue.lower()}?", |
| "input": "", |
| "output": f"The KMP × HU interaction produces {ints['KMP_x_HU']:,} DEGs in {tissue.lower()}, indicating {'massive' if ints['KMP_x_HU'] > 5000 else 'substantial' if ints['KMP_x_HU'] > 500 else 'moderate'} non-additive effects." |
| }) |
| |
| |
| examples.append({ |
| "instruction": f"Describe the KMP × IR interaction in {tissue.lower()}.", |
| "input": "", |
| "output": f"The KMP × IR interaction produces {ints['KMP_x_IR']:,} DEGs in {tissue.lower()}, {'representing the largest radiation-drug interaction' if ints['KMP_x_IR'] > 1000 else 'indicating modest interaction with radiation stress'}." |
| }) |
| |
| |
| examples.append({ |
| "instruction": f"Is there a HU × IR interaction in {tissue.lower()}?", |
| "input": "", |
| "output": f"Yes, the HU × IR interaction produces {ints['HU_x_IR']:,} DEGs in {tissue.lower()}, indicating the two stressors have {'strong synergistic' if ints['HU_x_IR'] > 1000 else 'moderate non-additive'} effects." |
| }) |
| |
| return examples |
|
|
|
|
| def generate_factual_pathways() -> List[Dict]: |
| """Factual questions about pathway patterns.""" |
| examples = [] |
| |
| for tissue, pattern in OXPHOS_PATTERNS.items(): |
| examples.append({ |
| "instruction": f"What happens to oxidative phosphorylation in {tissue.lower()} under stress?", |
| "input": "", |
| "output": f"Under combined HU+IR stress, OXPHOS shows NES = {pattern['stress_NES']:.2f} in {tissue.lower()}, indicating {'suppression' if pattern['stress_NES'] < 0 else 'activation'} of mitochondrial respiration." |
| }) |
| |
| examples.append({ |
| "instruction": f"How does KMP affect OXPHOS in {tissue.lower()}?", |
| "input": "", |
| "output": f"KMP shifts OXPHOS NES from {pattern['stress_NES']:.2f} to {pattern['KMP_NES']:.2f} in {tissue.lower()} (Δ = {pattern['delta']:.2f}). This {pattern['pattern']} pattern indicates {'restoration of mitochondrial function' if 'RESCUE' in pattern['pattern'] else 'metabolic braking' if 'SUPPRESSION' in pattern['pattern'] else 'KMP-specific effects'}." |
| }) |
| |
| for tissue, pathways in PATHWAY_PATTERNS.items(): |
| for pathway, data in pathways.items(): |
| examples.append({ |
| "instruction": f"What is the {pathway.replace('_', ' ').lower()} response in {tissue.lower()}?", |
| "input": "", |
| "output": f"In {tissue.lower()}, stress produces {pathway.replace('_', ' ')} NES = {data['stress']:.2f}, and KMP shifts this to NES = {data['kmp']:.2f}. Pattern: {data['pattern']}." |
| }) |
| |
| return examples |
|
|
|
|
| def generate_comparison_questions() -> List[Dict]: |
| """Cross-tissue and cross-condition comparisons.""" |
| examples = [] |
| |
| |
| tissues_by_hu = sorted(STRESSOR_EFFECTS.items(), key=lambda x: x[1]['HU'], reverse=True) |
| examples.append({ |
| "instruction": "Rank tissues by sensitivity to hindlimb unloading.", |
| "input": "", |
| "output": f"HU sensitivity ranking: 1) {tissues_by_hu[0][0]} ({tissues_by_hu[0][1]['HU']:,}), 2) {tissues_by_hu[1][0]} ({tissues_by_hu[1][1]['HU']:,}), 3) {tissues_by_hu[2][0]} ({tissues_by_hu[2][1]['HU']:,}), 4) {tissues_by_hu[3][0]} ({tissues_by_hu[3][1]['HU']})." |
| }) |
| |
| tissues_by_ir = sorted(STRESSOR_EFFECTS.items(), key=lambda x: x[1]['IR'], reverse=True) |
| examples.append({ |
| "instruction": "Rank tissues by sensitivity to ionizing radiation.", |
| "input": "", |
| "output": f"IR sensitivity ranking: 1) {tissues_by_ir[0][0]} ({tissues_by_ir[0][1]['IR']:,}), 2) {tissues_by_ir[1][0]} ({tissues_by_ir[1][1]['IR']:,}), 3) {tissues_by_ir[2][0]} ({tissues_by_ir[2][1]['IR']}), 4) {tissues_by_ir[3][0]} ({tissues_by_ir[3][1]['IR']})." |
| }) |
| |
| tissues_by_kmp = sorted(KMP_EFFECTS.items(), key=lambda x: x[1]['in_HU_IR'], reverse=True) |
| examples.append({ |
| "instruction": "Rank tissues by KMP effect under combined stress.", |
| "input": "", |
| "output": f"KMP effect under stress: 1) {tissues_by_kmp[0][0]} ({tissues_by_kmp[0][1]['in_HU_IR']:,}), 2) {tissues_by_kmp[1][0]} ({tissues_by_kmp[1][1]['in_HU_IR']}), 3) {tissues_by_kmp[2][0]} ({tissues_by_kmp[2][1]['in_HU_IR']}), 4) {tissues_by_kmp[3][0]} ({tissues_by_kmp[3][1]['in_HU_IR']})." |
| }) |
| |
| |
| for t1 in ['Heart', 'Hippocampus', 'Liver', 'Soleus']: |
| for t2 in ['Heart', 'Hippocampus', 'Liver', 'Soleus']: |
| if t1 < t2: |
| examples.append({ |
| "instruction": f"Compare {t1.lower()} and {t2.lower()} responses to HU.", |
| "input": "", |
| "output": f"{t1}: {STRESSOR_EFFECTS[t1]['HU']:,} DEGs. {t2}: {STRESSOR_EFFECTS[t2]['HU']:,} DEGs. {'Same' if TISSUE_TYPES[t1] == TISSUE_TYPES[t2] else 'Different'} KMP response type." |
| }) |
| |
| examples.append({ |
| "instruction": f"Compare KMP context-dependency in {t1.lower()} vs {t2.lower()}.", |
| "input": "", |
| "output": f"{t1} ({TISSUE_TYPES[t1]}): baseline→stress = {KMP_EFFECTS[t1]['baseline']}→{KMP_EFFECTS[t1]['in_HU_IR']:,}. {t2} ({TISSUE_TYPES[t2]}): {KMP_EFFECTS[t2]['baseline']}→{KMP_EFFECTS[t2]['in_HU_IR']}." |
| }) |
| |
| |
| for tissue, effects in STRESSOR_EFFECTS.items(): |
| if effects['HU'] > effects['IR'] * 3: |
| dominance = "HU-dominant" |
| elif effects['IR'] > effects['HU'] * 3: |
| dominance = "IR-dominant" |
| else: |
| dominance = "balanced response" |
| |
| examples.append({ |
| "instruction": f"What stressor dominates the response in {tissue.lower()}?", |
| "input": "", |
| "output": f"{tissue} shows {dominance}: HU = {effects['HU']:,} DEGs, IR = {effects['IR']:,} DEGs (ratio = {effects['HU']/max(effects['IR'],1):.1f})." |
| }) |
| |
| return examples |
|
|
|
|
| def generate_prediction_tasks() -> List[Dict]: |
| """Interaction and cross-tissue prediction tasks.""" |
| examples = [] |
| |
| |
| for tissue in STRESSOR_EFFECTS.keys(): |
| effects = STRESSOR_EFFECTS[tissue] |
| examples.append({ |
| "instruction": f"Predict combined HU+IR effect in {tissue.lower()} from main effects.", |
| "input": f"HU alone: {effects['HU']:,} DEGs. IR alone: {effects['IR']:,} DEGs.", |
| "output": f"Additive prediction: ~{effects['HU']+effects['IR']:,} DEGs. Actual: {effects['HU_IR']:,} DEGs. The {'synergistic' if effects['HU_IR'] > effects['HU']+effects['IR'] else 'sub-additive'} effect reflects biological interaction between stressors." |
| }) |
| |
| |
| for tissue in KMP_EFFECTS.keys(): |
| kmp = KMP_EFFECTS[tissue] |
| examples.append({ |
| "instruction": f"Predict KMP effect under stress in {tissue.lower()}.", |
| "input": f"KMP at baseline: {kmp['baseline']} DEGs. Tissue type: {TISSUE_TYPES[tissue]}.", |
| "output": f"Based on tissue type, predict {'increase' if 'stress-activated' in TISSUE_TYPES[tissue] else 'decrease'}. Actual: {kmp['in_HU_IR']:,} DEGs. Ratio: {kmp['in_HU_IR']/max(kmp['baseline'],1):.1f}x." |
| }) |
| |
| |
| examples.append({ |
| "instruction": "Given heart (Type A) and soleus (Type A), predict similarity of KMP response.", |
| "input": "Both are Type A (stress-activated). Heart KMP under stress: 2,110 DEGs.", |
| "output": f"Prediction: Similar stress-activated pattern. Actual soleus: {KMP_EFFECTS['Soleus']['in_HU_IR']} DEGs. Both show stress-activated response, confirming Type A classification predicts pharmacodynamics." |
| }) |
| |
| examples.append({ |
| "instruction": "Given heart OXPHOS RESCUE, predict liver OXPHOS pattern.", |
| "input": f"Heart: stress {OXPHOS_PATTERNS['Heart']['stress_NES']:.1f} → KMP {OXPHOS_PATTERNS['Heart']['KMP_NES']:.1f}. Liver is Type C (different).", |
| "output": f"Prediction: Different pattern due to metabolic role. Actual: Liver shows SUPPRESSION (stress {OXPHOS_PATTERNS['Liver']['stress_NES']:.1f} → KMP {OXPHOS_PATTERNS['Liver']['KMP_NES']:.1f}). Opposite pattern confirms tissue-specific mechanisms." |
| }) |
| |
| examples.append({ |
| "instruction": "Predict KMP × HU interaction magnitude in soleus from heart data.", |
| "input": f"Heart KMP × HU: {INTERACTIONS['Heart']['KMP_x_HU']} DEGs. Soleus has larger HU response ({STRESSOR_EFFECTS['Soleus']['HU']:,} vs {STRESSOR_EFFECTS['Heart']['HU']}).", |
| "output": f"Prediction: Larger than heart due to greater HU sensitivity. Actual: {INTERACTIONS['Soleus']['KMP_x_HU']:,} DEGs ({INTERACTIONS['Soleus']['KMP_x_HU']/INTERACTIONS['Heart']['KMP_x_HU']:.0f}x larger). Massive interaction reflects stress-dependent KMP activation." |
| }) |
| |
| return examples |
|
|
|
|
| def generate_design_critique() -> List[Dict]: |
| """Experimental design critique tasks.""" |
| examples = [] |
| |
| examples.append({ |
| "instruction": "Critique: Testing KMP only at baseline to assess spaceflight protection.", |
| "input": "", |
| "output": f"FLAWED. Heart baseline KMP: {KMP_EFFECTS['Heart']['baseline']} DEGs. Heart stressed KMP: {KMP_EFFECTS['Heart']['in_HU_IR']:,} DEGs. Testing at baseline would miss {KMP_EFFECTS['Heart']['in_HU_IR']/max(KMP_EFFECTS['Heart']['baseline'],1):.0f}x larger stressed effect. Must include stress conditions." |
| }) |
| |
| examples.append({ |
| "instruction": "Critique: Extrapolating heart results to predict liver response.", |
| "input": "", |
| "output": "INCORRECT. Heart (Type A) and Liver (Type C) have opposite patterns. Heart OXPHOS: RESCUE. Liver OXPHOS: SUPPRESSION. KMP effect: Heart {0} → {1} DEGs, Liver {2} → {3} DEGs. Cross-tissue extrapolation invalid.".format( |
| KMP_EFFECTS['Heart']['baseline'], KMP_EFFECTS['Heart']['in_HU_IR'], |
| KMP_EFFECTS['Liver']['baseline'], KMP_EFFECTS['Liver']['in_HU_IR'] |
| ) |
| }) |
| |
| examples.append({ |
| "instruction": "Critique: Using |log2FC| > 1 threshold.", |
| "input": "", |
| "output": f"PROBLEMATIC. Stringent thresholds miss coordinated small effects. Using padj-only: Heart KMP = {KMP_EFFECTS['Heart']['in_HU_IR']:,} DEGs. Many genes with <2-fold change contribute to pathway-level effects like OXPHOS rescue." |
| }) |
| |
| examples.append({ |
| "instruction": "Critique: Single-tissue study for KMP countermeasure development.", |
| "input": "", |
| "output": f"INSUFFICIENT. Tissue responses vary dramatically: Heart {KMP_EFFECTS['Heart']['in_HU_IR']:,}, Soleus {KMP_EFFECTS['Soleus']['in_HU_IR']}, Hippocampus {KMP_EFFECTS['Hippocampus']['in_HU_IR']}, Liver {KMP_EFFECTS['Liver']['in_HU_IR']} DEGs under stress. Type A/B/C classification requires multi-tissue data." |
| }) |
| |
| examples.append({ |
| "instruction": "Critique: Assuming additive drug-stressor effects.", |
| "input": "", |
| "output": f"INCORRECT. KMP × HU interactions are highly non-additive. Soleus: {INTERACTIONS['Soleus']['KMP_x_HU']:,} interaction DEGs. This super-additive effect indicates synergistic mechanisms that additive models cannot capture." |
| }) |
| |
| examples.append({ |
| "instruction": "Evaluate cell culture KMP data for in vivo predictions.", |
| "input": "", |
| "output": f"LIMITED VALUE. Cell culture cannot capture: context-dependency (KMP effects range 0-{KMP_EFFECTS['Heart']['in_HU_IR']:,} DEGs), tissue specificity (Type A/B/C), or interaction effects (KMP × HU = {INTERACTIONS['Soleus']['KMP_x_HU']:,} DEGs). In vivo factorial studies essential." |
| }) |
| |
| examples.append({ |
| "instruction": "Critique: Human dosing based on mouse efficacy data.", |
| "input": "", |
| "output": "PREMATURE. Missing: dose-response (single dose tested), human PK, therapeutic window, bioavailability. Mouse-to-human scaling unreliable (10-100x errors). Need: dose-response studies, PK modeling, Phase I trials." |
| }) |
| |
| return examples |
|
|
|
|
| def generate_mechanistic_reasoning() -> List[Dict]: |
| """Mechanistic reasoning tasks.""" |
| examples = [] |
| |
| examples.append({ |
| "instruction": "Explain hippocampus baseline-active, stress-blocked pattern.", |
| "input": "", |
| "output": f"Hippocampus: {KMP_EFFECTS['Hippocampus']['baseline']:,} DEGs baseline → {KMP_EFFECTS['Hippocampus']['in_HU_IR']} DEGs stressed. Mechanisms: (1) Target saturation by stress signals, (2) Pathway competition with DNA damage response, (3) Altered transcriptional machinery from radiation. Implication: Pre-treatment may be more effective." |
| }) |
| |
| examples.append({ |
| "instruction": "Explain the massive KMP × HU interaction in soleus.", |
| "input": "", |
| "output": f"Soleus KMP × HU = {INTERACTIONS['Soleus']['KMP_x_HU']:,} DEGs (largest interaction). Mechanisms: (1) Mechanosensitive pathways create KMP binding sites, (2) HU upregulates KMP targets normally not expressed, (3) Mitochondrial dysfunction reveals KMP protective capacity. Key insight: KMP specifically protects stressed tissue." |
| }) |
| |
| examples.append({ |
| "instruction": "Explain opposite OXPHOS patterns in heart vs liver.", |
| "input": "", |
| "output": f"Heart OXPHOS: stress suppresses ({OXPHOS_PATTERNS['Heart']['stress_NES']:.1f}) → KMP rescues ({OXPHOS_PATTERNS['Heart']['KMP_NES']:.1f}). Liver OXPHOS: stress activates ({OXPHOS_PATTERNS['Liver']['stress_NES']:.1f}) → KMP suppresses ({OXPHOS_PATTERNS['Liver']['KMP_NES']:.1f}). Heart needs ATP restoration for contraction. Liver needs metabolic braking to prevent oxidative damage. 'Rescue' is tissue-specific." |
| }) |
| |
| examples.append({ |
| "instruction": "Why does soleus show 0 KMP DEGs at baseline but 491 under stress?", |
| "input": "", |
| "output": f"Complete stress-dependency. Baseline: KMP targets not expressed or accessible. Under HU stress: Mechanosensitive pathways activate, target expression increases, mitochondrial stress reveals KMP binding sites. Result: 0 → {KMP_EFFECTS['Soleus']['in_HU_IR']} DEGs. KMP is conditionally active therapeutic." |
| }) |
| |
| examples.append({ |
| "instruction": "Explain Type A/B/C classification system.", |
| "input": "", |
| "output": f"Type A (Heart, Soleus): Stress-activated. Baseline minimal, stress maximal. KMP works when needed. Type B (Hippocampus): Baseline-active ({KMP_EFFECTS['Hippocampus']['baseline']:,}→{KMP_EFFECTS['Hippocampus']['in_HU_IR']}). Stress blocks. May need pre-treatment. Type C (Liver): Stress-blocked ({KMP_EFFECTS['Liver']['baseline']}→{KMP_EFFECTS['Liver']['in_HU_IR']}). Different mechanism needed." |
| }) |
| |
| examples.append({ |
| "instruction": "Why is KMP × HU larger than KMP × IR in most tissues?", |
| "input": "", |
| "output": f"KMP × HU: Heart {INTERACTIONS['Heart']['KMP_x_HU']}, Liver {INTERACTIONS['Liver']['KMP_x_HU']:,}, Soleus {INTERACTIONS['Soleus']['KMP_x_HU']:,}. KMP × IR: Heart {INTERACTIONS['Heart']['KMP_x_IR']}, Liver {INTERACTIONS['Liver']['KMP_x_IR']}, Soleus {INTERACTIONS['Soleus']['KMP_x_IR']}. KMP likely targets mechanosensitive/metabolic pathways more than DNA damage pathways." |
| }) |
| |
| return examples |
|
|
|
|
| def generate_uncertainty_calibration() -> List[Dict]: |
| """Uncertainty calibration tasks.""" |
| examples = [] |
| |
| examples.append({ |
| "instruction": "What is KMP's effect on cardiac contractility?", |
| "input": "", |
| "output": f"CANNOT DIRECTLY ANSWER. Data shows: {KMP_EFFECTS['Heart']['in_HU_IR']:,} DEGs, OXPHOS rescue. These are transcriptomic changes. NOT shown: functional contractility, protein levels, force generation. Inference: OXPHOS rescue may improve ATP→may improve contractility. Confidence: LOW for function." |
| }) |
| |
| examples.append({ |
| "instruction": "Will KMP prevent human astronaut muscle atrophy?", |
| "input": "", |
| "output": f"CANNOT CONFIDENTLY ANSWER. Supports: Stress-activated protection ({KMP_EFFECTS['Soleus']['in_HU_IR']} DEGs), KMP × HU interaction ({INTERACTIONS['Soleus']['KMP_x_HU']:,}), OXPHOS rescue. Missing: Human PK, true microgravity (HU is analog), long-duration effects. Strong rationale but insufficient for prediction." |
| }) |
| |
| examples.append({ |
| "instruction": "Which gene is most important for KMP protection?", |
| "input": "", |
| "output": "CANNOT DEFINITIVELY ANSWER. Transcriptomics shows correlation, not causation. Large change ≠ causal role. Redundant pathways exist. No knockout validation. Better questions: 'Largest effects?' (answerable), 'Most enriched pathways?' (answerable). Need: CRISPR screening for causality." |
| }) |
| |
| examples.append({ |
| "instruction": "What KMP dose should be used for astronauts?", |
| "input": "", |
| "output": "CANNOT INFORM DOSING. Missing: Dose-response (one dose tested), human PK, therapeutic window, bioavailability. Mouse-human scaling unreliable. Need: Multi-dose mouse studies, PK modeling, human Phase I. Do not extrapolate." |
| }) |
| |
| examples.append({ |
| "instruction": "Is the 8,484 DEG interaction statistically robust?", |
| "input": "", |
| "output": f"YES with caveats. Each of {INTERACTIONS['Soleus']['KMP_x_HU']:,} genes has padj < 0.05. Highly significant. But: exact number is threshold-dependent, effect sizes vary. Appropriate: 'approximately 8,500 DEGs indicating substantial non-additive effect.' Avoid false precision." |
| }) |
| |
| examples.append({ |
| "instruction": "Can we predict KMP effects in kidney from this data?", |
| "input": "", |
| "output": "CANNOT PREDICT. Kidney not studied. Type A/B/C patterns are tissue-specific and cannot be extrapolated. Kidney has unique metabolic/filtration role. Would need: dedicated kidney experiments. Cross-tissue prediction requires mechanistic validation." |
| }) |
| |
| return examples |
|
|
|
|
| def generate_application_questions() -> List[Dict]: |
| """Practical application questions.""" |
| examples = [] |
| |
| examples.append({ |
| "instruction": "Design a KMP efficacy study for astronaut protection.", |
| "input": "", |
| "output": f"Required: (1) Factorial design with stress conditions (not baseline-only), (2) Multiple tissues (Type A/B/C responses differ), (3) Functional endpoints beyond transcriptomics, (4) Dose-response. Rationale: Baseline-only would miss {KMP_EFFECTS['Heart']['in_HU_IR']:,}/{KMP_EFFECTS['Heart']['baseline']} DEG effect in heart." |
| }) |
| |
| examples.append({ |
| "instruction": "Prioritize tissues for KMP therapeutic development.", |
| "input": "", |
| "output": f"Priority 1: Soleus - largest interaction ({INTERACTIONS['Soleus']['KMP_x_HU']:,}), stress-activated, OXPHOS rescue. Priority 2: Heart - strong rescue ({KMP_EFFECTS['Heart']['in_HU_IR']:,} DEGs). Lower priority: Hippocampus (stress-blocked), Liver (minimal effect). Focus on Type A tissues." |
| }) |
| |
| examples.append({ |
| "instruction": "Identify biomarkers for KMP efficacy monitoring.", |
| "input": "", |
| "output": "Candidate biomarkers: (1) OXPHOS pathway genes (track rescue/suppression), (2) Tissue-type classification (confirm Type A/B/C), (3) Interaction-responsive genes from KMP × HU. Need validation: functional correlation, accessibility (blood-based), temporal dynamics." |
| }) |
| |
| examples.append({ |
| "instruction": "What controls are essential for KMP spaceflight studies?", |
| "input": "", |
| "output": f"Essential controls: (1) Vehicle under all stress conditions (not just baseline), (2) KMP at baseline (to detect context-dependency), (3) Single stressors (HU-only, IR-only) for interaction calculation, (4) Multiple tissues. Missing any control prevents detecting effects like {KMP_EFFECTS['Soleus']['baseline']}→{KMP_EFFECTS['Soleus']['in_HU_IR']} shift." |
| }) |
| |
| return examples |
|
|
|
|
| def compile_sft_dataset(output_file: str = 'kmp_sft_dataset.json'): |
| """Compile all examples into final SFT dataset.""" |
| |
| all_examples = [] |
| |
| print("Generating factual basic examples...") |
| all_examples.extend(generate_factual_basic()) |
| |
| print("Generating factual KMP examples...") |
| all_examples.extend(generate_factual_kmp()) |
| |
| print("Generating factual interaction examples...") |
| all_examples.extend(generate_factual_interactions()) |
| |
| print("Generating factual pathway examples...") |
| all_examples.extend(generate_factual_pathways()) |
| |
| print("Generating comparison examples...") |
| all_examples.extend(generate_comparison_questions()) |
| |
| print("Generating prediction examples...") |
| all_examples.extend(generate_prediction_tasks()) |
| |
| print("Generating design critique examples...") |
| all_examples.extend(generate_design_critique()) |
| |
| print("Generating mechanistic reasoning examples...") |
| all_examples.extend(generate_mechanistic_reasoning()) |
| |
| print("Generating uncertainty calibration examples...") |
| all_examples.extend(generate_uncertainty_calibration()) |
| |
| print("Generating application examples...") |
| all_examples.extend(generate_application_questions()) |
| |
| |
| formatted = [] |
| for ex in all_examples: |
| if ex.get('input'): |
| text = f"### Instruction:\n{ex['instruction']}\n\n### Input:\n{ex['input']}\n\n### Response:\n{ex['output']}" |
| else: |
| text = f"### Instruction:\n{ex['instruction']}\n\n### Response:\n{ex['output']}" |
| formatted.append({"text": text}) |
| |
| |
| random.seed(42) |
| random.shuffle(formatted) |
| |
| with open(output_file, 'w') as f: |
| json.dump(formatted, f, indent=2) |
| |
| print(f"\n{'='*60}") |
| print(f"SFT Dataset Summary") |
| print(f"{'='*60}") |
| print(f"Total examples: {len(formatted)}") |
| print(f"Output file: {output_file}") |
| |
| return formatted |
|
|
|
|
| if __name__ == "__main__": |
| parser = argparse.ArgumentParser() |
| parser.add_argument('--output', default='kmp_sft_dataset.json') |
| args = parser.parse_args() |
| compile_sft_dataset(args.output) |
|
|