Spaces:
Sleeping
Sleeping
File size: 7,357 Bytes
4d45df4 7e68ee3 ba73d96 7e68ee3 4d45df4 b4aa569 4d45df4 7e68ee3 120bed9 7e68ee3 4d45df4 b4aa569 120bed9 b4aa569 120bed9 b4aa569 4d45df4 b4aa569 4d45df4 b4aa569 120bed9 4d45df4 b4aa569 7e68ee3 b4aa569 7e68ee3 b4aa569 120bed9 7e68ee3 b4aa569 120bed9 7e68ee3 b4aa569 120bed9 7e68ee3 b4aa569 120bed9 7e68ee3 b4aa569 120bed9 7e68ee3 b4aa569 120bed9 7e68ee3 120bed9 7e68ee3 120bed9 4d45df4 7e68ee3 b4aa569 7e68ee3 b4aa569 7e68ee3 b4aa569 7e68ee3 b4aa569 7e68ee3 120bed9 7e68ee3 120bed9 7e68ee3 4d45df4 b4aa569 7e68ee3 120bed9 7e68ee3 4d45df4 120bed9 4d45df4 120bed9 4d45df4 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 |
import streamlit as st
import pandas as pd
import numpy as np
# Seed for reproducibility
np.random.seed(42)
# Function to generate synthetic data
def generate_realistic_data(num_patients=100):
# Initialize data lists
patient_ids = []
ages = []
menopausal_status = []
tumor_sizes = []
lymph_nodes = []
grades = []
stages = []
er_status = []
pr_status = []
her2_status = []
ki67_level = []
tnbc_status = []
brca_mutation = []
overall_health = []
genomic_score = []
treatment = []
for i in range(num_patients):
# Patient ID
patient_id = i + 1 # Start patient IDs from 1
patient_ids.append(patient_id)
# Age: Normally distributed between 30 and 80 years
age = int(np.random.normal(60, 10))
age = max(30, min(age, 80)) # Ensure age is between 30 and 80
ages.append(age)
# Menopausal Status: Determined by age
menopausal = 'Post-menopausal' if age >= 50 else 'Pre-menopausal'
menopausal_status.append(menopausal)
# Tumor Size in cm: Log-normal distribution
tumor_size = round(np.random.lognormal(mean=0.7, sigma=0.5), 2)
tumor_sizes.append(tumor_size)
# Lymph Node Involvement: Higher chance with larger tumors
lymph_node = 'Positive' if (tumor_size > 2.0 and np.random.rand() < 0.6) or (tumor_size <= 2.0 and np.random.rand() < 0.3) else 'Negative'
lymph_nodes.append(lymph_node)
# Tumor Grade (1-3): Higher grades more likely with larger tumors
grade = np.random.choice([1, 2, 3], p=[0.1, 0.4, 0.5] if tumor_size > 2.0 else [0.3, 0.5, 0.2])
grades.append(grade)
# Tumor Stage (I-IV): Based on tumor size and lymph node involvement
if tumor_size <= 2.0 and lymph_node == 'Negative':
stage = 'I'
elif (tumor_size > 2.0 and tumor_size <= 5.0) and lymph_node == 'Negative':
stage = 'II'
elif lymph_node == 'Positive' or tumor_size > 5.0:
stage = 'III'
else:
stage = 'II'
if np.random.rand() < 0.05:
stage = 'IV'
stages.append(stage)
# Hormone Receptor Status (ER and PR)
er = np.random.choice(['Positive', 'Negative'], p=[0.75, 0.25])
pr = 'Positive' if er == 'Positive' and np.random.rand() > 0.1 else 'Negative'
er_status.append(er)
pr_status.append(pr)
# HER2 Status: Correlates with tumor grade
her2 = np.random.choice(['Positive', 'Negative'], p=[0.3, 0.7] if grade == 3 else [0.15, 0.85])
her2_status.append(her2)
# Ki-67 Level: Higher in higher-grade tumors
ki67 = 'High' if grade == 3 and np.random.rand() < 0.8 else 'Low'
ki67_level.append(ki67)
# Triple-Negative Status (TNBC)
tnbc = 'Positive' if er == 'Negative' and pr == 'Negative' and her2 == 'Negative' else 'Negative'
tnbc_status.append(tnbc)
# BRCA Mutation: Higher in TNBC and younger patients
brca = 'Positive' if tnbc == 'Positive' or age < 40 and np.random.rand() < 0.2 else 'Negative'
brca_mutation.append(brca)
# Overall Health: Varies with age
health = 'Good' if age < 65 and np.random.rand() < 0.9 else 'Poor'
overall_health.append(health)
# Genomic Recurrence Score: For ER+, HER2- patients
recurrence_score = np.random.choice(['Low', 'Intermediate', 'High'], p=[0.6, 0.3, 0.1]) if er == 'Positive' and her2 == 'Negative' else 'N/A'
genomic_score.append(recurrence_score)
# Treatment based on NCCN guidelines
if stage in ['I', 'II']:
if tnbc == 'Positive':
treat = 'Surgery, Chemotherapy, and Radiation Therapy' + (', plus PARP Inhibitors' if brca == 'Positive' else '')
elif er == 'Positive' and recurrence_score != 'N/A':
if recurrence_score == 'High':
treat = 'Surgery, Chemotherapy, Hormone Therapy, and Radiation Therapy'
elif recurrence_score == 'Intermediate':
treat = 'Surgery, Consider Chemotherapy, Hormone Therapy, and Radiation Therapy'
else:
treat = 'Surgery, Hormone Therapy, and Radiation Therapy'
elif her2 == 'Positive':
treat = 'Surgery, HER2-Targeted Therapy, Chemotherapy, and Radiation Therapy'
else:
treat = 'Surgery, Chemotherapy, and Radiation Therapy'
elif stage == 'III':
treat = 'Neoadjuvant Chemotherapy, Surgery, Radiation Therapy' + (', HER2-Targeted Therapy' if her2 == 'Positive' else '') + (', Hormone Therapy' if er == 'Positive' else '')
else:
treat = 'Systemic Therapy (' + ', '.join([option for option in ['Hormone Therapy' if er == 'Positive' else '', 'HER2-Targeted Therapy' if her2 == 'Positive' else '', 'Chemotherapy' if tnbc == 'Positive' else ''] if option]) + '), Palliative Care' if health == 'Good' else 'Palliative Care Only'
treatment.append(treat)
# Create DataFrame
data = {
'Patient ID': patient_ids,
'Age': ages,
'Menopausal Status': menopausal_status,
'Tumor Size (cm)': tumor_sizes,
'Lymph Node Involvement': lymph_nodes,
'Tumor Grade': grades,
'Tumor Stage': stages,
'ER Status': er_status,
'PR Status': pr_status,
'HER2 Status': her2_status,
'Ki-67 Level': ki67_level,
'TNBC Status': tnbc_status,
'BRCA Mutation': brca_mutation,
'Overall Health': overall_health,
'Genomic Recurrence Score': genomic_score,
'Treatment': treatment
}
df = pd.DataFrame(data)
return df
def main():
st.title('Synthetic Breast Cancer Patient Data Generator')
st.write('This app generates synthetic breast cancer patient data based on NCCN guidelines.')
# User inputs
num_patients = st.number_input('Number of Patients to Generate', min_value=10, max_value=10000, value=100, step=10)
if st.button('Generate Data'):
df = generate_realistic_data(num_patients=num_patients)
st.success(f'Generated data for {num_patients} patients.')
# Display DataFrame
st.dataframe(df)
# Provide download link for data with Treatment column
csv_with_treatment = df.to_csv(index=False).encode('utf-8')
st.download_button(
label="Download data as CSV with Treatment",
data=csv_with_treatment,
file_name='synthetic_breast_cancer_data_with_treatment.csv',
mime='text/csv',
)
# Provide download link for data with Treatment column renamed to CheckTreatment
df_check_treatment = df.rename(columns={'Treatment': 'CheckTreatment'})
csv_check_treatment = df_check_treatment.to_csv(index=False).encode('utf-8')
st.download_button(
label="Download data as CSV with CheckTreatment",
data=csv_check_treatment,
file_name='synthetic_breast_cancer_data_with_check_treatment.csv',
mime='text/csv',
)
if __name__ == '__main__':
main()
|