Spaces:

eaglelandsonce
/

BreastCancerDataSet

Sleeping

File size: 7,357 Bytes

4d45df4
7e68ee3
 
 
ba73d96
 
 
 
 
7e68ee3
4d45df4
b4aa569
 
 
 
 
4d45df4
7e68ee3
 
 
 
 
 
 
 
120bed9
7e68ee3
4d45df4
 
 
 
 
b4aa569
 
 
 
 
 
 
 
 
 
 
 
 
 
120bed9
b4aa569
 
 
120bed9
b4aa569
 
 
 
 
4d45df4
b4aa569
4d45df4
b4aa569
 
120bed9
4d45df4
b4aa569
7e68ee3
 
b4aa569
7e68ee3
 
 
 
 
b4aa569
120bed9
7e68ee3
 
b4aa569
120bed9
7e68ee3
 
b4aa569
120bed9
7e68ee3
 
b4aa569
120bed9
7e68ee3
 
b4aa569
120bed9
7e68ee3
 
b4aa569
120bed9
7e68ee3
 
120bed9
7e68ee3
 
120bed9
4d45df4
7e68ee3
b4aa569
 
 
7e68ee3
b4aa569
7e68ee3
b4aa569
7e68ee3
b4aa569
7e68ee3
120bed9
 
 
7e68ee3
120bed9
7e68ee3
 
 
4d45df4
b4aa569
 
 
 
 
7e68ee3
 
 
 
 
 
 
 
 
120bed9
7e68ee3
 
 
 
 
4d45df4
 
 
 
 
 
 
 
 
 
 
 
 
 
120bed9
 
 
 
 
 
 
 
 
 
 
 
4d45df4
120bed9
 
 
4d45df4

import streamlit as st
import pandas as pd
import numpy as np

# Seed for reproducibility
np.random.seed(42)

# Function to generate synthetic data
def generate_realistic_data(num_patients=100):
    # Initialize data lists
    patient_ids = []
    ages = []
    menopausal_status = []
    tumor_sizes = []
    lymph_nodes = []
    grades = []
    stages = []
    er_status = []
    pr_status = []
    her2_status = []
    ki67_level = []
    tnbc_status = []
    brca_mutation = []
    overall_health = []
    genomic_score = []
    treatment = []
    
    for i in range(num_patients):
        # Patient ID
        patient_id = i + 1  # Start patient IDs from 1
        patient_ids.append(patient_id)
        
        # Age: Normally distributed between 30 and 80 years
        age = int(np.random.normal(60, 10))
        age = max(30, min(age, 80))  # Ensure age is between 30 and 80
        ages.append(age)
        
        # Menopausal Status: Determined by age
        menopausal = 'Post-menopausal' if age >= 50 else 'Pre-menopausal'
        menopausal_status.append(menopausal)
        
        # Tumor Size in cm: Log-normal distribution
        tumor_size = round(np.random.lognormal(mean=0.7, sigma=0.5), 2)
        tumor_sizes.append(tumor_size)
        
        # Lymph Node Involvement: Higher chance with larger tumors
        lymph_node = 'Positive' if (tumor_size > 2.0 and np.random.rand() < 0.6) or (tumor_size <= 2.0 and np.random.rand() < 0.3) else 'Negative'
        lymph_nodes.append(lymph_node)
        
        # Tumor Grade (1-3): Higher grades more likely with larger tumors
        grade = np.random.choice([1, 2, 3], p=[0.1, 0.4, 0.5] if tumor_size > 2.0 else [0.3, 0.5, 0.2])
        grades.append(grade)
        
        # Tumor Stage (I-IV): Based on tumor size and lymph node involvement
        if tumor_size <= 2.0 and lymph_node == 'Negative':
            stage = 'I'
        elif (tumor_size > 2.0 and tumor_size <= 5.0) and lymph_node == 'Negative':
            stage = 'II'
        elif lymph_node == 'Positive' or tumor_size > 5.0:
            stage = 'III'
        else:
            stage = 'II'
        if np.random.rand() < 0.05:
            stage = 'IV'
        stages.append(stage)
        
        # Hormone Receptor Status (ER and PR)
        er = np.random.choice(['Positive', 'Negative'], p=[0.75, 0.25])
        pr = 'Positive' if er == 'Positive' and np.random.rand() > 0.1 else 'Negative'
        er_status.append(er)
        pr_status.append(pr)
        
        # HER2 Status: Correlates with tumor grade
        her2 = np.random.choice(['Positive', 'Negative'], p=[0.3, 0.7] if grade == 3 else [0.15, 0.85])
        her2_status.append(her2)
        
        # Ki-67 Level: Higher in higher-grade tumors
        ki67 = 'High' if grade == 3 and np.random.rand() < 0.8 else 'Low'
        ki67_level.append(ki67)
        
        # Triple-Negative Status (TNBC)
        tnbc = 'Positive' if er == 'Negative' and pr == 'Negative' and her2 == 'Negative' else 'Negative'
        tnbc_status.append(tnbc)
        
        # BRCA Mutation: Higher in TNBC and younger patients
        brca = 'Positive' if tnbc == 'Positive' or age < 40 and np.random.rand() < 0.2 else 'Negative'
        brca_mutation.append(brca)
        
        # Overall Health: Varies with age
        health = 'Good' if age < 65 and np.random.rand() < 0.9 else 'Poor'
        overall_health.append(health)
        
        # Genomic Recurrence Score: For ER+, HER2- patients
        recurrence_score = np.random.choice(['Low', 'Intermediate', 'High'], p=[0.6, 0.3, 0.1]) if er == 'Positive' and her2 == 'Negative' else 'N/A'
        genomic_score.append(recurrence_score)
        
        # Treatment based on NCCN guidelines
        if stage in ['I', 'II']:
            if tnbc == 'Positive':
                treat = 'Surgery, Chemotherapy, and Radiation Therapy' + (', plus PARP Inhibitors' if brca == 'Positive' else '')
            elif er == 'Positive' and recurrence_score != 'N/A':
                if recurrence_score == 'High':
                    treat = 'Surgery, Chemotherapy, Hormone Therapy, and Radiation Therapy'
                elif recurrence_score == 'Intermediate':
                    treat = 'Surgery, Consider Chemotherapy, Hormone Therapy, and Radiation Therapy'
                else:
                    treat = 'Surgery, Hormone Therapy, and Radiation Therapy'
            elif her2 == 'Positive':
                treat = 'Surgery, HER2-Targeted Therapy, Chemotherapy, and Radiation Therapy'
            else:
                treat = 'Surgery, Chemotherapy, and Radiation Therapy'
        elif stage == 'III':
            treat = 'Neoadjuvant Chemotherapy, Surgery, Radiation Therapy' + (', HER2-Targeted Therapy' if her2 == 'Positive' else '') + (', Hormone Therapy' if er == 'Positive' else '')
        else:
            treat = 'Systemic Therapy (' + ', '.join([option for option in ['Hormone Therapy' if er == 'Positive' else '', 'HER2-Targeted Therapy' if her2 == 'Positive' else '', 'Chemotherapy' if tnbc == 'Positive' else ''] if option]) + '), Palliative Care' if health == 'Good' else 'Palliative Care Only'
        
        treatment.append(treat)
    
    # Create DataFrame
    data = {
        'Patient ID': patient_ids,
        'Age': ages,
        'Menopausal Status': menopausal_status,
        'Tumor Size (cm)': tumor_sizes,
        'Lymph Node Involvement': lymph_nodes,
        'Tumor Grade': grades,
        'Tumor Stage': stages,
        'ER Status': er_status,
        'PR Status': pr_status,
        'HER2 Status': her2_status,
        'Ki-67 Level': ki67_level,
        'TNBC Status': tnbc_status,
        'BRCA Mutation': brca_mutation,
        'Overall Health': overall_health,
        'Genomic Recurrence Score': genomic_score,
        'Treatment': treatment
    }
    
    df = pd.DataFrame(data)
    return df

def main():
    st.title('Synthetic Breast Cancer Patient Data Generator')
    st.write('This app generates synthetic breast cancer patient data based on NCCN guidelines.')
    
    # User inputs
    num_patients = st.number_input('Number of Patients to Generate', min_value=10, max_value=10000, value=100, step=10)
    
    if st.button('Generate Data'):
        df = generate_realistic_data(num_patients=num_patients)
        st.success(f'Generated data for {num_patients} patients.')
        
        # Display DataFrame
        st.dataframe(df)
        
        # Provide download link for data with Treatment column
        csv_with_treatment = df.to_csv(index=False).encode('utf-8')
        st.download_button(
            label="Download data as CSV with Treatment",
            data=csv_with_treatment,
            file_name='synthetic_breast_cancer_data_with_treatment.csv',
            mime='text/csv',
        )
        
        # Provide download link for data with Treatment column renamed to CheckTreatment
        df_check_treatment = df.rename(columns={'Treatment': 'CheckTreatment'})
        csv_check_treatment = df_check_treatment.to_csv(index=False).encode('utf-8')
        st.download_button(
            label="Download data as CSV with CheckTreatment",
            data=csv_check_treatment,
            file_name='synthetic_breast_cancer_data_with_check_treatment.csv',
            mime='text/csv',
        )

if __name__ == '__main__':
    main()