Spaces:
Sleeping
Sleeping
eaglelandsonce
commited on
Create 15_Breast_Cancer_Data.py
Browse files- pages/15_Breast_Cancer_Data.py +183 -0
pages/15_Breast_Cancer_Data.py
ADDED
@@ -0,0 +1,183 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
import pandas as pd
|
3 |
+
import numpy as np
|
4 |
+
|
5 |
+
# Seed for reproducibility
|
6 |
+
np.random.seed(42)
|
7 |
+
|
8 |
+
# Function to generate synthetic data
|
9 |
+
def generate_realistic_data(num_patients=100):
|
10 |
+
# Initialize data lists
|
11 |
+
patient_ids = []
|
12 |
+
ages = []
|
13 |
+
menopausal_status = []
|
14 |
+
tumor_sizes = []
|
15 |
+
lymph_nodes = []
|
16 |
+
grades = []
|
17 |
+
stages = []
|
18 |
+
er_status = []
|
19 |
+
pr_status = []
|
20 |
+
her2_status = []
|
21 |
+
ki67_level = []
|
22 |
+
tnbc_status = []
|
23 |
+
brca_mutation = []
|
24 |
+
overall_health = []
|
25 |
+
genomic_score = []
|
26 |
+
treatment = []
|
27 |
+
|
28 |
+
for i in range(num_patients):
|
29 |
+
# Patient ID
|
30 |
+
patient_id = i + 1
|
31 |
+
patient_ids.append(patient_id)
|
32 |
+
|
33 |
+
# Age
|
34 |
+
age = int(np.random.normal(60, 10))
|
35 |
+
age = max(30, min(age, 80))
|
36 |
+
ages.append(age)
|
37 |
+
|
38 |
+
# Menopausal Status
|
39 |
+
menopausal = 'Post-menopausal' if age >= 50 else 'Pre-menopausal'
|
40 |
+
menopausal_status.append(menopausal)
|
41 |
+
|
42 |
+
# Tumor Size
|
43 |
+
tumor_size = round(np.random.lognormal(mean=0.7, sigma=0.5), 2)
|
44 |
+
tumor_sizes.append(tumor_size)
|
45 |
+
|
46 |
+
# Lymph Node Involvement
|
47 |
+
lymph_node = 'Positive' if (tumor_size > 2.0 and np.random.rand() < 0.6) or (tumor_size <= 2.0 and np.random.rand() < 0.3) else 'Negative'
|
48 |
+
lymph_nodes.append(lymph_node)
|
49 |
+
|
50 |
+
# Tumor Grade
|
51 |
+
grade = np.random.choice([1, 2, 3], p=[0.1, 0.4, 0.5] if tumor_size > 2.0 else [0.3, 0.5, 0.2])
|
52 |
+
grades.append(grade)
|
53 |
+
|
54 |
+
# Tumor Stage
|
55 |
+
if tumor_size <= 2.0 and lymph_node == 'Negative':
|
56 |
+
stage = 'I'
|
57 |
+
elif (tumor_size > 2.0 and tumor_size <= 5.0) and lymph_node == 'Negative':
|
58 |
+
stage = 'II'
|
59 |
+
elif lymph_node == 'Positive' or tumor_size > 5.0:
|
60 |
+
stage = 'III'
|
61 |
+
else:
|
62 |
+
stage = 'II'
|
63 |
+
if np.random.rand() < 0.05:
|
64 |
+
stage = 'IV'
|
65 |
+
stages.append(stage)
|
66 |
+
|
67 |
+
# Hormone Receptor Status
|
68 |
+
er = np.random.choice(['Positive', 'Negative'], p=[0.75, 0.25])
|
69 |
+
pr = 'Positive' if er == 'Positive' and np.random.rand() > 0.1 else 'Negative'
|
70 |
+
er_status.append(er)
|
71 |
+
pr_status.append(pr)
|
72 |
+
|
73 |
+
# HER2 Status
|
74 |
+
her2 = np.random.choice(['Positive', 'Negative'], p=[0.3, 0.7] if grade == 3 else [0.15, 0.85])
|
75 |
+
her2_status.append(her2)
|
76 |
+
|
77 |
+
# Ki-67 Level
|
78 |
+
ki67 = 'High' if grade == 3 and np.random.rand() < 0.8 else 'Low'
|
79 |
+
ki67_level.append(ki67)
|
80 |
+
|
81 |
+
# Triple-Negative Status
|
82 |
+
tnbc = 'Positive' if er == 'Negative' and pr == 'Negative' and her2 == 'Negative' else 'Negative'
|
83 |
+
tnbc_status.append(tnbc)
|
84 |
+
|
85 |
+
# BRCA Mutation
|
86 |
+
brca = 'Positive' if (tnbc == 'Positive' or age < 40) and np.random.rand() < 0.2 else 'Negative'
|
87 |
+
brca_mutation.append(brca)
|
88 |
+
|
89 |
+
# Overall Health
|
90 |
+
health = 'Good' if age < 65 and np.random.rand() < 0.9 else 'Poor'
|
91 |
+
overall_health.append(health)
|
92 |
+
|
93 |
+
# Genomic Recurrence Score
|
94 |
+
recurrence_score = np.random.choice(['Low', 'Intermediate', 'High'], p=[0.6, 0.3, 0.1]) if er == 'Positive' and her2 == 'Negative' else 'N/A'
|
95 |
+
genomic_score.append(recurrence_score)
|
96 |
+
|
97 |
+
# Treatment
|
98 |
+
if stage in ['I', 'II']:
|
99 |
+
if tnbc == 'Positive':
|
100 |
+
treat = 'Surgery, Chemotherapy, and Radiation Therapy' + (', plus PARP Inhibitors' if brca == 'Positive' else '')
|
101 |
+
elif er == 'Positive' and recurrence_score != 'N/A':
|
102 |
+
if recurrence_score == 'High':
|
103 |
+
treat = 'Surgery, Chemotherapy, Hormone Therapy, and Radiation Therapy'
|
104 |
+
elif recurrence_score == 'Intermediate':
|
105 |
+
treat = 'Surgery, Consider Chemotherapy, Hormone Therapy, and Radiation Therapy'
|
106 |
+
else:
|
107 |
+
treat = 'Surgery, Hormone Therapy, and Radiation Therapy'
|
108 |
+
elif her2 == 'Positive':
|
109 |
+
treat = 'Surgery, HER2-Targeted Therapy, Chemotherapy, and Radiation Therapy'
|
110 |
+
else:
|
111 |
+
treat = 'Surgery, Chemotherapy, and Radiation Therapy'
|
112 |
+
elif stage == 'III':
|
113 |
+
treat = 'Neoadjuvant Chemotherapy, Surgery, Radiation Therapy' + (', HER2-Targeted Therapy' if her2 == 'Positive' else '') + (', Hormone Therapy' if er == 'Positive' else '')
|
114 |
+
else:
|
115 |
+
treat = 'Systemic Therapy (' + ', '.join([option for option in ['Hormone Therapy' if er == 'Positive' else '', 'HER2-Targeted Therapy' if her2 == 'Positive' else '', 'Chemotherapy' if tnbc == 'Positive' else ''] if option]) + '), Palliative Care' if health == 'Good' else 'Palliative Care Only'
|
116 |
+
|
117 |
+
treatment.append(treat)
|
118 |
+
|
119 |
+
# Create DataFrame
|
120 |
+
data = {
|
121 |
+
'Patient ID': patient_ids,
|
122 |
+
'Age': ages,
|
123 |
+
'Menopausal Status': menopausal_status,
|
124 |
+
'Tumor Size (cm)': tumor_sizes,
|
125 |
+
'Lymph Node Involvement': lymph_nodes,
|
126 |
+
'Tumor Grade': grades,
|
127 |
+
'Tumor Stage': stages,
|
128 |
+
'ER Status': er_status,
|
129 |
+
'PR Status': pr_status,
|
130 |
+
'HER2 Status': her2_status,
|
131 |
+
'Ki-67 Level': ki67_level,
|
132 |
+
'TNBC Status': tnbc_status,
|
133 |
+
'BRCA Mutation': brca_mutation,
|
134 |
+
'Overall Health': overall_health,
|
135 |
+
'Genomic Recurrence Score': genomic_score,
|
136 |
+
'Treatment': treatment,
|
137 |
+
}
|
138 |
+
|
139 |
+
return pd.DataFrame(data)
|
140 |
+
|
141 |
+
# Function to generate fuzzy data
|
142 |
+
def generate_fuzzy_data(df, error_rate=0.1):
|
143 |
+
fuzzy_df = df.copy()
|
144 |
+
num_rows, num_cols = fuzzy_df.shape
|
145 |
+
|
146 |
+
# Introduce errors
|
147 |
+
for _ in range(int(num_rows * num_cols * error_rate)):
|
148 |
+
row = np.random.randint(0, num_rows)
|
149 |
+
col = np.random.randint(0, num_cols)
|
150 |
+
|
151 |
+
value = fuzzy_df.iloc[row, col]
|
152 |
+
|
153 |
+
if isinstance(value, str):
|
154 |
+
if value in ['Post-menopausal', 'Pre-menopausal']:
|
155 |
+
fuzzy_df.iloc[row, col] = 'Post-menopausal' if value == 'Pre-menopausal' else 'Pre-menopausal'
|
156 |
+
elif value in ['Positive', 'Negative']:
|
157 |
+
fuzzy_df.iloc[row, col] = 'Negative' if value == 'Positive' else 'Positive'
|
158 |
+
elif isinstance(value, (int, float)):
|
159 |
+
noise = np.random.normal(0, 0.1 * value)
|
160 |
+
fuzzy_df.iloc[row, col] += noise
|
161 |
+
|
162 |
+
return fuzzy_df
|
163 |
+
|
164 |
+
def main():
|
165 |
+
st.title('Synthetic Data Generator: Clean and Fuzzy (Noisy)')
|
166 |
+
st.write('This app generates synthetic breast cancer patient data and provides downloads for both clean and fuzzy datasets.')
|
167 |
+
|
168 |
+
num_patients = st.number_input('Number of Patients to Generate', min_value=10, max_value=10000, value=100, step=10)
|
169 |
+
|
170 |
+
if st.button('Generate Data'):
|
171 |
+
perfect_data = generate_realistic_data(num_patients)
|
172 |
+
fuzzy_data = generate_fuzzy_data(perfect_data, error_rate=0.1)
|
173 |
+
|
174 |
+
st.subheader('Perfect Data')
|
175 |
+
st.dataframe(perfect_data)
|
176 |
+
st.download_button('Download Perfect Data', perfect_data.to_csv(index=False), file_name='perfect_data.csv')
|
177 |
+
|
178 |
+
st.subheader('Fuzzy Data (10% Error Rate)')
|
179 |
+
st.dataframe(fuzzy_data)
|
180 |
+
st.download_button('Download Fuzzy Data', fuzzy_data.to_csv(index=False), file_name='fuzzy_data.csv')
|
181 |
+
|
182 |
+
if __name__ == '__main__':
|
183 |
+
main()
|