eaglelandsonce commited on
Commit
1d766b2
·
verified ·
1 Parent(s): 1c0dc24

Create 15_Breast_Cancer_Data.py

Browse files
Files changed (1) hide show
  1. pages/15_Breast_Cancer_Data.py +183 -0
pages/15_Breast_Cancer_Data.py ADDED
@@ -0,0 +1,183 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ import numpy as np
4
+
5
+ # Seed for reproducibility
6
+ np.random.seed(42)
7
+
8
+ # Function to generate synthetic data
9
+ def generate_realistic_data(num_patients=100):
10
+ # Initialize data lists
11
+ patient_ids = []
12
+ ages = []
13
+ menopausal_status = []
14
+ tumor_sizes = []
15
+ lymph_nodes = []
16
+ grades = []
17
+ stages = []
18
+ er_status = []
19
+ pr_status = []
20
+ her2_status = []
21
+ ki67_level = []
22
+ tnbc_status = []
23
+ brca_mutation = []
24
+ overall_health = []
25
+ genomic_score = []
26
+ treatment = []
27
+
28
+ for i in range(num_patients):
29
+ # Patient ID
30
+ patient_id = i + 1
31
+ patient_ids.append(patient_id)
32
+
33
+ # Age
34
+ age = int(np.random.normal(60, 10))
35
+ age = max(30, min(age, 80))
36
+ ages.append(age)
37
+
38
+ # Menopausal Status
39
+ menopausal = 'Post-menopausal' if age >= 50 else 'Pre-menopausal'
40
+ menopausal_status.append(menopausal)
41
+
42
+ # Tumor Size
43
+ tumor_size = round(np.random.lognormal(mean=0.7, sigma=0.5), 2)
44
+ tumor_sizes.append(tumor_size)
45
+
46
+ # Lymph Node Involvement
47
+ lymph_node = 'Positive' if (tumor_size > 2.0 and np.random.rand() < 0.6) or (tumor_size <= 2.0 and np.random.rand() < 0.3) else 'Negative'
48
+ lymph_nodes.append(lymph_node)
49
+
50
+ # Tumor Grade
51
+ grade = np.random.choice([1, 2, 3], p=[0.1, 0.4, 0.5] if tumor_size > 2.0 else [0.3, 0.5, 0.2])
52
+ grades.append(grade)
53
+
54
+ # Tumor Stage
55
+ if tumor_size <= 2.0 and lymph_node == 'Negative':
56
+ stage = 'I'
57
+ elif (tumor_size > 2.0 and tumor_size <= 5.0) and lymph_node == 'Negative':
58
+ stage = 'II'
59
+ elif lymph_node == 'Positive' or tumor_size > 5.0:
60
+ stage = 'III'
61
+ else:
62
+ stage = 'II'
63
+ if np.random.rand() < 0.05:
64
+ stage = 'IV'
65
+ stages.append(stage)
66
+
67
+ # Hormone Receptor Status
68
+ er = np.random.choice(['Positive', 'Negative'], p=[0.75, 0.25])
69
+ pr = 'Positive' if er == 'Positive' and np.random.rand() > 0.1 else 'Negative'
70
+ er_status.append(er)
71
+ pr_status.append(pr)
72
+
73
+ # HER2 Status
74
+ her2 = np.random.choice(['Positive', 'Negative'], p=[0.3, 0.7] if grade == 3 else [0.15, 0.85])
75
+ her2_status.append(her2)
76
+
77
+ # Ki-67 Level
78
+ ki67 = 'High' if grade == 3 and np.random.rand() < 0.8 else 'Low'
79
+ ki67_level.append(ki67)
80
+
81
+ # Triple-Negative Status
82
+ tnbc = 'Positive' if er == 'Negative' and pr == 'Negative' and her2 == 'Negative' else 'Negative'
83
+ tnbc_status.append(tnbc)
84
+
85
+ # BRCA Mutation
86
+ brca = 'Positive' if (tnbc == 'Positive' or age < 40) and np.random.rand() < 0.2 else 'Negative'
87
+ brca_mutation.append(brca)
88
+
89
+ # Overall Health
90
+ health = 'Good' if age < 65 and np.random.rand() < 0.9 else 'Poor'
91
+ overall_health.append(health)
92
+
93
+ # Genomic Recurrence Score
94
+ recurrence_score = np.random.choice(['Low', 'Intermediate', 'High'], p=[0.6, 0.3, 0.1]) if er == 'Positive' and her2 == 'Negative' else 'N/A'
95
+ genomic_score.append(recurrence_score)
96
+
97
+ # Treatment
98
+ if stage in ['I', 'II']:
99
+ if tnbc == 'Positive':
100
+ treat = 'Surgery, Chemotherapy, and Radiation Therapy' + (', plus PARP Inhibitors' if brca == 'Positive' else '')
101
+ elif er == 'Positive' and recurrence_score != 'N/A':
102
+ if recurrence_score == 'High':
103
+ treat = 'Surgery, Chemotherapy, Hormone Therapy, and Radiation Therapy'
104
+ elif recurrence_score == 'Intermediate':
105
+ treat = 'Surgery, Consider Chemotherapy, Hormone Therapy, and Radiation Therapy'
106
+ else:
107
+ treat = 'Surgery, Hormone Therapy, and Radiation Therapy'
108
+ elif her2 == 'Positive':
109
+ treat = 'Surgery, HER2-Targeted Therapy, Chemotherapy, and Radiation Therapy'
110
+ else:
111
+ treat = 'Surgery, Chemotherapy, and Radiation Therapy'
112
+ elif stage == 'III':
113
+ treat = 'Neoadjuvant Chemotherapy, Surgery, Radiation Therapy' + (', HER2-Targeted Therapy' if her2 == 'Positive' else '') + (', Hormone Therapy' if er == 'Positive' else '')
114
+ else:
115
+ treat = 'Systemic Therapy (' + ', '.join([option for option in ['Hormone Therapy' if er == 'Positive' else '', 'HER2-Targeted Therapy' if her2 == 'Positive' else '', 'Chemotherapy' if tnbc == 'Positive' else ''] if option]) + '), Palliative Care' if health == 'Good' else 'Palliative Care Only'
116
+
117
+ treatment.append(treat)
118
+
119
+ # Create DataFrame
120
+ data = {
121
+ 'Patient ID': patient_ids,
122
+ 'Age': ages,
123
+ 'Menopausal Status': menopausal_status,
124
+ 'Tumor Size (cm)': tumor_sizes,
125
+ 'Lymph Node Involvement': lymph_nodes,
126
+ 'Tumor Grade': grades,
127
+ 'Tumor Stage': stages,
128
+ 'ER Status': er_status,
129
+ 'PR Status': pr_status,
130
+ 'HER2 Status': her2_status,
131
+ 'Ki-67 Level': ki67_level,
132
+ 'TNBC Status': tnbc_status,
133
+ 'BRCA Mutation': brca_mutation,
134
+ 'Overall Health': overall_health,
135
+ 'Genomic Recurrence Score': genomic_score,
136
+ 'Treatment': treatment,
137
+ }
138
+
139
+ return pd.DataFrame(data)
140
+
141
+ # Function to generate fuzzy data
142
+ def generate_fuzzy_data(df, error_rate=0.1):
143
+ fuzzy_df = df.copy()
144
+ num_rows, num_cols = fuzzy_df.shape
145
+
146
+ # Introduce errors
147
+ for _ in range(int(num_rows * num_cols * error_rate)):
148
+ row = np.random.randint(0, num_rows)
149
+ col = np.random.randint(0, num_cols)
150
+
151
+ value = fuzzy_df.iloc[row, col]
152
+
153
+ if isinstance(value, str):
154
+ if value in ['Post-menopausal', 'Pre-menopausal']:
155
+ fuzzy_df.iloc[row, col] = 'Post-menopausal' if value == 'Pre-menopausal' else 'Pre-menopausal'
156
+ elif value in ['Positive', 'Negative']:
157
+ fuzzy_df.iloc[row, col] = 'Negative' if value == 'Positive' else 'Positive'
158
+ elif isinstance(value, (int, float)):
159
+ noise = np.random.normal(0, 0.1 * value)
160
+ fuzzy_df.iloc[row, col] += noise
161
+
162
+ return fuzzy_df
163
+
164
+ def main():
165
+ st.title('Synthetic Data Generator: Clean and Fuzzy (Noisy)')
166
+ st.write('This app generates synthetic breast cancer patient data and provides downloads for both clean and fuzzy datasets.')
167
+
168
+ num_patients = st.number_input('Number of Patients to Generate', min_value=10, max_value=10000, value=100, step=10)
169
+
170
+ if st.button('Generate Data'):
171
+ perfect_data = generate_realistic_data(num_patients)
172
+ fuzzy_data = generate_fuzzy_data(perfect_data, error_rate=0.1)
173
+
174
+ st.subheader('Perfect Data')
175
+ st.dataframe(perfect_data)
176
+ st.download_button('Download Perfect Data', perfect_data.to_csv(index=False), file_name='perfect_data.csv')
177
+
178
+ st.subheader('Fuzzy Data (10% Error Rate)')
179
+ st.dataframe(fuzzy_data)
180
+ st.download_button('Download Fuzzy Data', fuzzy_data.to_csv(index=False), file_name='fuzzy_data.csv')
181
+
182
+ if __name__ == '__main__':
183
+ main()