eaglelandsonce commited on
Commit
95a0a03
·
verified ·
1 Parent(s): 6e8dad4

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +119 -135
app.py CHANGED
@@ -5,142 +5,126 @@ import numpy as np
5
  # Seed for reproducibility
6
  np.random.seed(42)
7
 
8
- # Function to generate synthetic breast cancer data
9
- def generate_breast_cancer_data(num_patients=100):
10
- patient_ids = [f"BC_{i+1:05d}" for i in range(num_patients)]
11
- ages = []
12
- menopausal_status = []
13
- tumor_sizes = []
14
- lymph_nodes = []
15
- grades = []
16
- stages = []
17
- er_status = []
18
- pr_status = []
19
- her2_status = []
20
- ki67_level = []
21
- tnbc_status = []
22
- brca_mutation = []
23
- overall_health = []
24
- genomic_score = []
25
- treatment = []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26
 
27
- for i in range(num_patients):
28
- age = int(np.random.normal(60, 10))
29
- age = max(30, min(age, 80))
30
- ages.append(age)
31
-
32
- menopausal = "Post-menopausal" if age >= 50 else "Pre-menopausal"
33
- menopausal_status.append(menopausal)
34
-
35
- tumor_size = round(np.random.lognormal(mean=0.7, sigma=0.5), 2)
36
- tumor_sizes.append(tumor_size)
37
-
38
- lymph_node = (
39
- "Positive"
40
- if (tumor_size > 2.0 and np.random.rand() < 0.6)
41
- or (tumor_size <= 2.0 and np.random.rand() < 0.3)
42
- else "Negative"
43
- )
44
- lymph_nodes.append(lymph_node)
45
-
46
- grade = np.random.choice([1, 2, 3], p=[0.1, 0.4, 0.5] if tumor_size > 2.0 else [0.3, 0.5, 0.2])
47
- grades.append(grade)
48
-
49
- if tumor_size <= 2.0 and lymph_node == "Negative":
50
- stage = "I"
51
- elif (tumor_size > 2.0 and tumor_size <= 5.0) and lymph_node == "Negative":
52
- stage = "II"
53
- elif lymph_node == "Positive" or tumor_size > 5.0:
54
- stage = "III"
55
- else:
56
- stage = "II"
57
- if np.random.rand() < 0.05:
58
- stage = "IV"
59
- stages.append(stage)
60
-
61
- er = np.random.choice(["Positive", "Negative"], p=[0.75, 0.25])
62
- pr = "Positive" if er == "Positive" and np.random.rand() > 0.1 else "Negative"
63
- er_status.append(er)
64
- pr_status.append(pr)
65
-
66
- her2 = np.random.choice(["Positive", "Negative"], p=[0.3, 0.7] if grade == 3 else [0.15, 0.85])
67
- her2_status.append(her2)
68
-
69
- ki67 = "High" if grade == 3 and np.random.rand() < 0.8 else "Low"
70
- ki67_level.append(ki67)
71
-
72
- tnbc = "Positive" if er == "Negative" and pr == "Negative" and her2 == "Negative" else "Negative"
73
- tnbc_status.append(tnbc)
74
-
75
- brca = "Positive" if (tnbc == "Positive" or age < 40) and np.random.rand() < 0.2 else "Negative"
76
- brca_mutation.append(brca)
77
-
78
- health = "Good" if age < 65 and np.random.rand() < 0.9 else "Poor"
79
- overall_health.append(health)
80
-
81
- recurrence_score = (
82
- np.random.choice(["Low", "Intermediate", "High"], p=[0.6, 0.3, 0.1])
83
- if er == "Positive" and her2 == "Negative"
84
- else "N/A"
85
- )
86
- genomic_score.append(recurrence_score)
87
-
88
- if stage in ["I", "II"]:
89
- if tnbc == "Positive":
90
- treat = "Surgery, Chemotherapy, and Radiation Therapy"
91
- elif er == "Positive" and recurrence_score != "N/A":
92
- if recurrence_score == "High":
93
- treat = "Surgery, Chemotherapy, Hormone Therapy, and Radiation Therapy"
94
- elif recurrence_score == "Intermediate":
95
- treat = "Surgery, Consider Chemotherapy, Hormone Therapy, and Radiation Therapy"
96
- else:
97
- treat = "Surgery, Hormone Therapy, and Radiation Therapy"
98
- elif her2 == "Positive":
99
- treat = "Surgery, HER2-Targeted Therapy, Chemotherapy, and Radiation Therapy"
100
- else:
101
- treat = "Surgery, Chemotherapy, and Radiation Therapy"
102
- elif stage == "III":
103
- treat = (
104
- "Neoadjuvant Chemotherapy, Surgery, Radiation Therapy"
105
- + (", HER2-Targeted Therapy" if her2 == "Positive" else "")
106
- + (", Hormone Therapy" if er == "Positive" else "")
107
- )
108
- else:
109
- treat = "Systemic Therapy (Palliative Care)"
110
- treatment.append(treat)
111
-
112
- return pd.DataFrame(
113
- {
114
- "Patient ID": patient_ids,
115
- "Age": ages,
116
- "Menopausal Status": menopausal_status,
117
- "Tumor Size (cm)": tumor_sizes,
118
- "Lymph Node Involvement": lymph_nodes,
119
- "Tumor Grade": grades,
120
- "Tumor Stage": stages,
121
- "ER Status": er_status,
122
- "PR Status": pr_status,
123
- "HER2 Status": her2_status,
124
- "Ki-67 Level": ki67_level,
125
- "TNBC Status": tnbc_status,
126
- "BRCA Mutation": brca_mutation,
127
- "Overall Health": overall_health,
128
- "Genomic Recurrence Score": genomic_score,
129
- "Treatment": treatment,
130
- }
131
- )
132
 
 
 
 
 
 
133
 
134
- # Main Streamlit App
135
- st.title("Synthetic Data Generator")
136
- num_patients = st.slider("Number of Patients to Generate", 10, 1000, 100)
137
  if st.button("Generate Data"):
138
- breast_cancer_data = generate_breast_cancer_data(num_patients)
139
- st.write("Breast Cancer Data")
140
- st.dataframe(breast_cancer_data)
141
- st.download_button(
142
- "Download Breast Cancer Data",
143
- data=breast_cancer_data.to_csv(index=False),
144
- file_name="breast_cancer_data.csv",
145
- mime="text/csv",
146
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
  # Seed for reproducibility
6
  np.random.seed(42)
7
 
8
+ # Function to generate synthetic Enrollments
9
+ def generate_enrollments(num_members):
10
+ primary_keys = [f"PPK_{i+1:05d}" for i in range(num_members)]
11
+ enrollments_data = {
12
+ "MEM_AGE": np.random.randint(18, 80, num_members),
13
+ "MEM_MSA_NAME": np.random.choice(["DETROIT", "HONOLULU", "LOS ANGELES"], num_members),
14
+ "MEM_STAT": np.random.choice(["ACTIVE", "INACTIVE"], num_members),
15
+ "MEMBER_ID": primary_keys,
16
+ "PRIMARY_PERSON_KEY": primary_keys,
17
+ "PAYER_LOB": np.random.choice(["MEDICAID", "COMMERCIAL", "MEDICARE"], num_members),
18
+ "PAYER_TYPE": np.random.choice(["PPO", "HMO"], num_members),
19
+ "PRIMARY_CHRONIC_CONDITION_ROLLUP_DESC": np.random.choice(["Cancer", "Diabetes", "Hypertension"], num_members),
20
+ "Count of PRIMARY_CHRONIC_CONDITION_ROLLUP_ID": np.random.randint(1, 5, num_members),
21
+ "PROD_TYPE": np.random.choice(["DENTAL", "VISION", "MEDICAL"], num_members),
22
+ "RELATION": np.random.choice(["SUBSCRIBER", "DEPENDENT"], num_members),
23
+ "YEARMO": np.random.randint(202201, 202412, num_members),
24
+ }
25
+ return pd.DataFrame(enrollments_data)
26
+
27
+ # Function to generate synthetic Members
28
+ def generate_members(num_members):
29
+ primary_keys = [f"PPK_{i+1:05d}" for i in range(num_members)]
30
+ members_data = {
31
+ "MEM_ETHNICITY": np.random.choice(["Hispanic", "Non-Hispanic", None], num_members),
32
+ "MEM_GENDER": ["F"] * num_members,
33
+ "MEM_MSA_NAME": np.random.choice(["DETROIT", "HONOLULU", "LOS ANGELES"], num_members),
34
+ "MEM_RACE": np.random.choice(["White", "Black", "Asian", None], num_members),
35
+ "MEM_STATE": np.random.choice(["MI", "HI", "CA"], num_members),
36
+ "MEM_ZIP3": np.random.randint(100, 999, num_members),
37
+ "MEMBER_ID": primary_keys,
38
+ "PRIMARY_PERSON_KEY": primary_keys,
39
+ }
40
+ return pd.DataFrame(members_data)
41
+
42
+ # Function to generate synthetic Providers
43
+ def generate_providers(num_providers):
44
+ providers_data = {
45
+ "PROV_CLINIC_STATE": np.random.choice(["MI", "HI", "CA"], num_providers),
46
+ "PROV_CLINIC_ZIP": np.random.randint(10000, 99999, num_providers),
47
+ "PROV_KEY": [f"PK_{i+1:05d}" for i in range(num_providers)],
48
+ "Sum of PROV_NPI_ORG": np.random.randint(1, 50, num_providers),
49
+ "PROV_TAXONOMY": np.random.choice(["208100000X", "207RE0101X"], num_providers),
50
+ "PROV_TYPE": np.random.choice(["Type1", "Type2"], num_providers),
51
+ }
52
+ return pd.DataFrame(providers_data)
53
+
54
+ # Function to generate synthetic Services
55
+ def generate_services(num_services, primary_keys):
56
+ services_data = {
57
+ "PRIMARY_PERSON_KEY": np.random.choice(primary_keys, num_services),
58
+ "Sum of AMT_ALLOWED": np.random.uniform(1000, 10000, num_services),
59
+ "Sum of AMT_BILLED": np.random.uniform(1000, 15000, num_services),
60
+ "Count of AMT_PAID": np.random.randint(1, 5, num_services),
61
+ "ATT_PROV_KEY": [f"PK_{i+1:05d}" for i in np.random.randint(1, len(primary_keys), num_services)],
62
+ "BILL_PROV_KEY": [f"PK_{i+1:05d}" for i in np.random.randint(1, len(primary_keys), num_services)],
63
+ "CLAIM_IN_NETWORK": np.random.choice(["Y", "N", None], num_services),
64
+ "RELATION": np.random.choice(["SUBSCRIBER", "DEPENDENT"], num_services),
65
+ "SERVICE_SETTING": np.random.choice(["OUTPATIENT", "INPATIENT"], num_services),
66
+ "Sum of SERVICE_LINE": np.random.randint(1, 10, num_services),
67
+ "Sum of SV_UNITS": np.random.randint(1, 100, num_services),
68
+ "YEARMO": np.random.randint(202201, 202412, num_services),
69
+ }
70
+ return pd.DataFrame(services_data)
71
+
72
+ # Function to generate synthetic BreastCancer data
73
+ def generate_breast_cancer_data(num_patients):
74
+ patient_ids = [f"PPK_{i+1:05d}" for i in range(num_patients)]
75
+ breast_cancer_data = {
76
+ "Patient ID": patient_ids,
77
+ "Age": np.random.randint(30, 80, num_patients),
78
+ "Menopausal Status": np.random.choice(["Post-menopausal", "Pre-menopausal"], num_patients),
79
+ "Tumor Size (cm)": np.round(np.random.lognormal(mean=0.7, sigma=0.5, size=num_patients), 2),
80
+ "Lymph Node Involvement": np.random.choice(["Positive", "Negative"], num_patients),
81
+ "Tumor Grade": np.random.choice([1, 2, 3], num_patients),
82
+ "Tumor Stage": np.random.choice(["I", "II", "III", "IV"], num_patients),
83
+ "ER Status": np.random.choice(["Positive", "Negative"], num_patients),
84
+ "PR Status": np.random.choice(["Positive", "Negative"], num_patients),
85
+ "HER2 Status": np.random.choice(["Positive", "Negative"], num_patients),
86
+ "Ki-67 Level": np.random.choice(["High", "Low"], num_patients),
87
+ "TNBC Status": np.random.choice(["Positive", "Negative"], num_patients),
88
+ "BRCA Mutation": np.random.choice(["Positive", "Negative"], num_patients),
89
+ "Overall Health": np.random.choice(["Good", "Poor"], num_patients),
90
+ "Genomic Recurrence Score": np.random.choice(["Low", "Intermediate", "High", "N/A"], num_patients),
91
+ "Treatment": np.random.choice(["Surgery", "Chemotherapy", "Radiation Therapy"], num_patients),
92
+ }
93
+ return pd.DataFrame(breast_cancer_data)
94
 
95
+ # Main Streamlit App
96
+ st.title("Synthetic Medical Data Generator")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
97
 
98
+ # Input parameters
99
+ num_members = st.slider("Number of Members to Generate", 10, 1000, 100)
100
+ num_providers = st.slider("Number of Providers to Generate", 10, 500, 100)
101
+ num_services = st.slider("Number of Services to Generate", 10, 2000, 500)
102
+ num_patients = st.slider("Number of Breast Cancer Patients to Generate", 10, 500, 100)
103
 
 
 
 
104
  if st.button("Generate Data"):
105
+ enrollments_df = generate_enrollments(num_members)
106
+ members_df = generate_members(num_members)
107
+ providers_df = generate_providers(num_providers)
108
+ services_df = generate_services(num_services, enrollments_df["PRIMARY_PERSON_KEY"].tolist())
109
+ breast_cancer_df = generate_breast_cancer_data(num_patients)
110
+
111
+ # Display data
112
+ st.subheader("Enrollments Data")
113
+ st.dataframe(enrollments_df.head())
114
+ st.download_button("Download Enrollments", enrollments_df.to_csv(index=False), "enrollments.csv")
115
+
116
+ st.subheader("Members Data")
117
+ st.dataframe(members_df.head())
118
+ st.download_button("Download Members", members_df.to_csv(index=False), "members.csv")
119
+
120
+ st.subheader("Providers Data")
121
+ st.dataframe(providers_df.head())
122
+ st.download_button("Download Providers", providers_df.to_csv(index=False), "providers.csv")
123
+
124
+ st.subheader("Services Data")
125
+ st.dataframe(services_df.head())
126
+ st.download_button("Download Services", services_df.to_csv(index=False), "services.csv")
127
+
128
+ st.subheader("Breast Cancer Data")
129
+ st.dataframe(breast_cancer_df.head())
130
+ st.download_button("Download Breast Cancer Data", breast_cancer_df.to_csv(index=False), "breast_cancer.csv")