vkt1414 commited on
Commit
6330aeb
·
1 Parent(s): c6d0240

add violin plots, several other enhancements

Browse files
Files changed (1) hide show
  1. filter_data_app.py +128 -57
filter_data_app.py CHANGED
@@ -5,12 +5,12 @@ import pandas as pd
5
  from upsetplot import UpSet
6
  import matplotlib.pyplot as plt
7
  import polars as pl
 
8
 
9
  # Set page configuration
10
  st.set_page_config(layout="wide")
11
 
12
- # URL and local path to the Parquet file
13
- PARQUET_URL = 'https://github.com/vkt1414/idc-index-data/releases/download/0.1/qualitative_checks.parquet'
14
  LOCAL_PARQUET_FILE = 'qual-checks-and-quant-values.parquet'
15
 
16
  @st.cache_data
@@ -27,13 +27,23 @@ def load_data():
27
  'connected_volumes',
28
  'Volume from Voxel Summation'
29
  ]
30
- return pl.read_parquet(LOCAL_PARQUET_FILE, columns=cols)
 
 
 
 
 
 
 
31
 
32
  # Function to filter data based on user input
33
  def filter_data(df, filters):
34
  for col, value in filters.items():
35
- if value:
36
- df = df.filter(pl.col(col) == value)
 
 
 
37
  return df
38
 
39
  # Function to create an UpSet plot for failed checks
@@ -43,7 +53,7 @@ def create_upset_plot_failures(df):
43
  # Treat 'pass' and null values as passing
44
  df = df.set_index(~((df['segmentation_completeness'] == 'pass') | df['segmentation_completeness'].isnull())).set_index(~((df['laterality_check'] == 'pass') | df['laterality_check'].isnull()), append=True)
45
  df = df.set_index(~((df['series_with_vertabra_on_every_slice'] == 'pass') | df['series_with_vertabra_on_every_slice'].isnull()), append=True)
46
- df = df.set_index(~((df['connected_volumes'] == 'pass') | df['connected_volumes'].isnull()), append=True)
47
  df = df[df.index.to_frame().any(axis=1)] # Ignore the case when all conditions are false
48
 
49
  fig = plt.figure()
@@ -64,6 +74,13 @@ def create_upset_plot_passes(df):
64
  upset.plot(fig=fig)
65
  st.pyplot(fig)
66
 
 
 
 
 
 
 
 
67
  # Main function to run the Streamlit app
68
  def main():
69
  st.title("Qualitative Checks of TotalSegmentator Segmentations on NLST")
@@ -115,13 +132,22 @@ def main():
115
  # Apply the current filters to update options for other filters
116
  filtered_df = filter_data(df, filters)
117
 
 
118
  # Update options for other filters based on the current selection
119
  segmentation_completeness_options = [""] + filtered_df['segmentation_completeness'].unique().to_list()
120
  laterality_check_options = [""] + filtered_df['laterality_check'].unique().to_list()
121
  series_with_vertabra_on_every_slice_options = [""] + filtered_df['series_with_vertabra_on_every_slice'].unique().to_list()
122
- connected_volumes_options = [""] + filtered_df['connected_volumes'].unique().to_list()
123
  laterality_options = [""] + filtered_df['laterality'].unique().to_list()
124
-
 
 
 
 
 
 
 
 
125
  # Add remaining filters with default values from session state
126
  segmentation_completeness = st.selectbox(
127
  "Segmentation Completeness",
@@ -147,62 +173,29 @@ def main():
147
  on_change=lambda: apply_filter('series_with_vertabra_on_every_slice', st.session_state.series_with_vertabra_on_every_slice)
148
  )
149
 
 
 
 
 
 
 
 
150
  connected_volumes = st.selectbox(
151
- "Connected Volumes",
152
- options=connected_volumes_options,
153
- index=connected_volumes_options.index(filters['connected_volumes']) if filters['connected_volumes'] else 0,
154
  key='connected_volumes',
155
  on_change=lambda: apply_filter('connected_volumes', st.session_state.connected_volumes)
156
  )
157
 
158
- laterality = st.selectbox(
159
- "Laterality",
160
- options=laterality_options,
161
- index=laterality_options.index(filters['laterality']) if filters['laterality'] else 0,
162
- key='laterality',
163
- on_change=lambda: apply_filter('laterality', st.session_state.laterality)
164
- )
165
-
166
  st.session_state.filters = filters
 
 
 
 
 
167
 
168
- # Define the pages
169
- if page == "Summary":
170
- st.header("Summary of Qualitative Checks")
171
- # Execute the SQL to get summary statistics
172
- summary_df = duckdb.query("""
173
- WITH Checks AS (
174
- SELECT
175
- bodyPart,
176
- laterality,
177
- COUNT(*) AS total_count,
178
- SUM(CASE WHEN segmentation_completeness = 'pass' THEN 1 ELSE 0 END) AS pass_count,
179
- SUM(CASE WHEN laterality_check = 'pass' THEN 1 ELSE 0 END) AS laterality_pass_count,
180
- SUM(CASE WHEN series_with_vertabra_on_every_slice = 'pass' THEN 1 ELSE 0 END) AS vertabra_pass_count,
181
- SUM(CASE WHEN connected_volumes = 'pass' THEN 1 ELSE 0 END) AS volumes_pass_count
182
- FROM
183
- 'qual-checks-and-quant-values.parquet'
184
- GROUP BY
185
- bodyPart, laterality
186
- )
187
-
188
- SELECT
189
- bodyPart,
190
- laterality,
191
- ROUND((pass_count * 100.0) / total_count, 2) || '% (' || pass_count || '/' || total_count || ')' AS segmentation_completeness,
192
- CASE WHEN laterality IS NOT NULL
193
- THEN ROUND((laterality_pass_count * 100.0) / NULLIF(total_count, 0), 2) || '% (' || laterality_pass_count || '/' || total_count || ')'
194
- ELSE 'N/A' END AS laterality_check,
195
- ROUND((vertabra_pass_count * 100.0) / total_count, 2) || '% (' || vertabra_pass_count || '/' || total_count || ')' AS vertabra_check,
196
- ROUND((volumes_pass_count * 100.0) / total_count, 2) || '% (' || volumes_pass_count || '/' || total_count || ')' AS volumes_check
197
- FROM
198
- Checks
199
- ORDER BY
200
- bodyPart, laterality;
201
- """).pl()
202
- summary_df = summary_df.to_pandas()
203
- st.data_editor(summary_df, hide_index=True,use_container_width=True,height=1500)
204
-
205
- elif page == "UpSet Plots":
206
  st.header("UpSet Plots of Qualitative Checks")
207
 
208
  # Pagination for the filtered dataframe
@@ -223,6 +216,7 @@ def main():
223
  start_idx = (page_number - 1) * page_size
224
  end_idx = min(start_idx + page_size, len(filtered_df)) # Ensure end_idx does not go beyond the dataframe length
225
  paginated_df = filtered_df[start_idx:end_idx].to_pandas() # Convert to Pandas DataFrame
 
226
 
227
  # Display the paginated dataframe
228
  st.header("Filtered Data")
@@ -230,7 +224,16 @@ def main():
230
 
231
  st.data_editor(
232
  paginated_df,
 
 
 
 
 
 
 
 
233
  hide_index=True,
 
234
  )
235
 
236
  # Explanation about the UpSet plot
@@ -251,5 +254,73 @@ def main():
251
  if not filtered_df.is_empty():
252
  create_upset_plot_passes(filtered_df)
253
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
254
  if __name__ == "__main__":
255
  main()
 
5
  from upsetplot import UpSet
6
  import matplotlib.pyplot as plt
7
  import polars as pl
8
+ from polars import col, lit
9
 
10
  # Set page configuration
11
  st.set_page_config(layout="wide")
12
 
13
+ # Local path to the Parquet file
 
14
  LOCAL_PARQUET_FILE = 'qual-checks-and-quant-values.parquet'
15
 
16
  @st.cache_data
 
27
  'connected_volumes',
28
  'Volume from Voxel Summation'
29
  ]
30
+ df = pl.read_parquet(LOCAL_PARQUET_FILE, columns=cols)
31
+ df = df.with_columns([
32
+ pl.when(pl.col('connected_volumes') == 'pass').then(pl.lit(1)).otherwise(
33
+ pl.col('connected_volumes').cast(pl.Int32, strict=False)
34
+ ).alias('connected_volumes')
35
+ ])
36
+
37
+ return df
38
 
39
  # Function to filter data based on user input
40
  def filter_data(df, filters):
41
  for col, value in filters.items():
42
+ if value is not None:
43
+ if col == 'connected_volumes' and value:
44
+ df = df.filter((pl.col(col) <= value) & (pl.col(col).is_not_null()))
45
+ else:
46
+ df = df.filter(pl.col(col) == value)
47
  return df
48
 
49
  # Function to create an UpSet plot for failed checks
 
53
  # Treat 'pass' and null values as passing
54
  df = df.set_index(~((df['segmentation_completeness'] == 'pass') | df['segmentation_completeness'].isnull())).set_index(~((df['laterality_check'] == 'pass') | df['laterality_check'].isnull()), append=True)
55
  df = df.set_index(~((df['series_with_vertabra_on_every_slice'] == 'pass') | df['series_with_vertabra_on_every_slice'].isnull()), append=True)
56
+ df = df.set_index(~((df['connected_volumes'] == '1') | df['connected_volumes'].isnull()), append=True)
57
  df = df[df.index.to_frame().any(axis=1)] # Ignore the case when all conditions are false
58
 
59
  fig = plt.figure()
 
74
  upset.plot(fig=fig)
75
  st.pyplot(fig)
76
 
77
+ # Function to calculate standard deviation of volumes within a patient
78
+ def calculate_std_dev(df):
79
+ df=df.to_pandas()
80
+ # Group by 'PatientID' and calculate the standard deviation of 'Volume from Voxel Summation'
81
+ std_dev_df = df.groupby(['PatientID','bodyPart'])['Volume from Voxel Summation'].std()
82
+ return std_dev_df
83
+
84
  # Main function to run the Streamlit app
85
  def main():
86
  st.title("Qualitative Checks of TotalSegmentator Segmentations on NLST")
 
132
  # Apply the current filters to update options for other filters
133
  filtered_df = filter_data(df, filters)
134
 
135
+
136
  # Update options for other filters based on the current selection
137
  segmentation_completeness_options = [""] + filtered_df['segmentation_completeness'].unique().to_list()
138
  laterality_check_options = [""] + filtered_df['laterality_check'].unique().to_list()
139
  series_with_vertabra_on_every_slice_options = [""] + filtered_df['series_with_vertabra_on_every_slice'].unique().to_list()
140
+ connected_volumes_options = filtered_df['connected_volumes'].unique().to_list()
141
  laterality_options = [""] + filtered_df['laterality'].unique().to_list()
142
+
143
+ laterality = st.selectbox(
144
+ "Laterality",
145
+ options=laterality_options,
146
+ index=laterality_options.index(filters['laterality']) if filters['laterality'] else 0,
147
+ key='laterality',
148
+ on_change=lambda: apply_filter('laterality', st.session_state.laterality)
149
+ )
150
+
151
  # Add remaining filters with default values from session state
152
  segmentation_completeness = st.selectbox(
153
  "Segmentation Completeness",
 
173
  on_change=lambda: apply_filter('series_with_vertabra_on_every_slice', st.session_state.series_with_vertabra_on_every_slice)
174
  )
175
 
176
+ # connected_volumes = st.selectbox(
177
+ # "Connected Volumes (<= value)",
178
+ # options=connected_volumes_options,
179
+ # index=connected_volumes_options.index(filters['connected_volumes']) if filters['connected_volumes'] else 0,
180
+ # key='connected_volumes',
181
+ # on_change=lambda: apply_filter('connected_volumes', st.session_state.connected_volumes)
182
+ # )
183
  connected_volumes = st.selectbox(
184
+ "Connected Volumes (<= value)",
185
+ options=[None] + connected_volumes_options,
186
+ index=connected_volumes_options.index(filters['connected_volumes'])+1 if filters['connected_volumes'] else 0,
187
  key='connected_volumes',
188
  on_change=lambda: apply_filter('connected_volumes', st.session_state.connected_volumes)
189
  )
190
 
 
 
 
 
 
 
 
 
191
  st.session_state.filters = filters
192
+
193
+ if laterality:
194
+ body_part_df = df.filter((col('bodyPart') == lit(body_part)) & (col('laterality') == lit(laterality)))
195
+ else:
196
+ body_part_df = df.filter(col('bodyPart') == lit(body_part))
197
 
198
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
199
  st.header("UpSet Plots of Qualitative Checks")
200
 
201
  # Pagination for the filtered dataframe
 
216
  start_idx = (page_number - 1) * page_size
217
  end_idx = min(start_idx + page_size, len(filtered_df)) # Ensure end_idx does not go beyond the dataframe length
218
  paginated_df = filtered_df[start_idx:end_idx].to_pandas() # Convert to Pandas DataFrame
219
+ paginated_df['Viewer Url'] = 'https://viewer.imaging.datacommons.cancer.gov/viewer/'+paginated_df['StudyInstanceUID']
220
 
221
  # Display the paginated dataframe
222
  st.header("Filtered Data")
 
224
 
225
  st.data_editor(
226
  paginated_df,
227
+ column_config={
228
+ "Viewer Url":st.column_config.LinkColumn("StudyInstanceUID",
229
+ display_text="https:\/\/viewer\.imaging\.datacommons\.cancer\.gov\/viewer\/(.*)"
230
+
231
+ ),
232
+
233
+ },
234
+ column_order=("PatientID", "Viewer Url", "seriesNumber","bodyPart","laterality", "segmentation_completeness","laterality_check", "series_with_vertabra_on_every_slice","connected_volumes"),
235
  hide_index=True,
236
+ use_container_width=True
237
  )
238
 
239
  # Explanation about the UpSet plot
 
254
  if not filtered_df.is_empty():
255
  create_upset_plot_passes(filtered_df)
256
 
257
+ import seaborn as sns
258
+ import pandas as pd
259
+
260
+ # Assuming calculate_std_dev returns a Series
261
+ std_dev_before_filtering = calculate_std_dev(body_part_df)
262
+ std_dev_after_filtering = calculate_std_dev(filtered_df)
263
+
264
+ # Convert Series to DataFrame and add 'Filtering' column
265
+ std_dev_before_filtering = std_dev_before_filtering.reset_index().rename(columns={0: 'Volume from Voxel Summation'})
266
+ std_dev_before_filtering['Filtering'] = 'Before Filtering'
267
+
268
+ std_dev_after_filtering = std_dev_after_filtering.reset_index().rename(columns={0: 'Volume from Voxel Summation'})
269
+ std_dev_after_filtering['Filtering'] = 'After Filtering'
270
+
271
+ # Combine the dataframes for easier plotting
272
+ combined_df = pd.concat([std_dev_before_filtering, std_dev_after_filtering])
273
+
274
+ # Reset the index of the DataFrame
275
+ combined_df = combined_df.reset_index(drop=True)
276
+
277
+ # Display violin plots for the distribution of standard deviation of volumes
278
+ st.header("Violin Plots for Standard Deviation of Volumes")
279
+ st.write("This plot shows the distribution of standard deviation of volumes within a patient.")
280
+ fig2, ax = plt.subplots()
281
+ sns.violinplot(x='Filtering', y='Volume from Voxel Summation', data=combined_df, ax=ax)
282
+ ax.set_ylabel("Standard Deviation of Volumes")
283
+ st.pyplot(fig2)
284
+
285
+
286
+ # Define the pages
287
+ if page == "Summary":
288
+ st.header("Summary of Qualitative Checks")
289
+ # Execute the SQL to get summary statistics
290
+ summary_df = duckdb.query("""
291
+ WITH Checks AS (
292
+ SELECT
293
+ bodyPart,
294
+ laterality,
295
+ COUNT(*) AS total_count,
296
+ SUM(CASE WHEN segmentation_completeness = 'pass' THEN 1 ELSE 0 END) AS pass_count,
297
+ SUM(CASE WHEN laterality_check = 'pass' THEN 1 ELSE 0 END) AS laterality_pass_count,
298
+ SUM(CASE WHEN series_with_vertabra_on_every_slice = 'pass' THEN 1 ELSE 0 END) AS vertabra_pass_count,
299
+ SUM(CASE WHEN connected_volumes = 'pass' THEN 1 ELSE 0 END) AS volumes_pass_count
300
+ FROM
301
+ 'qual-checks-and-quant-values.parquet'
302
+ GROUP BY
303
+ bodyPart, laterality
304
+ )
305
+
306
+ SELECT
307
+ bodyPart,
308
+ laterality,
309
+ ROUND((pass_count * 100.0) / total_count, 2) || '% (' || pass_count || '/' || total_count || ')' AS segmentation_completeness,
310
+ CASE WHEN laterality IS NOT NULL
311
+ THEN ROUND((laterality_pass_count * 100.0) / NULLIF(total_count, 0), 2) || '% (' || laterality_pass_count || '/' || total_count || ')'
312
+ ELSE 'N/A' END AS laterality_check,
313
+ ROUND((vertabra_pass_count * 100.0) / total_count, 2) || '% (' || vertabra_pass_count || '/' || total_count || ')' AS vertabra_check,
314
+ ROUND((volumes_pass_count * 100.0) / total_count, 2) || '% (' || volumes_pass_count || '/' || total_count || ')' AS volumes_check
315
+ FROM
316
+ Checks
317
+ ORDER BY
318
+ bodyPart, laterality;
319
+ """).pl()
320
+ summary_df = summary_df.to_pandas()
321
+ st.data_editor(summary_df, hide_index=True,use_container_width=True,height=1500)
322
+
323
+ # elif page == "UpSet Plots":
324
+
325
  if __name__ == "__main__":
326
  main()