Spaces:
Sleeping
Sleeping
Update Quality_Control.py
Browse files- Quality_Control.py +296 -185
Quality_Control.py
CHANGED
@@ -12,12 +12,6 @@ import hvplot.pandas
|
|
12 |
import pandas as pd
|
13 |
import numpy as np
|
14 |
import json
|
15 |
-
import panel as pn
|
16 |
-
import pandas as pd
|
17 |
-
import os
|
18 |
-
import pandas as pd
|
19 |
-
import random
|
20 |
-
import asyncio
|
21 |
import matplotlib.pyplot as plt
|
22 |
from bokeh.plotting import figure
|
23 |
from bokeh.io import push_notebook, show
|
@@ -29,56 +23,47 @@ from bokeh.models import Span, Label
|
|
29 |
from bokeh.models import ColumnDataSource, Button
|
30 |
from my_modules import *
|
31 |
from datasets import load_dataset
|
32 |
-
|
33 |
#Silence FutureWarnings & UserWarnings
|
34 |
warnings.filterwarnings('ignore', category= FutureWarning)
|
35 |
warnings.filterwarnings('ignore', category= UserWarning)
|
36 |
|
37 |
-
#input_path = os.path.join(present_dir, 'wetransfer_data-zip_2024-05-17_1431')
|
38 |
-
present_dir = os.path.dirname(os.path.realpath(__file__))
|
39 |
-
# Construct the full path to the stored_variables.json file
|
40 |
-
json_path = os.path.join(present_dir, 'stored_variables.json')
|
41 |
-
with open(json_path, 'r') as file:
|
42 |
-
stored_vars = json.load(file)
|
43 |
-
directory = stored_vars['base_dir']
|
44 |
-
input_path = os.path.join(present_dir,directory)
|
45 |
-
set_path = stored_vars['set_path']
|
46 |
-
selected_metadata_files = stored_vars['selected_metadata_files']
|
47 |
-
ls_samples = stored_vars['ls_samples']
|
48 |
-
base_dir = input_path
|
49 |
|
50 |
-
#
|
51 |
-
#
|
52 |
-
|
53 |
-
|
|
|
|
|
|
|
54 |
pn.extension()
|
55 |
|
56 |
update_button = pn.widgets.Button(name='CSV Files', button_type='primary')
|
57 |
def update_samples(event):
|
58 |
-
with open(
|
59 |
stored_vars = json.load(file)
|
60 |
-
|
61 |
-
|
62 |
-
return f'CSV Files Selected: {ls_samples}'
|
63 |
update_button.on_click(update_samples)
|
64 |
|
65 |
csv_files_button = pn.widgets.Button(icon="clipboard", button_type="primary")
|
66 |
indicator = pn.indicators.LoadingSpinner(value=False, size=25)
|
67 |
|
68 |
def handle_click(clicks):
|
69 |
-
with open(
|
70 |
stored_vars = json.load(file)
|
71 |
-
|
72 |
-
|
73 |
-
#return f'CSV Files Selected: {ls_samples}'
|
74 |
|
75 |
-
|
76 |
-
|
|
|
|
|
77 |
|
78 |
|
79 |
# ## I.2. *DIRECTORIES
|
80 |
|
81 |
-
|
82 |
|
83 |
# Set base directory
|
84 |
|
@@ -132,7 +117,7 @@ for d in [base_dir, input_data_dir, output_data_dir, output_images_dir, metadata
|
|
132 |
print("The", d, "directory already exists !")
|
133 |
|
134 |
os.chdir(input_data_dir)
|
135 |
-
with open(
|
136 |
stored_vars = json.load(file)
|
137 |
# ls_samples = stored_vars['ls_samples']
|
138 |
selected_metadata_files = stored_vars['selected_metadata_files']
|
@@ -180,6 +165,13 @@ print('metadata_images_dir :', metadata_images_dir)
|
|
180 |
#ls_samples = [sample for sample in os.listdir(input_data_dir) if sample.endswith(".csv")]
|
181 |
print("The following CSV files were detected:\n\n",[sample for sample in ls_samples], "\n\nin", input_data_dir, "directory.")
|
182 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
183 |
def combine_and_save_metadata_files(metadata_dir, selected_metadata_files):
|
184 |
if len(selected_metadata_files) == []:
|
185 |
if not file:
|
@@ -200,44 +192,38 @@ def combine_and_save_metadata_files(metadata_dir, selected_metadata_files):
|
|
200 |
return combined_metadata_df
|
201 |
|
202 |
else:
|
203 |
-
|
204 |
single_file_path = os.path.join(metadata_dir, selected_metadata_files[0])
|
205 |
single_file_df = pd.read_csv(single_file_path)
|
206 |
print(f"Only one file selected: {selected_metadata_files[0]}")
|
207 |
-
|
208 |
-
|
209 |
-
|
210 |
-
|
211 |
-
|
212 |
-
|
213 |
-
|
214 |
-
|
215 |
-
combined_metadata_df = pd.read_csv(combined_metadata_path)
|
216 |
-
else:
|
217 |
-
if selected_metadata_files:
|
218 |
-
combined_metadata_df = pd.DataFrame()
|
219 |
-
for file in selected_metadata_files:
|
220 |
-
file_path = os.path.join(metadata_dir, file)
|
221 |
-
metadata_df = pd.read_csv(file_path)
|
222 |
-
combined_metadata_df = pd.concat([combined_metadata_df, metadata_df], ignore_index=True)
|
223 |
-
|
224 |
-
combined_metadata_df.to_csv(combined_metadata_path, index=False)
|
225 |
-
print(f"Combined metadata saved to: {combined_metadata_path}")
|
226 |
-
else:
|
227 |
-
print("No metadata files selected.")
|
228 |
-
combined_metadata_df = pd.DataFrame()
|
229 |
-
|
230 |
-
return combined_metadata_df
|
231 |
|
232 |
print(combine_and_save_metadata_files(metadata_dir, selected_metadata_files))
|
233 |
|
|
|
|
|
|
|
|
|
234 |
ls_samples
|
235 |
|
|
|
|
|
236 |
path = os.path.join(input_data_dir, ls_samples[0])
|
237 |
#df = load_dataset('csv', data_files = path )
|
238 |
df = pd.read_csv(os.path.join(input_data_dir, ls_samples[0]),index_col = 0, nrows = 1)
|
239 |
df.head(10)
|
240 |
|
|
|
|
|
|
|
|
|
241 |
# First gather information on expected headers using first file in ls_samples
|
242 |
# Read in the first row of the file corresponding to the first sample (index = 0) in ls_samples
|
243 |
df = pd.read_csv(os.path.join(input_data_dir, ls_samples[0]) , index_col = 0, nrows = 1)
|
@@ -248,7 +234,17 @@ print("df :\n", df.head(), "\n")
|
|
248 |
print("df's columns :\n", df.columns, "\n")
|
249 |
print("df's index :\n", df.index, "\n")
|
250 |
print("df's index name :\n", df.index.name)
|
|
|
|
|
|
|
|
|
|
|
251 |
df.head()
|
|
|
|
|
|
|
|
|
|
|
252 |
# Verify that the ID column in input file became the index
|
253 |
# Verify that the index name column is "ID", if not, rename it
|
254 |
if df.index.name != "ID":
|
@@ -276,15 +272,40 @@ print("\ndf :\n", df.head(), "\n")
|
|
276 |
print("df's columns :\n", df.columns, "\n")
|
277 |
print("df's index :\n", df.index, "\n")
|
278 |
print("df's index name :\n", df.index.name)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
279 |
print("Used " + ls_samples[0] + " to determine the expected and corrected headers for all files.\n")
|
280 |
print("These headers are: \n" + ", ".join([h for h in expected_headers]))
|
281 |
|
282 |
corrected_headers = True
|
283 |
|
|
|
|
|
|
|
|
|
284 |
for sample in ls_samples:
|
285 |
file_path = os.path.join(input_data_dir,sample)
|
286 |
print(file_path)
|
287 |
|
|
|
|
|
|
|
|
|
288 |
# Import all the others files
|
289 |
dfs = {}
|
290 |
###############################
|
@@ -439,16 +460,32 @@ else:
|
|
439 |
file_not_intensities.write(item + "\n")
|
440 |
file_not_intensities.close()
|
441 |
|
|
|
|
|
|
|
|
|
442 |
not_intensities_df = pd.read_csv(path_not_intensities)
|
443 |
not_intensities_df
|
444 |
|
|
|
|
|
|
|
|
|
445 |
# Columns we want to keep: not_intensities, and any intensity column that contains 'Intensity_Average' (drop any intensity marker column that is not a mean intensity)
|
446 |
to_keep = not_intensities + [x for x in df.columns.values[~df.columns.isin(not_intensities)] if 'Intensity_Average' in x]
|
447 |
|
448 |
to_keep
|
449 |
|
|
|
|
|
|
|
|
|
450 |
print(len(to_keep) - 1)
|
451 |
|
|
|
|
|
|
|
|
|
452 |
# However, our to_keep list contains items that might not be in our df headers!
|
453 |
# These items are from our not_intensities list. So let's ask for only those items from to_keep that are actually found in our df
|
454 |
# Retains only the columns from the to_keep list that are found in the df's headers (columns).
|
@@ -458,17 +495,14 @@ df = df[[x for x in to_keep if x in df.columns.values]]
|
|
458 |
|
459 |
df.head()
|
460 |
|
461 |
-
# Assuming you have a DataFrame named 'df'
|
462 |
-
# df = pd.read_csv('your_file.csv')
|
463 |
|
464 |
-
#
|
465 |
-
json_file_path = os.path.join(present_dir,"stored_variables.json")
|
466 |
|
467 |
-
|
468 |
-
|
469 |
-
|
470 |
-
|
471 |
-
|
472 |
|
473 |
# Get all column names
|
474 |
all_columns = df.columns.tolist()
|
@@ -495,95 +529,6 @@ intensity_marker = list(set(intensity_marker))
|
|
495 |
print("Intensity Markers:")
|
496 |
print(intensity_marker)
|
497 |
|
498 |
-
# Create a DataFrame with the intensity markers and default values
|
499 |
-
marker_options_df = pd.DataFrame({
|
500 |
-
'Marker': intensity_marker,
|
501 |
-
'Cell': [True] * len(intensity_marker),
|
502 |
-
'Cytoplasm': [False] * len(intensity_marker),
|
503 |
-
'Nucleus': [False] * len(intensity_marker)
|
504 |
-
})
|
505 |
-
|
506 |
-
# Define formatters for the Tabulator widget
|
507 |
-
tabulator_formatters = {
|
508 |
-
'Cell': {'type': 'tickCross'},
|
509 |
-
'Cytoplasm': {'type': 'tickCross'},
|
510 |
-
'Nucleus': {'type': 'tickCross'}
|
511 |
-
}
|
512 |
-
|
513 |
-
# Create the Tabulator widget
|
514 |
-
tabulator = pn.widgets.Tabulator(marker_options_df, formatters=tabulator_formatters, sizing_mode='stretch_width')
|
515 |
-
|
516 |
-
# Create a DataFrame to store the initial intensities
|
517 |
-
new_data = [{'Description': f"{marker}_Cell_Intensity_Average"} for marker in intensity_marker if True]
|
518 |
-
new_data_df = pd.DataFrame(new_data)
|
519 |
-
|
520 |
-
# Create a widget to display the new data as a DataFrame
|
521 |
-
new_data_table = pn.widgets.Tabulator(new_data_df, name='New Data Table', sizing_mode='stretch_width')
|
522 |
-
|
523 |
-
# Create a button to start the update process
|
524 |
-
run_button = pn.widgets.Button(name="Save Selection", button_type='primary')
|
525 |
-
|
526 |
-
# Function to update stored_variables.json
|
527 |
-
def update_stored_variables(selected_columns):
|
528 |
-
stored_variables["selected_intensities"] = selected_columns
|
529 |
-
with open(json_file_path, "w") as file:
|
530 |
-
json.dump(stored_variables, file, indent=4)
|
531 |
-
|
532 |
-
# Define the update_intensities function
|
533 |
-
def update_intensities(event=None):
|
534 |
-
global new_data, new_data_df
|
535 |
-
new_data = []
|
536 |
-
selected_columns = []
|
537 |
-
for _, row in tabulator.value.iterrows():
|
538 |
-
marker = row['Marker']
|
539 |
-
if row['Cell']:
|
540 |
-
new_data.append({'Description': f"{marker}_Cell_Intensity_Average"})
|
541 |
-
selected_columns.append(f"{marker}_Cell_Intensity_Average")
|
542 |
-
if row['Cytoplasm']:
|
543 |
-
new_data.append({'Description': f"{marker}_Cytoplasm_Intensity_Average"})
|
544 |
-
selected_columns.append(f"{marker}_Cytoplasm_Intensity_Average")
|
545 |
-
if row['Nucleus']:
|
546 |
-
new_data.append({'Description': f"{marker}_Nucleus_Intensity_Average"})
|
547 |
-
selected_columns.append(f"{marker}_Nucleus_Intensity_Average")
|
548 |
-
new_data_df = pd.DataFrame(new_data)
|
549 |
-
new_data_table.value = new_data_df
|
550 |
-
update_stored_variables(selected_columns)
|
551 |
-
print("Updated intensities DataFrame:")
|
552 |
-
print(new_data_df)
|
553 |
-
|
554 |
-
# Define the runner function
|
555 |
-
async def runner(event):
|
556 |
-
update_intensities()
|
557 |
-
|
558 |
-
# Bind the runner function to the button
|
559 |
-
run_button.on_click(runner)
|
560 |
-
|
561 |
-
# Attach the update_intensities function to changes in the Tabulator widget
|
562 |
-
tabulator.param.watch(update_intensities, 'value')
|
563 |
-
|
564 |
-
# Layout
|
565 |
-
updated_intensities = pn.Column(tabulator, run_button, new_data_table, sizing_mode="stretch_width")
|
566 |
-
pn.extension('tabulator')
|
567 |
-
'''
|
568 |
-
# Iterate over each column name
|
569 |
-
for column in all_columns:
|
570 |
-
# Check if the column name contains 'Intensity_Average'
|
571 |
-
if 'Intensity_Average' in column:
|
572 |
-
# Split the column name by underscore
|
573 |
-
parts = column.split('_')
|
574 |
-
|
575 |
-
# Extract the word before the first underscore
|
576 |
-
marker = parts[0]
|
577 |
-
|
578 |
-
# Add the marker to the intensity_marker list
|
579 |
-
intensity_marker.append(marker)
|
580 |
-
|
581 |
-
# Remove duplicates from the intensity_marker list
|
582 |
-
intensity_marker = list(set(intensity_marker))
|
583 |
-
|
584 |
-
print("Intensity Markers:")
|
585 |
-
print(intensity_marker)
|
586 |
-
|
587 |
# Create a callback function to update the intensities array
|
588 |
def update_intensities(event):
|
589 |
global intensities
|
@@ -608,6 +553,10 @@ def update_intensities(event):
|
|
608 |
print("Updated intensities DataFrame:")
|
609 |
print(intensities_df)
|
610 |
|
|
|
|
|
|
|
|
|
611 |
tabulator_formatters = {
|
612 |
'bool': {'type': 'tickCross'}
|
613 |
}
|
@@ -626,6 +575,12 @@ tabulator.param.watch(update_intensities,'value')
|
|
626 |
|
627 |
# Create a Panel layout with the Tabulator widget
|
628 |
marker_options_layout = pn.Column(tabulator, sizing_mode="stretch_width")
|
|
|
|
|
|
|
|
|
|
|
|
|
629 |
# Initialize the Panel extension with Tabulator
|
630 |
pn.extension('tabulator')
|
631 |
|
@@ -682,13 +637,17 @@ run_button.on_click(runner)
|
|
682 |
# Layout
|
683 |
updated_intensities = pn.Column(tabulator, run_button, new_data_table, sizing_mode="stretch_width")
|
684 |
|
685 |
-
pn.extension()
|
686 |
# Serve the layout
|
687 |
#updated_intensities.servable()
|
688 |
|
689 |
|
690 |
intensities_df = new_data_table
|
|
|
|
|
691 |
intensities_df = pn.pane.DataFrame(intensities_df)
|
|
|
|
|
692 |
print(intensities_df)
|
693 |
# ## I.4. QC CHECKS
|
694 |
|
@@ -745,6 +704,10 @@ def check_index_format(index_str, ls_samples):
|
|
745 |
# If all checks pass, return True
|
746 |
return True
|
747 |
|
|
|
|
|
|
|
|
|
748 |
# Let's take a look at a few features to make sure our dataframe is as expected
|
749 |
df.index
|
750 |
def check_format_ofindex(index):
|
@@ -758,11 +721,19 @@ def check_format_ofindex(index):
|
|
758 |
return index_format
|
759 |
print(check_format_ofindex(df.index))
|
760 |
|
|
|
|
|
|
|
|
|
761 |
df.shape
|
762 |
check_index = df.index
|
763 |
check_shape = df.shape
|
764 |
print(check_shape)
|
765 |
|
|
|
|
|
|
|
|
|
766 |
# Check for NaN entries (should not be any unless columns do not align)
|
767 |
# False means no NaN entries
|
768 |
# True means NaN entries
|
@@ -770,6 +741,10 @@ df.isnull().any().any()
|
|
770 |
|
771 |
check_no_null = df.isnull().any().any()
|
772 |
|
|
|
|
|
|
|
|
|
773 |
# Check that all expected files were imported into final dataframe
|
774 |
if sorted(df.Sample_ID.unique()) == sorted(ls_samples):
|
775 |
print("All expected filenames are present in big df Sample_ID column.")
|
@@ -780,6 +755,10 @@ else:
|
|
780 |
|
781 |
print(df.Sample_ID)
|
782 |
|
|
|
|
|
|
|
|
|
783 |
# Delete rows that have 0 value mean intensities for intensity columns
|
784 |
print("df.shape before removing 0 mean values: ", df.shape)
|
785 |
|
@@ -846,6 +825,9 @@ for key, value in quality_check_results.items():
|
|
846 |
print(f"{key}: {value}")
|
847 |
|
848 |
|
|
|
|
|
|
|
849 |
import panel as pn
|
850 |
import pandas as pd
|
851 |
|
@@ -943,6 +925,8 @@ def create_line_graph2(quantile):
|
|
943 |
|
944 |
return p
|
945 |
|
|
|
|
|
946 |
# Bind the create_line_graph function to the quantile slider
|
947 |
nucleus_size_line_graph_with_histogram = pn.bind(create_line_graph2, quantile=quantile_slider.param.value)
|
948 |
|
@@ -967,8 +951,17 @@ qs = [quantile, 0.50, 1.00 - quantile]
|
|
967 |
quantiles = df['Nucleus_Size'].quantile(q=qs).values
|
968 |
threshold = quantiles[2]
|
969 |
|
|
|
|
|
|
|
|
|
970 |
print(threshold)
|
971 |
|
|
|
|
|
|
|
|
|
|
|
972 |
import panel as pn
|
973 |
import pandas as pd
|
974 |
import numpy as np
|
@@ -1006,6 +999,10 @@ results_display = pn.bind(update_threshold_and_display, quantile_slider)
|
|
1006 |
# Layout the components in a Panel app
|
1007 |
layout2 = results_display
|
1008 |
|
|
|
|
|
|
|
|
|
1009 |
print("Number of cells before filtering :", df.shape[0])
|
1010 |
cells_before_filter = f"Number of cells before filtering :{df.shape[0]}"
|
1011 |
# Delete small cells and objects w/high AF555 Signal (RBCs)
|
@@ -1113,6 +1110,10 @@ def calculate_quantiles(column, quantile):
|
|
1113 |
quantiles = df[column].quantile(q=[quantile, 0.50, 1 - quantile])
|
1114 |
return quantiles
|
1115 |
|
|
|
|
|
|
|
|
|
1116 |
quantile_slider = pn.widgets.FloatSlider(name='Quantile', start=0.01, end=0.99, step=0.01, value=0.05)
|
1117 |
|
1118 |
|
@@ -1122,10 +1123,30 @@ quantile_slider = pn.widgets.FloatSlider(name='Quantile', start=0.01, end=0.99,
|
|
1122 |
# Layout the components in a Panel app
|
1123 |
#nucleus_size_graph = pn.Column(nucleus_size_line_graph)
|
1124 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1125 |
len(intensities)
|
|
|
|
|
|
|
|
|
|
|
|
|
1126 |
|
1127 |
df
|
1128 |
|
|
|
|
|
|
|
|
|
1129 |
def calculate_cytoplasm_quantiles(column, quantile):
|
1130 |
# Print the columns of the DataFrame
|
1131 |
print("DataFrame columns:", df.columns)
|
@@ -1143,9 +1164,14 @@ def create_cytoplasm_intensity_df(column, quantile):
|
|
1143 |
return pn.pane.DataFrame(output)
|
1144 |
|
1145 |
# Bind the create_app function to the quantile slider
|
1146 |
-
cytoplasm_quantile_output_app = pn.bind(create_cytoplasm_intensity_df, column=
|
1147 |
|
1148 |
pn.Column(quantile_slider, cytoplasm_quantile_output_app)
|
|
|
|
|
|
|
|
|
|
|
1149 |
def calculate_cytoplasm_quantiles(column, quantile):
|
1150 |
quantiles = df[column].quantile(q=[quantile, 0.50, 1 - quantile])
|
1151 |
return quantiles
|
@@ -1159,12 +1185,15 @@ def create_cytoplasm_intensity_df(column, quantile):
|
|
1159 |
|
1160 |
|
1161 |
# Bind the create_app function to the quantile slider
|
1162 |
-
cytoplasm_quantile_output_app = pn.bind(create_cytoplasm_intensity_df, column=
|
1163 |
pn.Column(quantile_slider,cytoplasm_quantile_output_app)
|
1164 |
|
1165 |
|
1166 |
# ## I.5. COLUMNS OF INTERESTS
|
1167 |
|
|
|
|
|
|
|
1168 |
# Remove columns containing "DAPI"
|
1169 |
df = df[[x for x in df.columns.values if 'DAPI' not in x]]
|
1170 |
|
@@ -1172,6 +1201,9 @@ print("Columns are now...")
|
|
1172 |
print([c for c in df.columns.values])
|
1173 |
|
1174 |
|
|
|
|
|
|
|
1175 |
# Create lists of full names and shortened names to use in plotting
|
1176 |
full_to_short_names, short_to_full_names = \
|
1177 |
shorten_feature_names(df.columns.values[~df.columns.isin(not_intensities)])
|
@@ -1179,6 +1211,9 @@ full_to_short_names, short_to_full_names = \
|
|
1179 |
short_to_full_names
|
1180 |
|
1181 |
|
|
|
|
|
|
|
1182 |
# Save this data to a metadata file
|
1183 |
filename = os.path.join(metadata_dir, "full_to_short_column_names.csv")
|
1184 |
fh = open(filename, "w")
|
@@ -1189,6 +1224,10 @@ for k,v in full_to_short_names.items():
|
|
1189 |
fh.close()
|
1190 |
print("The full_to_short_column_names.csv file was created !")
|
1191 |
|
|
|
|
|
|
|
|
|
1192 |
# Save this data to a metadata file
|
1193 |
filename = os.path.join(metadata_dir, "short_to_full_column_names.csv")
|
1194 |
fh = open(filename, "w")
|
@@ -1202,11 +1241,18 @@ print("The short_to_full_column_names.csv file was created !")
|
|
1202 |
|
1203 |
# ## I.6. EXPOSURE TIME
|
1204 |
|
|
|
|
|
1205 |
|
1206 |
#import the ashlar analysis file
|
1207 |
file_path = os.path.join(metadata_dir, 'combined_metadata.csv')
|
1208 |
ashlar_analysis = pd.read_csv(file_path)
|
1209 |
ashlar_analysis
|
|
|
|
|
|
|
|
|
|
|
1210 |
# Extracting and renaming columns
|
1211 |
new_df = ashlar_analysis[['Name', 'Cycle', 'ChannelIndex', 'ExposureTime']].copy()
|
1212 |
new_df.rename(columns={
|
@@ -1225,6 +1271,10 @@ new_df.to_csv('Ashlar_Exposure_Time.csv', index=False)
|
|
1225 |
# Print the new dataframe
|
1226 |
print(new_df)
|
1227 |
|
|
|
|
|
|
|
|
|
1228 |
# Here, we want to end up with a data structure that incorporates metadata on each intensity marker column used in our big dataframe in an easy-to-use format.
|
1229 |
# This is going to include the full name of the intensity marker columns in the big data frame,
|
1230 |
# the corresponding round and channel,
|
@@ -1254,21 +1304,41 @@ else:
|
|
1254 |
print("\nNo null values detected.")
|
1255 |
|
1256 |
|
|
|
|
|
|
|
1257 |
if len(exp_df['Target']) > len(exp_df['Target'].unique()):
|
1258 |
print("One or more non-unique Target values in exp_df. Currently not supported.")
|
1259 |
exp_df = exp_df.drop_duplicates(subset = 'Target').reindex()
|
1260 |
|
|
|
|
|
|
|
|
|
1261 |
# sort exp_df by the values in the 'Target' column in ascending order and then retrieve the first few rows of the sorted df
|
1262 |
exp_df.sort_values(by = ['Target']).head()
|
1263 |
|
|
|
|
|
|
|
|
|
1264 |
# Create lowercase version of target
|
1265 |
exp_df['target_lower'] = exp_df['Target'].str.lower()
|
1266 |
exp_df.head()
|
1267 |
|
|
|
|
|
|
|
|
|
1268 |
# Create df that contains marker intensity columns in our df that aren't in not_intensities
|
1269 |
intensities = pd.DataFrame({'full_column':df.columns.values[~df.columns.isin(not_intensities)]})
|
1270 |
|
1271 |
intensities
|
|
|
|
|
|
|
|
|
|
|
1272 |
# Extract the marker information from the `full_column`, which corresponds to full column in big dataframe
|
1273 |
# Use regular expressions (regex) to isolate the part of the field that begins (^) with an alphanumeric value (W), and ends with an underscore (_)
|
1274 |
# '$' is end of line
|
@@ -1277,10 +1347,20 @@ intensities['marker'] = intensities['full_column'].str.extract(r'([^\W_]+)')
|
|
1277 |
intensities['marker_lower'] = intensities['marker'].str.lower()
|
1278 |
|
1279 |
intensities
|
|
|
|
|
|
|
|
|
|
|
1280 |
# Subset the intensities df to exclude any column pertaining to DAPI
|
1281 |
intensities = intensities.loc[intensities['marker_lower'] != 'dapi']
|
1282 |
|
1283 |
intensities.head()
|
|
|
|
|
|
|
|
|
|
|
1284 |
# Merge the intensities andexp_df together to create metadata
|
1285 |
metadata = pd.merge(exp_df, intensities, how = 'left', left_on = 'target_lower',right_on = 'marker_lower')
|
1286 |
metadata = metadata.drop(columns = ['marker_lower'])
|
@@ -1290,14 +1370,27 @@ metadata = metadata.dropna()
|
|
1290 |
# target_lower is Target in small caps
|
1291 |
# marker is the extracted first component of the full column in segmentation data, with corresponding capitalization
|
1292 |
metadata
|
|
|
|
|
|
|
|
|
|
|
1293 |
# Add a column to signify marker target localisation.
|
1294 |
# Use a lambda to determine segmented location of intensity marker column and update metadata accordingly
|
1295 |
# Using the add_metadata_location() function in my_modules.py
|
1296 |
metadata['localisation'] = metadata.apply(
|
1297 |
lambda row: add_metadata_location(row), axis = 1)
|
1298 |
|
|
|
|
|
|
|
|
|
1299 |
mlid = metadata
|
1300 |
|
|
|
|
|
|
|
|
|
1301 |
# Save this data structure to the metadata folder
|
1302 |
# don't want to add color in because that's better off treating color the same for round, channel, and sample
|
1303 |
filename = "marker_intensity_metadata.csv"
|
@@ -1336,6 +1429,10 @@ custom_colors_values = sb.palplot(sb.color_palette([custom_colors.get(ch, 'blue'
|
|
1336 |
print("Unique channels are:", metadata.Channel.unique())
|
1337 |
sb.palplot(sb.color_palette(channel_color_values))
|
1338 |
|
|
|
|
|
|
|
|
|
1339 |
# Function to create a palette plot with custom colors
|
1340 |
def create_palette_plot():
|
1341 |
# Get unique channels
|
@@ -1398,6 +1495,9 @@ app_palette_plot = create_palette_plot(custom_colors)
|
|
1398 |
#app_palette_plot.servable()
|
1399 |
|
1400 |
|
|
|
|
|
|
|
1401 |
# Store in a dictionary
|
1402 |
channel_color_dict = dict(zip(metadata.Channel.unique(), channel_color_values))
|
1403 |
channel_color_dict
|
@@ -1406,6 +1506,10 @@ for k,v in channel_color_dict.items():
|
|
1406 |
|
1407 |
channel_color_dict
|
1408 |
|
|
|
|
|
|
|
|
|
1409 |
color_df_channel = color_dict_to_df(channel_color_dict, "Channel")
|
1410 |
|
1411 |
# Save to file in metadatadirectory
|
@@ -1415,6 +1519,10 @@ color_df_channel.to_csv(filename, index = False)
|
|
1415 |
|
1416 |
color_df_channel
|
1417 |
|
|
|
|
|
|
|
|
|
1418 |
# Legend of channel info only
|
1419 |
g = plt.figure(figsize = (1,1)).add_subplot(111)
|
1420 |
g.axis('off')
|
@@ -1448,6 +1556,10 @@ sb.palplot(sb.color_palette(round_color_values))
|
|
1448 |
|
1449 |
## TO-DO: write what these parameters mean
|
1450 |
|
|
|
|
|
|
|
|
|
1451 |
# Store in a dictionary
|
1452 |
round_color_dict = dict(zip(metadata.Round.unique(), round_color_values))
|
1453 |
|
@@ -1456,6 +1568,10 @@ for k,v in round_color_dict.items():
|
|
1456 |
|
1457 |
round_color_dict
|
1458 |
|
|
|
|
|
|
|
|
|
1459 |
color_df_round = color_dict_to_df(round_color_dict, "Round")
|
1460 |
|
1461 |
# Save to file in metadatadirectory
|
@@ -1485,6 +1601,9 @@ plt.savefig(filename, bbox_inches = 'tight')
|
|
1485 |
|
1486 |
# ### I.7.3. SAMPLES COLORS
|
1487 |
|
|
|
|
|
|
|
1488 |
# we want colors that are neither sequential nor categorical.
|
1489 |
# Categorical would be ideal if we could generate an arbitrary number of colors, but I do not think that we can.
|
1490 |
# Hense, we will choose `n` colors from a continuous palette. First we will generate the right number of colors. Later, we will assign TMA samples to gray.
|
@@ -1496,10 +1615,18 @@ color_values = sb.color_palette("husl",n_colors = len(ls_samples))#'HLS'
|
|
1496 |
# Display those unique colors
|
1497 |
sb.palplot(sb.color_palette(color_values))
|
1498 |
|
|
|
|
|
|
|
|
|
1499 |
TMA_samples = [s for s in df.Sample_ID.unique() if 'TMA' in s]
|
1500 |
TMA_color_values = sb.color_palette(n_colors = len(TMA_samples),palette = "gray")
|
1501 |
sb.palplot(sb.color_palette(TMA_color_values))
|
1502 |
|
|
|
|
|
|
|
|
|
1503 |
# Store in a dictionary
|
1504 |
color_dict = dict()
|
1505 |
color_dict = dict(zip(df.Sample_ID.unique(), color_values))
|
@@ -1615,34 +1742,16 @@ variable_widget = pn.widgets.Select(name="Target", value="Exp", options=list(met
|
|
1615 |
window_widget = pn.widgets.IntSlider(name="window", value=30, start=1, end=60)
|
1616 |
sigma_widget = pn.widgets.IntSlider(name="sigma", value=10, start=0, end=20)
|
1617 |
|
1618 |
-
# Function to save files
|
1619 |
-
def save_files(event):
|
1620 |
-
for sample in ls_samples:
|
1621 |
-
sample_id = sample.split('.csv')[0]
|
1622 |
-
filename = os.path.join(output_data_dir, sample_id + "_" + step_suffix + ".csv")
|
1623 |
-
|
1624 |
-
df_save = df.loc[df['Sample_ID'] == sample, :]
|
1625 |
-
if os.path.exists(filename):
|
1626 |
-
df_save.to_csv(filename, index=True, index_label='ID', mode='w') # Overwrite by default
|
1627 |
-
print(f"File {filename} was overwritten!")
|
1628 |
-
else:
|
1629 |
-
df_save.to_csv(filename, index=True, index_label='ID') # Save normally if the file doesn't exist
|
1630 |
-
print(f"File {filename} was created and saved!")
|
1631 |
-
|
1632 |
-
# Button to download files
|
1633 |
-
download_button = pn.widgets.Button(name='Download Files', button_type='primary')
|
1634 |
-
download_button.on_click(save_files)
|
1635 |
-
|
1636 |
app = pn.template.GoldenTemplate(
|
1637 |
site="Cyc-IF",
|
1638 |
title="Quality Control",
|
1639 |
main=[
|
1640 |
pn.Tabs(
|
1641 |
("Dataframes", pn.Column(
|
1642 |
-
pn.Row(csv_files_button,pn.bind(handle_click, csv_files_button.param.clicks)
|
1643 |
pn.pane.Markdown("### The Dataframe uploaded:"), pn.pane.DataFrame(intial_dataframe),
|
1644 |
#pn.pane.Markdown("### The Exposure time DataFrame is :"), pn.pane.DataFrame(exp_df.head()),
|
1645 |
-
pn.pane.Markdown("### The DataFrame after merging CycIF data x metadata :"), pn.pane.DataFrame(merged_dataframe.head(
|
1646 |
)),
|
1647 |
("Quality Control", pn.Column(
|
1648 |
quality_check(quality_control_df, not_intensities)
|
@@ -1656,17 +1765,19 @@ app = pn.template.GoldenTemplate(
|
|
1656 |
)),
|
1657 |
("Plots", pn.Column(
|
1658 |
#pn.pane.Markdown(" ### Nucleus Size Distribution: "), pn.Row(nucleus_size_line_graph_with_histogram, num_of_cell_removal),
|
1659 |
-
pn.pane.Markdown(" ### Nucleus Size Distribution: "), pn.Row(plot1,layout2),
|
1660 |
#pn.pane.Markdown("### Nucleus Distribution Plot:"), pn.Column(nucleus_size_plot, nucleus_size_graph),
|
1661 |
pn.pane.Markdown(" ### Intensity Average Plot:"), pn.Row(selected_marker_plot,num_of_cell_removal_intensity ),
|
1662 |
#pn.Column(pn.Column(column_dropdown, generate_plot_button), quantile_slider, plot),
|
1663 |
#pn.pane.Markdown("### Cytoplasm Intensity Plot:"), cytoplasm_intensity_plot,
|
1664 |
#pn.pane.Markdown("### AF555_Cell_Intensity_Average:"), quantile_output_app,
|
1665 |
-
#pn.pane.Markdown("### Distribution of AF555_Cell_Intensity_Average with Quantiles:"), quantile_intensity_plot)
|
1666 |
-
pn.Column(download_button),
|
1667 |
)),
|
1668 |
|
1669 |
),
|
1670 |
])
|
1671 |
|
1672 |
-
app.servable()
|
|
|
|
|
|
|
|
12 |
import pandas as pd
|
13 |
import numpy as np
|
14 |
import json
|
|
|
|
|
|
|
|
|
|
|
|
|
15 |
import matplotlib.pyplot as plt
|
16 |
from bokeh.plotting import figure
|
17 |
from bokeh.io import push_notebook, show
|
|
|
23 |
from bokeh.models import ColumnDataSource, Button
|
24 |
from my_modules import *
|
25 |
from datasets import load_dataset
|
26 |
+
os.getcwd()
|
27 |
#Silence FutureWarnings & UserWarnings
|
28 |
warnings.filterwarnings('ignore', category= FutureWarning)
|
29 |
warnings.filterwarnings('ignore', category= UserWarning)
|
30 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
31 |
|
32 |
+
#present_dir = os.path.dirname(os.path.realpath(__file__))
|
33 |
+
#input_path = os.path.join(present_dir, 'wetransfer_data-zip_2024-05-17_1431')
|
34 |
+
base_dir = '/code/wetransfer_data-zip_2024-05-17_1431'
|
35 |
+
set_path = 'test'
|
36 |
+
selected_metadata_files = ['Slide_B_DD1s1.one_1.tif.csv', 'Slide_B_DD1s1.one_2.tif.csv']
|
37 |
+
ls_samples = ['DD3S1.csv', 'DD3S2.csv', 'DD3S3.csv', 'TMA.csv']
|
38 |
+
|
39 |
pn.extension()
|
40 |
|
41 |
update_button = pn.widgets.Button(name='CSV Files', button_type='primary')
|
42 |
def update_samples(event):
|
43 |
+
with open('stored_variables.json', 'r') as file:
|
44 |
stored_vars = json.load(file)
|
45 |
+
# ls_samples = stored_vars['ls_samples']
|
46 |
+
print(ls_samples)
|
|
|
47 |
update_button.on_click(update_samples)
|
48 |
|
49 |
csv_files_button = pn.widgets.Button(icon="clipboard", button_type="primary")
|
50 |
indicator = pn.indicators.LoadingSpinner(value=False, size=25)
|
51 |
|
52 |
def handle_click(clicks):
|
53 |
+
with open('stored_variables.json', 'r') as file:
|
54 |
stored_vars = json.load(file)
|
55 |
+
# ls_samples = stored_vars['ls_samples']
|
56 |
+
return f'CSV Files Selected: {ls_samples}'
|
|
|
57 |
|
58 |
+
pn.Row(
|
59 |
+
csv_files_button,
|
60 |
+
pn.bind(handle_click, csv_files_button.param.clicks),
|
61 |
+
)
|
62 |
|
63 |
|
64 |
# ## I.2. *DIRECTORIES
|
65 |
|
66 |
+
set_path = 'test'
|
67 |
|
68 |
# Set base directory
|
69 |
|
|
|
117 |
print("The", d, "directory already exists !")
|
118 |
|
119 |
os.chdir(input_data_dir)
|
120 |
+
with open('stored_variables.json', 'r') as file:
|
121 |
stored_vars = json.load(file)
|
122 |
# ls_samples = stored_vars['ls_samples']
|
123 |
selected_metadata_files = stored_vars['selected_metadata_files']
|
|
|
165 |
#ls_samples = [sample for sample in os.listdir(input_data_dir) if sample.endswith(".csv")]
|
166 |
print("The following CSV files were detected:\n\n",[sample for sample in ls_samples], "\n\nin", input_data_dir, "directory.")
|
167 |
|
168 |
+
|
169 |
+
# In[26]:
|
170 |
+
|
171 |
+
|
172 |
+
import os
|
173 |
+
import pandas as pd
|
174 |
+
|
175 |
def combine_and_save_metadata_files(metadata_dir, selected_metadata_files):
|
176 |
if len(selected_metadata_files) == []:
|
177 |
if not file:
|
|
|
192 |
return combined_metadata_df
|
193 |
|
194 |
else:
|
195 |
+
if selected_metadata_files:
|
196 |
single_file_path = os.path.join(metadata_dir, selected_metadata_files[0])
|
197 |
single_file_df = pd.read_csv(single_file_path)
|
198 |
print(f"Only one file selected: {selected_metadata_files[0]}")
|
199 |
+
return single_file_df
|
200 |
+
else:
|
201 |
+
print("No metadata files selected.")
|
202 |
+
return pd.DataFrame()
|
203 |
+
|
204 |
+
|
205 |
+
# In[27]:
|
206 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
207 |
|
208 |
print(combine_and_save_metadata_files(metadata_dir, selected_metadata_files))
|
209 |
|
210 |
+
|
211 |
+
# In[28]:
|
212 |
+
|
213 |
+
|
214 |
ls_samples
|
215 |
|
216 |
+
|
217 |
+
# In[29]:
|
218 |
path = os.path.join(input_data_dir, ls_samples[0])
|
219 |
#df = load_dataset('csv', data_files = path )
|
220 |
df = pd.read_csv(os.path.join(input_data_dir, ls_samples[0]),index_col = 0, nrows = 1)
|
221 |
df.head(10)
|
222 |
|
223 |
+
|
224 |
+
# In[30]:
|
225 |
+
|
226 |
+
|
227 |
# First gather information on expected headers using first file in ls_samples
|
228 |
# Read in the first row of the file corresponding to the first sample (index = 0) in ls_samples
|
229 |
df = pd.read_csv(os.path.join(input_data_dir, ls_samples[0]) , index_col = 0, nrows = 1)
|
|
|
234 |
print("df's columns :\n", df.columns, "\n")
|
235 |
print("df's index :\n", df.index, "\n")
|
236 |
print("df's index name :\n", df.index.name)
|
237 |
+
|
238 |
+
|
239 |
+
# In[31]:
|
240 |
+
|
241 |
+
|
242 |
df.head()
|
243 |
+
|
244 |
+
|
245 |
+
# In[32]:
|
246 |
+
|
247 |
+
|
248 |
# Verify that the ID column in input file became the index
|
249 |
# Verify that the index name column is "ID", if not, rename it
|
250 |
if df.index.name != "ID":
|
|
|
272 |
print("df's columns :\n", df.columns, "\n")
|
273 |
print("df's index :\n", df.index, "\n")
|
274 |
print("df's index name :\n", df.index.name)
|
275 |
+
|
276 |
+
|
277 |
+
# In[33]:
|
278 |
+
|
279 |
+
|
280 |
+
df.head()
|
281 |
+
|
282 |
+
|
283 |
+
# In[34]:
|
284 |
+
|
285 |
+
|
286 |
+
df.head()
|
287 |
+
|
288 |
+
|
289 |
+
# In[35]:
|
290 |
+
|
291 |
+
|
292 |
print("Used " + ls_samples[0] + " to determine the expected and corrected headers for all files.\n")
|
293 |
print("These headers are: \n" + ", ".join([h for h in expected_headers]))
|
294 |
|
295 |
corrected_headers = True
|
296 |
|
297 |
+
|
298 |
+
# In[36]:
|
299 |
+
|
300 |
+
|
301 |
for sample in ls_samples:
|
302 |
file_path = os.path.join(input_data_dir,sample)
|
303 |
print(file_path)
|
304 |
|
305 |
+
|
306 |
+
# In[37]:
|
307 |
+
|
308 |
+
|
309 |
# Import all the others files
|
310 |
dfs = {}
|
311 |
###############################
|
|
|
460 |
file_not_intensities.write(item + "\n")
|
461 |
file_not_intensities.close()
|
462 |
|
463 |
+
|
464 |
+
# In[46]:
|
465 |
+
|
466 |
+
|
467 |
not_intensities_df = pd.read_csv(path_not_intensities)
|
468 |
not_intensities_df
|
469 |
|
470 |
+
|
471 |
+
# In[47]:
|
472 |
+
|
473 |
+
|
474 |
# Columns we want to keep: not_intensities, and any intensity column that contains 'Intensity_Average' (drop any intensity marker column that is not a mean intensity)
|
475 |
to_keep = not_intensities + [x for x in df.columns.values[~df.columns.isin(not_intensities)] if 'Intensity_Average' in x]
|
476 |
|
477 |
to_keep
|
478 |
|
479 |
+
|
480 |
+
# In[48]:
|
481 |
+
|
482 |
+
|
483 |
print(len(to_keep) - 1)
|
484 |
|
485 |
+
|
486 |
+
# In[49]:
|
487 |
+
|
488 |
+
|
489 |
# However, our to_keep list contains items that might not be in our df headers!
|
490 |
# These items are from our not_intensities list. So let's ask for only those items from to_keep that are actually found in our df
|
491 |
# Retains only the columns from the to_keep list that are found in the df's headers (columns).
|
|
|
495 |
|
496 |
df.head()
|
497 |
|
|
|
|
|
498 |
|
499 |
+
# In[50]:
|
|
|
500 |
|
501 |
+
|
502 |
+
import pandas as pd
|
503 |
+
|
504 |
+
# Assuming you have a DataFrame named 'df'
|
505 |
+
# df = pd.read_csv('your_file.csv')
|
506 |
|
507 |
# Get all column names
|
508 |
all_columns = df.columns.tolist()
|
|
|
529 |
print("Intensity Markers:")
|
530 |
print(intensity_marker)
|
531 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
532 |
# Create a callback function to update the intensities array
|
533 |
def update_intensities(event):
|
534 |
global intensities
|
|
|
553 |
print("Updated intensities DataFrame:")
|
554 |
print(intensities_df)
|
555 |
|
556 |
+
|
557 |
+
# In[54]:
|
558 |
+
|
559 |
+
|
560 |
tabulator_formatters = {
|
561 |
'bool': {'type': 'tickCross'}
|
562 |
}
|
|
|
575 |
|
576 |
# Create a Panel layout with the Tabulator widget
|
577 |
marker_options_layout = pn.Column(tabulator, sizing_mode="stretch_width")
|
578 |
+
|
579 |
+
import panel as pn
|
580 |
+
import pandas as pd
|
581 |
+
import random
|
582 |
+
import asyncio
|
583 |
+
|
584 |
# Initialize the Panel extension with Tabulator
|
585 |
pn.extension('tabulator')
|
586 |
|
|
|
637 |
# Layout
|
638 |
updated_intensities = pn.Column(tabulator, run_button, new_data_table, sizing_mode="stretch_width")
|
639 |
|
640 |
+
pn.extension()
|
641 |
# Serve the layout
|
642 |
#updated_intensities.servable()
|
643 |
|
644 |
|
645 |
intensities_df = new_data_table
|
646 |
+
intensities_df
|
647 |
+
|
648 |
intensities_df = pn.pane.DataFrame(intensities_df)
|
649 |
+
intensities_df
|
650 |
+
|
651 |
print(intensities_df)
|
652 |
# ## I.4. QC CHECKS
|
653 |
|
|
|
704 |
# If all checks pass, return True
|
705 |
return True
|
706 |
|
707 |
+
|
708 |
+
# In[70]:
|
709 |
+
|
710 |
+
|
711 |
# Let's take a look at a few features to make sure our dataframe is as expected
|
712 |
df.index
|
713 |
def check_format_ofindex(index):
|
|
|
721 |
return index_format
|
722 |
print(check_format_ofindex(df.index))
|
723 |
|
724 |
+
|
725 |
+
# In[71]:
|
726 |
+
|
727 |
+
|
728 |
df.shape
|
729 |
check_index = df.index
|
730 |
check_shape = df.shape
|
731 |
print(check_shape)
|
732 |
|
733 |
+
|
734 |
+
# In[72]:
|
735 |
+
|
736 |
+
|
737 |
# Check for NaN entries (should not be any unless columns do not align)
|
738 |
# False means no NaN entries
|
739 |
# True means NaN entries
|
|
|
741 |
|
742 |
check_no_null = df.isnull().any().any()
|
743 |
|
744 |
+
|
745 |
+
# In[73]:
|
746 |
+
|
747 |
+
|
748 |
# Check that all expected files were imported into final dataframe
|
749 |
if sorted(df.Sample_ID.unique()) == sorted(ls_samples):
|
750 |
print("All expected filenames are present in big df Sample_ID column.")
|
|
|
755 |
|
756 |
print(df.Sample_ID)
|
757 |
|
758 |
+
|
759 |
+
# In[74]:
|
760 |
+
|
761 |
+
|
762 |
# Delete rows that have 0 value mean intensities for intensity columns
|
763 |
print("df.shape before removing 0 mean values: ", df.shape)
|
764 |
|
|
|
825 |
print(f"{key}: {value}")
|
826 |
|
827 |
|
828 |
+
# In[80]:
|
829 |
+
|
830 |
+
|
831 |
import panel as pn
|
832 |
import pandas as pd
|
833 |
|
|
|
925 |
|
926 |
return p
|
927 |
|
928 |
+
|
929 |
+
|
930 |
# Bind the create_line_graph function to the quantile slider
|
931 |
nucleus_size_line_graph_with_histogram = pn.bind(create_line_graph2, quantile=quantile_slider.param.value)
|
932 |
|
|
|
951 |
quantiles = df['Nucleus_Size'].quantile(q=qs).values
|
952 |
threshold = quantiles[2]
|
953 |
|
954 |
+
|
955 |
+
# In[89]:
|
956 |
+
|
957 |
+
|
958 |
print(threshold)
|
959 |
|
960 |
+
|
961 |
+
# In[90]:
|
962 |
+
|
963 |
+
|
964 |
+
|
965 |
import panel as pn
|
966 |
import pandas as pd
|
967 |
import numpy as np
|
|
|
999 |
# Layout the components in a Panel app
|
1000 |
layout2 = results_display
|
1001 |
|
1002 |
+
|
1003 |
+
# In[91]:
|
1004 |
+
|
1005 |
+
|
1006 |
print("Number of cells before filtering :", df.shape[0])
|
1007 |
cells_before_filter = f"Number of cells before filtering :{df.shape[0]}"
|
1008 |
# Delete small cells and objects w/high AF555 Signal (RBCs)
|
|
|
1110 |
quantiles = df[column].quantile(q=[quantile, 0.50, 1 - quantile])
|
1111 |
return quantiles
|
1112 |
|
1113 |
+
|
1114 |
+
# In[105]:
|
1115 |
+
|
1116 |
+
|
1117 |
quantile_slider = pn.widgets.FloatSlider(name='Quantile', start=0.01, end=0.99, step=0.01, value=0.05)
|
1118 |
|
1119 |
|
|
|
1123 |
# Layout the components in a Panel app
|
1124 |
#nucleus_size_graph = pn.Column(nucleus_size_line_graph)
|
1125 |
|
1126 |
+
|
1127 |
+
# In[106]:
|
1128 |
+
|
1129 |
+
|
1130 |
+
#df["CKs_Cytoplasm_Intensity_Average"].quantile(q=qs)
|
1131 |
+
|
1132 |
+
|
1133 |
+
# In[107]:
|
1134 |
+
|
1135 |
+
|
1136 |
len(intensities)
|
1137 |
+
if 'CKs_Cytoplasm_Intensity_Average' in intensities:
|
1138 |
+
print(1)
|
1139 |
+
|
1140 |
+
|
1141 |
+
# In[108]:
|
1142 |
+
|
1143 |
|
1144 |
df
|
1145 |
|
1146 |
+
|
1147 |
+
# In[109]:
|
1148 |
+
|
1149 |
+
|
1150 |
def calculate_cytoplasm_quantiles(column, quantile):
|
1151 |
# Print the columns of the DataFrame
|
1152 |
print("DataFrame columns:", df.columns)
|
|
|
1164 |
return pn.pane.DataFrame(output)
|
1165 |
|
1166 |
# Bind the create_app function to the quantile slider
|
1167 |
+
cytoplasm_quantile_output_app = pn.bind(create_cytoplasm_intensity_df, column='CKs_Cytoplasm_Intensity_Average', quantile=quantile_slider.param.value)
|
1168 |
|
1169 |
pn.Column(quantile_slider, cytoplasm_quantile_output_app)
|
1170 |
+
|
1171 |
+
|
1172 |
+
# In[110]:
|
1173 |
+
|
1174 |
+
|
1175 |
def calculate_cytoplasm_quantiles(column, quantile):
|
1176 |
quantiles = df[column].quantile(q=[quantile, 0.50, 1 - quantile])
|
1177 |
return quantiles
|
|
|
1185 |
|
1186 |
|
1187 |
# Bind the create_app function to the quantile slider
|
1188 |
+
cytoplasm_quantile_output_app = pn.bind(create_cytoplasm_intensity_df, column='CKs_Cytoplasm_Intensity_Average', quantile = quantile_slider.param.value)
|
1189 |
pn.Column(quantile_slider,cytoplasm_quantile_output_app)
|
1190 |
|
1191 |
|
1192 |
# ## I.5. COLUMNS OF INTERESTS
|
1193 |
|
1194 |
+
# In[111]:
|
1195 |
+
|
1196 |
+
|
1197 |
# Remove columns containing "DAPI"
|
1198 |
df = df[[x for x in df.columns.values if 'DAPI' not in x]]
|
1199 |
|
|
|
1201 |
print([c for c in df.columns.values])
|
1202 |
|
1203 |
|
1204 |
+
# In[112]:
|
1205 |
+
|
1206 |
+
|
1207 |
# Create lists of full names and shortened names to use in plotting
|
1208 |
full_to_short_names, short_to_full_names = \
|
1209 |
shorten_feature_names(df.columns.values[~df.columns.isin(not_intensities)])
|
|
|
1211 |
short_to_full_names
|
1212 |
|
1213 |
|
1214 |
+
# In[113]:
|
1215 |
+
|
1216 |
+
|
1217 |
# Save this data to a metadata file
|
1218 |
filename = os.path.join(metadata_dir, "full_to_short_column_names.csv")
|
1219 |
fh = open(filename, "w")
|
|
|
1224 |
fh.close()
|
1225 |
print("The full_to_short_column_names.csv file was created !")
|
1226 |
|
1227 |
+
|
1228 |
+
# In[114]:
|
1229 |
+
|
1230 |
+
|
1231 |
# Save this data to a metadata file
|
1232 |
filename = os.path.join(metadata_dir, "short_to_full_column_names.csv")
|
1233 |
fh = open(filename, "w")
|
|
|
1241 |
|
1242 |
# ## I.6. EXPOSURE TIME
|
1243 |
|
1244 |
+
# In[115]:
|
1245 |
+
|
1246 |
|
1247 |
#import the ashlar analysis file
|
1248 |
file_path = os.path.join(metadata_dir, 'combined_metadata.csv')
|
1249 |
ashlar_analysis = pd.read_csv(file_path)
|
1250 |
ashlar_analysis
|
1251 |
+
|
1252 |
+
|
1253 |
+
# In[116]:
|
1254 |
+
|
1255 |
+
|
1256 |
# Extracting and renaming columns
|
1257 |
new_df = ashlar_analysis[['Name', 'Cycle', 'ChannelIndex', 'ExposureTime']].copy()
|
1258 |
new_df.rename(columns={
|
|
|
1271 |
# Print the new dataframe
|
1272 |
print(new_df)
|
1273 |
|
1274 |
+
|
1275 |
+
# In[117]:
|
1276 |
+
|
1277 |
+
|
1278 |
# Here, we want to end up with a data structure that incorporates metadata on each intensity marker column used in our big dataframe in an easy-to-use format.
|
1279 |
# This is going to include the full name of the intensity marker columns in the big data frame,
|
1280 |
# the corresponding round and channel,
|
|
|
1304 |
print("\nNo null values detected.")
|
1305 |
|
1306 |
|
1307 |
+
# In[118]:
|
1308 |
+
|
1309 |
+
|
1310 |
if len(exp_df['Target']) > len(exp_df['Target'].unique()):
|
1311 |
print("One or more non-unique Target values in exp_df. Currently not supported.")
|
1312 |
exp_df = exp_df.drop_duplicates(subset = 'Target').reindex()
|
1313 |
|
1314 |
+
|
1315 |
+
# In[119]:
|
1316 |
+
|
1317 |
+
|
1318 |
# sort exp_df by the values in the 'Target' column in ascending order and then retrieve the first few rows of the sorted df
|
1319 |
exp_df.sort_values(by = ['Target']).head()
|
1320 |
|
1321 |
+
|
1322 |
+
# In[120]:
|
1323 |
+
|
1324 |
+
|
1325 |
# Create lowercase version of target
|
1326 |
exp_df['target_lower'] = exp_df['Target'].str.lower()
|
1327 |
exp_df.head()
|
1328 |
|
1329 |
+
|
1330 |
+
# In[121]:
|
1331 |
+
|
1332 |
+
|
1333 |
# Create df that contains marker intensity columns in our df that aren't in not_intensities
|
1334 |
intensities = pd.DataFrame({'full_column':df.columns.values[~df.columns.isin(not_intensities)]})
|
1335 |
|
1336 |
intensities
|
1337 |
+
|
1338 |
+
|
1339 |
+
# In[122]:
|
1340 |
+
|
1341 |
+
|
1342 |
# Extract the marker information from the `full_column`, which corresponds to full column in big dataframe
|
1343 |
# Use regular expressions (regex) to isolate the part of the field that begins (^) with an alphanumeric value (W), and ends with an underscore (_)
|
1344 |
# '$' is end of line
|
|
|
1347 |
intensities['marker_lower'] = intensities['marker'].str.lower()
|
1348 |
|
1349 |
intensities
|
1350 |
+
|
1351 |
+
|
1352 |
+
# In[123]:
|
1353 |
+
|
1354 |
+
|
1355 |
# Subset the intensities df to exclude any column pertaining to DAPI
|
1356 |
intensities = intensities.loc[intensities['marker_lower'] != 'dapi']
|
1357 |
|
1358 |
intensities.head()
|
1359 |
+
|
1360 |
+
|
1361 |
+
# In[124]:
|
1362 |
+
|
1363 |
+
|
1364 |
# Merge the intensities andexp_df together to create metadata
|
1365 |
metadata = pd.merge(exp_df, intensities, how = 'left', left_on = 'target_lower',right_on = 'marker_lower')
|
1366 |
metadata = metadata.drop(columns = ['marker_lower'])
|
|
|
1370 |
# target_lower is Target in small caps
|
1371 |
# marker is the extracted first component of the full column in segmentation data, with corresponding capitalization
|
1372 |
metadata
|
1373 |
+
|
1374 |
+
|
1375 |
+
# In[125]:
|
1376 |
+
|
1377 |
+
|
1378 |
# Add a column to signify marker target localisation.
|
1379 |
# Use a lambda to determine segmented location of intensity marker column and update metadata accordingly
|
1380 |
# Using the add_metadata_location() function in my_modules.py
|
1381 |
metadata['localisation'] = metadata.apply(
|
1382 |
lambda row: add_metadata_location(row), axis = 1)
|
1383 |
|
1384 |
+
|
1385 |
+
# In[126]:
|
1386 |
+
|
1387 |
+
|
1388 |
mlid = metadata
|
1389 |
|
1390 |
+
|
1391 |
+
# In[127]:
|
1392 |
+
|
1393 |
+
|
1394 |
# Save this data structure to the metadata folder
|
1395 |
# don't want to add color in because that's better off treating color the same for round, channel, and sample
|
1396 |
filename = "marker_intensity_metadata.csv"
|
|
|
1429 |
print("Unique channels are:", metadata.Channel.unique())
|
1430 |
sb.palplot(sb.color_palette(channel_color_values))
|
1431 |
|
1432 |
+
|
1433 |
+
# In[131]:
|
1434 |
+
|
1435 |
+
|
1436 |
# Function to create a palette plot with custom colors
|
1437 |
def create_palette_plot():
|
1438 |
# Get unique channels
|
|
|
1495 |
#app_palette_plot.servable()
|
1496 |
|
1497 |
|
1498 |
+
# In[133]:
|
1499 |
+
|
1500 |
+
|
1501 |
# Store in a dictionary
|
1502 |
channel_color_dict = dict(zip(metadata.Channel.unique(), channel_color_values))
|
1503 |
channel_color_dict
|
|
|
1506 |
|
1507 |
channel_color_dict
|
1508 |
|
1509 |
+
|
1510 |
+
# In[134]:
|
1511 |
+
|
1512 |
+
|
1513 |
color_df_channel = color_dict_to_df(channel_color_dict, "Channel")
|
1514 |
|
1515 |
# Save to file in metadatadirectory
|
|
|
1519 |
|
1520 |
color_df_channel
|
1521 |
|
1522 |
+
|
1523 |
+
# In[135]:
|
1524 |
+
|
1525 |
+
|
1526 |
# Legend of channel info only
|
1527 |
g = plt.figure(figsize = (1,1)).add_subplot(111)
|
1528 |
g.axis('off')
|
|
|
1556 |
|
1557 |
## TO-DO: write what these parameters mean
|
1558 |
|
1559 |
+
|
1560 |
+
# In[137]:
|
1561 |
+
|
1562 |
+
|
1563 |
# Store in a dictionary
|
1564 |
round_color_dict = dict(zip(metadata.Round.unique(), round_color_values))
|
1565 |
|
|
|
1568 |
|
1569 |
round_color_dict
|
1570 |
|
1571 |
+
|
1572 |
+
# In[138]:
|
1573 |
+
|
1574 |
+
|
1575 |
color_df_round = color_dict_to_df(round_color_dict, "Round")
|
1576 |
|
1577 |
# Save to file in metadatadirectory
|
|
|
1601 |
|
1602 |
# ### I.7.3. SAMPLES COLORS
|
1603 |
|
1604 |
+
# In[140]:
|
1605 |
+
|
1606 |
+
|
1607 |
# we want colors that are neither sequential nor categorical.
|
1608 |
# Categorical would be ideal if we could generate an arbitrary number of colors, but I do not think that we can.
|
1609 |
# Hense, we will choose `n` colors from a continuous palette. First we will generate the right number of colors. Later, we will assign TMA samples to gray.
|
|
|
1615 |
# Display those unique colors
|
1616 |
sb.palplot(sb.color_palette(color_values))
|
1617 |
|
1618 |
+
|
1619 |
+
# In[141]:
|
1620 |
+
|
1621 |
+
|
1622 |
TMA_samples = [s for s in df.Sample_ID.unique() if 'TMA' in s]
|
1623 |
TMA_color_values = sb.color_palette(n_colors = len(TMA_samples),palette = "gray")
|
1624 |
sb.palplot(sb.color_palette(TMA_color_values))
|
1625 |
|
1626 |
+
|
1627 |
+
# In[142]:
|
1628 |
+
|
1629 |
+
|
1630 |
# Store in a dictionary
|
1631 |
color_dict = dict()
|
1632 |
color_dict = dict(zip(df.Sample_ID.unique(), color_values))
|
|
|
1742 |
window_widget = pn.widgets.IntSlider(name="window", value=30, start=1, end=60)
|
1743 |
sigma_widget = pn.widgets.IntSlider(name="sigma", value=10, start=0, end=20)
|
1744 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1745 |
app = pn.template.GoldenTemplate(
|
1746 |
site="Cyc-IF",
|
1747 |
title="Quality Control",
|
1748 |
main=[
|
1749 |
pn.Tabs(
|
1750 |
("Dataframes", pn.Column(
|
1751 |
+
pn.Row(csv_files_button,pn.bind(handle_click, csv_files_button.param.clicks)),
|
1752 |
pn.pane.Markdown("### The Dataframe uploaded:"), pn.pane.DataFrame(intial_dataframe),
|
1753 |
#pn.pane.Markdown("### The Exposure time DataFrame is :"), pn.pane.DataFrame(exp_df.head()),
|
1754 |
+
pn.pane.Markdown("### The DataFrame after merging CycIF data x metadata :"), pn.pane.DataFrame(merged_dataframe.head()),
|
1755 |
)),
|
1756 |
("Quality Control", pn.Column(
|
1757 |
quality_check(quality_control_df, not_intensities)
|
|
|
1765 |
)),
|
1766 |
("Plots", pn.Column(
|
1767 |
#pn.pane.Markdown(" ### Nucleus Size Distribution: "), pn.Row(nucleus_size_line_graph_with_histogram, num_of_cell_removal),
|
1768 |
+
#pn.pane.Markdown(" ### Nucleus Size Distribution: "), pn.Row(plot1,layout2),
|
1769 |
#pn.pane.Markdown("### Nucleus Distribution Plot:"), pn.Column(nucleus_size_plot, nucleus_size_graph),
|
1770 |
pn.pane.Markdown(" ### Intensity Average Plot:"), pn.Row(selected_marker_plot,num_of_cell_removal_intensity ),
|
1771 |
#pn.Column(pn.Column(column_dropdown, generate_plot_button), quantile_slider, plot),
|
1772 |
#pn.pane.Markdown("### Cytoplasm Intensity Plot:"), cytoplasm_intensity_plot,
|
1773 |
#pn.pane.Markdown("### AF555_Cell_Intensity_Average:"), quantile_output_app,
|
1774 |
+
#pn.pane.Markdown("### Distribution of AF555_Cell_Intensity_Average with Quantiles:"), quantile_intensity_plot)
|
|
|
1775 |
)),
|
1776 |
|
1777 |
),
|
1778 |
])
|
1779 |
|
1780 |
+
app.servable()
|
1781 |
+
|
1782 |
+
if __name__ == "__main__":
|
1783 |
+
pn.serve(app, port=5007)
|