KashyapiNagaHarshitha commited on
Commit
b10249c
1 Parent(s): 5a89c15

Upload 3 files

Browse files
Files changed (3) hide show
  1. Quality_Control.py +1796 -0
  2. my_modules.py +468 -0
  3. stored_variables.json +6 -0
Quality_Control.py ADDED
@@ -0,0 +1,1796 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ # coding: utf-8
3
+
4
+ import warnings
5
+ import os
6
+ import plotly as plt
7
+ import seaborn as sb
8
+ import plotly.express as px
9
+ import panel as pn
10
+ import holoviews as hv
11
+ import hvplot.pandas
12
+ import pandas as pd
13
+ import numpy as np
14
+ import json
15
+ import matplotlib.pyplot as plt
16
+ from bokeh.plotting import figure
17
+ from bokeh.io import push_notebook, show
18
+ from bokeh.io.export import export_png
19
+ from bokeh.resources import INLINE
20
+ from bokeh.embed import file_html
21
+ from bokeh.io import curdoc
22
+ from bokeh.models import Span, Label
23
+ from bokeh.models import ColumnDataSource, Button
24
+ from my_modules import *
25
+
26
+ #Silence FutureWarnings & UserWarnings
27
+ warnings.filterwarnings('ignore', category= FutureWarning)
28
+ warnings.filterwarnings('ignore', category= UserWarning)
29
+
30
+
31
+ '''get_ipython().run_line_magic('store', '-r base_dir')
32
+ get_ipython().run_line_magic('store', '-r set_path')
33
+ get_ipython().run_line_magic('store', '-r ls_samples')
34
+ get_ipython().run_line_magic('store', '-r selected_metadata_files')'''
35
+
36
+
37
+ '''# Retrieve the variables from the JSON file
38
+ with open('stored_variables.json', 'r') as file:
39
+ stored_vars = json.load(file)
40
+
41
+ base_dir = stored_vars['base_dir']
42
+ set_path = stored_vars['set_path']
43
+ selected_metadata_files = stored_vars['selected_metadata_files']
44
+ ls_samples = stored_vars['ls_samples']
45
+ print(f"Base Directory: {base_dir}")
46
+ print(f"Set Path: {set_path}")
47
+ print(f"Selected_metadata_files: {selected_metadata_files}")
48
+
49
+
50
+ print(base_dir)
51
+ print(set_path)
52
+ print(ls_samples)
53
+ print(selected_metadata_files)'''
54
+
55
+
56
+
57
+ base_dir = '/Users/harshithakolipaka/Downloads/wetransfer_data-zip_2024-05-17_1431'
58
+ set_path = 'test'
59
+ selected_metadata_files = "['Slide_B_DD1s1.one_1.tif.csv', 'Slide_B_DD1s1.one_2.tif.csv']"
60
+ ls_samples = "['Ashlar_Exposure_Time.csv', 'new_data.csv', 'DD3S1.csv', 'DD3S2.csv', 'DD3S3.csv', 'TMA.csv']"
61
+
62
+ pn.extension()
63
+
64
+ update_button = pn.widgets.Button(name='CSV Files', button_type='primary')
65
+ def update_samples(event):
66
+ with open('/Users/harshithakolipaka/Desktop/CycIF_platform_py/stored_variables.json', 'r') as file:
67
+ stored_vars = json.load(file)
68
+ ls_samples = stored_vars['ls_samples']
69
+ print(ls_samples)
70
+ update_button.on_click(update_samples)
71
+
72
+ csv_files_button = pn.widgets.Button(icon="clipboard", name = " Click on the clipboard to display the selected files", button_type="primary")
73
+ indicator = pn.indicators.LoadingSpinner(value=False, size=25)
74
+
75
+ def handle_click(clicks):
76
+ with open('/Users/harshithakolipaka/Desktop/CycIF_platform_py/stored_variables.json', 'r') as file:
77
+ stored_vars = json.load(file)
78
+ ls_samples = stored_vars['ls_samples']
79
+ return f'CSV Files Selected: {ls_samples}'
80
+
81
+ pn.Row(
82
+ csv_files_button,
83
+ pn.bind(handle_click, csv_files_button.param.clicks),
84
+ )
85
+
86
+
87
+ # ## I.2. *DIRECTORIES
88
+
89
+ set_path = 'test'
90
+
91
+ # Set base directory
92
+
93
+ directorio_actual = os.getcwd()
94
+ print(directorio_actual)
95
+
96
+ ##### MAC WORKSTATION #####
97
+ #base_dir = r'/Volumes/LaboLabrie/Projets/OC_TMA_Pejovic/Temp/Zoe/CyCIF_pipeline/'
98
+ ###########################
99
+
100
+ ##### WINDOWS WORKSTATION #####
101
+ #base_dir = r'C:\Users\LaboLabrie\gerz2701\cyCIF-pipeline\Set_B'
102
+ ###############################
103
+ input_path = base_dir
104
+
105
+ ##### LOCAL WORKSTATION #####
106
+ #base_dir = r'/Users/harshithakolipaka/Downloads/wetransfer_data-zip_2024-05-17_1431/'
107
+ base_dir = input_path
108
+ print(base_dir)
109
+ #############################
110
+
111
+ #set_name = 'Set_A'
112
+ #set_name = 'test'
113
+ set_name = set_path
114
+
115
+ project_name = set_name # Project name
116
+ step_suffix = 'qc_eda' # Curent part (here part I)
117
+ previous_step_suffix_long = "" # Previous part (here empty)
118
+
119
+ # Initial input data directory
120
+ input_data_dir = os.path.join(base_dir, project_name + "_data")
121
+
122
+ # QC/EDA output directories
123
+ # global output
124
+ output_data_dir = os.path.join(base_dir, project_name + "_" + step_suffix)
125
+ # images subdirectory
126
+ output_images_dir = os.path.join(output_data_dir,"images")
127
+
128
+ # Data and Metadata directories
129
+ # global data
130
+ metadata_dir = os.path.join(base_dir, project_name + "_metadata")
131
+ # images subdirectory
132
+ metadata_images_dir = os.path.join(metadata_dir,"images")
133
+
134
+ # Create directories if they don't already exist
135
+ for d in [base_dir, input_data_dir, output_data_dir, output_images_dir, metadata_dir, metadata_images_dir]:
136
+ if not os.path.exists(d):
137
+ print("Creation of the" , d, "directory...")
138
+ os.makedirs(d)
139
+ else :
140
+ print("The", d, "directory already exists !")
141
+
142
+ os.chdir(input_data_dir)
143
+ with open('/Users/harshithakolipaka/Desktop/CycIF_platform_py/stored_variables.json', 'r') as file:
144
+ stored_vars = json.load(file)
145
+ ls_samples = stored_vars['ls_samples']
146
+ selected_metadata_files = stored_vars['selected_metadata_files']
147
+
148
+ directories = []
149
+ for i in [base_dir, input_data_dir, output_data_dir, output_images_dir, metadata_dir, metadata_images_dir]:
150
+ directories.append(i)
151
+
152
+ directories
153
+
154
+ def print_directories(directories):
155
+
156
+ label_path = []
157
+ labels = [
158
+ "base_dir",
159
+ "input_data_dir",
160
+ "output_data_dir",
161
+ "output_images_dir",
162
+ "metadata_dir",
163
+ "metadata_images_dir"
164
+ ]
165
+
166
+ for label, path in zip(labels, directories):
167
+ label_path.append(f"{label} : {path}")
168
+
169
+ return label_path
170
+
171
+ print_directories
172
+
173
+
174
+ # Verify paths
175
+ print('base_dir :', base_dir)
176
+ print('input_data_dir :', input_data_dir)
177
+ print('output_data_dir :', output_data_dir)
178
+ print('output_images_dir :', output_images_dir)
179
+ print('metadata_dir :', metadata_dir)
180
+ print('metadata_images_dir :', metadata_images_dir)
181
+
182
+
183
+ # ## I.3. FILES
184
+
185
+ # Listing all the .csv files in the metadata/data directory
186
+ # Don't forget to move the csv files into the proj_data directory
187
+ # if the data dir is empty it's not going to work
188
+ #ls_samples = [sample for sample in os.listdir(input_data_dir) if sample.endswith(".csv")]
189
+ print("The following CSV files were detected:\n\n",[sample for sample in ls_samples], "\n\nin", input_data_dir, "directory.")
190
+
191
+
192
+ # In[26]:
193
+
194
+
195
+ import os
196
+ import pandas as pd
197
+
198
+ def combine_and_save_metadata_files(metadata_dir, selected_metadata_files):
199
+ if len(selected_metadata_files) == []:
200
+ if not file:
201
+ warnings.warn("No Ashlar file uploaded. Please upload a valid file.", UserWarning)
202
+ return
203
+
204
+ elif len(selected_metadata_files) > 1:
205
+ combined_metadata_df = pd.DataFrame()
206
+
207
+ for file in selected_metadata_files:
208
+ file_path = os.path.join(metadata_dir, file)
209
+ df = pd.read_csv(file_path)
210
+ combined_metadata_df = pd.concat([combined_metadata_df, df], ignore_index=True)
211
+
212
+ combined_metadata_df.to_csv(os.path.join(metadata_dir, "combined_metadata.csv"), index=False)
213
+ print(f"Combined metadata file saved as 'combined_metadata.csv' in {metadata_dir}")
214
+
215
+ return combined_metadata_df
216
+
217
+ else:
218
+ if selected_metadata_files:
219
+ single_file_path = os.path.join(metadata_dir, selected_metadata_files[0])
220
+ single_file_df = pd.read_csv(single_file_path)
221
+ print(f"Only one file selected: {selected_metadata_files[0]}")
222
+ return single_file_df
223
+ else:
224
+ print("No metadata files selected.")
225
+ return pd.DataFrame()
226
+
227
+
228
+ # In[27]:
229
+
230
+
231
+ print(combine_and_save_metadata_files(metadata_dir, selected_metadata_files))
232
+
233
+
234
+ # In[28]:
235
+
236
+
237
+ ls_samples
238
+
239
+
240
+ # In[29]:
241
+
242
+
243
+ df = pd.read_csv(os.path.join(input_data_dir, ls_samples[0]),index_col = 0, nrows = 1)
244
+ df.head(10)
245
+
246
+
247
+ # In[30]:
248
+
249
+
250
+ # First gather information on expected headers using first file in ls_samples
251
+ # Read in the first row of the file corresponding to the first sample (index = 0) in ls_samples
252
+ df = pd.read_csv(os.path.join(input_data_dir, ls_samples[0]) , index_col = 0, nrows = 1)
253
+
254
+
255
+ # Make sure the file was imported correctly
256
+ print("df :\n", df.head(), "\n")
257
+ print("df's columns :\n", df.columns, "\n")
258
+ print("df's index :\n", df.index, "\n")
259
+ print("df's index name :\n", df.index.name)
260
+
261
+
262
+ # In[31]:
263
+
264
+
265
+ df.head()
266
+
267
+
268
+ # In[32]:
269
+
270
+
271
+ # Verify that the ID column in input file became the index
272
+ # Verify that the index name column is "ID", if not, rename it
273
+ if df.index.name != "ID":
274
+ print("Expected the first column in input file (index_col = 0) to be 'ID'. \n"
275
+ "This column will be used to set the index names (cell number for each sample). \n"
276
+ "It appears that the column '" + df.index.name + "' was actually the imported as the index column.")
277
+ #df.index.name = 'ID'
278
+ print("A new index name (first column) will be given ('ID') to replace the current one '" + df.index.name + "'\n")
279
+
280
+ # Apply the changes to the headers as specified with apply_header_changes() function (in my_modules.py)
281
+ # Apply the changes to the dataframe rows as specified with apply_df_changes() function (in my_modules.py)
282
+ #df = apply_header_changes(df)
283
+ print(df.index)
284
+ df.index = df.index.str.replace(r'@1$', '')
285
+ df = apply_df_changes(df)
286
+
287
+ # Set variable to hold default header values
288
+ expected_headers = df.columns.values
289
+ expected_header = True
290
+ print(expected_header)
291
+
292
+ intial_dataframe = df
293
+ # Make sure the file is now formated correctly
294
+ print("\ndf :\n", df.head(), "\n")
295
+ print("df's columns :\n", df.columns, "\n")
296
+ print("df's index :\n", df.index, "\n")
297
+ print("df's index name :\n", df.index.name)
298
+
299
+
300
+ # In[33]:
301
+
302
+
303
+ df.head()
304
+
305
+
306
+ # In[34]:
307
+
308
+
309
+ df.head()
310
+
311
+
312
+ # In[35]:
313
+
314
+
315
+ print("Used " + ls_samples[0] + " to determine the expected and corrected headers for all files.\n")
316
+ print("These headers are: \n" + ", ".join([h for h in expected_headers]))
317
+
318
+ corrected_headers = True
319
+
320
+
321
+ # In[36]:
322
+
323
+
324
+ for sample in ls_samples:
325
+ file_path = os.path.join(input_data_dir,sample)
326
+ print(file_path)
327
+
328
+
329
+ # In[37]:
330
+
331
+
332
+ # Import all the others files
333
+ dfs = {}
334
+ ###############################
335
+ # !! This may take a while !! #
336
+ ###############################
337
+ errors = []
338
+
339
+ for sample in ls_samples:
340
+ file_path = os.path.join(input_data_dir,sample)
341
+
342
+ try:
343
+ # Read the CSV file
344
+ df = pd.read_csv(file_path, index_col=0)
345
+ # Check if the DataFrame is empty, if so, don't continue trying to process df and remove it
346
+
347
+ if not df.empty:
348
+ # Manipulations necessary for concatenation
349
+ df = apply_header_changes(df)
350
+ df = apply_df_changes(df)
351
+ # Reorder the columns to match the expected headers list
352
+ #df = df.reindex(columns=expected_headers)
353
+ print(df.head(1))
354
+ print(sample, "file is processed !\n")
355
+ #print(df)
356
+
357
+ # Compare df's header df against what is expected
358
+ compare_headers(expected_headers, df.columns.values, sample)
359
+ #print(df.columns.values)
360
+ # Add a new colunm to identify the csv file (sample) where the df comes from
361
+ df['Sample_ID'] = sample
362
+
363
+ except pd.errors.EmptyDataError:
364
+ errors.append(f'\nEmpty data error in {sample} file. Removing from analysis...')
365
+ print(f'\nEmpty data error in {sample} file. Removing from analysis...')
366
+ ls_samples.remove(sample)
367
+
368
+ # Add df to dfs
369
+ dfs[sample] = df
370
+
371
+ print(dfs)
372
+
373
+
374
+ dfs.values()
375
+
376
+ # Merge dfs into one df
377
+ df = pd.concat(dfs.values(), ignore_index=False , sort = False)
378
+ del dfs
379
+ merge = True
380
+ merged_dataframe = df
381
+ df.head()
382
+
383
+ # Set index to Sample_ID + cell number :
384
+ # create a new custom index for df based on the sample names and integer cell numbers, and then remove the temporary columns 'level_0' and 'index' that were introduced during the operations
385
+
386
+ # Creates a copy of the DataFrame df and resets its index without creating a new column for the old index
387
+ # This essentially removes the old index column and replaces it with a default integer index
388
+ df = df.copy().reset_index(drop=True)
389
+
390
+ #print(df)
391
+
392
+ # Initializing an empty list index to store the new index labels for the DataFrame
393
+ index = []
394
+
395
+ for sample in ls_samples:
396
+ # Extract a chunk of data from the original df where the 'Sample_ID' column matches the current sample name
397
+ # This chunk is stored in the df_chunk df, which is a subset of the original data for that specific sample
398
+ df_chunk = df.loc[df['Sample_ID'] == sample,:].copy()
399
+ old_index = df_chunk.index
400
+ # Reset the index of the df_chunk df, removing the old index and replacing it with a default integer index
401
+ df_chunk = df_chunk.reset_index(drop=True)
402
+ # A new index is created for the df_chunk df. It combines the sample name with 'Cell_' and the integer index values, converting them to strings
403
+ # This new index will have labels like 'SampleName_Cell_0', 'SampleName_Cell_1', and so on.
404
+ sample = sample.split('.')[0]
405
+ df_chunk = df_chunk.set_index(f'{sample}_Cell_' + df_chunk.index.astype(str))
406
+ # The index values of df_chunk are then added to the index list
407
+ index = index + df_chunk.index.values.tolist()
408
+
409
+ # After processing all the samples in the loop, assign the index list as the new index of the original df.
410
+ df.index = index
411
+ # Remove the 'level_0' and 'index' columns from df
412
+ df = df.loc[:,~df.columns.isin(['level_0','index'])]
413
+ assigned_new_index = True
414
+ df.head()
415
+
416
+
417
+ # ### I.3.2. NOT_INTENSITIES
418
+
419
+ # not_intensities is the list of the columns unrelated to the markers fluorescence intensities
420
+ # Can include items that aren't in a given header.
421
+ #not_intensitiehttp://localhost:8888/lab/tree/Downloads/wetransfer_data-zip_2024-05-17_1431/1_qc_eda.ipynb
422
+ #I.3.2.-NOT_INTENSITIESs = ['Nuc_X', 'Nuc_X_Inv', 'Nuc_Y', 'Nuc_Y_Inv', 'Nucleus_Roundness', 'Nucleus_Size', 'Cell_Size',
423
+ # 'ROI_index', 'Sample_ID', 'replicate_ID', 'Cell_ID','cell_type', 'cell_subtype', 'cluster','ID',
424
+ # 'Cytoplasm_Size', 'immune_checkpoint', 'Unique_ROI_index', 'Patient', 'Primary_chem(1)_vs_surg(0)']
425
+ # not_intensities is the list of the columns unrelated to the markers fluorescence intensities
426
+ # Can include items that aren't in a given header.
427
+ #not_intensities = ['Nuc_X', 'Nuc_X_Inv', 'Nuc_Y', 'Nuc_Y_Inv', 'Nucleus_Roundness', 'Nucleus_Size', 'Cell_Size',
428
+ # 'ROI_index', 'Sample_ID', 'replicate_ID', 'Cell_ID','cell_type', 'cell_subtype', 'cluster','ID',
429
+ # 'Cytoplasm_Size', 'immune_checkpoint', 'Unique_ROI_index', 'Patient', 'Primary_chem(1)_vs_surg(0)']
430
+
431
+ # Get all column names
432
+ all_columns = df.columns.tolist()
433
+
434
+ # Create a list to store non-intensity column names
435
+ not_intensities = []
436
+ intensity_columns = []
437
+ # Iterate over each column name
438
+ for column in all_columns:
439
+ # Check if the column name contains 'Intensity_Average'
440
+ if 'Intensity_Average' not in column:
441
+ print(not_intensities)
442
+ not_intensities.append(column)
443
+ else:
444
+ intensity_columns.append(column)
445
+
446
+
447
+ # Create a new DataFrame with non-intensity columns
448
+ not_intensities_df = pd.DataFrame(not_intensities)
449
+ print("Non-intensity columns:")
450
+ print(not_intensities)
451
+
452
+ print("non-intensity DataFrame:")
453
+ not_intensities
454
+ #print(len(intensity_columns))
455
+
456
+
457
+ pd.DataFrame(not_intensities)
458
+
459
+ path_not_intensities = os.path.join(metadata_dir,"not_intensities.csv")
460
+
461
+ # If this file already exists, add only not_intensities items of the list not already present in file
462
+ if os.path.exists(path_not_intensities):
463
+ print("'not_intensities.csv' already exists.")
464
+ print("Reconciling file and Jupyter notebook lists.")
465
+ file_not_intensities = open(path_not_intensities, "r")
466
+ file_ni = file_not_intensities.read().splitlines()
467
+ # Set difference to identify items not already in file
468
+ to_add = set(not_intensities) - set(file_ni)
469
+ # We want not_intensities to the a complete list
470
+ not_intensities = list(set(file_ni) | set(not_intensities))
471
+ file_not_intensities.close()
472
+ file_not_intensities = open(path_not_intensities, "a")
473
+ for item in to_add:
474
+ file_not_intensities.write(item +"\n")
475
+ file_not_intensities.close()
476
+
477
+ else:
478
+ # The file does not yet exist
479
+ print("Could not find " + path_not_intensities + ". Creating now.")
480
+ file_not_intensities = open(path_not_intensities, "w")
481
+ for item in not_intensities:
482
+ file_not_intensities.write(item + "\n")
483
+ file_not_intensities.close()
484
+
485
+
486
+ # In[46]:
487
+
488
+
489
+ not_intensities_df = pd.read_csv(path_not_intensities)
490
+ not_intensities_df
491
+
492
+
493
+ # In[47]:
494
+
495
+
496
+ # Columns we want to keep: not_intensities, and any intensity column that contains 'Intensity_Average' (drop any intensity marker column that is not a mean intensity)
497
+ to_keep = not_intensities + [x for x in df.columns.values[~df.columns.isin(not_intensities)] if 'Intensity_Average' in x]
498
+
499
+ to_keep
500
+
501
+
502
+ # In[48]:
503
+
504
+
505
+ print(len(to_keep) - 1)
506
+
507
+
508
+ # In[49]:
509
+
510
+
511
+ # However, our to_keep list contains items that might not be in our df headers!
512
+ # These items are from our not_intensities list. So let's ask for only those items from to_keep that are actually found in our df
513
+ # Retains only the columns from the to_keep list that are found in the df's headers (columns).
514
+ # This ensures that we are only keeping the columns that exist in your df, avoiding any potential issues with non-existent column names.
515
+ # The result is a df containing only the specified columns.
516
+ df = df[[x for x in to_keep if x in df.columns.values]]
517
+
518
+ df.head()
519
+
520
+
521
+ # In[50]:
522
+
523
+
524
+ import pandas as pd
525
+
526
+ # Assuming you have a DataFrame named 'df'
527
+ # df = pd.read_csv('your_file.csv')
528
+
529
+ # Get all column names
530
+ all_columns = df.columns.tolist()
531
+
532
+ # Create an empty list to store intensity markers
533
+ intensity_marker = []
534
+
535
+ # Iterate over each column name
536
+ for column in all_columns:
537
+ # Check if the column name contains 'Intensity_Average'
538
+ if 'Intensity_Average' in column:
539
+ # Split the column name by underscore
540
+ parts = column.split('_')
541
+
542
+ # Extract the word before the first underscore
543
+ marker = parts[0]
544
+
545
+ # Add the marker to the intensity_marker list
546
+ intensity_marker.append(marker)
547
+
548
+ # Remove duplicates from the intensity_marker list
549
+ intensity_marker = list(set(intensity_marker))
550
+
551
+ print("Intensity Markers:")
552
+ print(intensity_marker)
553
+
554
+ # Create a callback function to update the intensities array
555
+ def update_intensities(event):
556
+ global intensities
557
+ global intensities_df
558
+ new_intensities = []
559
+ selected_columns = []
560
+ for marker, cell, cytoplasm, nucleus in zip(marker_options_df['Marker'], marker_options_df['Cell'], marker_options_df['Cytoplasm'], marker_options_df['Nucleus']):
561
+ if cell:
562
+ new_intensities.append(f"{marker}_Cell_Intensity_Average")
563
+ selected_columns.append(f"{marker}_Cell_Intensity_Average")
564
+ if cytoplasm:
565
+ new_intensities.append(f"{marker}_Cytoplasm_Intensity_Average")
566
+ selected_columns.append(f"{marker}_Cytoplasm_Intensity_Average")
567
+ if nucleus:
568
+ new_intensities.append(f"{marker}_Nucleus_Intensity_Average")
569
+ selected_columns.append(f"{marker}_Nucleus_Intensity_Average")
570
+ intensities = new_intensities
571
+ if selected_columns:
572
+ intensities_df = merged_dataframe[selected_columns]
573
+ else:
574
+ intensities_df = pd.DataFrame()
575
+ print("Updated intensities DataFrame:")
576
+ print(intensities_df)
577
+
578
+
579
+ # In[54]:
580
+
581
+
582
+ tabulator_formatters = {
583
+ 'bool': {'type': 'tickCross'}
584
+ }
585
+
586
+ # Create a DataFrame with the intensity markers and default values
587
+ marker_options_df = pd.DataFrame({
588
+ 'Marker': intensity_marker,
589
+ 'Cell': [False] * len(intensity_marker),
590
+ 'Cytoplasm': [False] * len(intensity_marker),
591
+ 'Nucleus': [False] * len(intensity_marker)
592
+ })
593
+
594
+ # Create the Tabulator widget and link the callback function
595
+ tabulator = pn.widgets.Tabulator(marker_options_df, formatters=tabulator_formatters, sizing_mode='stretch_width')
596
+ tabulator.param.watch(update_intensities,'value')
597
+
598
+ # Create a Panel layout with the Tabulator widget
599
+ marker_options_layout = pn.Column(tabulator, sizing_mode="stretch_width")
600
+
601
+ import panel as pn
602
+ import pandas as pd
603
+ import random
604
+ import asyncio
605
+
606
+ # Initialize the Panel extension with Tabulator
607
+ pn.extension('tabulator')
608
+
609
+ # Create a DataFrame with the intensity markers and default values
610
+ marker_options_df = pd.DataFrame({
611
+ 'Marker': intensity_marker,
612
+ 'Cell': [True] * len(intensity_marker),
613
+ 'Cytoplasm': [False] * len(intensity_marker),
614
+ 'Nucleus': [False] * len(intensity_marker)
615
+ })
616
+
617
+ # Define formatters for the Tabulator widget
618
+ tabulator_formatters = {
619
+ 'Cell': {'type': 'tickCross'},
620
+ 'Cytoplasm': {'type': 'tickCross'},
621
+ 'Nucleus': {'type': 'tickCross'}
622
+ }
623
+
624
+ # Create the Tabulator widget
625
+ tabulator = pn.widgets.Tabulator(marker_options_df, formatters=tabulator_formatters, sizing_mode='stretch_width')
626
+
627
+ # Create a DataFrame to store the initial intensities
628
+ new_data = [{'Description': f"{marker}_Cell_Intensity_Average"} for marker in intensity_marker if True]
629
+ new_data_df = pd.DataFrame(new_data)
630
+
631
+ # Create a widget to display the new data as a DataFrame
632
+ new_data_table = pn.widgets.Tabulator(new_data_df, name='New Data Table', sizing_mode='stretch_width')
633
+
634
+ # Create a button to start the update process
635
+ run_button = pn.widgets.Button(name="Save Selection", button_type='primary')
636
+
637
+ # Define the update_intensities function
638
+ def update_intensities():
639
+ global new_data, new_data_df
640
+ new_data = []
641
+ for _, row in tabulator.value.iterrows():
642
+ marker = row['Marker']
643
+ if row['Cell']:
644
+ new_data.append({'Description': f"{marker}_Cell_Intensity_Average"})
645
+ if row['Cytoplasm']:
646
+ new_data.append({'Description': f"{marker}_Cytoplasm_Intensity_Average"})
647
+ if row['Nucleus']:
648
+ new_data.append({'Description': f"{marker}_Nucleus_Intensity_Average"})
649
+ new_data_df = pd.DataFrame(new_data)
650
+ new_data_table.value = new_data_df
651
+
652
+ # Define the runner function
653
+ async def runner(event):
654
+ update_intensities()
655
+
656
+ # Bind the runner function to the button
657
+ run_button.on_click(runner)
658
+
659
+ # Layout
660
+ updated_intensities = pn.Column(tabulator, run_button, new_data_table, sizing_mode="stretch_width")
661
+
662
+ pn.extension()
663
+ # Serve the layout
664
+ #updated_intensities.servable()
665
+
666
+
667
+ intensities_df = new_data_table
668
+ intensities_df
669
+
670
+ intensities_df = pn.pane.DataFrame(intensities_df)
671
+ intensities_df
672
+
673
+ print(intensities_df)
674
+ # ## I.4. QC CHECKS
675
+
676
+ def quality_check_results(check_shape, check_no_null,check_zero_intensities):
677
+ results = [
678
+ f"Check Index: {check_index}",
679
+ f"Check Shape: {check_shape}",
680
+ f"Check No Null: {check_no_null}",
681
+ f"Check Zero Intensities: {check_zero_intensities}"
682
+ ]
683
+ return pn.Column(*[pn.Row(result) for result in results], sizing_mode="stretch_width")
684
+
685
+ print(ls_samples)
686
+
687
+ def check_index_format(index_str, ls_samples):
688
+ """
689
+ Checks if the given index string follows the specified format.
690
+
691
+ Args:
692
+ index_str (str): The index string to be checked.
693
+ ls_samples (list): A list of valid sample names.
694
+
695
+ Returns:
696
+ bool: True if the index string follows the format, False otherwise.
697
+ """
698
+ # Split the index string into parts
699
+ parts = index_str.split('_')
700
+
701
+ # Check if there are exactly 3 parts
702
+ if len(parts) != 3:
703
+ print(len(parts))
704
+ return False
705
+
706
+ # Check if the first part is in ls_samples
707
+ sample_name = parts[0]
708
+ if f'{sample_name}.csv' not in ls_samples:
709
+ print(sample_name)
710
+ return False
711
+
712
+ # Check if the second part is in ['cell', 'cytoplasm', 'nucleus']
713
+ location = parts[1]
714
+ valid_locations = ['Cell', 'Cytoplasm', 'Nucleus']
715
+ if location not in valid_locations:
716
+ print(location)
717
+ return False
718
+
719
+ # Check if the third part is a number
720
+ try:
721
+ index = int(parts[2])
722
+ except ValueError:
723
+ print(index)
724
+ return False
725
+
726
+ # If all checks pass, return True
727
+ return True
728
+
729
+
730
+ # In[70]:
731
+
732
+
733
+ # Let's take a look at a few features to make sure our dataframe is as expected
734
+ df.index
735
+ def check_format_ofindex(index):
736
+ for index in df.index:
737
+ check_index = check_index_format(index, ls_samples)
738
+ if check_index is False:
739
+ index_format = "Bad"
740
+ return index_format
741
+
742
+ index_format = "Good"
743
+ return index_format
744
+ print(check_format_ofindex(df.index))
745
+
746
+
747
+ # In[71]:
748
+
749
+
750
+ df.shape
751
+ check_index = df.index
752
+ check_shape = df.shape
753
+ print(check_shape)
754
+
755
+
756
+ # In[72]:
757
+
758
+
759
+ # Check for NaN entries (should not be any unless columns do not align)
760
+ # False means no NaN entries
761
+ # True means NaN entries
762
+ df.isnull().any().any()
763
+
764
+ check_no_null = df.isnull().any().any()
765
+
766
+
767
+ # In[73]:
768
+
769
+
770
+ # Check that all expected files were imported into final dataframe
771
+ if sorted(df.Sample_ID.unique()) == sorted(ls_samples):
772
+ print("All expected filenames are present in big df Sample_ID column.")
773
+ check_all_expected_files_present = "All expected filenames are present in big df Sample_ID column."
774
+ else:
775
+ compare_headers(['no samples'], df.Sample_ID.unique(), "big df Sample_ID column")
776
+ check_all_expected_files_present = compare_headers(['no samples'], df.Sample_ID.unique(), "big df Sample_ID column")
777
+
778
+ print(df.Sample_ID)
779
+
780
+
781
+ # In[74]:
782
+
783
+
784
+ # Delete rows that have 0 value mean intensities for intensity columns
785
+ print("df.shape before removing 0 mean values: ", df.shape)
786
+
787
+ # We use the apply method on df to calculate the mean intensity for each row. It's done this by applying a lambda function to each row.
788
+ # The lambda function excludes the columns listed in the not_intensities list (which are not to be considered for mean intensity calculations)
789
+ # and calculates the mean of the remaining values in each row.
790
+ ###############################
791
+ # !! This may take a while !! #
792
+ ###############################
793
+ # Calculate mean intensity excluding 'not_intensities' columns
794
+ mean_intensity = df.loc[:, ~df.columns.isin(not_intensities)].mean(axis=1)
795
+
796
+ # Check if there are any 0 mean intensity values
797
+ if (mean_intensity == 0).any():
798
+ df = df.loc[mean_intensity > 0, :]
799
+ print("Shape after removing 0 mean values: ", df.shape)
800
+ check_zero_intensities = f'df.shape after removing 0 mean values: {df.shape}'
801
+ else:
802
+ print("No zero intensity values.")
803
+ check_zero_intensities = " No zero intensity values found in the DataFrame."
804
+
805
+
806
+
807
+ # Get quantiles (5th, 50th, 95th)
808
+ # List of nucleus size percentiles to extract
809
+ #qs = [0.05,0.50,0.95]
810
+
811
+
812
+
813
+ #df["Nucleus_Size"].quantile(q=qs)
814
+
815
+
816
+ quality_control_df = df
817
+ quality_control_df.head()
818
+
819
+ # Function to perform quality checks
820
+ def perform_quality_checks(df, ls_samples, not_intensities):
821
+ results = {}
822
+ errors = []
823
+ # Check index
824
+ results['index'] = df.index
825
+
826
+ # Check shape
827
+ results['shape'] = df.shape
828
+
829
+ # Check for NaN entries
830
+ results['nan_entries'] = df.isnull().any().any()
831
+
832
+ # Remove rows with 0 mean intensity values
833
+ mean_intensity = df.loc[:, ~df.columns.isin(not_intensities)].mean(axis=1)
834
+ if (mean_intensity == 0).any():
835
+ df = df.loc[mean_intensity > 0, :]
836
+ results['zero_intensity_removal'] = f"Zero intensity entires are found and removed. Shape after removing: {df.shape}"
837
+ else:
838
+ results['zero_intensity_removal'] = "No zero intensity values found in the DataFrame."
839
+
840
+ return results
841
+
842
+ # Example usage of the function
843
+ quality_check_results = perform_quality_checks(df, ls_samples, not_intensities)
844
+
845
+ # Print results
846
+ for key, value in quality_check_results.items():
847
+ print(f"{key}: {value}")
848
+
849
+
850
+ # In[80]:
851
+
852
+
853
+ import panel as pn
854
+ import pandas as pd
855
+
856
+ def quality_check(file, not_intensities):
857
+ # Load the output file
858
+ df = file
859
+
860
+ # Check Index
861
+ check_index = check_format_ofindex(df.index)
862
+
863
+ # Check Shape
864
+ check_shape = df.shape
865
+
866
+ # Check for NaN entries
867
+ check_no_null = df.isnull().any().any()
868
+
869
+ mean_intensity = df.loc[:, ~df.columns.isin(not_intensities)].mean(axis=1)
870
+ if (mean_intensity == 0).any():
871
+ df = df.loc[mean_intensity > 0, :]
872
+ print("df.shape after removing 0 mean values: ", df.shape)
873
+ check_zero_intensities = f'df.shape after removing 0 mean values: {df.shape}'
874
+ else:
875
+ print("No zero intensity values found in the DataFrame.")
876
+ check_zero_intensities = "No zero intensities."
877
+
878
+ # Create a quality check results table
879
+ quality_check_results_table = pd.DataFrame({
880
+ 'Check': ['Index', 'Shape', 'Check for NaN Entries', 'Check for Zero Intensities'],
881
+ 'Result': [str(check_index), str(check_shape), str(check_no_null), check_zero_intensities]
882
+ })
883
+
884
+ # Create a quality check results component
885
+ quality_check_results_component = pn.Card(
886
+ pn.pane.DataFrame(quality_check_results_table),
887
+ title="Quality Control Results",
888
+ header_background="#2196f3",
889
+ header_color="white",
890
+ )
891
+
892
+ return quality_check_results_component
893
+
894
+ quantile_slider = pn.widgets.FloatSlider(name='Quantile', start=0.01, end=0.99, step=0.01, value=0.05)
895
+
896
+
897
+ # Function to calculate quantile values
898
+ def calculate_quantiles(quantile):
899
+ quantile_value_intensity = df["AF555_Cell_Intensity_Average"].quantile(q=[quantile, 0.50, 1 - quantile])
900
+ return quantile_value_intensity
901
+
902
+ # Function to create the Panel app
903
+ def create_app(quantile = quantile_slider.param.value):
904
+ quantiles = calculate_quantiles(quantile)
905
+ output = pd.DataFrame(quantiles)
906
+
907
+ # Create a Markdown widget to display the output
908
+ output_widget = pn.pane.DataFrame(output)
909
+
910
+ return output_widget
911
+
912
+
913
+ # Bind the create_app function to the quantile slider
914
+ quantile_output_app = pn.bind(create_app, quantile_slider.param.value)
915
+ #pn.Column(quantile_slider,quantile_output_app).servable()
916
+
917
+ # Function to create the line graph plot using Bokeh
918
+ def create_line_graph2(quantile):
919
+ # Calculate histogram
920
+ hist, edges = np.histogram(df['Nucleus_Size'], bins=30)
921
+
922
+ # Calculate the midpoints of bins for plotting
923
+ midpoints = (edges[:-1] + edges[1:]) / 2
924
+
925
+ # Calculate quantiles
926
+ qs = [quantile, 0.50, 1.00 - quantile]
927
+ quantiles = df['Nucleus_Size'].quantile(q=qs).values
928
+
929
+ # Create Bokeh line graph plot
930
+ p = figure(title='Frequency vs. Nucleus_Size',
931
+ x_axis_label='Nucleus_Size',
932
+ y_axis_label='Frequency',
933
+ width=800, height=400)
934
+
935
+ # Plotting histogram
936
+ p.quad(top=hist, bottom=0, left=edges[:-1], right=edges[1:],
937
+ fill_color='skyblue', line_color='black', alpha=0.6)
938
+
939
+ # Plotting line graph
940
+ p.line(midpoints, hist, line_width=2, color='blue', alpha=0.7)
941
+
942
+ # Add quantile lines
943
+ for q in quantiles:
944
+ span = Span(location=q, dimension='height', line_color='red', line_dash='dashed', line_width=2)
945
+ p.add_layout(span)
946
+ p.add_layout(Label(x=q, y=max(hist), text=f'{q:.1f}', text_color='red'))
947
+
948
+ return p
949
+
950
+ # Bind the create_line_graph function to the quantile slider
951
+ nucleus_size_line_graph_with_histogram = pn.bind(create_line_graph2, quantile=quantile_slider.param.value)
952
+
953
+ # Layout the components in a Panel app
954
+ #nucleus_size_line_graph_with_histogram = pn.Column(create_line_graph2(quantile = quantile_slider.param.value))
955
+ #nucleus_size_line_graph_with_histogram.servable()
956
+ # Layout the components in a Panel app
957
+ plot1 = pn.Column(quantile_slider, pn.pane.Bokeh(nucleus_size_line_graph_with_histogram))
958
+ #plot1.servable()
959
+
960
+ #Removing cells based on nucleus size
961
+
962
+ quantile = quantile_slider.value
963
+ qs = [quantile, 0.50, 1.00 - quantile]
964
+ quantiles = df['Nucleus_Size'].quantile(q=qs).values
965
+ threshold = quantiles[2]
966
+
967
+
968
+ # In[89]:
969
+
970
+
971
+ print(threshold)
972
+
973
+
974
+ # In[90]:
975
+
976
+
977
+
978
+ import panel as pn
979
+ import pandas as pd
980
+ import numpy as np
981
+ from bokeh.plotting import figure
982
+ from bokeh.models import Span, Label
983
+ # Define the quantile slider
984
+ #quantile_slider = pn.widgets.FloatSlider(name='Quantile', start=0.01, end=0.99, step=0.01, value=0.05)
985
+
986
+ # Function to update the threshold and display number of cells removed
987
+ def update_threshold_and_display(quantile):
988
+ qs = [quantile, 0.50, 1.00 - quantile]
989
+ quantiles = df['Nucleus_Size'].quantile(q=qs).values
990
+ threshold = quantiles[2]
991
+
992
+ # Filter the DataFrame based on the new threshold
993
+ df_filtered = df.loc[(df['Nucleus_Size'] > 42) & (df['Nucleus_Size'] < threshold)]
994
+
995
+ # Calculate the number of cells removed
996
+ cells_before_filter = df.shape[0]
997
+ cells_after_filter = df_filtered.shape[0]
998
+ cells_removed = cells_before_filter - cells_after_filter
999
+
1000
+ # Display the results
1001
+ results = pn.Column(
1002
+ f"Number of cells before filtering: {cells_before_filter}",
1003
+ f"Number of cells after filtering on nucleus size: {cells_after_filter}",
1004
+ f"Number of cells removed: {cells_removed}"
1005
+ )
1006
+
1007
+ return results
1008
+
1009
+ # Bind the update function to the quantile slider
1010
+ results_display = pn.bind(update_threshold_and_display, quantile_slider)
1011
+
1012
+ # Layout the components in a Panel app
1013
+ layout2 = results_display
1014
+
1015
+
1016
+ # In[91]:
1017
+
1018
+
1019
+ print("Number of cells before filtering :", df.shape[0])
1020
+ cells_before_filter = f"Number of cells before filtering :{df.shape[0]}"
1021
+ # Delete small cells and objects w/high AF555 Signal (RBCs)
1022
+ # We usually use the 95th percentile calculated during QC_EDA
1023
+ df = df.loc[(df['Nucleus_Size'] > 42 )]
1024
+ df = df.loc[(df['Nucleus_Size'] < threshold)]
1025
+ cells_after_filter_nucleus_shape = df.shape[0]
1026
+ print("Number of cells after filtering on nucleus size:", df.shape[0])
1027
+
1028
+ df = df.loc[(df['AF555_Cell_Intensity_Average'] < 2000)]
1029
+ print("Number of cells after filtering on AF555A ___ intensity:", df.shape[0])
1030
+ cells_after_filter_intensity_shape = df.shape[0]
1031
+ cells_after_filter_nucleus = f"Number of cells after filtering on nucleus size: {cells_after_filter_nucleus_shape}"
1032
+ cells_after_filter_intensity = f"Number of cells after filtering on AF555A ___ intensity: {cells_after_filter_intensity_shape}"
1033
+
1034
+ num_of_cell_removal_intensity = cells_after_filter_intensity
1035
+
1036
+ print(num_of_cell_removal_intensity )
1037
+
1038
+ num_of_cell_removal = pn.Column(cells_before_filter, cells_after_filter_nucleus)
1039
+
1040
+
1041
+ # Assuming you have a DataFrame 'df' with the intensity columns
1042
+ intensities = df.filter(like='Intensity').columns.tolist()
1043
+
1044
+ # Create a ColumnDataSource from the DataFrame
1045
+ source = ColumnDataSource(df)
1046
+
1047
+ # Function to calculate quantile values
1048
+ def calculate_quantiles(column, quantile):
1049
+ quantiles = df[column].quantile(q=[quantile, 0.50, 1 - quantile]).values
1050
+ return quantiles
1051
+
1052
+ # Create the dropdown menu
1053
+ column_dropdown = pn.widgets.Select(name='Select Column', options=intensities)
1054
+
1055
+ quantile_slider = pn.widgets.FloatSlider(name='Quantile', start=0.01, end=0.99, step=0.01, value=0.05)
1056
+
1057
+
1058
+ # Function to create the Bokeh plot
1059
+ def create_intensity_plot(column, quantile):
1060
+ quantiles = calculate_quantiles(column, quantile)
1061
+ hist, edges = np.histogram(df[column], bins = 30)
1062
+ # Calculate the midpoints of bins for plotting
1063
+ midpoints = (edges[:-1] + edges[1:]) / 2
1064
+
1065
+ # Create Bokeh plot
1066
+ p = figure(title=f'Distribution of {column} with Quantiles',
1067
+ x_axis_label=f'{column} Values',
1068
+ y_axis_label='Frequency',
1069
+ width=800, height=400)
1070
+
1071
+
1072
+ p.quad(top=hist, bottom=0, left=edges[:-1], right= edges[1:],
1073
+ fill_color='skyblue', line_color='black', alpha=0.7)
1074
+
1075
+ # Plotting line graph
1076
+ p.line(midpoints, hist, line_width=2, color='blue', alpha=0.7)
1077
+
1078
+ # Add quantile lines
1079
+ for q in quantiles:
1080
+ span = Span(location=q, dimension='height', line_color='red', line_dash='dashed', line_width=2)
1081
+ p.add_layout(span)
1082
+ p.add_layout(Label(x=q, y=max(hist), text=f'{q:.1f}', text_color='red'))
1083
+
1084
+ return p
1085
+
1086
+
1087
+ # Bind the create_plot function to the quantile slider, column dropdown, and button click
1088
+ marker_intensity_with_histogram = pn.bind(create_intensity_plot,column_dropdown.param.value, quantile_slider.param.value, watch=True)
1089
+
1090
+ # Create the button
1091
+ generate_plot_button = Button(label='Generate Plot', button_type='primary')
1092
+
1093
+ def update_plot(column, quantile):
1094
+ plot = create_intensity_plot(column, quantile)
1095
+ plot.renderers[0].data_source = source # Update the data source for the renderer
1096
+ return plot
1097
+
1098
+ #Display the dropdown menu, quantile slider, button, and plot
1099
+ #plot = update_plot(column_dropdown.param.value, quantile_slider.param.value)
1100
+
1101
+ def generate_plot(event):
1102
+ updated_plot = update_plot(column_dropdown.param.value, quantile_slider.param.value)
1103
+ #pn.Column(pn.Row(column_dropdown, generate_plot_button), quantile_slider, updated_plot).servable()
1104
+
1105
+ generate_plot_button.on_click(generate_plot)
1106
+ selected_marker_plot = pn.Column(pn.Row(pn.Column(column_dropdown, marker_intensity_with_histogram )))
1107
+ #pn.Column(pn.Row(pn.Column(column_dropdown, marker_intensity_with_histogram ), generate_plot_button)).servable()
1108
+
1109
+ import panel as pn
1110
+ import numpy as np
1111
+ import pandas as pd
1112
+ from bokeh.plotting import figure
1113
+ from bokeh.models import ColumnDataSource, Button, Span, Label
1114
+
1115
+ # Assuming you have a DataFrame 'df' with the intensity columns
1116
+ intensities = df.filter(like='Intensity').columns.tolist()
1117
+
1118
+ # Create a ColumnDataSource from the DataFrame
1119
+ source = ColumnDataSource(df)
1120
+
1121
+ # Function to calculate quantile values
1122
+ def calculate_quantiles(column, quantile):
1123
+ quantiles = df[column].quantile(q=[quantile, 0.50, 1 - quantile])
1124
+ return quantiles
1125
+
1126
+
1127
+ # In[105]:
1128
+
1129
+
1130
+ quantile_slider = pn.widgets.FloatSlider(name='Quantile', start=0.01, end=0.99, step=0.01, value=0.05)
1131
+
1132
+
1133
+ # Bind the create_line_graph function to the quantile slider
1134
+ #nucleus_size_line_graph = pn.bind(create_line_graph, quantile=quantile_slider.param.value)
1135
+
1136
+ # Layout the components in a Panel app
1137
+ #nucleus_size_graph = pn.Column(nucleus_size_line_graph)
1138
+
1139
+
1140
+ # In[106]:
1141
+
1142
+
1143
+ #df["CKs_Cytoplasm_Intensity_Average"].quantile(q=qs)
1144
+
1145
+
1146
+ # In[107]:
1147
+
1148
+
1149
+ len(intensities)
1150
+ if 'CKs_Cytoplasm_Intensity_Average' in intensities:
1151
+ print(1)
1152
+
1153
+
1154
+ # In[108]:
1155
+
1156
+
1157
+ df
1158
+
1159
+
1160
+ # In[109]:
1161
+
1162
+
1163
+ def calculate_cytoplasm_quantiles(column, quantile):
1164
+ # Print the columns of the DataFrame
1165
+ print("DataFrame columns:", df.columns)
1166
+
1167
+ # Check if the column exists in the DataFrame
1168
+ if column not in df.columns:
1169
+ raise KeyError(f"Column '{column}' does not exist in the DataFrame.")
1170
+
1171
+ quantiles = df[column].quantile(q=[quantile, 0.50, 1 - quantile])
1172
+ return quantiles
1173
+
1174
+ def create_cytoplasm_intensity_df(column, quantile):
1175
+ quantiles = calculate_cytoplasm_quantiles(column, quantile)
1176
+ output = pd.DataFrame(quantiles)
1177
+ return pn.pane.DataFrame(output)
1178
+
1179
+ # Bind the create_app function to the quantile slider
1180
+ cytoplasm_quantile_output_app = pn.bind(create_cytoplasm_intensity_df, column='CKs_Cytoplasm_Intensity_Average', quantile=quantile_slider.param.value)
1181
+
1182
+ pn.Column(quantile_slider, cytoplasm_quantile_output_app)
1183
+
1184
+
1185
+ # In[110]:
1186
+
1187
+
1188
+ def calculate_cytoplasm_quantiles(column, quantile):
1189
+ quantiles = df[column].quantile(q=[quantile, 0.50, 1 - quantile])
1190
+ return quantiles
1191
+
1192
+ def create_cytoplasm_intensity_df(column, quantile):
1193
+ quantiles = calculate_cytoplasm_quantiles(column, quantile)
1194
+ output = pd.DataFrame(quantiles)
1195
+ # Create a Dataframe widget to display the output
1196
+ output_widget = pn.pane.DataFrame(output)
1197
+ return output_widget
1198
+
1199
+
1200
+ # Bind the create_app function to the quantile slider
1201
+ cytoplasm_quantile_output_app = pn.bind(create_cytoplasm_intensity_df, column='CKs_Cytoplasm_Intensity_Average', quantile = quantile_slider.param.value)
1202
+ pn.Column(quantile_slider,cytoplasm_quantile_output_app)
1203
+
1204
+
1205
+ # ## I.5. COLUMNS OF INTERESTS
1206
+
1207
+ # In[111]:
1208
+
1209
+
1210
+ # Remove columns containing "DAPI"
1211
+ df = df[[x for x in df.columns.values if 'DAPI' not in x]]
1212
+
1213
+ print("Columns are now...")
1214
+ print([c for c in df.columns.values])
1215
+
1216
+
1217
+ # In[112]:
1218
+
1219
+
1220
+ # Create lists of full names and shortened names to use in plotting
1221
+ full_to_short_names, short_to_full_names = \
1222
+ shorten_feature_names(df.columns.values[~df.columns.isin(not_intensities)])
1223
+
1224
+ short_to_full_names
1225
+
1226
+
1227
+ # In[113]:
1228
+
1229
+
1230
+ # Save this data to a metadata file
1231
+ filename = os.path.join(metadata_dir, "full_to_short_column_names.csv")
1232
+ fh = open(filename, "w")
1233
+ fh.write("full_name,short_name\n")
1234
+ for k,v in full_to_short_names.items():
1235
+ fh.write(k + "," + v + "\n")
1236
+
1237
+ fh.close()
1238
+ print("The full_to_short_column_names.csv file was created !")
1239
+
1240
+
1241
+ # In[114]:
1242
+
1243
+
1244
+ # Save this data to a metadata file
1245
+ filename = os.path.join(metadata_dir, "short_to_full_column_names.csv")
1246
+ fh = open(filename, "w")
1247
+ fh.write("short_name,full_name\n")
1248
+ for k,v in short_to_full_names.items():
1249
+ fh.write(k + "," + v + "\n")
1250
+
1251
+ fh.close()
1252
+ print("The short_to_full_column_names.csv file was created !")
1253
+
1254
+
1255
+ # ## I.6. EXPOSURE TIME
1256
+
1257
+ # In[115]:
1258
+
1259
+
1260
+ #import the ashlar analysis file
1261
+ file_path = os.path.join(metadata_dir, 'combined_metadata.csv')
1262
+ ashlar_analysis = pd.read_csv(file_path)
1263
+ ashlar_analysis
1264
+
1265
+
1266
+ # In[116]:
1267
+
1268
+
1269
+ # Extracting and renaming columns
1270
+ new_df = ashlar_analysis[['Name', 'Cycle', 'ChannelIndex', 'ExposureTime']].copy()
1271
+ new_df.rename(columns={
1272
+ 'Name': 'Target',
1273
+ 'Cycle': 'Round',
1274
+ 'ChannelIndex': 'Channel'
1275
+ }, inplace=True)
1276
+
1277
+ # Applying suffixes to the columns
1278
+ new_df['Round'] = 'R' + new_df['Round'].astype(str)
1279
+ new_df['Channel'] = 'c' + new_df['Channel'].astype(str)
1280
+
1281
+ # Save to CSV
1282
+ new_df.to_csv('Ashlar_Exposure_Time.csv', index=False)
1283
+
1284
+ # Print the new dataframe
1285
+ print(new_df)
1286
+
1287
+
1288
+ # In[117]:
1289
+
1290
+
1291
+ # Here, we want to end up with a data structure that incorporates metadata on each intensity marker column used in our big dataframe in an easy-to-use format.
1292
+ # This is going to include the full name of the intensity marker columns in the big data frame,
1293
+ # the corresponding round and channel,
1294
+ # the target protein (e.g., CD45),
1295
+ # and the segmentation localization information (cell, cytoplasm, nucleus)
1296
+
1297
+ # We can use this data structure to assign unique colors to all channels and rounds, for example, for use in later visualizations
1298
+ # Exposure_time file from ASHLAR analysis
1299
+ filename = "Exposure_Time.csv"
1300
+ filename = os.path.join(metadata_dir, filename)
1301
+ exp_df = pd.read_csv(filename)
1302
+
1303
+ print(exp_df)
1304
+
1305
+ # Verify file imported correctly
1306
+ # File length
1307
+ print("df's shape: ", exp_df.shape)
1308
+ # Headers
1309
+ expected_headers =['Round','Target','Exp','Channel']
1310
+ compare_headers(expected_headers, exp_df.columns.values, "Imported metadata file")
1311
+
1312
+ # Missingness
1313
+ if exp_df.isnull().any().any():
1314
+ print("\nexp_df has null value(s) in row(s):")
1315
+ print(exp_df[exp_df.isna().any(axis=1)])
1316
+ else:
1317
+ print("\nNo null values detected.")
1318
+
1319
+
1320
+ # In[118]:
1321
+
1322
+
1323
+ if len(exp_df['Target']) > len(exp_df['Target'].unique()):
1324
+ print("One or more non-unique Target values in exp_df. Currently not supported.")
1325
+ exp_df = exp_df.drop_duplicates(subset = 'Target').reindex()
1326
+
1327
+
1328
+ # In[119]:
1329
+
1330
+
1331
+ # sort exp_df by the values in the 'Target' column in ascending order and then retrieve the first few rows of the sorted df
1332
+ exp_df.sort_values(by = ['Target']).head()
1333
+
1334
+
1335
+ # In[120]:
1336
+
1337
+
1338
+ # Create lowercase version of target
1339
+ exp_df['target_lower'] = exp_df['Target'].str.lower()
1340
+ exp_df.head()
1341
+
1342
+
1343
+ # In[121]:
1344
+
1345
+
1346
+ # Create df that contains marker intensity columns in our df that aren't in not_intensities
1347
+ intensities = pd.DataFrame({'full_column':df.columns.values[~df.columns.isin(not_intensities)]})
1348
+
1349
+ intensities
1350
+
1351
+
1352
+ # In[122]:
1353
+
1354
+
1355
+ # Extract the marker information from the `full_column`, which corresponds to full column in big dataframe
1356
+ # Use regular expressions (regex) to isolate the part of the field that begins (^) with an alphanumeric value (W), and ends with an underscore (_)
1357
+ # '$' is end of line
1358
+ intensities['marker'] = intensities['full_column'].str.extract(r'([^\W_]+)')
1359
+ # convert to lowercase
1360
+ intensities['marker_lower'] = intensities['marker'].str.lower()
1361
+
1362
+ intensities
1363
+
1364
+
1365
+ # In[123]:
1366
+
1367
+
1368
+ # Subset the intensities df to exclude any column pertaining to DAPI
1369
+ intensities = intensities.loc[intensities['marker_lower'] != 'dapi']
1370
+
1371
+ intensities.head()
1372
+
1373
+
1374
+ # In[124]:
1375
+
1376
+
1377
+ # Merge the intensities andexp_df together to create metadata
1378
+ metadata = pd.merge(exp_df, intensities, how = 'left', left_on = 'target_lower',right_on = 'marker_lower')
1379
+ metadata = metadata.drop(columns = ['marker_lower'])
1380
+ metadata = metadata.dropna()
1381
+
1382
+ # Target is the capitalization from the Exposure_Time.csv
1383
+ # target_lower is Target in small caps
1384
+ # marker is the extracted first component of the full column in segmentation data, with corresponding capitalization
1385
+ metadata
1386
+
1387
+
1388
+ # In[125]:
1389
+
1390
+
1391
+ # Add a column to signify marker target localisation.
1392
+ # Use a lambda to determine segmented location of intensity marker column and update metadata accordingly
1393
+ # Using the add_metadata_location() function in my_modules.py
1394
+ metadata['localisation'] = metadata.apply(
1395
+ lambda row: add_metadata_location(row), axis = 1)
1396
+
1397
+
1398
+ # In[126]:
1399
+
1400
+
1401
+ mlid = metadata
1402
+
1403
+
1404
+ # In[127]:
1405
+
1406
+
1407
+ # Save this data structure to the metadata folder
1408
+ # don't want to add color in because that's better off treating color the same for round, channel, and sample
1409
+ filename = "marker_intensity_metadata.csv"
1410
+ filename = os.path.join(metadata_dir, filename)
1411
+ metadata.to_csv(filename, index = False)
1412
+ print("The marker_intensity_metadata.csv file was created !")
1413
+
1414
+
1415
+
1416
+ # ## I.7. COLORS WORKFLOW
1417
+
1418
+ # ### I.7.1. CHANNELS COLORS
1419
+
1420
+
1421
+ # we want colors that are categorical, since Channel is a non-ordered category (yes, they are numbered, but arbitrarily).
1422
+ # A categorical color palette will have dissimilar colors.
1423
+ # Get those unique colors
1424
+ if len(metadata.Channel.unique()) > 10:
1425
+ print("WARNING: There are more unique channel values than \
1426
+ there are colors to choose from. Select different palette, e.g., \
1427
+ continuous palette 'husl'.")
1428
+ channel_color_values = sb.color_palette("bright",n_colors = len(metadata.Channel.unique()))
1429
+ # chose 'colorblind' because it is categorical and we're unlikely to have > 10
1430
+
1431
+ # You can customize the colors for each channel here
1432
+ custom_colors = {
1433
+ 'c2': 'lightgreen',
1434
+ 'c3': 'tomato',
1435
+ 'c4': 'pink',
1436
+ 'c5': 'turquoise'
1437
+ }
1438
+
1439
+ custom_colors_values = sb.palplot(sb.color_palette([custom_colors.get(ch, 'blue') for ch in metadata.Channel.unique()]))
1440
+
1441
+ # Display those unique customs colors
1442
+ print("Unique channels are:", metadata.Channel.unique())
1443
+ sb.palplot(sb.color_palette(channel_color_values))
1444
+
1445
+
1446
+ # In[131]:
1447
+
1448
+
1449
+ # Function to create a palette plot with custom colors
1450
+ def create_palette_plot():
1451
+ # Get unique channels
1452
+ unique_channels = metadata.Channel.unique()
1453
+
1454
+ # Define custom colors for each channel
1455
+ custom_colors = {
1456
+ 'c2': 'lightgreen',
1457
+ 'c3': 'tomato',
1458
+ 'c4': 'pink',
1459
+ 'c5': 'turquoise'
1460
+ }
1461
+
1462
+ # Get custom colors for each channel
1463
+ colors = [custom_colors.get(ch, 'blue') for ch in unique_channels]
1464
+
1465
+ # Create a palette plot (palplot)
1466
+ palette_plot = sb.palplot(sb.color_palette(colors))
1467
+ channel_color_values = sb.color_palette("bright",n_colors = len(metadata.Channel.unique()))
1468
+ channel_color_values = sb.palplot(channel_color_values)
1469
+ return palette_plot, channel_color_values
1470
+
1471
+
1472
+ # Create the palette plot directly
1473
+ palette_plot = create_palette_plot()
1474
+
1475
+ # Define the Panel app layout
1476
+ app_palette_plot = pn.Column(
1477
+ pn.pane.Markdown("### Custom Color Palette"),
1478
+ palette_plot,
1479
+ )
1480
+
1481
+ # Function to create a palette plot with custom colors
1482
+ def create_palette_plot(custom_colors):
1483
+ # Get unique channels
1484
+ unique_channels = metadata.Channel.unique()
1485
+
1486
+ # Get custom colors for each channel
1487
+ colors = [custom_colors.get(ch, 'blue') for ch in unique_channels]
1488
+
1489
+ # Create a palette plot (palplot)
1490
+ palette_plot = sb.palplot(sb.color_palette(colors))
1491
+
1492
+ return palette_plot
1493
+
1494
+ # Define custom colors for each channel
1495
+ custom_colors = {
1496
+ 'c2': 'lightgreen',
1497
+ 'c3': 'tomato',
1498
+ 'c4': 'pink',
1499
+ 'c5': 'turquoise'
1500
+ }
1501
+
1502
+ # Display those unique customs colo
1503
+ print("Unique channels are:", metadata.Channel.unique())
1504
+ # Function to bind create_palette_plot
1505
+ app_palette_plot = create_palette_plot(custom_colors)
1506
+
1507
+
1508
+ #app_palette_plot.servable()
1509
+
1510
+
1511
+ # In[133]:
1512
+
1513
+
1514
+ # Store in a dictionary
1515
+ channel_color_dict = dict(zip(metadata.Channel.unique(), channel_color_values))
1516
+ channel_color_dict
1517
+ for k,v in channel_color_dict.items():
1518
+ channel_color_dict[k] = np.float64(v)
1519
+
1520
+ channel_color_dict
1521
+
1522
+
1523
+ # In[134]:
1524
+
1525
+
1526
+ color_df_channel = color_dict_to_df(channel_color_dict, "Channel")
1527
+
1528
+ # Save to file in metadatadirectory
1529
+ filename = "channel_color_data.csv"
1530
+ filename = os.path.join(metadata_dir, filename)
1531
+ color_df_channel.to_csv(filename, index = False)
1532
+
1533
+ color_df_channel
1534
+
1535
+
1536
+ # In[135]:
1537
+
1538
+
1539
+ # Legend of channel info only
1540
+ g = plt.figure(figsize = (1,1)).add_subplot(111)
1541
+ g.axis('off')
1542
+ handles = []
1543
+ for item in channel_color_dict.keys():
1544
+ h = g.bar(0,0, color = channel_color_dict[item],
1545
+ label = item, linewidth =0)
1546
+ handles.append(h)
1547
+ first_legend = plt.legend(handles=handles, loc='upper right', title = 'Channel'),
1548
+ # box_to_anchor=(10,10),
1549
+ # bbox_transform=plt.gcf().transFigure)
1550
+
1551
+ filename = "Channel_legend.png"
1552
+ filename = os.path.join(metadata_images_dir, filename)
1553
+ plt.savefig(filename, bbox_inches = 'tight')
1554
+
1555
+ # ### I.7.2. ROUNDS COLORS
1556
+
1557
+
1558
+ # we want colors that are sequential, since Round is an ordered category.
1559
+ # We can still generate colors that are easy to distinguish. Also, many of the categorical palettes cap at at about 10 or so unique colors, and repeat from there.
1560
+ # We do not want any repeats!
1561
+ round_color_values = sb.cubehelix_palette(
1562
+ len(metadata.Round.unique()), start=1, rot= -0.75, dark=0.19, light=.85, reverse=True)
1563
+ # round_color_values = sb.color_palette("cubehelix",n_colors = len(metadata.Round.unique()))
1564
+ # chose 'cubehelix' because it is sequential, and round is a continuous process
1565
+ # each color value is a tuple of three values: (R, G, B)
1566
+ print(metadata.Round.unique())
1567
+
1568
+ sb.palplot(sb.color_palette(round_color_values))
1569
+
1570
+ ## TO-DO: write what these parameters mean
1571
+
1572
+
1573
+ # In[137]:
1574
+
1575
+
1576
+ # Store in a dictionary
1577
+ round_color_dict = dict(zip(metadata.Round.unique(), round_color_values))
1578
+
1579
+ for k,v in round_color_dict.items():
1580
+ round_color_dict[k] = np.float64(v)
1581
+
1582
+ round_color_dict
1583
+
1584
+
1585
+ # In[138]:
1586
+
1587
+
1588
+ color_df_round = color_dict_to_df(round_color_dict, "Round")
1589
+
1590
+ # Save to file in metadatadirectory
1591
+ filename = "round_color_data.csv"
1592
+ filename = os.path.join(metadata_dir, filename)
1593
+ color_df_round.to_csv(filename, index = False)
1594
+
1595
+ color_df_round
1596
+
1597
+ # Legend of round info only
1598
+
1599
+ round_legend = plt.figure(figsize = (1,1)).add_subplot(111)
1600
+ round_legend.axis('off')
1601
+ handles = []
1602
+ for item in round_color_dict.keys():
1603
+ h = round_legend.bar(0,0, color = round_color_dict[item],
1604
+ label = item, linewidth =0)
1605
+ handles.append(h)
1606
+ first_legend = plt.legend(handles=handles, loc='upper right', title = 'Round'),
1607
+ # bbox_to_anchor=(10,10),
1608
+ # bbox_transform=plt.gcf().transFigure)
1609
+
1610
+ filename = "Round_legend.png"
1611
+ filename = os.path.join(metadata_images_dir, filename)
1612
+ plt.savefig(filename, bbox_inches = 'tight')
1613
+
1614
+
1615
+ # ### I.7.3. SAMPLES COLORS
1616
+
1617
+ # In[140]:
1618
+
1619
+
1620
+ # we want colors that are neither sequential nor categorical.
1621
+ # Categorical would be ideal if we could generate an arbitrary number of colors, but I do not think that we can.
1622
+ # Hense, we will choose `n` colors from a continuous palette. First we will generate the right number of colors. Later, we will assign TMA samples to gray.
1623
+
1624
+ # Get those unique colors
1625
+ color_values = sb.color_palette("husl",n_colors = len(ls_samples))#'HLS'
1626
+ # each color value is a tuple of three values: (R, G, B)
1627
+
1628
+ # Display those unique colors
1629
+ sb.palplot(sb.color_palette(color_values))
1630
+
1631
+
1632
+ # In[141]:
1633
+
1634
+
1635
+ TMA_samples = [s for s in df.Sample_ID.unique() if 'TMA' in s]
1636
+ TMA_color_values = sb.color_palette(n_colors = len(TMA_samples),palette = "gray")
1637
+ sb.palplot(sb.color_palette(TMA_color_values))
1638
+
1639
+
1640
+ # In[142]:
1641
+
1642
+
1643
+ # Store in a dictionary
1644
+ color_dict = dict()
1645
+ color_dict = dict(zip(df.Sample_ID.unique(), color_values))
1646
+
1647
+ # Replace all TMA samples' colors with gray
1648
+ i = 0
1649
+ for key in color_dict.keys():
1650
+ if 'TMA' in key:
1651
+ color_dict[key] = TMA_color_values[i]
1652
+ i +=1
1653
+
1654
+ color_dict
1655
+
1656
+ color_df_sample = color_dict_to_df(color_dict, "Sample_ID")
1657
+
1658
+ # Save to file in metadatadirectory
1659
+ filename = "sample_color_data.csv"
1660
+ filename = os.path.join(metadata_dir, filename)
1661
+ color_df_sample.to_csv(filename, index = False)
1662
+
1663
+ color_df_sample
1664
+
1665
+
1666
+ # Legend of sample info only
1667
+ g = plt.figure(figsize = (1,1)).add_subplot(111)
1668
+ g.axis('off')
1669
+ handles = []
1670
+ for item in color_dict.keys():
1671
+ h = g.bar(0,0, color = color_dict[item],
1672
+ label = item, linewidth =0)
1673
+ handles.append(h)
1674
+ first_legend = plt.legend(handles=handles, loc='upper right', title = 'Sample')
1675
+
1676
+ filename = "Sample_legend.png"
1677
+ filename = os.path.join(metadata_images_dir, filename)
1678
+ plt.savefig(filename, bbox_inches = 'tight')
1679
+
1680
+
1681
+ # ### I.7.4. CLUSTERS COLORS
1682
+
1683
+ '''if 'cluster' in df.columns:
1684
+ cluster_color_values = sb.color_palette("hls",n_colors = len(df.cluster.unique()))
1685
+
1686
+ #print(sorted(test_df.cluster.unique()))
1687
+ # Display those unique colors
1688
+ sb.palplot(sb.color_palette(cluster_color_values))
1689
+
1690
+ cluster_color_dict = dict(zip(sorted(test_df.cluster.unique()), cluster_color_values))
1691
+ print(cluster_color_dict)
1692
+
1693
+ # Create dataframe
1694
+ cluster_color_df = color_dict_to_df(cluster_color_dict, "cluster")
1695
+ cluster_color_df.head()
1696
+
1697
+ # Save to file in metadatadirectory
1698
+ filename = "cluster_color_data.csv"
1699
+ filename = os.path.join(metadata_dir, filename)
1700
+ cluster_color_df.to_csv(filename, index = False)
1701
+
1702
+
1703
+
1704
+ # Legend of cluster info only
1705
+
1706
+ if 'cluster' in df.columns:
1707
+ g = plt.figure(figsize = (1,1)).add_subplot(111)
1708
+ g.axis('off')
1709
+ handles = []
1710
+ for item in sorted(cluster_color_dict.keys()):
1711
+ h = g.bar(0,0, color = cluster_color_dict[item],
1712
+ label = item, linewidth =0)
1713
+ handles.append(h)
1714
+ first_legend = plt.legend(handles=handles, loc='upper right', title = 'Cluster'),
1715
+
1716
+
1717
+ filename = "Clustertype_legend.png"
1718
+ filename = os.path.join(metadata_images_dir, filename)
1719
+ plt.savefig(filename, bbox_inches = 'tight')'''
1720
+
1721
+ mlid.head()
1722
+
1723
+
1724
+ metadata
1725
+
1726
+
1727
+
1728
+ import io
1729
+ import panel as pn
1730
+ pn.extension()
1731
+
1732
+ file_input = pn.widgets.FileInput()
1733
+
1734
+ file_input
1735
+
1736
+
1737
+ def transform_data(variable, window, sigma):
1738
+ """Calculates the rolling average and identifies outliers"""
1739
+ avg = metadata[variable].rolling(window=window).mean()
1740
+ residual = metadata[variable] - avg
1741
+ std = residual.rolling(window=window).std()
1742
+ outliers = np.abs(residual) > std * sigma
1743
+ return avg, avg[outliers]
1744
+
1745
+
1746
+ def get_plot(variable="Exp", window=30, sigma=10):
1747
+ """Plots the rolling average and the outliers"""
1748
+ avg, highlight = transform_data(variable, window, sigma)
1749
+ return avg.hvplot(
1750
+ height=300, legend=False,
1751
+ ) * highlight.hvplot.scatter(padding=0.1, legend=False)
1752
+
1753
+
1754
+ variable_widget = pn.widgets.Select(name="Target", value="Exp", options=list(metadata.columns))
1755
+ window_widget = pn.widgets.IntSlider(name="window", value=30, start=1, end=60)
1756
+ sigma_widget = pn.widgets.IntSlider(name="sigma", value=10, start=0, end=20)
1757
+
1758
+ app = pn.template.GoldenTemplate(
1759
+ site="Cyc-IF",
1760
+ title="Quality Control",
1761
+ main=[
1762
+ pn.Tabs(
1763
+ ("Dataframes", pn.Column(
1764
+ pn.Row(csv_files_button,pn.bind(handle_click, csv_files_button.param.clicks)),
1765
+ pn.pane.Markdown("### The Dataframe uploaded:"), pn.pane.DataFrame(intial_dataframe),
1766
+ #pn.pane.Markdown("### The Exposure time DataFrame is :"), pn.pane.DataFrame(exp_df.head()),
1767
+ pn.pane.Markdown("### The DataFrame after merging CycIF data x metadata :"), pn.pane.DataFrame(merged_dataframe.head()),
1768
+ )),
1769
+ ("Quality Control", pn.Column(
1770
+ quality_check(quality_control_df, not_intensities)
1771
+ #pn.pane.Markdown("### The Quality check results are:"), quality_check_results(check_shape, check_no_null, check_all_expected_files_present, check_zero_intensities)
1772
+ )),
1773
+ ("Intensities", pn.Column(
1774
+ pn.pane.Markdown("### The Not Intensities DataFrame after processing is :"), pn.pane.DataFrame(not_intensities_df, height=250),
1775
+ pn.pane.Markdown("### Select Intensities to be included"), updated_intensities,
1776
+ #pn.pane.Markdown("### The Intensities DataFrame"), intensities_df,
1777
+ #pn.pane.Markdown("### The metadata obtained that specifies the localisation:"), pn.pane.DataFrame(mlid.head())
1778
+ )),
1779
+ ("Plots", pn.Column(
1780
+ #pn.pane.Markdown(" ### Nucleus Size Distribution: "), pn.Row(nucleus_size_line_graph_with_histogram, num_of_cell_removal),
1781
+ #pn.pane.Markdown(" ### Nucleus Size Distribution: "), pn.Row(plot1,layout2),
1782
+ #pn.pane.Markdown("### Nucleus Distribution Plot:"), pn.Column(nucleus_size_plot, nucleus_size_graph),
1783
+ pn.pane.Markdown(" ### Intensity Average Plot:"), pn.Row(selected_marker_plot,num_of_cell_removal_intensity ),
1784
+ #pn.Column(pn.Column(column_dropdown, generate_plot_button), quantile_slider, plot),
1785
+ #pn.pane.Markdown("### Cytoplasm Intensity Plot:"), cytoplasm_intensity_plot,
1786
+ #pn.pane.Markdown("### AF555_Cell_Intensity_Average:"), quantile_output_app,
1787
+ #pn.pane.Markdown("### Distribution of AF555_Cell_Intensity_Average with Quantiles:"), quantile_intensity_plot)
1788
+ )),
1789
+
1790
+ ),
1791
+ ])
1792
+
1793
+ app.servable()
1794
+
1795
+ if __name__ == "__main__":
1796
+ pn.serve(app, port=5007)
my_modules.py ADDED
@@ -0,0 +1,468 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import numpy as np
3
+ import pandas as pd
4
+ import subprocess
5
+ import os
6
+ import random
7
+ import re
8
+ import pandas as pd
9
+ import numpy as np
10
+ import seaborn as sb
11
+ import matplotlib.pyplot as plt
12
+ import matplotlib.colors as mplc
13
+ import subprocess
14
+
15
+
16
+ from scipy import signal
17
+
18
+ import plotly.figure_factory as ff
19
+ import plotly
20
+ import plotly.graph_objs as go
21
+ from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
22
+
23
+
24
+ # This function takes in a dataframe, changes the names
25
+ # of the column in various ways, and returns the dataframe.
26
+ # For best accuracy and generalizability, the code uses
27
+ # regular expressions (regex) to find strings for replacement.
28
+ def apply_header_changes(df):
29
+ # remove lowercase x at beginning of name
30
+ df.columns = df.columns.str.replace("^x","")
31
+ # remove space at beginning of name
32
+ df.columns = df.columns.str.replace("^ ","")
33
+ # replace space with underscore
34
+ df.columns = df.columns.str.replace(" ","_")
35
+ # fix typos
36
+ df.columns = df.columns.str.replace("AF_AF","AF")
37
+ # change "Cell Id" into "ID"
38
+ df.columns = df.columns.str.replace("Cell Id","ID")
39
+ # if the ID is the index, change "Cell Id" into "ID"
40
+ df.index.name = "ID"
41
+ #
42
+ df.columns = df.columns.str.replace("","")
43
+ return df
44
+
45
+ def apply_df_changes(df):
46
+ # Remove "@1" after the ID in the index
47
+ df.index = df.index.str.replace(r'@1$', '')
48
+ return df
49
+
50
+ def compare_headers(expected, actual, name):
51
+ missing_actual = np.setdiff1d(expected, actual)
52
+ extra_actual = np.setdiff1d(actual, expected)
53
+ if len(missing_actual) > 0:
54
+ #print("WARNING: File '" + name + "' lacks the following expected header(s) after import header reformatting: \n"
55
+ # + str(missing_actual))
56
+ print("WARNING: File '" + name + "' lacks the following expected item(s): \n" + str(missing_actual))
57
+ if len(extra_actual) > 0:
58
+ #print("WARNING: '" + name + "' has the following unexpected header(s) after import header reformatting: \n"
59
+ # + str(extra_actual))
60
+ print("WARNING: '" + name + "' has the following unexpected item(s): \n" + str(extra_actual))
61
+
62
+ return None
63
+
64
+
65
+ def add_metadata_location(row):
66
+ fc = row['full_column'].lower()
67
+ if 'cytoplasm' in fc and 'cell' not in fc and 'nucleus' not in fc:
68
+ return 'cytoplasm'
69
+ elif 'cell' in fc and 'cytoplasm' not in fc and 'nucleus' not in fc:
70
+ return 'cell'
71
+ elif 'nucleus' in fc and 'cell' not in fc and 'cytoplasm' not in fc:
72
+ return 'nucleus'
73
+ else:
74
+ return 'unknown'
75
+
76
+
77
+ def get_perc(row, cell_type):
78
+ total = row['stroma'] + row['immune'] + row['cancer']+row['endothelial']
79
+ return round(row[cell_type]/total *100,1)
80
+
81
+
82
+
83
+ # Divide each marker (and its localisation) by the right exposure setting for each group of samples
84
+ def divide_exp_time(col, exp_col, metadata):
85
+ exp_time = metadata.loc[metadata['full_column'] == col.name, exp_col].values[0]
86
+ return col/exp_time
87
+
88
+
89
+ def do_background_sub(col, df, metadata):
90
+ #print(col.name)
91
+ location = metadata.loc[metadata['full_column'] == col.name, 'localisation'].values[0]
92
+ #print('location = ' + location)
93
+ channel = metadata.loc[metadata['full_column'] == col.name, 'Channel'].values[0]
94
+ #print('channel = ' + channel)
95
+ af_target = metadata.loc[
96
+ (metadata['Channel']==channel) \
97
+ & (metadata['localisation']==location) \
98
+ & (metadata['target_lower'].str.contains(r'^af\d{3}$')),\
99
+ 'full_column'].values[0]
100
+ return col - df.loc[:,af_target]
101
+
102
+
103
+ """
104
+ This function plots distributions. It takes in a string title (title), a list of
105
+ dataframes from which to plot (dfs), a list of dataframe names for the legend
106
+ (names), a list of the desired colors for the plotted samples (colors),
107
+ a string for the x-axis label (x_label), ```a float binwidth for histrogram (bin_size)```,
108
+ a boolean to show the legend or not (legend),
109
+ and the names of the marker(s) to plot (input_labels). If not specified,
110
+ the function will plot all markers in one plot. input_labels can either be a
111
+ single string, e.g., 'my_marker', or a list, e.g., ['my_marker1','my_marker2'].
112
+
113
+ The function will create a distribution plot and save it to png. It requires
114
+ a list of items not to be considered as markers when evaluating column names
115
+ (not_markers) to be in memory. It also requires a desired output location of
116
+ the files (output_dir) to already be in memory.
117
+ """
118
+
119
+
120
+
121
+ def make_distr_plot_per_sample(title, location, dfs, df_names, colors, x_label, legend, xlims = None, markers = ['all'],not_intensities = None):
122
+ ### GET LIST OF MARKERS TO PLOT ###
123
+ # Get list of markers to plot if not specified by user, using columns in first df
124
+ # Writing function(parameter = FILLER) makes that parameter optional when user calls function,
125
+ # since it is given a default value!
126
+ if markers == ["all"]:
127
+ markers = [c for c in dfs[0].columns.values if c not in not_intensities]
128
+ elif not isinstance(markers, list):
129
+ markers = [markers]
130
+ # Make input labels a set to get only unique values, then put back into list
131
+ markers = list(set(markers))
132
+
133
+ ### GET XLIMS ###
134
+ if xlims == None:
135
+ mins = [df.loc[:,markers].min().min() for df in dfs]
136
+ maxes = [df.loc[:,markers].max().max() for df in dfs]
137
+ xlims = [min(mins), max(maxes)]
138
+ if not isinstance(xlims, list):
139
+ print("Problem - xlmis not list. Exiting method...")
140
+ return None
141
+ ### CHECK DATA CAN BE PLOTTED ###
142
+ # Check for data with only 1 unique value - this will cause error if plotted
143
+ group_labels = []
144
+ hist_data = []
145
+ # Iterate through all dataframes (dfs)
146
+ for i in range(len(dfs)):
147
+ # Iterate through all marker labels
148
+ for f in markers:
149
+ # If there is only one unique value in the marker data for this dataframe,
150
+ # you cannot plot a distribution plot. It gives you a linear algebra
151
+ # singular value matrix error
152
+ if dfs[i][f].nunique() != 1:
153
+ # Add df name and marker name to labels list
154
+ # If we have >1 df, we want to make clear
155
+ # which legend label is associated with which df
156
+ if len(df_names) > 1:
157
+ group_labels.append(df_names[i]+"_"+f)
158
+ else:
159
+ group_labels.append(f)
160
+ # add the data to the data list
161
+ hist_data.append(dfs[i][f])
162
+ # if no data had >1 unique values, there is nothing to plot
163
+ if len(group_labels) < 1:
164
+ print("No markers plotted - all were singular value. Names and markers were " + str(df_names) + ", " + str(markers))
165
+ return None
166
+
167
+ ### TRANSFORM COLOR ITEMS TO CORRECT TYPE ###
168
+ if isinstance(colors[0], tuple):
169
+ colors = ['rgb' + str(color) for color in colors]
170
+
171
+ ### PLOT DATA ###
172
+ # Create plot
173
+ fig = ff.create_distplot(hist_data, group_labels, bin_size=0.1,
174
+ #colors=colors, bin_size=bin_size, show_rug=False)#show_hist=False,
175
+ colors=colors, show_rug=False)
176
+ # Adjust title, font, background color, legend...
177
+ fig.update_layout(title_text=title, font=dict(size=18),
178
+ plot_bgcolor = 'white', showlegend = legend)#, legend_x = 3)
179
+ # Adjust opacity
180
+ fig.update_traces(opacity=0.6)
181
+ # Adjust x-axis parameters
182
+ fig.update_xaxes(title_text = x_label, showline=True, linewidth=2, linecolor='black',
183
+ tickfont=dict(size=18), range = xlims) # x lims was here
184
+ # Adjust y-axis parameters
185
+ fig.update_yaxes(title_text = "Kernel density estimate",showline=True, linewidth=1, linecolor='black',
186
+ tickfont=dict(size=18))
187
+
188
+
189
+ ### SAVE/DISPLAY PLOT ###
190
+ # Save plot to HTML
191
+ # plotly.io.write_html(fig, file = output_dir + "/" + title + ".html")
192
+ # Plot in new tab
193
+ #plot(fig)
194
+ # Save to png
195
+ filename = os.path.join(location, title.replace(" ","_") + ".png")
196
+ fig.write_image(filename)
197
+ return None
198
+
199
+
200
+
201
+
202
+
203
+ # this could be changed to use recursion and make it 'smarter'
204
+
205
+ def shorten_feature_names(long_names):
206
+ name_dict = dict(zip(long_names,[n.split('_')[0] for n in long_names]))
207
+ names_lts, long_names, iteration = shorten_feature_names_helper(name_dict, long_names, 1)
208
+ # names_lts = names long-to-short
209
+ # names_stl = names stl
210
+ names_stl = {}
211
+ for n in names_lts.items():
212
+ names_stl[n[1]] = n[0]
213
+ return names_lts, names_stl
214
+
215
+
216
+ def shorten_feature_names_helper(name_dict, long_names, iteration):
217
+ #print("\nThis is iteration #"+str(iteration))
218
+ #print("name_dict is: " + str(name_dict))
219
+ #print("long_names is: " + str(long_names))
220
+ ## If the number of unique nicknames == number of long names
221
+ ## then the work here is done
222
+ #print('\nCompare lengths: ' + str(len(set(name_dict.values()))) + ", " + str(len(long_names)))
223
+ #print('set(name_dict.values()): ' + str(set(name_dict.values())))
224
+ #print('long_names: ' + str(long_names))
225
+ if len(set(name_dict.values())) == len(long_names):
226
+ #print('All done!')
227
+ return name_dict, long_names, iteration
228
+
229
+ ## otherwise, if the number of unique nicknames is not
230
+ ## equal to the number of long names (must be shorter than),
231
+ ## then we need to find more unique names
232
+ iteration += 1
233
+ nicknames_set = set()
234
+ non_unique_nicknames = set()
235
+ # construct set of current nicknames
236
+ for long_name in long_names:
237
+ #print('long_name is ' + long_name + ' and non_unique_nicknames set is ' + str(non_unique_nicknames))
238
+ short_name = name_dict[long_name]
239
+ if short_name in nicknames_set:
240
+ non_unique_nicknames.add(short_name)
241
+ else:
242
+ nicknames_set.add(short_name)
243
+ #print('non_unique_nicknames are: ' + str(non_unique_nicknames))
244
+
245
+ # figure out all long names associated
246
+ # with the non-unique short names
247
+ trouble_long_names = set()
248
+ for long_name in long_names:
249
+ short_name = name_dict[long_name]
250
+ if short_name in non_unique_nicknames:
251
+ trouble_long_names.add(long_name)
252
+
253
+ #print('troublesome long names are: ' + str(trouble_long_names))
254
+ #print('name_dict: ' + str(name_dict))
255
+ # operate on all names that are associated with
256
+ # the non-unique short nicknames
257
+ for long_name in trouble_long_names:
258
+ #print('trouble long name is: ' + long_name)
259
+ #print('old nickname is: ' + name_dict[long_name])
260
+ name_dict[long_name] = '_'.join(long_name.split('_')[0:iteration])
261
+ #print('new nickname is: ' + name_dict[long_name])
262
+ shorten_feature_names_helper(name_dict, long_names, iteration)
263
+ return name_dict, long_names, iteration
264
+
265
+
266
+ def heatmap_function2(title,
267
+ data,
268
+ method, metric, cmap,
269
+ cbar_kws, xticklabels, save_loc,
270
+ row_cluster, col_cluster,
271
+ annotations = {'rows':[],'cols':[]}):
272
+
273
+ sb.set(font_scale= 6.0)
274
+
275
+ # Extract row and column mappings
276
+ row_mappings = []
277
+ col_mappings = []
278
+ for ann in annotations['rows']:
279
+ row_mappings.append(ann['mapping'])
280
+ for ann in annotations['cols']:
281
+ col_mappings.append(ann['mapping'])
282
+ # If empty lists, convert to None so seaborn accepts
283
+ # as the row_colors or col_colors objects
284
+ if len(row_mappings) == 0:
285
+ row_mappings = None
286
+ if len(col_mappings) == 0:
287
+ col_mappings = None
288
+
289
+ def heatmap_function(title,
290
+ data,
291
+ method, metric, cmap,
292
+ cbar_kws, xticklabels, save_loc,
293
+ row_cluster, col_cluster,
294
+ annotations = {'rows':[],'cols':[]}):
295
+
296
+ sb.set(font_scale= 2.0)
297
+
298
+ # Extract row and column mappings
299
+ row_mappings = []
300
+ col_mappings = []
301
+ for ann in annotations['rows']:
302
+ row_mappings.append(ann['mapping'])
303
+ for ann in annotations['cols']:
304
+ col_mappings.append(ann['mapping'])
305
+ # If empty lists, convert to None so seaborn accepts
306
+ # as the row_colors or col_colors objects
307
+ if len(row_mappings) == 0:
308
+ row_mappings = None
309
+ if len(col_mappings) == 0:
310
+ col_mappings = None
311
+
312
+ # Create clustermap
313
+ g = sb.clustermap(data = data,
314
+ robust = True,
315
+ method = method, metric = metric,
316
+ cmap = cmap,
317
+ row_cluster = row_cluster, col_cluster = col_cluster,
318
+ figsize = (40,30),
319
+ row_colors=row_mappings, col_colors=col_mappings,
320
+ yticklabels = False,
321
+ cbar_kws = cbar_kws,
322
+ xticklabels = xticklabels)
323
+
324
+ # To rotate slightly the x labels
325
+ plt.setp(g.ax_heatmap.xaxis.get_majorticklabels(), rotation=45)
326
+
327
+ # Add title
328
+ g.fig.suptitle(title, fontsize = 60.0)
329
+
330
+ #And now for the legends:
331
+ # iterate through 'rows', 'cols'
332
+ for ann_type in annotations.keys():
333
+ # iterate through each individual annotation feature
334
+ for ann in annotations[ann_type]:
335
+ color_dict = ann['dict']
336
+ handles = []
337
+ for item in color_dict.keys():
338
+ h = g.ax_col_dendrogram.bar(0,0, color = color_dict[item], label = item,
339
+ linewidth = 0)
340
+ handles.append(h)
341
+ legend = plt.legend(handles = handles, loc = ann['location'], title = ann['label'],
342
+ bbox_to_anchor=ann['bbox_to_anchor'],
343
+ bbox_transform=plt.gcf().transFigure)
344
+ ax = plt.gca().add_artist(legend)
345
+
346
+ # Save image
347
+ filename = os.path.join(save_loc, title.lower().replace(" ","_") + ".png")
348
+ g.savefig(filename)
349
+
350
+ return None
351
+
352
+
353
+
354
+ # sources -
355
+ #https://stackoverflow.com/questions/27988846/how-to-express-classes-on-the-axis-of-a-heatmap-in-seaborn
356
+ # https://matplotlib.org/3.1.1/tutorials/intermediate/legend_guide.html
357
+
358
+
359
+ def verify_line_no(filename, lines_read):
360
+ # Use Linux "wc -l" command to get the number of lines in the unopened file
361
+ wc = subprocess.check_output(['wc', '-l', filename]).decode("utf-8")
362
+ # Take that string, turn it into a list, extract the first item,
363
+ # and make that an int - this is the number of lines in the file
364
+ wc = int(wc.split()[0])
365
+ if lines_read != wc:
366
+ print("WARNING: '" + filename + "' has " + str(wc) +
367
+ " lines, but imported dataframe has "
368
+ + str(lines_read) + " (including header).")
369
+ return None
370
+
371
+
372
+ def rgb_tuple_from_str(rgb_str):
373
+ rgb_str = rgb_str.replace("(","").replace(")","").replace(" ","")
374
+ rgb = list(map(float,rgb_str.split(",")))
375
+ return tuple(rgb)
376
+
377
+ def color_dict_to_df(cd, column_name):
378
+ df = pd.DataFrame.from_dict(cd, orient = 'index')
379
+ df['rgb'] = df.apply(lambda row: (np.float64(row[0]), np.float64(row[1]), np.float64(row[2])), axis = 1)
380
+ df = df.drop(columns = [0,1,2])
381
+ df['hex'] = df.apply(lambda row: mplc.to_hex(row['rgb']), axis = 1)
382
+ df[column_name] = df.index
383
+ return df
384
+
385
+
386
+ # p-values that are less than or equal to 0.05
387
+ def p_add_star(row):
388
+ m = [str('{:0.3e}'.format(m)) + "*"
389
+ if m <= 0.05 \
390
+ else str('{:0.3e}'.format(m))
391
+ for m in row ]
392
+ return pd.Series(m)
393
+
394
+ # assigns a specific number of asterisks based on the thresholds
395
+ def p_to_star(row):
396
+ output = []
397
+ for item in row:
398
+ if item <= 0.001:
399
+ stars = 3
400
+ elif item <= 0.01:
401
+ stars = 2
402
+ elif item <= 0.05:
403
+ stars = 1
404
+ else:
405
+ stars = 0
406
+ value = ''
407
+ for i in range(stars):
408
+ value += '*'
409
+ output.append(value)
410
+ return pd.Series(output)
411
+
412
+
413
+
414
+ def plot_gaussian_distributions(df):
415
+ # Initialize thresholds list to store all calculated thresholds
416
+ all_thresholds = []
417
+
418
+ # Iterate over all columns except the first one (assuming the first one is non-numeric or an index)
419
+ for column in df.columns:
420
+ # Extract the marker data
421
+ marker_data = df[column]
422
+
423
+ # Calculating mean and standard deviation for each marker
424
+ m_mean, m_std = np.mean(marker_data), np.std(marker_data)
425
+
426
+ # Generating x values for the Gaussian curve
427
+ x_vals = np.linspace(marker_data.min(), marker_data.max(), 100)
428
+
429
+ # Calculating Gaussian distribution curve
430
+ gaussian_curve = (1 / (m_std * np.sqrt(2 * np.pi))) * np.exp(-(x_vals - m_mean) ** 2 / (2 * m_std ** 2))
431
+
432
+ # Creating figure for Gaussian distribution for each marker
433
+ fig = go.Figure()
434
+ fig.add_trace(go.Scatter(x=x_vals, y=gaussian_curve, mode='lines', name=f'{column} Gaussian Distribution'))
435
+ fig.update_layout(title=f'Gaussian Distribution for {column} Marker')
436
+
437
+ # Calculating thresholds based on each marker's distribution
438
+ seuil_1sigma = m_mean + m_std
439
+ seuil_2sigma = m_mean + 2 * m_std
440
+ seuil_3sigma = m_mean + 3 * m_std
441
+
442
+ # Display the figures with thresholds
443
+ fig.add_shape(type='line', x0=seuil_1sigma, y0=0, x1=seuil_1sigma, y1=np.max(gaussian_curve),
444
+ line=dict(color='red', dash='dash'), name=f'Seuil 1σ: {seuil_1sigma:.2f}')
445
+ fig.add_shape(type='line', x0=seuil_2sigma, y0=0, x1=seuil_2sigma, y1=np.max(gaussian_curve),
446
+ line=dict(color='green', dash='dash'), name=f'Seuil 2σ: {seuil_2sigma:.2f}')
447
+ fig.add_shape(type='line', x0=seuil_3sigma, y0=0, x1=seuil_3sigma, y1=np.max(gaussian_curve),
448
+ line=dict(color='blue', dash='dash'), name=f'Seuil 3σ: {seuil_3sigma:.2f}')
449
+
450
+ # Add markers and values to the plot
451
+ fig.add_trace(go.Scatter(x=[seuil_1sigma, seuil_2sigma, seuil_3sigma],
452
+ y=[0, 0, 0],
453
+ mode='markers+text',
454
+ text=[f'{seuil_1sigma:.2f}', f'{seuil_2sigma:.2f}', f'{seuil_3sigma:.2f}'],
455
+ textposition="top center",
456
+ marker=dict(size=10, color=['red', 'green', 'blue']),
457
+ name='Threshold Values'))
458
+
459
+ fig.show()
460
+
461
+ # Append thresholds for each marker to the list
462
+ all_thresholds.append((column, seuil_1sigma, seuil_2sigma, seuil_3sigma)) # Include the column name
463
+
464
+ # Return thresholds for all markers
465
+ return all_thresholds
466
+
467
+
468
+
stored_variables.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "base_dir": "/Users/harshithakolipaka/Downloads/wetransfer_data-zip_2024-05-17_1431",
3
+ "set_path": "test",
4
+ "ls_samples": ["DD3S1.csv", "DD3S2.csv", "DD3S3.csv", "TMA.csv"],
5
+ "selected_metadata_files": ["Slide_B_DD1s1.one_1.tif.csv"]
6
+ }