KashyapiNagaHarshitha commited on
Commit
a5013aa
1 Parent(s): 160dd0c

Create Background_substraction

Browse files
Files changed (1) hide show
  1. Background_substraction +1069 -0
Background_substraction ADDED
@@ -0,0 +1,1069 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ # coding: utf-8
3
+
4
+
5
+ # In[1]:
6
+ import os
7
+ import random
8
+ import re
9
+ import pandas as pd
10
+ import numpy as np
11
+ import seaborn as sb
12
+ import matplotlib.pyplot as plt
13
+ import matplotlib.colors as mplc
14
+ import subprocess
15
+ import warnings
16
+
17
+ from scipy import signal
18
+
19
+ import plotly.figure_factory as ff
20
+ import plotly
21
+ import plotly.graph_objs as go
22
+ from plotly.offline import download_plotlyjs, plot
23
+ import plotly.express as px
24
+ from my_modules import *
25
+
26
+
27
+ # In[2]:
28
+
29
+
30
+ #Silence FutureWarnings & UserWarnings
31
+ warnings.filterwarnings('ignore', category= FutureWarning)
32
+ warnings.filterwarnings('ignore', category= UserWarning)
33
+
34
+
35
+ # ## II.2. *DIRECTORIES
36
+
37
+ # In[5]:
38
+
39
+
40
+ # Set base directory
41
+
42
+ ##### MAC WORKSTATION #####
43
+ #base_dir = r'/Volumes/LaboLabrie/Projets/OC_TMA_Pejovic/Temp/Zoe/CyCIF_pipeline/'
44
+ ###########################
45
+
46
+ ##### WINDOWS WORKSTATION #####
47
+ #base_dir = r'C:\Users\LaboLabrie\gerz2701\cyCIF-pipeline\Set_B'
48
+ ###############################
49
+
50
+ ##### LOCAL WORKSTATION #####
51
+ #base_dir = r'/Users/harshithakolipaka/Downloads/wetransfer_data-zip_2024-05-17_1431/'
52
+ #############################
53
+
54
+ #set_name = 'Set_A'
55
+ #set_name = 'test'
56
+ input_path = 'wetransfer_data-zip_2024-05-17_1431'
57
+ base_dir = os.path.join(home_dir, input_path)
58
+ set_path = 'test'
59
+ selected_metadata_files = ['Slide_B_DD1s1.one_1.tif.csv', 'Slide_B_DD1s1.one_2.tif.csv']
60
+ ls_samples = ['Ashlar_Exposure_Time.csv', 'new_data.csv', 'DD3S1.csv', 'DD3S2.csv', 'DD3S3.csv', 'TMA.csv']
61
+
62
+ set_name = set_path
63
+
64
+
65
+ # In[7]:
66
+
67
+
68
+ project_name = set_name # Project name
69
+ step_suffix = 'bs' # Curent part (here part II)
70
+ previous_step_suffix_long = "_qc_eda" # Previous part (here QC/EDA NOTEBOOK)
71
+
72
+ # Initial input data directory
73
+ input_data_dir = os.path.join(base_dir, project_name + previous_step_suffix_long)
74
+
75
+ # BS output directories
76
+ output_data_dir = os.path.join(base_dir, project_name + "_" + step_suffix)
77
+ # BS images subdirectory
78
+ output_images_dir = os.path.join(output_data_dir,"images")
79
+
80
+ # Data and Metadata directories
81
+ # Metadata directories
82
+ metadata_dir = os.path.join(base_dir, project_name + "_metadata")
83
+ # images subdirectory
84
+ metadata_images_dir = os.path.join(metadata_dir,"images")
85
+
86
+ # Create directories if they don't already exist
87
+ for d in [base_dir, input_data_dir, output_data_dir, output_images_dir, metadata_dir, metadata_images_dir]:
88
+ if not os.path.exists(d):
89
+ print("Creation of the" , d, "directory...")
90
+ os.makedirs(d)
91
+ else :
92
+ print("The", d, "directory already exists !")
93
+
94
+ os.chdir(input_data_dir)
95
+
96
+
97
+ # In[8]:
98
+
99
+
100
+ # Verify paths
101
+ print('base_dir :', base_dir)
102
+ print('input_data_dir :', input_data_dir)
103
+ print('output_data_dir :', output_data_dir)
104
+ print('output_images_dir :', output_images_dir)
105
+ print('metadata_dir :', metadata_dir)
106
+ print('metadata_images_dir :', metadata_images_dir)
107
+
108
+
109
+ # ## II.3. FILES
110
+ #Don't forget to put your data in the projname_data directory !
111
+ # ### II.3.1. METADATA
112
+
113
+ # In[9]:
114
+
115
+
116
+ # Import all metadata we need from the QC/EDA chapter
117
+
118
+ # METADATA
119
+ filename = "marker_intensity_metadata.csv"
120
+ filename = os.path.join(metadata_dir, filename)
121
+
122
+ # Check file exists
123
+ if not os.path.exists(filename):
124
+ print("WARNING: Could not find desired file: "+filename)
125
+ else :
126
+ print("The",filename,"file was imported for further analysis!")
127
+
128
+ # Open, read in information
129
+ metadata = pd.read_csv(filename)
130
+
131
+ # Verify size with verify_line_no() function in my_modules.py
132
+ #verify_line_no(filename, metadata.shape[0] + 1)
133
+
134
+ # Verify headers
135
+ exp_cols = ['Round','Target','Channel','target_lower','full_column','marker','localisation']
136
+ compare_headers(exp_cols, metadata.columns.values, "Marker metadata file")
137
+
138
+ metadata = metadata.dropna()
139
+ metadata.head()
140
+
141
+
142
+ # ### II.3.2. NOT_INTENSITIES
143
+
144
+ # In[10]:
145
+
146
+
147
+ # NOT_INTENSITIES
148
+ filename = "not_intensities.csv"
149
+ filename = os.path.join(metadata_dir, filename)
150
+
151
+ # Check file exists
152
+ if not os.path.exists(filename):
153
+ print("WARNING: Could not find desired file: "+filename)
154
+ else :
155
+ print("The",filename,"file was imported for further analysis!")
156
+
157
+ # Open, read in information
158
+ #not_intensities = []
159
+ with open(filename, 'r') as fh:
160
+ not_intensities = fh.read().strip().split("\n")
161
+ # take str, strip whitespace, split on new line character
162
+
163
+ not_intensities = ['Nuc_X', 'Nuc_X_Inv', 'Nuc_Y', 'Nuc_Y_Inv', 'Nucleus_Roundness', 'Nucleus_Size', 'Cell_Size',
164
+ 'ROI_index', 'Sample_ID', 'replicate_ID', 'Cell_ID','cell_type', 'cell_subtype', 'cluster','ID',
165
+ 'Cytoplasm_Size', 'immune_checkpoint', 'Unique_ROI_index', 'Patient', 'Primary_chem(1)_vs_surg(0)']
166
+
167
+ # Verify size
168
+ print("Verifying data read from file is the correct length...\n")
169
+ verify_line_no(filename, len(not_intensities))
170
+
171
+ # Print to console
172
+ print("not_intensities =\n", not_intensities)
173
+
174
+
175
+ # ### II.3.3. FULL_TO_SHORT_COLUMN_NAMES
176
+
177
+ # In[11]:
178
+
179
+
180
+ # FULL_TO_SHORT_COLUMN_NAMES
181
+ filename = "full_to_short_column_names.csv"
182
+ filename = os.path.join(metadata_dir, filename)
183
+
184
+ # Check file exists
185
+ if not os.path.exists(filename):
186
+ print("WARNING: Could not find desired file: " + filename)
187
+ else :
188
+ print("The",filename,"file was imported for further analysis!")
189
+
190
+ # Open, read in information
191
+ df = pd.read_csv(filename, header = 0)
192
+
193
+ # Verify size
194
+ print("Verifying data read from file is the correct length...\n")
195
+ #verify_line_no(filename, df.shape[0] + 1)
196
+
197
+ # Turn into dictionary
198
+ full_to_short_names = df.set_index('full_name').T.to_dict('records')[0]
199
+
200
+ # Print information
201
+ print('full_to_short_names =\n',full_to_short_names)
202
+
203
+
204
+ # ### II.3.4. SHORT_TO_FULL_COLUMN_NAMES
205
+
206
+ # In[12]:
207
+
208
+
209
+ # SHORT_TO_FULL_COLUMN_NAMES
210
+ filename = "short_to_full_column_names.csv"
211
+ filename = os.path.join(metadata_dir, filename)
212
+
213
+ # Check file exists
214
+ if not os.path.exists(filename):
215
+ print("WARNING: Could not find desired file: " + filename)
216
+ else :
217
+ print("The",filename,"file was imported for further analysis!")
218
+
219
+ # Open, read in information
220
+ df = pd.read_csv(filename, header = 0)
221
+
222
+ # Verify size
223
+ print("Verifying data read from file is the correct length...\n")
224
+ #verify_line_no(filename, df.shape[0] + 1)
225
+
226
+ # Turn into dictionary
227
+ short_to_full_names = df.set_index('short_name').T.to_dict('records')[0]
228
+
229
+ # Print information
230
+ print('short_to_full_names =\n',short_to_full_names)
231
+
232
+
233
+ # ### II.3.5. SAMPLES COLORS
234
+
235
+ # In[13]:
236
+
237
+
238
+ # COLORS INFORMATION
239
+ filename = "sample_color_data.csv"
240
+ filename = os.path.join(metadata_dir, filename)
241
+
242
+ # Check file exists
243
+ if not os.path.exists(filename):
244
+ print("WARNING: Could not find desired file: " + filename)
245
+ else :
246
+ print("The",filename,"file was imported for further analysis!")
247
+
248
+ # Open, read in information
249
+ df = pd.read_csv(filename, header = 0)
250
+ df = df.drop(columns = ['hex'])
251
+
252
+
253
+ # our tuple of float values for rgb, (r, g, b) was read in
254
+ # as a string '(r, g, b)'. We need to extract the r-, g-, and b-
255
+ # substrings and convert them back into floats
256
+ df['rgb'] = df.apply(lambda row: rgb_tuple_from_str(row['rgb']), axis = 1)
257
+
258
+ # Verify size
259
+ print("Verifying data read from file is the correct length...\n")
260
+ #verify_line_no(filename, df.shape[0] + 1)
261
+
262
+ # Turn into dictionary
263
+ sample_color_dict = df.set_index('Sample_ID')['rgb'].to_dict()
264
+
265
+ # Print information
266
+ print('sample_color_dict =\n',sample_color_dict)
267
+ sample_color_dict = pd.DataFrame.from_dict(sample_color_dict, orient='index', columns=['R', 'G', 'B'])
268
+
269
+
270
+ # In[14]:
271
+
272
+
273
+ sample_color_dict
274
+
275
+
276
+ # ### II.3.6. CHANNELS COLORS
277
+
278
+ # In[15]:
279
+
280
+
281
+ # CHANNELS
282
+ filename = "channel_color_data.csv"
283
+ filename = os.path.join(metadata_dir, filename)
284
+
285
+ # Check file exists
286
+ if not os.path.exists(filename):
287
+ print("WARNING: Could not find desired file: "+filename)
288
+ else :
289
+ print("The",filename,"file was imported for further analysis!")
290
+
291
+ # Open, read in information
292
+ df = pd.read_csv(filename, header = 0)
293
+ df = df.drop(columns = ['hex'])
294
+
295
+ # our tuple of float values for rgb, (r, g, b) was read in
296
+ # as a string '(r, g, b)'. We need to extract the r-, g-, and b-
297
+ # substrings and convert them back into floats
298
+ df['rgb'] = df.apply(lambda row: rgb_tuple_from_str(row['rgb']), axis = 1)
299
+
300
+ # Verify size
301
+ print("Verifying data read from file is the correct length...\n")
302
+ #verify_line_no(filename, df.shape[0] + 1)
303
+
304
+ # Turn into dictionary
305
+ channel_color_dict = df.set_index('Channel')['rgb'].to_dict()
306
+
307
+ # Print information
308
+ print('channel_color_dict =\n',channel_color_dict)
309
+ channel_color_dict = pd.DataFrame.from_dict(channel_color_dict, orient='index', columns=['R', 'G', 'B'])
310
+
311
+
312
+ # In[16]:
313
+
314
+
315
+ channel_color_dict
316
+
317
+
318
+ # ### II.3.7. ROUNDS COLORS
319
+
320
+ # In[17]:
321
+
322
+
323
+ # ROUND
324
+ filename = "round_color_data.csv"
325
+ filename = os.path.join(metadata_dir, filename)
326
+
327
+ # Check file exists
328
+ if not os.path.exists(filename):
329
+ print("WARNING: Could not find desired file: "+filename)
330
+ else :
331
+ print("The",filename,"file was imported for further analysis!")
332
+
333
+ # Open, read in information
334
+ df = pd.read_csv(filename, header = 0)
335
+ df = df.drop(columns = ['hex'])
336
+
337
+ # our tuple of float values for rgb, (r, g, b) was read in
338
+ # as a string '(r, g, b)'. We need to extract the r-, g-, and b-
339
+ # substrings and convert them back into floats
340
+ df['rgb'] = df.apply(lambda row: rgb_tuple_from_str(row['rgb']), axis = 1)
341
+
342
+ # Verify size
343
+ print("Verifying data read from file is the correct length...\n")
344
+ #verify_line_no(filename, df.shape[0] + 1)
345
+
346
+ # Turn into dictionary
347
+ round_color_dict = df.set_index('Round')['rgb'].to_dict()
348
+
349
+ # Print information
350
+ print('round_color_dict =\n',round_color_dict)
351
+ round_color_dict = pd.DataFrame.from_dict(round_color_dict, orient='index', columns=['R', 'G', 'B'])
352
+
353
+
354
+ # In[18]:
355
+
356
+
357
+ round_color_dict
358
+
359
+
360
+ # ### II.3.8. DATA
361
+
362
+ # In[19]:
363
+
364
+
365
+ # DATA
366
+ # List files in the directory
367
+ # Check if the directory exists
368
+ if os.path.exists(input_data_dir):
369
+ ls_samples = [sample for sample in os.listdir(input_data_dir) if sample.endswith("_qc_eda.csv")]
370
+
371
+ print("The following CSV files were detected:")
372
+ print([sample for sample in ls_samples])
373
+ else:
374
+ print(f"The directory {input_data_dir} does not exist.")
375
+
376
+
377
+ # In[20]:
378
+
379
+
380
+ # Import all the others files
381
+ dfs = {}
382
+
383
+ # Set variable to hold default header values
384
+ # First gather information on expected headers using first file in ls_samples
385
+ # Read in the first row of the file corresponding to the first sample (index = 0) in ls_samples
386
+ df = pd.read_csv(os.path.join(input_data_dir, ls_samples[0]) , index_col = 0, nrows = 1)
387
+ expected_headers = df.columns.values
388
+ print(expected_headers)
389
+
390
+ ###############################
391
+ # !! This may take a while !! #
392
+ ###############################
393
+ for sample in ls_samples:
394
+ file_path = os.path.join(input_data_dir,sample)
395
+
396
+ try:
397
+ # Read the CSV file
398
+ df = pd.read_csv(file_path, index_col=0)
399
+ # Check if the DataFrame is empty, if so, don't continue trying to process df and remove it
400
+
401
+ if not df.empty:
402
+ # Reorder the columns to match the expected headers list
403
+ df = df.reindex(columns=expected_headers)
404
+ print(sample, "file is processed !\n")
405
+ #print(df)
406
+
407
+ except pd.errors.EmptyDataError:
408
+ print(f'\nEmpty data error in {sample} file. Removing from analysis...')
409
+ ls_samples.remove(sample)
410
+
411
+ # Add df to dfs
412
+ dfs[sample] = df
413
+
414
+ #print(dfs)
415
+
416
+
417
+ # In[21]:
418
+
419
+
420
+ # Merge dfs into one df
421
+ df = pd.concat(dfs.values(), ignore_index=False , sort = False)
422
+ #del dfs
423
+ df.head()
424
+
425
+
426
+ # In[22]:
427
+
428
+
429
+ df.shape
430
+
431
+
432
+ # In[23]:
433
+
434
+
435
+ # Check for NaN entries (should not be any unless columns do not align)
436
+ # False means no NaN entries
437
+ # True means NaN entries
438
+ df.isnull().any().any()
439
+
440
+
441
+ # ## II.4. *FILTERING
442
+
443
+ # In[24]:
444
+
445
+
446
+ print("Number of cells before filtering :", df.shape[0])
447
+ cells_before_filter = f"Number of cells before filtering :{df.shape[0]}"
448
+
449
+
450
+ # In[25]:
451
+
452
+
453
+ #print(df)
454
+
455
+
456
+ # In[26]:
457
+
458
+
459
+ # Delete small cells and objects w/high AF555 Signal (RBCs)
460
+ # We usually use the 95th percentile calculated during QC_EDA
461
+ df = df.loc[(df['Nucleus_Size'] > 42 )]
462
+ df = df.loc[(df['Nucleus_Size'] < 216)]
463
+ print("Number of cells after filtering on nucleus size:", df.shape[0])
464
+
465
+ df = df.loc[(df['AF555_Cell_Intensity_Average'] < 2000)]
466
+ print("Number of cells after filtering on AF555A ___ intensity:", df.shape[0])
467
+ cells_after_filter_nucleus = f"Number of cells after filtering on nucleus size: {df.shape[0]}"
468
+ cells_after_filter_intensity = f"Number of cells after filtering on AF555A ___ intensity: {df.shape[0]}"
469
+
470
+
471
+ # In[27]:
472
+
473
+
474
+ # Assign cell type
475
+ # Assign tumor cells at each row at first (random assigning here just for development purposes)
476
+ # Generate random values for cell_type column
477
+ random_values = np.random.randint(0, 10, size=len(df))
478
+
479
+ # Assign cell type based on random values
480
+ def assign_cell_type(n):
481
+ return np.random.choice(['STROMA','CANCER','IMMUNE','ENDOTHELIAL'])
482
+
483
+ df['cell_type'] = np.vectorize(assign_cell_type)(random_values)
484
+ df['cell_subtype'] = df['cell_type'].copy()
485
+
486
+
487
+ # In[28]:
488
+
489
+
490
+ filtered_dataframe = df
491
+ df.head()
492
+
493
+
494
+ # In[29]:
495
+
496
+
497
+ quality_control_df = filtered_dataframe
498
+
499
+
500
+ # In[30]:
501
+
502
+
503
+ def check_index_format(index_str, ls_samples):
504
+ """
505
+ Checks if the given index string follows the specified format.
506
+
507
+ Args:
508
+ index_str (str): The index string to be checked.
509
+ ls_samples (list): A list of valid sample names.
510
+
511
+ Returns:
512
+ bool: True if the index string follows the format, False otherwise.
513
+ """
514
+ # Split the index string into parts
515
+ parts = index_str.split('_')
516
+
517
+ # Check if there are exactly 3 parts
518
+ if len(parts) != 3:
519
+ print(len(parts))
520
+ return False
521
+
522
+ # Check if the first part is in ls_samples
523
+ sample_name = parts[0]
524
+ if f'{sample_name}_qc_eda.csv' not in ls_samples:
525
+ print(sample_name)
526
+ return False
527
+
528
+ # Check if the second part is in ['cell', 'cytoplasm', 'nucleus']
529
+ location = parts[1]
530
+ valid_locations = ['Cell', 'Cytoplasm', 'Nucleus']
531
+ if location not in valid_locations:
532
+ print(location)
533
+ return False
534
+
535
+ # Check if the third part is a number
536
+ try:
537
+ index = int(parts[2])
538
+ except ValueError:
539
+ print(index)
540
+ return False
541
+
542
+ # If all checks pass, return True
543
+ return True
544
+
545
+
546
+ # In[31]:
547
+
548
+
549
+ # Let's take a look at a few features to make sure our dataframe is as expected
550
+ df.index
551
+ def check_format_ofindex(index):
552
+ for index in df.index:
553
+ check_index = check_index_format(index, ls_samples)
554
+ if check_index is False:
555
+ index_format = "Bad"
556
+ return index_format
557
+
558
+ index_format = "Good"
559
+ return index_format
560
+ print(check_format_ofindex(df.index))
561
+
562
+
563
+ # In[32]:
564
+
565
+
566
+ import panel as pn
567
+ import pandas as pd
568
+
569
+ def quality_check(file, not_intensities):
570
+ # Load the output file
571
+ df = file
572
+
573
+ # Check Index
574
+ check_index = check_format_ofindex(df.index)
575
+
576
+ # Check Shape
577
+ check_shape = df.shape
578
+
579
+ # Check for NaN entries
580
+ check_no_null = df.isnull().any().any()
581
+
582
+ mean_intensity = df.loc[:, ~df.columns.isin(not_intensities)].mean(axis=1)
583
+ if (mean_intensity == 0).any():
584
+ df = df.loc[mean_intensity > 0, :]
585
+ print("df.shape after removing 0 mean values: ", df.shape)
586
+ check_zero_intensities = f'Shape after removing 0 mean values: {df.shape}'
587
+ else:
588
+ print("No zero intensity values.")
589
+ check_zero_intensities = "No zero intensity values."
590
+
591
+ # Create a quality check results table
592
+ quality_check_results_table = pd.DataFrame({
593
+ 'Check': ['Index', 'Shape', 'Check for NaN Entries', 'Check for Zero Intensities'],
594
+ 'Result': [str(check_index), str(check_shape), str(check_no_null), check_zero_intensities]
595
+ })
596
+
597
+ # Create a quality check results component
598
+ quality_check_results_component = pn.Card(
599
+ pn.pane.DataFrame(quality_check_results_table),
600
+ title="Quality Control Results",
601
+ header_background="#2196f3",
602
+ header_color="white",
603
+ )
604
+
605
+ return quality_check_results_component
606
+
607
+
608
+ # ## II.5. CELL TYPES COLORS
609
+ # Establish colors to use throughout workflow
610
+
611
+ # we want colors that are categorical, since Cell Type is a non-ordered category.
612
+ # A categorical color palette will have dissimilar colors.
613
+ # Get those unique colors
614
+ cell_types = ['STROMA','CANCER','IMMUNE','ENDOTHELIAL']
615
+ color_values = sb.color_palette("hls", n_colors = len(cell_types))
616
+ # each color value is a tuple of three values: (R, G, B)
617
+
618
+ print("Unique cell types are:",df.cell_type.unique())
619
+ # Display those unique colors
620
+ sb.palplot(sb.color_palette(color_values))
621
+ # In[33]:
622
+
623
+
624
+ # Define your custom colors for each cell type
625
+ custom_colors = {
626
+ 'CANCER': (0.1333, 0.5451, 0.1333),
627
+ 'STROMA': (0.4, 0.4, 0.4),
628
+ 'IMMUNE': (1, 1, 0),
629
+ 'ENDOTHELIAL': (0.502, 0, 0.502)
630
+ }
631
+
632
+ # Retrieve the list of cell types
633
+ cell_types = list(custom_colors.keys())
634
+
635
+ # Extract the corresponding colors from the dictionary
636
+ color_values = [custom_colors[cell] for cell in cell_types]
637
+
638
+ # Display the colors
639
+ sb.palplot(sb.color_palette(color_values))
640
+
641
+
642
+ # In[34]:
643
+
644
+
645
+ # Store in a dctionnary
646
+ celltype_color_dict = dict(zip(cell_types, color_values))
647
+ celltype_color_dict
648
+
649
+
650
+ # In[35]:
651
+
652
+
653
+ celltype_color_df = pd.DataFrame.from_dict(celltype_color_dict, orient='index', columns=['R', 'G', 'B'])
654
+
655
+
656
+ # In[36]:
657
+
658
+
659
+ # Save color information (mapping and legend) to metadata directory
660
+ # Create dataframe
661
+ celltype_color_df = color_dict_to_df(celltype_color_dict, "cell_type")
662
+ celltype_color_df.head()
663
+
664
+ # Save to file in metadatadirectory
665
+ filename = "celltype_color_data.csv"
666
+ filename = os.path.join(metadata_dir, filename)
667
+ celltype_color_df.to_csv(filename, index = False)
668
+ print("File" + filename + " was created!")
669
+
670
+
671
+ # In[37]:
672
+
673
+
674
+ celltype_color_df.head()
675
+
676
+
677
+ # In[38]:
678
+
679
+
680
+ # Legend of cell type info only
681
+ g = plt.figure(figsize = (1,1)).add_subplot(111)
682
+ g.axis('off')
683
+ handles = []
684
+ for item in celltype_color_dict.keys():
685
+ h = g.bar(0,0, color = celltype_color_dict[item],
686
+ label = item, linewidth =0)
687
+ handles.append(h)
688
+ first_legend = plt.legend(handles=handles, loc='upper right', title = 'Cell type'),
689
+
690
+
691
+ filename = "Celltype_legend.png"
692
+ filename = os.path.join(metadata_images_dir, filename)
693
+ plt.savefig(filename, bbox_inches = 'tight')
694
+
695
+
696
+ # In[39]:
697
+
698
+
699
+ metadata
700
+
701
+
702
+ # In[40]:
703
+
704
+
705
+ df.columns.values
706
+
707
+
708
+ # In[41]:
709
+
710
+
711
+ df.shape
712
+
713
+
714
+ # In[42]:
715
+
716
+
717
+ metadata.shape
718
+
719
+
720
+ # ## II.6. *CELL SUBTYPES COLORS
721
+
722
+ # In[43]:
723
+
724
+
725
+ # Establish colors to use throughout workflow
726
+
727
+ # we want colors that are categorical, since Cell Type is a non-ordered category.
728
+ # A categorical color palette will have dissimilar colors.
729
+ # Get those unique colors
730
+ cell_subtypes = ['DC','B', 'TCD4','TCD8','M1','M2','Treg', \
731
+ 'IMMUNE_OTHER', 'CANCER', 'αSMA_myCAF',\
732
+ 'STROMA_OTHER', 'ENDOTHELIAL']
733
+ color_values = sb.color_palette("Paired",n_colors = len(cell_subtypes))
734
+ # each color value is a tuple of three values: (R, G, B)
735
+
736
+ print("Unique cell types are:",df.cell_subtype.unique())
737
+ # Display those unique colors
738
+ sb.palplot(sb.color_palette(color_values))
739
+
740
+
741
+ # In[44]:
742
+
743
+
744
+ # Store in a dctionnary
745
+ cellsubtype_color_dict = dict(zip(cell_subtypes, color_values))
746
+ cellsubtype_color_dict
747
+
748
+
749
+ # In[45]:
750
+
751
+
752
+ cellsubtype_color_df = pd.DataFrame.from_dict(cellsubtype_color_dict, orient='index', columns=['R', 'G', 'B'])
753
+
754
+
755
+ # In[46]:
756
+
757
+
758
+ # Save color information (mapping and legend) to metadata directory
759
+ # Create dataframe
760
+ cellsubtype_color_df = color_dict_to_df(cellsubtype_color_dict, "cell_subtype")
761
+
762
+ # Save to file in metadatadirectory
763
+ filename = "cellsubtype_color_data.csv"
764
+ filename = os.path.join(metadata_dir, filename)
765
+ cellsubtype_color_df.to_csv(filename, index = False)
766
+ print("File" + filename + " was created!")
767
+
768
+
769
+ # In[47]:
770
+
771
+
772
+ cellsubtype_color_df.head()
773
+
774
+
775
+ # In[48]:
776
+
777
+
778
+ # Legend of cell type info only
779
+ g = plt.figure(figsize = (1,1)).add_subplot(111)
780
+ g.axis('off')
781
+ handles = []
782
+ for item in cellsubtype_color_dict.keys():
783
+ h = g.bar(0,0, color = cellsubtype_color_dict[item],
784
+ label = item, linewidth =0)
785
+ handles.append(h)
786
+ first_legend = plt.legend(handles=handles, loc='upper right', title = 'Cell subtype'),
787
+
788
+
789
+ filename = "Cellsubtype_legend.png"
790
+ filename = os.path.join(metadata_images_dir, filename)
791
+ plt.savefig(filename, bbox_inches = 'tight')
792
+
793
+
794
+ # ## II.7. IMMUNE CHECKPOINT COLORS
795
+
796
+ # In[49]:
797
+
798
+
799
+ # Assign IMMUNE SUBTYPES
800
+ df['cell_subtype'] = df['cell_type'].copy()
801
+ df['immune_checkpoint'] = 'none'
802
+ df
803
+
804
+ immune_checkpoint = ['B7H4', 'PDL1', 'PD1', 'None']
805
+ color_values = sb.color_palette("husl",n_colors=len(immune_checkpoint))
806
+ # each color value is a tuple of three values: (R, G, B)
807
+
808
+ print("Unique immune checkpoint are:",df.immune_checkpoint.unique())
809
+ # Display those unique colors
810
+ sb.palplot(sb.color_palette(color_values))
811
+ # In[50]:
812
+
813
+
814
+ immune_checkpoint = ['B7H4', 'PDL1', 'PD1', 'B7H4_PDL1', 'None']
815
+
816
+ # Base colors for the primary checkpoints
817
+ base_colors = sb.color_palette("husl", n_colors=3) # Three distinct colors
818
+
819
+ # Function to mix two RGB colors
820
+ def mix_colors(color1, color2):
821
+ return tuple((c1 + c2) / 2 for c1, c2 in zip(color1, color2))
822
+
823
+ # Generate mixed colors for the combinations of checkpoints
824
+ mixed_colors = [
825
+ mix_colors(base_colors[0], base_colors[1]), # Mix B7H4 and PDL1
826
+ # mix_colors(base_colors[0], base_colors[2]), # Mix B7H4 and PD1
827
+ # mix_colors(base_colors[1], base_colors[2]), # Mix PDL1 and PD1
828
+ tuple(np.mean(base_colors, axis=0)) # Mix B7H4, PDL1, and PD1
829
+ ]
830
+
831
+ # Adding the color for 'None'
832
+ #none_color = [(0.8, 0.8, 0.8)] # A shade of gray
833
+
834
+ # Combine all colors into one list
835
+ color_values = base_colors + mixed_colors #+ none_color
836
+
837
+ # Display unique immune checkpoint combinations
838
+ print("Unique immune checkpoint combinations are:", immune_checkpoint)
839
+ # Display the unique colors
840
+ sb.palplot(color_values)
841
+
842
+
843
+ # In[51]:
844
+
845
+
846
+ # Store in a dctionnary
847
+ immunecheckpoint_color_dict = dict(zip(immune_checkpoint, color_values))
848
+ immunecheckpoint_color_dict
849
+
850
+
851
+ # In[52]:
852
+
853
+
854
+ # Save color information (mapping and legend) to metadata directory
855
+ # Create dataframe
856
+ immunecheckpoint_color_df = color_dict_to_df(immunecheckpoint_color_dict, "immune_checkpoint")
857
+ immunecheckpoint_color_df.head()
858
+
859
+ # Save to file in metadatadirectory
860
+ filename = "immunecheckpoint_color_data.csv"
861
+ filename = os.path.join(metadata_dir, filename)
862
+ immunecheckpoint_color_df.to_csv(filename, index = False)
863
+ print("File " + filename + " was created!")
864
+
865
+
866
+ # In[53]:
867
+
868
+
869
+ # Legend of cell type info only
870
+ g = plt.figure(figsize = (1,1)).add_subplot(111)
871
+ g.axis('off')
872
+ handles = []
873
+ for item in immunecheckpoint_color_dict.keys():
874
+ h = g.bar(0,0, color = immunecheckpoint_color_dict[item],
875
+ label = item, linewidth =0)
876
+ handles.append(h)
877
+ first_legend = plt.legend(handles=handles, loc='upper right', title = 'Immune checkpoint'),
878
+
879
+
880
+ filename = "Cellsubtype_legend.png"
881
+ filename = os.path.join(metadata_images_dir, filename)
882
+ plt.savefig(filename, bbox_inches = 'tight')
883
+
884
+
885
+ # ## II.7. BACKGROUND SUBSTRACTION
886
+
887
+ # In[54]:
888
+
889
+
890
+ def do_background_sub(col, df, metadata):
891
+ #print(col.name)
892
+ location = metadata.loc[metadata['full_column'] == col.name, 'localisation'].values[0]
893
+ #print('location = ' + location)
894
+ channel = metadata.loc[metadata['full_column'] == col.name, 'Channel'].values[0]
895
+ #print('channel = ' + channel)
896
+ af_target = metadata.loc[
897
+ (metadata['Channel']==channel) \
898
+ & (metadata['localisation']==location) \
899
+ & (metadata['target_lower'].str.contains(r'^af\d{3}$')),\
900
+ 'full_column'].values[0]
901
+ return col - df.loc[:,af_target]
902
+
903
+
904
+ # In[55]:
905
+
906
+
907
+ metadata_with_localisation = metadata
908
+ metadata_with_localisation
909
+
910
+
911
+ # In[56]:
912
+
913
+
914
+ #Normalization
915
+
916
+ df.loc[:, ~df.columns.isin(not_intensities)] = \
917
+ df.loc[:, ~df.columns.isin(not_intensities)].apply(lambda column: divide_exp_time(column, 'Exp', metadata), axis = 0)
918
+
919
+
920
+ # In[57]:
921
+
922
+
923
+ normalization_df = df
924
+ normalization_df.head()
925
+
926
+
927
+ # In[58]:
928
+
929
+
930
+ # Do background subtraction
931
+ # this uses a df (metadata) outside of
932
+ # the scope of the lambda...
933
+ # careful that this might break inside of a script...
934
+
935
+ df.loc[:,~df.columns.isin(not_intensities)] = \
936
+ df.loc[:,~df.columns.isin(not_intensities)].apply(lambda column: do_background_sub(column, df, metadata),axis = 0)
937
+
938
+
939
+ # In[59]:
940
+
941
+
942
+ df
943
+ background_substraction_df = df
944
+ background_substraction_df.head()
945
+
946
+
947
+ # In[60]:
948
+
949
+
950
+ # Drop AF columns
951
+ df = df.filter(regex='^(?!AF\d{3}).*')
952
+ print(df.columns.values)
953
+
954
+
955
+ # In[61]:
956
+
957
+
958
+ intensities_df = df.loc[:, ~df.columns.isin(not_intensities)]
959
+ intensities_df
960
+
961
+
962
+ # In[62]:
963
+
964
+
965
+ normalization_df.head()
966
+
967
+
968
+ # In[63]:
969
+
970
+
971
+ metadata_df = metadata_with_localisation
972
+ intensities_df = intensities_df # Assuming you have loaded the intensities DataFrame
973
+
974
+ # Create a list of column names from the intensities DataFrame
975
+ column_names = intensities_df.columns.tolist()
976
+
977
+ # Create a Select widget for choosing a column
978
+ column_selector = pn.widgets.Select(name='Select Column', options=column_names)
979
+
980
+ # Create a Markdown widget to display the selected column's information
981
+ column_info_md = pn.pane.Markdown(name='Column Information', width=400, object='Select a column to view its information.')
982
+
983
+ # Define a function to update the column information
984
+ def update_column_info(event):
985
+ selected_column = event.new
986
+ if selected_column:
987
+ # Get the selected column's intensity
988
+ intensity = intensities_df[selected_column].values
989
+
990
+ # Get the corresponding channel, localization, and experiment from the metadata
991
+ channel = metadata_df.loc[metadata_df['full_column'] == selected_column, 'Channel'].values[0]
992
+ localization = metadata_df.loc[metadata_df['full_column'] == selected_column, 'localisation'].values[0]
993
+ exposure = metadata_df.loc[metadata_df['full_column'] == selected_column, 'Exp'].values[0]
994
+
995
+ # Create a Markdown string with the column information
996
+ column_info_text = f"**Intensity:** {intensity}\n\n**Channel:** {channel}\n\n**Localization:** {localization}\n\n**Exposure:** {exposure}"
997
+
998
+ # Update the Markdown widget with the column information
999
+ column_info_md.object = column_info_text
1000
+ else:
1001
+ column_info_md.object = 'Select a column to view its information.'
1002
+
1003
+ # Watch for changes in the column selector and update the column information
1004
+ column_selector.param.watch(update_column_info, 'value')
1005
+
1006
+ # Create a Panel app and display the widgets
1007
+ bs_info = pn.Column(column_selector, column_info_md)
1008
+ pn.extension()
1009
+ bs_info.servable()
1010
+
1011
+
1012
+ # In[64]:
1013
+
1014
+
1015
+ normalization_df.head()
1016
+
1017
+
1018
+ # In[65]:
1019
+
1020
+
1021
+ import panel as pn
1022
+ df_widget = pn.widgets.DataFrame(metadata, name="MetaData")
1023
+ app2 = pn.template.GoldenTemplate(
1024
+ site="Cyc-IF",
1025
+ title=" Background-Substraction",
1026
+ main=[pn.Tabs(("Background-Substraction",pn.Column(
1027
+ #pn.Column(pn.pane.Markdown("### Celltype thresholds"), pn.pane.DataFrame(celltype_color_df)),
1028
+ #pn.Column(pn.pane.Markdown("### Cell Subtype thresholds"), pn.pane.DataFrame(cellsubtype_color_df)),
1029
+ #pn.Column(pn.pane.Markdown("### Cells Before Filtering"),pn.pane.Str(cells_before_filter)),
1030
+ #pn.Column(pn.pane.Markdown("### Cells After Filtering Nucleus"),pn.pane.Str(cells_after_filter_nucleus)),
1031
+ #pn.Column(pn.pane.Markdown("### Cells After Filtering Intensity"),pn.pane.Str(cells_after_filter_intensity)),
1032
+ #pn.Column(pn.pane.Markdown("### Dataframe after filtering"), pn.pane.DataFrame(filtered_dataframe.head())),
1033
+ pn.Column(pn.pane.Markdown("### The metadata obtained that specifies the localisation:"), metadata_with_localisation.head(8)),
1034
+ pn.Column(pn.pane.Markdown("### The channels and exposure of each intensities column"), bs_info),
1035
+ pn.Column(pn.pane.Markdown("### Dataframe after perfroming normalization"),pn.pane.DataFrame(normalization_df.head(), width = 1500)),
1036
+ pn.Column(pn.pane.Markdown("### Dataframe after background Substraction"), pn.Feed(background_substraction_df.head(),),
1037
+ ))),
1038
+ ("Quality Control", pn.Column(
1039
+ quality_check(quality_control_df, not_intensities)
1040
+ #pn.pane.Markdown("### The Quality check results are:"), quality_check_results(check_shape, check_no_null, check_all_expected_files_present, check_zero_intensities)
1041
+ ))
1042
+ )],)
1043
+
1044
+
1045
+ # In[66]:
1046
+
1047
+
1048
+ app2.show(port = 1003)
1049
+
1050
+
1051
+ # ## II.8. SAVE
1052
+
1053
+ # In[67]:
1054
+
1055
+
1056
+ # Save the data by Sample_ID
1057
+ # Check for the existence of the output file first
1058
+ for sample in ls_samples:
1059
+ sample_id = sample.split('_')[0]
1060
+ filename = os.path.join(output_data_dir, sample_id + "_" + step_suffix + ".csv")
1061
+ if os.path.exists(filename):
1062
+ print("File by name "+filename+" already exists.")
1063
+ else:
1064
+ sample_id_csv = sample_id + '.csv'
1065
+ df_save = df.loc[df['Sample_ID'] == sample_id_csv, :]
1066
+ #print(df_save)
1067
+ filename = os.path.join(output_data_dir, sample_id + "_" + step_suffix + ".csv")
1068
+ df_save.to_csv(filename, index=True, index_label='ID') # Set index parameter to True to retain the index column
1069
+ print("File " + filename + " was created!")